GNU Radio 3.4.0 C++ API
volk_16i_max_star_horizontal_16i_a16.h
Go to the documentation of this file.
00001 #ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a16_H
00002 #define INCLUDED_volk_16i_max_star_horizontal_16i_a16_H
00003 
00004 
00005 #include<inttypes.h>
00006 #include<stdio.h>       
00007 
00008 
00009 #if LV_HAVE_SSSE3
00010 
00011 #include<xmmintrin.h>
00012 #include<emmintrin.h>
00013 #include<tmmintrin.h>
00014 
00015 static inline  void volk_16i_max_star_horizontal_16i_a16_ssse3(int16_t* target, int16_t* src0, unsigned int num_bytes) {
00016 
00017   const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
00018   const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d};
00019   const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
00020   const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02};
00021 
00022   
00023   
00024   volatile __m128i xmm0, xmm1, xmm2, xmm3, xmm4; 
00025   __m128i  xmm5, xmm6, xmm7, xmm8;
00026   
00027   xmm4 = _mm_load_si128((__m128i*)shufmask0);
00028   xmm5 = _mm_load_si128((__m128i*)shufmask1);
00029   xmm6 = _mm_load_si128((__m128i*)andmask0);
00030   xmm7 = _mm_load_si128((__m128i*)andmask1);
00031   
00032   __m128i *p_target, *p_src0;
00033   
00034   p_target = (__m128i*)target;
00035   p_src0 = (__m128i*)src0;
00036 
00037   int bound = num_bytes >> 5;
00038   int intermediate = (num_bytes >> 4) & 1;
00039   int leftovers = (num_bytes >> 1) & 7;
00040   
00041   int i = 0;
00042   
00043   
00044   for(i = 0; i < bound; ++i) {
00045      
00046     xmm0 = _mm_load_si128(p_src0);
00047     xmm1 = _mm_load_si128(&p_src0[1]);
00048     
00049     
00050 
00051     xmm2 = _mm_xor_si128(xmm2, xmm2);
00052     p_src0 += 2;
00053     
00054     xmm3 = _mm_hsub_epi16(xmm0, xmm1);
00055     
00056     xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);   
00057 
00058     xmm8 = _mm_and_si128(xmm2, xmm6);
00059     xmm3 = _mm_and_si128(xmm2, xmm7);
00060     
00061 
00062     xmm8 = _mm_add_epi8(xmm8, xmm4);
00063     xmm3 = _mm_add_epi8(xmm3, xmm5);
00064 
00065     xmm0 = _mm_shuffle_epi8(xmm0, xmm8);
00066     xmm1 = _mm_shuffle_epi8(xmm1, xmm3);
00067     
00068     
00069     xmm3 = _mm_add_epi16(xmm0, xmm1);
00070 
00071     
00072     _mm_store_si128(p_target, xmm3);
00073     
00074     p_target += 1;
00075   
00076   }
00077 
00078   for(i = 0; i < intermediate; ++i) {
00079     
00080     xmm0 = _mm_load_si128(p_src0);
00081     
00082     
00083     xmm2 = _mm_xor_si128(xmm2, xmm2);
00084     p_src0 += 1;
00085     
00086     xmm3 = _mm_hsub_epi16(xmm0, xmm1);
00087     xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
00088 
00089     xmm8 = _mm_and_si128(xmm2, xmm6);
00090     
00091     xmm3 = _mm_add_epi8(xmm8, xmm4);
00092     
00093     xmm0 = _mm_shuffle_epi8(xmm0, xmm3);
00094     
00095 
00096     _mm_storel_pd((double*)p_target, (__m128d)xmm0);
00097     
00098     p_target = (__m128i*)((int8_t*)p_target + 8);
00099 
00100   }
00101     
00102   for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) { 
00103     target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
00104   }
00105   
00106 
00107 }   
00108  
00109 #endif /*LV_HAVE_SSSE3*/
00110 
00111 
00112 #if LV_HAVE_GENERIC
00113 static inline void volk_16i_max_star_horizontal_16i_a16_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) {
00114         
00115         int i = 0;
00116         
00117         int bound = num_bytes >> 1;
00118 
00119       
00120         for(i = 0; i < bound; i += 2) {
00121           target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1];
00122         }
00123                 
00124 }
00125 
00126 
00127 
00128 #endif /*LV_HAVE_GENERIC*/
00129 
00130 #endif /*INCLUDED_volk_16i_max_star_horizontal_16i_a16_H*/