Vector Optimized Library of Kernels  2.3
Architecture-tuned implementations of math kernels
volk_16i_convert_8i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
53 #ifndef INCLUDED_volk_16i_convert_8i_u_H
54 #define INCLUDED_volk_16i_convert_8i_u_H
55 
56 #include <inttypes.h>
57 #include <stdio.h>
58 
59 #ifdef LV_HAVE_AVX2
60 #include <immintrin.h>
61 
62 static inline void volk_16i_convert_8i_u_avx2(int8_t* outputVector,
63  const int16_t* inputVector,
64  unsigned int num_points)
65 {
66  unsigned int number = 0;
67  const unsigned int thirtysecondPoints = num_points / 32;
68 
69  int8_t* outputVectorPtr = outputVector;
70  int16_t* inputPtr = (int16_t*)inputVector;
71  __m256i inputVal1;
72  __m256i inputVal2;
73  __m256i ret;
74 
75  for (; number < thirtysecondPoints; number++) {
76 
77  // Load the 16 values
78  inputVal1 = _mm256_loadu_si256((__m256i*)inputPtr);
79  inputPtr += 16;
80  inputVal2 = _mm256_loadu_si256((__m256i*)inputPtr);
81  inputPtr += 16;
82 
83  inputVal1 = _mm256_srai_epi16(inputVal1, 8);
84  inputVal2 = _mm256_srai_epi16(inputVal2, 8);
85 
86  ret = _mm256_packs_epi16(inputVal1, inputVal2);
87  ret = _mm256_permute4x64_epi64(ret, 0b11011000);
88 
89  _mm256_storeu_si256((__m256i*)outputVectorPtr, ret);
90 
91  outputVectorPtr += 32;
92  }
93 
94  number = thirtysecondPoints * 32;
95  for (; number < num_points; number++) {
96  outputVector[number] = (int8_t)(inputVector[number] >> 8);
97  }
98 }
99 #endif /* LV_HAVE_AVX2 */
100 
101 
102 #ifdef LV_HAVE_SSE2
103 #include <emmintrin.h>
104 
105 static inline void volk_16i_convert_8i_u_sse2(int8_t* outputVector,
106  const int16_t* inputVector,
107  unsigned int num_points)
108 {
109  unsigned int number = 0;
110  const unsigned int sixteenthPoints = num_points / 16;
111 
112  int8_t* outputVectorPtr = outputVector;
113  int16_t* inputPtr = (int16_t*)inputVector;
114  __m128i inputVal1;
115  __m128i inputVal2;
116  __m128i ret;
117 
118  for (; number < sixteenthPoints; number++) {
119 
120  // Load the 16 values
121  inputVal1 = _mm_loadu_si128((__m128i*)inputPtr);
122  inputPtr += 8;
123  inputVal2 = _mm_loadu_si128((__m128i*)inputPtr);
124  inputPtr += 8;
125 
126  inputVal1 = _mm_srai_epi16(inputVal1, 8);
127  inputVal2 = _mm_srai_epi16(inputVal2, 8);
128 
129  ret = _mm_packs_epi16(inputVal1, inputVal2);
130 
131  _mm_storeu_si128((__m128i*)outputVectorPtr, ret);
132 
133  outputVectorPtr += 16;
134  }
135 
136  number = sixteenthPoints * 16;
137  for (; number < num_points; number++) {
138  outputVector[number] = (int8_t)(inputVector[number] >> 8);
139  }
140 }
141 #endif /* LV_HAVE_SSE2 */
142 
143 
144 #ifdef LV_HAVE_GENERIC
145 
146 static inline void volk_16i_convert_8i_generic(int8_t* outputVector,
147  const int16_t* inputVector,
148  unsigned int num_points)
149 {
150  int8_t* outputVectorPtr = outputVector;
151  const int16_t* inputVectorPtr = inputVector;
152  unsigned int number = 0;
153 
154  for (number = 0; number < num_points; number++) {
155  *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
156  }
157 }
158 #endif /* LV_HAVE_GENERIC */
159 
160 
161 #endif /* INCLUDED_volk_16i_convert_8i_u_H */
162 #ifndef INCLUDED_volk_16i_convert_8i_a_H
163 #define INCLUDED_volk_16i_convert_8i_a_H
164 
165 #include <inttypes.h>
166 #include <stdio.h>
167 
168 #ifdef LV_HAVE_AVX2
169 #include <immintrin.h>
170 
171 static inline void volk_16i_convert_8i_a_avx2(int8_t* outputVector,
172  const int16_t* inputVector,
173  unsigned int num_points)
174 {
175  unsigned int number = 0;
176  const unsigned int thirtysecondPoints = num_points / 32;
177 
178  int8_t* outputVectorPtr = outputVector;
179  int16_t* inputPtr = (int16_t*)inputVector;
180  __m256i inputVal1;
181  __m256i inputVal2;
182  __m256i ret;
183 
184  for (; number < thirtysecondPoints; number++) {
185 
186  // Load the 16 values
187  inputVal1 = _mm256_load_si256((__m256i*)inputPtr);
188  inputPtr += 16;
189  inputVal2 = _mm256_load_si256((__m256i*)inputPtr);
190  inputPtr += 16;
191 
192  inputVal1 = _mm256_srai_epi16(inputVal1, 8);
193  inputVal2 = _mm256_srai_epi16(inputVal2, 8);
194 
195  ret = _mm256_packs_epi16(inputVal1, inputVal2);
196  ret = _mm256_permute4x64_epi64(ret, 0b11011000);
197 
198  _mm256_store_si256((__m256i*)outputVectorPtr, ret);
199 
200  outputVectorPtr += 32;
201  }
202 
203  number = thirtysecondPoints * 32;
204  for (; number < num_points; number++) {
205  outputVector[number] = (int8_t)(inputVector[number] >> 8);
206  }
207 }
208 #endif /* LV_HAVE_AVX2 */
209 
210 
211 #ifdef LV_HAVE_SSE2
212 #include <emmintrin.h>
213 
214 static inline void volk_16i_convert_8i_a_sse2(int8_t* outputVector,
215  const int16_t* inputVector,
216  unsigned int num_points)
217 {
218  unsigned int number = 0;
219  const unsigned int sixteenthPoints = num_points / 16;
220 
221  int8_t* outputVectorPtr = outputVector;
222  int16_t* inputPtr = (int16_t*)inputVector;
223  __m128i inputVal1;
224  __m128i inputVal2;
225  __m128i ret;
226 
227  for (; number < sixteenthPoints; number++) {
228 
229  // Load the 16 values
230  inputVal1 = _mm_load_si128((__m128i*)inputPtr);
231  inputPtr += 8;
232  inputVal2 = _mm_load_si128((__m128i*)inputPtr);
233  inputPtr += 8;
234 
235  inputVal1 = _mm_srai_epi16(inputVal1, 8);
236  inputVal2 = _mm_srai_epi16(inputVal2, 8);
237 
238  ret = _mm_packs_epi16(inputVal1, inputVal2);
239 
240  _mm_store_si128((__m128i*)outputVectorPtr, ret);
241 
242  outputVectorPtr += 16;
243  }
244 
245  number = sixteenthPoints * 16;
246  for (; number < num_points; number++) {
247  outputVector[number] = (int8_t)(inputVector[number] >> 8);
248  }
249 }
250 #endif /* LV_HAVE_SSE2 */
251 
252 
253 #ifdef LV_HAVE_NEON
254 #include <arm_neon.h>
255 
256 static inline void volk_16i_convert_8i_neon(int8_t* outputVector,
257  const int16_t* inputVector,
258  unsigned int num_points)
259 {
260  int8_t* outputVectorPtr = outputVector;
261  const int16_t* inputVectorPtr = inputVector;
262  unsigned int number = 0;
263  unsigned int sixteenth_points = num_points / 16;
264 
265  int16x8_t inputVal0;
266  int16x8_t inputVal1;
267  int8x8_t outputVal0;
268  int8x8_t outputVal1;
269  int8x16_t outputVal;
270 
271  for (number = 0; number < sixteenth_points; number++) {
272  // load two input vectors
273  inputVal0 = vld1q_s16(inputVectorPtr);
274  inputVal1 = vld1q_s16(inputVectorPtr + 8);
275  // shift right
276  outputVal0 = vshrn_n_s16(inputVal0, 8);
277  outputVal1 = vshrn_n_s16(inputVal1, 8);
278  // squash two vectors and write output
279  outputVal = vcombine_s8(outputVal0, outputVal1);
280  vst1q_s8(outputVectorPtr, outputVal);
281  inputVectorPtr += 16;
282  outputVectorPtr += 16;
283  }
284 
285  for (number = sixteenth_points * 16; number < num_points; number++) {
286  *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
287  }
288 }
289 #endif /* LV_HAVE_NEON */
290 
291 
292 #ifdef LV_HAVE_GENERIC
293 
294 static inline void volk_16i_convert_8i_a_generic(int8_t* outputVector,
295  const int16_t* inputVector,
296  unsigned int num_points)
297 {
298  int8_t* outputVectorPtr = outputVector;
299  const int16_t* inputVectorPtr = inputVector;
300  unsigned int number = 0;
301 
302  for (number = 0; number < num_points; number++) {
303  *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
304  }
305 }
306 #endif /* LV_HAVE_GENERIC */
307 
308 #endif /* INCLUDED_volk_16i_convert_8i_a_H */
volk_16i_convert_8i_neon
static void volk_16i_convert_8i_neon(int8_t *outputVector, const int16_t *inputVector, unsigned int num_points)
Definition: volk_16i_convert_8i.h:256
volk_16i_convert_8i_u_sse2
static void volk_16i_convert_8i_u_sse2(int8_t *outputVector, const int16_t *inputVector, unsigned int num_points)
Definition: volk_16i_convert_8i.h:105
volk_16i_convert_8i_a_generic
static void volk_16i_convert_8i_a_generic(int8_t *outputVector, const int16_t *inputVector, unsigned int num_points)
Definition: volk_16i_convert_8i.h:294
volk_16i_convert_8i_generic
static void volk_16i_convert_8i_generic(int8_t *outputVector, const int16_t *inputVector, unsigned int num_points)
Definition: volk_16i_convert_8i.h:146
volk_16i_convert_8i_a_sse2
static void volk_16i_convert_8i_a_sse2(int8_t *outputVector, const int16_t *inputVector, unsigned int num_points)
Definition: volk_16i_convert_8i.h:214