Vector Optimized Library of Kernels  2.3
Architecture-tuned implementations of math kernels
volk_32f_8u_polarbutterfly_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2015 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
74 #ifndef VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_
75 #define VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_
76 #include <math.h>
78 
79 static inline float llr_odd(const float la, const float lb)
80 {
81  const float ala = fabsf(la);
82  const float alb = fabsf(lb);
83  return copysignf(1.0f, la) * copysignf(1.0f, lb) * (ala > alb ? alb : ala);
84 }
85 
86 static inline void llr_odd_stages(
87  float* llrs, int min_stage, const int depth, const int frame_size, const int row)
88 {
89  int loop_stage = depth - 1;
90  float* dst_llr_ptr;
91  float* src_llr_ptr;
92  int stage_size = 0x01 << loop_stage;
93 
94  int el;
95  while (min_stage <= loop_stage) {
96  dst_llr_ptr = llrs + loop_stage * frame_size + row;
97  src_llr_ptr = dst_llr_ptr + frame_size;
98  for (el = 0; el < stage_size; el++) {
99  *dst_llr_ptr++ = llr_odd(*src_llr_ptr, *(src_llr_ptr + 1));
100  src_llr_ptr += 2;
101  }
102 
103  --loop_stage;
104  stage_size >>= 1;
105  }
106 }
107 
108 static inline float llr_even(const float la, const float lb, const unsigned char f)
109 {
110  switch (f) {
111  case 0:
112  return lb + la;
113  default:
114  return lb - la;
115  }
116 }
117 
118 static inline void
119 even_u_values(unsigned char* u_even, const unsigned char* u, const int u_num)
120 {
121  u++;
122  int i;
123  for (i = 1; i < u_num; i += 2) {
124  *u_even++ = *u;
125  u += 2;
126  }
127 }
128 
129 static inline void
130 odd_xor_even_values(unsigned char* u_xor, const unsigned char* u, const int u_num)
131 {
132  int i;
133  for (i = 1; i < u_num; i += 2) {
134  *u_xor++ = *u ^ *(u + 1);
135  u += 2;
136  }
137 }
138 
139 static inline int calculate_max_stage_depth_for_row(const int frame_exp, const int row)
140 {
141  int max_stage_depth = 0;
142  int half_stage_size = 0x01;
143  int stage_size = half_stage_size << 1;
144  while (max_stage_depth < (frame_exp - 1)) { // last stage holds received values.
145  if (!(row % stage_size < half_stage_size)) {
146  break;
147  }
148  half_stage_size <<= 1;
149  stage_size <<= 1;
150  max_stage_depth++;
151  }
152  return max_stage_depth;
153 }
154 
155 #ifdef LV_HAVE_GENERIC
156 
157 static inline void volk_32f_8u_polarbutterfly_32f_generic(float* llrs,
158  unsigned char* u,
159  const int frame_exp,
160  const int stage,
161  const int u_num,
162  const int row)
163 {
164  const int frame_size = 0x01 << frame_exp;
165  const int next_stage = stage + 1;
166 
167  const int half_stage_size = 0x01 << stage;
168  const int stage_size = half_stage_size << 1;
169 
170  const bool is_upper_stage_half = row % stage_size < half_stage_size;
171 
172  // // this is a natural bit order impl
173  float* next_llrs = llrs + frame_size; // LLRs are stored in a consecutive array.
174  float* call_row_llr = llrs + row;
175 
176  const int section = row - (row % stage_size);
177  const int jump_size = ((row % half_stage_size) << 1) % stage_size;
178 
179  const int next_upper_row = section + jump_size;
180  const int next_lower_row = next_upper_row + 1;
181 
182  const float* upper_right_llr_ptr = next_llrs + next_upper_row;
183  const float* lower_right_llr_ptr = next_llrs + next_lower_row;
184 
185  if (!is_upper_stage_half) {
186  const int u_pos = u_num >> stage;
187  const unsigned char f = u[u_pos - 1];
188  *call_row_llr = llr_even(*upper_right_llr_ptr, *lower_right_llr_ptr, f);
189  return;
190  }
191 
192  if (frame_exp > next_stage) {
193  unsigned char* u_half = u + frame_size;
194  odd_xor_even_values(u_half, u, u_num);
196  next_llrs, u_half, frame_exp, next_stage, u_num, next_upper_row);
197 
198  even_u_values(u_half, u, u_num);
200  next_llrs, u_half, frame_exp, next_stage, u_num, next_lower_row);
201  }
202 
203  *call_row_llr = llr_odd(*upper_right_llr_ptr, *lower_right_llr_ptr);
204 }
205 
206 #endif /* LV_HAVE_GENERIC */
207 
208 
209 #ifdef LV_HAVE_AVX
210 #include <immintrin.h>
212 
213 static inline void volk_32f_8u_polarbutterfly_32f_u_avx(float* llrs,
214  unsigned char* u,
215  const int frame_exp,
216  const int stage,
217  const int u_num,
218  const int row)
219 {
220  const int frame_size = 0x01 << frame_exp;
221  if (row % 2) { // for odd rows just do the only necessary calculation and return.
222  const float* next_llrs = llrs + frame_size + row;
223  *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
224  return;
225  }
226 
227  const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row);
228  if (max_stage_depth < 3) { // vectorized version needs larger vectors.
229  volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row);
230  return;
231  }
232 
233  int loop_stage = max_stage_depth;
234  int stage_size = 0x01 << loop_stage;
235 
236  float* src_llr_ptr;
237  float* dst_llr_ptr;
238 
239  __m256 src0, src1, dst;
240 
241  if (row) { // not necessary for ZERO row. == first bit to be decoded.
242  // first do bit combination for all stages
243  // effectively encode some decoded bits again.
244  unsigned char* u_target = u + frame_size;
245  unsigned char* u_temp = u + 2 * frame_size;
246  memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size);
247 
248  if (stage_size > 15) {
249  _mm256_zeroupper();
250  volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size);
251  } else {
252  volk_8u_x2_encodeframepolar_8u_generic(u_target, u_temp, stage_size);
253  }
254 
255  src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
256  dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
257 
258  __m128i fbits;
259 
260  int p;
261  for (p = 0; p < stage_size; p += 8) {
262  _mm256_zeroupper();
263  fbits = _mm_loadu_si128((__m128i*)u_target);
264  u_target += 8;
265 
266  src0 = _mm256_loadu_ps(src_llr_ptr);
267  src1 = _mm256_loadu_ps(src_llr_ptr + 8);
268  src_llr_ptr += 16;
269 
270  dst = _mm256_polar_fsign_add_llrs(src0, src1, fbits);
271 
272  _mm256_storeu_ps(dst_llr_ptr, dst);
273  dst_llr_ptr += 8;
274  }
275 
276  --loop_stage;
277  stage_size >>= 1;
278  }
279 
280  const int min_stage = stage > 2 ? stage : 2;
281 
282  _mm256_zeroall(); // Important to clear cache!
283 
284  int el;
285  while (min_stage < loop_stage) {
286  dst_llr_ptr = llrs + loop_stage * frame_size + row;
287  src_llr_ptr = dst_llr_ptr + frame_size;
288  for (el = 0; el < stage_size; el += 8) {
289  src0 = _mm256_loadu_ps(src_llr_ptr);
290  src_llr_ptr += 8;
291  src1 = _mm256_loadu_ps(src_llr_ptr);
292  src_llr_ptr += 8;
293 
294  dst = _mm256_polar_minsum_llrs(src0, src1);
295 
296  _mm256_storeu_ps(dst_llr_ptr, dst);
297  dst_llr_ptr += 8;
298  }
299 
300  --loop_stage;
301  stage_size >>= 1;
302  }
303 
304  // for stages < 3 vectors are too small!.
305  llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row);
306 }
307 
308 #endif /* LV_HAVE_AVX */
309 
310 #ifdef LV_HAVE_AVX2
311 #include <immintrin.h>
313 
314 static inline void volk_32f_8u_polarbutterfly_32f_u_avx2(float* llrs,
315  unsigned char* u,
316  const int frame_exp,
317  const int stage,
318  const int u_num,
319  const int row)
320 {
321  const int frame_size = 0x01 << frame_exp;
322  if (row % 2) { // for odd rows just do the only necessary calculation and return.
323  const float* next_llrs = llrs + frame_size + row;
324  *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
325  return;
326  }
327 
328  const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row);
329  if (max_stage_depth < 3) { // vectorized version needs larger vectors.
330  volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row);
331  return;
332  }
333 
334  int loop_stage = max_stage_depth;
335  int stage_size = 0x01 << loop_stage;
336 
337  float* src_llr_ptr;
338  float* dst_llr_ptr;
339 
340  __m256 src0, src1, dst;
341 
342  if (row) { // not necessary for ZERO row. == first bit to be decoded.
343  // first do bit combination for all stages
344  // effectively encode some decoded bits again.
345  unsigned char* u_target = u + frame_size;
346  unsigned char* u_temp = u + 2 * frame_size;
347  memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size);
348 
349  if (stage_size > 15) {
350  _mm256_zeroupper();
351  volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size);
352  } else {
353  volk_8u_x2_encodeframepolar_8u_generic(u_target, u_temp, stage_size);
354  }
355 
356  src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
357  dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
358 
359  __m128i fbits;
360 
361  int p;
362  for (p = 0; p < stage_size; p += 8) {
363  _mm256_zeroupper();
364  fbits = _mm_loadu_si128((__m128i*)u_target);
365  u_target += 8;
366 
367  src0 = _mm256_loadu_ps(src_llr_ptr);
368  src1 = _mm256_loadu_ps(src_llr_ptr + 8);
369  src_llr_ptr += 16;
370 
371  dst = _mm256_polar_fsign_add_llrs_avx2(src0, src1, fbits);
372 
373  _mm256_storeu_ps(dst_llr_ptr, dst);
374  dst_llr_ptr += 8;
375  }
376 
377  --loop_stage;
378  stage_size >>= 1;
379  }
380 
381  const int min_stage = stage > 2 ? stage : 2;
382 
383  _mm256_zeroall(); // Important to clear cache!
384 
385  int el;
386  while (min_stage < loop_stage) {
387  dst_llr_ptr = llrs + loop_stage * frame_size + row;
388  src_llr_ptr = dst_llr_ptr + frame_size;
389  for (el = 0; el < stage_size; el += 8) {
390  src0 = _mm256_loadu_ps(src_llr_ptr);
391  src_llr_ptr += 8;
392  src1 = _mm256_loadu_ps(src_llr_ptr);
393  src_llr_ptr += 8;
394 
395  dst = _mm256_polar_minsum_llrs(src0, src1);
396 
397  _mm256_storeu_ps(dst_llr_ptr, dst);
398  dst_llr_ptr += 8;
399  }
400 
401  --loop_stage;
402  stage_size >>= 1;
403  }
404 
405  // for stages < 3 vectors are too small!.
406  llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row);
407 }
408 
409 #endif /* LV_HAVE_AVX2 */
410 
411 #endif /* VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_ */
llr_even
static float llr_even(const float la, const float lb, const unsigned char f)
Definition: volk_32f_8u_polarbutterfly_32f.h:108
even_u_values
static void even_u_values(unsigned char *u_even, const unsigned char *u, const int u_num)
Definition: volk_32f_8u_polarbutterfly_32f.h:119
volk_8u_x2_encodeframepolar_8u.h
i
for i
Definition: volk_config_fixed.tmpl.h:25
calculate_max_stage_depth_for_row
static int calculate_max_stage_depth_for_row(const int frame_exp, const int row)
Definition: volk_32f_8u_polarbutterfly_32f.h:139
llr_odd
static float llr_odd(const float la, const float lb)
Definition: volk_32f_8u_polarbutterfly_32f.h:79
volk_32f_8u_polarbutterfly_32f_u_avx
static void volk_32f_8u_polarbutterfly_32f_u_avx(float *llrs, unsigned char *u, const int frame_exp, const int stage, const int u_num, const int row)
Definition: volk_32f_8u_polarbutterfly_32f.h:213
_mm256_polar_fsign_add_llrs_avx2
static __m256 _mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits)
Definition: volk_avx2_intrinsics.h:81
volk_8u_x2_encodeframepolar_8u_generic
static void volk_8u_x2_encodeframepolar_8u_generic(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:63
volk_32f_8u_polarbutterfly_32f_generic
static void volk_32f_8u_polarbutterfly_32f_generic(float *llrs, unsigned char *u, const int frame_exp, const int stage, const int u_num, const int row)
Definition: volk_32f_8u_polarbutterfly_32f.h:157
_mm256_polar_fsign_add_llrs
static __m256 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits)
Definition: volk_avx_intrinsics.h:184
volk_avx_intrinsics.h
volk_8u_x2_encodeframepolar_8u_u_ssse3
static void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:86
volk_avx2_intrinsics.h
_mm256_polar_minsum_llrs
static __m256 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1)
Definition: volk_avx_intrinsics.h:167
odd_xor_even_values
static void odd_xor_even_values(unsigned char *u_xor, const unsigned char *u, const int u_num)
Definition: volk_32f_8u_polarbutterfly_32f.h:130
llr_odd_stages
static void llr_odd_stages(float *llrs, int min_stage, const int depth, const int frame_size, const int row)
Definition: volk_32f_8u_polarbutterfly_32f.h:86