libosmocore  0.10.2
Osmocom core library
conv_acc_sse_impl.h
Go to the documentation of this file.
1 
5 /*
6  * Copyright (C) 2013, 2014 Thomas Tsou <tom@tsou.cc>
7  *
8  * All Rights Reserved
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License along
21  * with this program; if not, write to the Free Software Foundation, Inc.,
22  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23  */
24 
25 /* Some distributions (notably Alpine Linux) for some strange reason
26  * don't have this #define */
27 #ifndef __always_inline
28 #define __always_inline inline __attribute__((always_inline))
29 #endif
30 
31 extern int sse41_supported;
32 
33 /* Octo-Viterbi butterfly
34  * Compute 8-wide butterfly generating 16 path decisions and 16 accumulated
35  * sums. Inputs all packed 16-bit integers in three 128-bit XMM registers.
36  * Two intermediate registers are used and results are set in the upper 4
37  * registers.
38  *
39  * Input:
40  * M0 - Path metrics 0 (packed 16-bit integers)
41  * M1 - Path metrics 1 (packed 16-bit integers)
42  * M2 - Branch metrics (packed 16-bit integers)
43  *
44  * Output:
45  * M2 - Selected and accumulated path metrics 0
46  * M4 - Selected and accumulated path metrics 1
47  * M3 - Path selections 0
48  * M1 - Path selections 1
49  */
50 #define SSE_BUTTERFLY(M0, M1, M2, M3, M4) \
51 { \
52  M3 = _mm_adds_epi16(M0, M2); \
53  M4 = _mm_subs_epi16(M1, M2); \
54  M0 = _mm_subs_epi16(M0, M2); \
55  M1 = _mm_adds_epi16(M1, M2); \
56  M2 = _mm_max_epi16(M3, M4); \
57  M3 = _mm_or_si128(_mm_cmpgt_epi16(M3, M4), _mm_cmpeq_epi16(M3, M4)); \
58  M4 = _mm_max_epi16(M0, M1); \
59  M1 = _mm_or_si128(_mm_cmpgt_epi16(M0, M1), _mm_cmpeq_epi16(M0, M1)); \
60 }
61 
62 /* Two lane deinterleaving K = 5:
63  * Take 16 interleaved 16-bit integers and deinterleave to 2 packed 128-bit
64  * registers. The operation summarized below. Four registers are used with
65  * the lower 2 as input and upper 2 as output.
66  *
67  * In - 10101010 10101010 10101010 10101010
68  * Out - 00000000 11111111 00000000 11111111
69  *
70  * Input:
71  * M0:1 - Packed 16-bit integers
72  *
73  * Output:
74  * M2:3 - Deinterleaved packed 16-bit integers
75  */
76 #define _I8_SHUFFLE_MASK 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
77 
78 #define SSE_DEINTERLEAVE_K5(M0, M1, M2, M3) \
79 { \
80  M2 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
81  M0 = _mm_shuffle_epi8(M0, M2); \
82  M1 = _mm_shuffle_epi8(M1, M2); \
83  M2 = _mm_unpacklo_epi64(M0, M1); \
84  M3 = _mm_unpackhi_epi64(M0, M1); \
85 }
86 
87 /* Two lane deinterleaving K = 7:
88  * Take 64 interleaved 16-bit integers and deinterleave to 8 packed 128-bit
89  * registers. The operation summarized below. 16 registers are used with the
90  * lower 8 as input and upper 8 as output.
91  *
92  * In - 10101010 10101010 10101010 10101010 ...
93  * Out - 00000000 11111111 00000000 11111111 ...
94  *
95  * Input:
96  * M0:7 - Packed 16-bit integers
97  *
98  * Output:
99  * M8:15 - Deinterleaved packed 16-bit integers
100  */
101 #define SSE_DEINTERLEAVE_K7(M0, M1, M2, M3, M4, M5, M6, M7, \
102  M8, M9, M10, M11, M12, M13, M14, M15) \
103 { \
104  M8 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
105  M0 = _mm_shuffle_epi8(M0, M8); \
106  M1 = _mm_shuffle_epi8(M1, M8); \
107  M2 = _mm_shuffle_epi8(M2, M8); \
108  M3 = _mm_shuffle_epi8(M3, M8); \
109  M4 = _mm_shuffle_epi8(M4, M8); \
110  M5 = _mm_shuffle_epi8(M5, M8); \
111  M6 = _mm_shuffle_epi8(M6, M8); \
112  M7 = _mm_shuffle_epi8(M7, M8); \
113  M8 = _mm_unpacklo_epi64(M0, M1); \
114  M9 = _mm_unpackhi_epi64(M0, M1); \
115  M10 = _mm_unpacklo_epi64(M2, M3); \
116  M11 = _mm_unpackhi_epi64(M2, M3); \
117  M12 = _mm_unpacklo_epi64(M4, M5); \
118  M13 = _mm_unpackhi_epi64(M4, M5); \
119  M14 = _mm_unpacklo_epi64(M6, M7); \
120  M15 = _mm_unpackhi_epi64(M6, M7); \
121 }
122 
123 /* Generate branch metrics N = 2:
124  * Compute 16 branch metrics from trellis outputs and input values.
125  *
126  * Input:
127  * M0:3 - 16 x 2 packed 16-bit trellis outputs
128  * M4 - Expanded and packed 16-bit input value
129  *
130  * Output:
131  * M6:7 - 16 computed 16-bit branch metrics
132  */
133 #define SSE_BRANCH_METRIC_N2(M0, M1, M2, M3, M4, M6, M7) \
134 { \
135  M0 = _mm_sign_epi16(M4, M0); \
136  M1 = _mm_sign_epi16(M4, M1); \
137  M2 = _mm_sign_epi16(M4, M2); \
138  M3 = _mm_sign_epi16(M4, M3); \
139  M6 = _mm_hadds_epi16(M0, M1); \
140  M7 = _mm_hadds_epi16(M2, M3); \
141 }
142 
143 /* Generate branch metrics N = 4:
144  * Compute 8 branch metrics from trellis outputs and input values. This
145  * macro is reused for N less than 4 where the extra soft input bits are
146  * padded.
147  *
148  * Input:
149  * M0:3 - 8 x 4 packed 16-bit trellis outputs
150  * M4 - Expanded and packed 16-bit input value
151  *
152  * Output:
153  * M5 - 8 computed 16-bit branch metrics
154  */
155 #define SSE_BRANCH_METRIC_N4(M0, M1, M2, M3, M4, M5) \
156 { \
157  M0 = _mm_sign_epi16(M4, M0); \
158  M1 = _mm_sign_epi16(M4, M1); \
159  M2 = _mm_sign_epi16(M4, M2); \
160  M3 = _mm_sign_epi16(M4, M3); \
161  M0 = _mm_hadds_epi16(M0, M1); \
162  M1 = _mm_hadds_epi16(M2, M3); \
163  M5 = _mm_hadds_epi16(M0, M1); \
164 }
165 
166 /* Horizontal minimum
167  * Compute horizontal minimum of packed unsigned 16-bit integers and place
168  * result in the low 16-bit element of the source register. Only SSE 4.1
169  * has a dedicated minpos instruction. One intermediate register is used
170  * if SSE 4.1 is not available. This is a destructive operation and the
171  * source register is overwritten.
172  *
173  * Input:
174  * M0 - Packed unsigned 16-bit integers
175  *
176  * Output:
177  * M0 - Minimum value placed in low 16-bit element
178  */
179 #if defined(HAVE_SSE4_1) || defined(HAVE_SSE41)
180 #define SSE_MINPOS(M0, M1) \
181 { \
182  if (sse41_supported) { \
183  M0 = _mm_minpos_epu16(M0); \
184  } else { \
185  M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
186  M0 = _mm_min_epi16(M0, M1); \
187  M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
188  M0 = _mm_min_epi16(M0, M1); \
189  M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
190  M0 = _mm_min_epi16(M0, M1); \
191  } \
192 }
193 #else
194 #define SSE_MINPOS(M0, M1) \
195 { \
196  M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
197  M0 = _mm_min_epi16(M0, M1); \
198  M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
199  M0 = _mm_min_epi16(M0, M1); \
200  M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
201  M0 = _mm_min_epi16(M0, M1); \
202 }
203 #endif
204 
205 /* Normalize state metrics K = 5:
206  * Compute 16-wide normalization by subtracting the smallest value from
207  * all values. Inputs are 16 packed 16-bit integers across 2 XMM registers.
208  * Two intermediate registers are used and normalized results are placed
209  * in the originating locations.
210  *
211  * Input:
212  * M0:1 - Path metrics 0:1 (packed 16-bit integers)
213  *
214  * Output:
215  * M0:1 - Normalized path metrics 0:1
216  */
217 #define SSE_NORMALIZE_K5(M0, M1, M2, M3) \
218 { \
219  M2 = _mm_min_epi16(M0, M1); \
220  SSE_MINPOS(M2, M3) \
221  SSE_BROADCAST(M2) \
222  M0 = _mm_subs_epi16(M0, M2); \
223  M1 = _mm_subs_epi16(M1, M2); \
224 }
225 
226 /* Normalize state metrics K = 7:
227  * Compute 64-wide normalization by subtracting the smallest value from
228  * all values. Inputs are 8 registers of accumulated sums and 4 temporary
229  * registers. Normalized results are returned in the originating locations.
230  *
231  * Input:
232  * M0:7 - Path metrics 0:7 (packed 16-bit integers)
233  *
234  * Output:
235  * M0:7 - Normalized path metrics 0:7
236  */
237 #define SSE_NORMALIZE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11) \
238 { \
239  M8 = _mm_min_epi16(M0, M1); \
240  M9 = _mm_min_epi16(M2, M3); \
241  M10 = _mm_min_epi16(M4, M5); \
242  M11 = _mm_min_epi16(M6, M7); \
243  M8 = _mm_min_epi16(M8, M9); \
244  M10 = _mm_min_epi16(M10, M11); \
245  M8 = _mm_min_epi16(M8, M10); \
246  SSE_MINPOS(M8, M9) \
247  SSE_BROADCAST(M8) \
248  M0 = _mm_subs_epi16(M0, M8); \
249  M1 = _mm_subs_epi16(M1, M8); \
250  M2 = _mm_subs_epi16(M2, M8); \
251  M3 = _mm_subs_epi16(M3, M8); \
252  M4 = _mm_subs_epi16(M4, M8); \
253  M5 = _mm_subs_epi16(M5, M8); \
254  M6 = _mm_subs_epi16(M6, M8); \
255  M7 = _mm_subs_epi16(M7, M8); \
256 }
257 
258 /* Combined BMU/PMU (K=5, N=2)
259  * Compute branch metrics followed by path metrics for half rate 16-state
260  * trellis. 8 butterflies are computed. Accumulated path sums are not
261  * preserved and read and written into the same memory location. Normalize
262  * sums if requires.
263  */
264 __always_inline static void _sse_metrics_k5_n2(const int16_t *val,
265  const int16_t *out, int16_t *sums, int16_t *paths, int norm)
266 {
267  __m128i m0, m1, m2, m3, m4, m5, m6;
268 
269  /* (BMU) Load input sequence */
270  m2 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
271 
272  /* (BMU) Load trellis outputs */
273  m0 = _mm_load_si128((__m128i *) &out[0]);
274  m1 = _mm_load_si128((__m128i *) &out[8]);
275 
276  /* (BMU) Compute branch metrics */
277  m0 = _mm_sign_epi16(m2, m0);
278  m1 = _mm_sign_epi16(m2, m1);
279  m2 = _mm_hadds_epi16(m0, m1);
280 
281  /* (PMU) Load accumulated path metrics */
282  m0 = _mm_load_si128((__m128i *) &sums[0]);
283  m1 = _mm_load_si128((__m128i *) &sums[8]);
284 
285  SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
286 
287  /* (PMU) Butterflies: 0-7 */
288  SSE_BUTTERFLY(m3, m4, m2, m5, m6)
289 
290  if (norm)
291  SSE_NORMALIZE_K5(m2, m6, m0, m1)
292 
293  _mm_store_si128((__m128i *) &sums[0], m2);
294  _mm_store_si128((__m128i *) &sums[8], m6);
295  _mm_store_si128((__m128i *) &paths[0], m5);
296  _mm_store_si128((__m128i *) &paths[8], m4);
297 }
298 
299 /* Combined BMU/PMU (K=5, N=3 and N=4)
300  * Compute branch metrics followed by path metrics for 16-state and rates
301  * to 1/4. 8 butterflies are computed. The input sequence is read four 16-bit
302  * values at a time, and extra values should be set to zero for rates other
303  * than 1/4. Normally only rates 1/3 and 1/4 are used as there is a
304  * dedicated implementation of rate 1/2.
305  */
306 __always_inline static void _sse_metrics_k5_n4(const int16_t *val,
307  const int16_t *out, int16_t *sums, int16_t *paths, int norm)
308 {
309  __m128i m0, m1, m2, m3, m4, m5, m6;
310 
311  /* (BMU) Load input sequence */
312  m4 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
313 
314  /* (BMU) Load trellis outputs */
315  m0 = _mm_load_si128((__m128i *) &out[0]);
316  m1 = _mm_load_si128((__m128i *) &out[8]);
317  m2 = _mm_load_si128((__m128i *) &out[16]);
318  m3 = _mm_load_si128((__m128i *) &out[24]);
319 
320  SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m4, m2)
321 
322  /* (PMU) Load accumulated path metrics */
323  m0 = _mm_load_si128((__m128i *) &sums[0]);
324  m1 = _mm_load_si128((__m128i *) &sums[8]);
325 
326  SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
327 
328  /* (PMU) Butterflies: 0-7 */
329  SSE_BUTTERFLY(m3, m4, m2, m5, m6)
330 
331  if (norm)
332  SSE_NORMALIZE_K5(m2, m6, m0, m1)
333 
334  _mm_store_si128((__m128i *) &sums[0], m2);
335  _mm_store_si128((__m128i *) &sums[8], m6);
336  _mm_store_si128((__m128i *) &paths[0], m5);
337  _mm_store_si128((__m128i *) &paths[8], m4);
338 }
339 
340 /* Combined BMU/PMU (K=7, N=2)
341  * Compute branch metrics followed by path metrics for half rate 64-state
342  * trellis. 32 butterfly operations are computed. Deinterleaving path
343  * metrics requires usage of the full SSE register file, so separate sums
344  * before computing branch metrics to avoid register spilling.
345  */
346 __always_inline static void _sse_metrics_k7_n2(const int16_t *val,
347  const int16_t *out, int16_t *sums, int16_t *paths, int norm)
348 {
349  __m128i m0, m1, m2, m3, m4, m5, m6, m7, m8,
350  m9, m10, m11, m12, m13, m14, m15;
351 
352  /* (PMU) Load accumulated path metrics */
353  m0 = _mm_load_si128((__m128i *) &sums[0]);
354  m1 = _mm_load_si128((__m128i *) &sums[8]);
355  m2 = _mm_load_si128((__m128i *) &sums[16]);
356  m3 = _mm_load_si128((__m128i *) &sums[24]);
357  m4 = _mm_load_si128((__m128i *) &sums[32]);
358  m5 = _mm_load_si128((__m128i *) &sums[40]);
359  m6 = _mm_load_si128((__m128i *) &sums[48]);
360  m7 = _mm_load_si128((__m128i *) &sums[56]);
361 
362  /* (PMU) Deinterleave to even-odd registers */
363  SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
364  m8, m9, m10, m11, m12, m13, m14, m15)
365 
366  /* (BMU) Load input symbols */
367  m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
368 
369  /* (BMU) Load trellis outputs */
370  m0 = _mm_load_si128((__m128i *) &out[0]);
371  m1 = _mm_load_si128((__m128i *) &out[8]);
372  m2 = _mm_load_si128((__m128i *) &out[16]);
373  m3 = _mm_load_si128((__m128i *) &out[24]);
374 
375  SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m4, m5)
376 
377  m0 = _mm_load_si128((__m128i *) &out[32]);
378  m1 = _mm_load_si128((__m128i *) &out[40]);
379  m2 = _mm_load_si128((__m128i *) &out[48]);
380  m3 = _mm_load_si128((__m128i *) &out[56]);
381 
382  SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m6, m7)
383 
384  /* (PMU) Butterflies: 0-15 */
385  SSE_BUTTERFLY(m8, m9, m4, m0, m1)
386  SSE_BUTTERFLY(m10, m11, m5, m2, m3)
387 
388  _mm_store_si128((__m128i *) &paths[0], m0);
389  _mm_store_si128((__m128i *) &paths[8], m2);
390  _mm_store_si128((__m128i *) &paths[32], m9);
391  _mm_store_si128((__m128i *) &paths[40], m11);
392 
393  /* (PMU) Butterflies: 17-31 */
394  SSE_BUTTERFLY(m12, m13, m6, m0, m2)
395  SSE_BUTTERFLY(m14, m15, m7, m9, m11)
396 
397  _mm_store_si128((__m128i *) &paths[16], m0);
398  _mm_store_si128((__m128i *) &paths[24], m9);
399  _mm_store_si128((__m128i *) &paths[48], m13);
400  _mm_store_si128((__m128i *) &paths[56], m15);
401 
402  if (norm)
403  SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
404  m7, m11, m0, m8, m9, m10)
405 
406  _mm_store_si128((__m128i *) &sums[0], m4);
407  _mm_store_si128((__m128i *) &sums[8], m5);
408  _mm_store_si128((__m128i *) &sums[16], m6);
409  _mm_store_si128((__m128i *) &sums[24], m7);
410  _mm_store_si128((__m128i *) &sums[32], m1);
411  _mm_store_si128((__m128i *) &sums[40], m3);
412  _mm_store_si128((__m128i *) &sums[48], m2);
413  _mm_store_si128((__m128i *) &sums[56], m11);
414 }
415 
416 /* Combined BMU/PMU (K=7, N=3 and N=4)
417  * Compute branch metrics followed by path metrics for half rate 64-state
418  * trellis. 32 butterfly operations are computed. Deinterleave path
419  * metrics before computing branch metrics as in the half rate case.
420  */
421 __always_inline static void _sse_metrics_k7_n4(const int16_t *val,
422  const int16_t *out, int16_t *sums, int16_t *paths, int norm)
423 {
424  __m128i m0, m1, m2, m3, m4, m5, m6, m7;
425  __m128i m8, m9, m10, m11, m12, m13, m14, m15;
426 
427  /* (PMU) Load accumulated path metrics */
428  m0 = _mm_load_si128((__m128i *) &sums[0]);
429  m1 = _mm_load_si128((__m128i *) &sums[8]);
430  m2 = _mm_load_si128((__m128i *) &sums[16]);
431  m3 = _mm_load_si128((__m128i *) &sums[24]);
432  m4 = _mm_load_si128((__m128i *) &sums[32]);
433  m5 = _mm_load_si128((__m128i *) &sums[40]);
434  m6 = _mm_load_si128((__m128i *) &sums[48]);
435  m7 = _mm_load_si128((__m128i *) &sums[56]);
436 
437  /* (PMU) Deinterleave into even and odd packed registers */
438  SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
439  m8, m9, m10, m11, m12, m13, m14, m15)
440 
441  /* (BMU) Load and expand 8-bit input out to 16-bits */
442  m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
443 
444  /* (BMU) Load and compute branch metrics */
445  m0 = _mm_load_si128((__m128i *) &out[0]);
446  m1 = _mm_load_si128((__m128i *) &out[8]);
447  m2 = _mm_load_si128((__m128i *) &out[16]);
448  m3 = _mm_load_si128((__m128i *) &out[24]);
449 
450  SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m4)
451 
452  m0 = _mm_load_si128((__m128i *) &out[32]);
453  m1 = _mm_load_si128((__m128i *) &out[40]);
454  m2 = _mm_load_si128((__m128i *) &out[48]);
455  m3 = _mm_load_si128((__m128i *) &out[56]);
456 
457  SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m5)
458 
459  m0 = _mm_load_si128((__m128i *) &out[64]);
460  m1 = _mm_load_si128((__m128i *) &out[72]);
461  m2 = _mm_load_si128((__m128i *) &out[80]);
462  m3 = _mm_load_si128((__m128i *) &out[88]);
463 
464  SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m6)
465 
466  m0 = _mm_load_si128((__m128i *) &out[96]);
467  m1 = _mm_load_si128((__m128i *) &out[104]);
468  m2 = _mm_load_si128((__m128i *) &out[112]);
469  m3 = _mm_load_si128((__m128i *) &out[120]);
470 
471  SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m7)
472 
473  /* (PMU) Butterflies: 0-15 */
474  SSE_BUTTERFLY(m8, m9, m4, m0, m1)
475  SSE_BUTTERFLY(m10, m11, m5, m2, m3)
476 
477  _mm_store_si128((__m128i *) &paths[0], m0);
478  _mm_store_si128((__m128i *) &paths[8], m2);
479  _mm_store_si128((__m128i *) &paths[32], m9);
480  _mm_store_si128((__m128i *) &paths[40], m11);
481 
482  /* (PMU) Butterflies: 17-31 */
483  SSE_BUTTERFLY(m12, m13, m6, m0, m2)
484  SSE_BUTTERFLY(m14, m15, m7, m9, m11)
485 
486  _mm_store_si128((__m128i *) &paths[16], m0);
487  _mm_store_si128((__m128i *) &paths[24], m9);
488  _mm_store_si128((__m128i *) &paths[48], m13);
489  _mm_store_si128((__m128i *) &paths[56], m15);
490 
491  if (norm)
492  SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
493  m7, m11, m0, m8, m9, m10)
494 
495  _mm_store_si128((__m128i *) &sums[0], m4);
496  _mm_store_si128((__m128i *) &sums[8], m5);
497  _mm_store_si128((__m128i *) &sums[16], m6);
498  _mm_store_si128((__m128i *) &sums[24], m7);
499  _mm_store_si128((__m128i *) &sums[32], m1);
500  _mm_store_si128((__m128i *) &sums[40], m3);
501  _mm_store_si128((__m128i *) &sums[48], m2);
502  _mm_store_si128((__m128i *) &sums[56], m11);
503 }
int16_t ** paths
Definition: conv_acc.c:168
#define SSE_BUTTERFLY(M0, M1, M2, M3, M4)
Definition: conv_acc_sse_impl.h:50
static __always_inline void _sse_metrics_k7_n2(const int16_t *val, const int16_t *out, int16_t *sums, int16_t *paths, int norm)
Definition: conv_acc_sse_impl.h:346
#define SSE_BRANCH_METRIC_N4(M0, M1, M2, M3, M4, M5)
Definition: conv_acc_sse_impl.h:155
#define __always_inline
Definition: conv_acc_sse_impl.h:28
#define SSE_BRANCH_METRIC_N2(M0, M1, M2, M3, M4, M6, M7)
Definition: conv_acc_sse_impl.h:133
#define SSE_DEINTERLEAVE_K5(M0, M1, M2, M3)
Definition: conv_acc_sse_impl.h:78
static __always_inline void _sse_metrics_k5_n2(const int16_t *val, const int16_t *out, int16_t *sums, int16_t *paths, int norm)
Definition: conv_acc_sse_impl.h:264
#define SSE_DEINTERLEAVE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11, M12, M13, M14, M15)
Definition: conv_acc_sse_impl.h:101
#define SSE_NORMALIZE_K5(M0, M1, M2, M3)
Definition: conv_acc_sse_impl.h:217
static __always_inline void _sse_metrics_k7_n4(const int16_t *val, const int16_t *out, int16_t *sums, int16_t *paths, int norm)
Definition: conv_acc_sse_impl.h:421
int sse41_supported
#define SSE_NORMALIZE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11)
Definition: conv_acc_sse_impl.h:237
static __always_inline void _sse_metrics_k5_n4(const int16_t *val, const int16_t *out, int16_t *sums, int16_t *paths, int norm)
Definition: conv_acc_sse_impl.h:306