blob: 5ca21b090231e07d3bfbb7403c3ddbea97e1c007 [file] [log] [blame]
Tom Tsou34e228a2017-04-29 00:16:43 +07001/*
2 * Intel SSE Viterbi decoder
3 *
4 * Copyright (C) 2013, 2014 Thomas Tsou <tom@tsou.cc>
5 *
6 * All Rights Reserved
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21 */
22
23#include <stdint.h>
24#include <emmintrin.h>
25#include <tmmintrin.h>
26#include <xmmintrin.h>
27
28#include "config.h"
29
30#if defined(HAVE_SSE4_1) || defined(HAVE_SSE41)
31 #include <smmintrin.h>
32#endif
33
34#ifdef HAVE_AVX2
35 #include <immintrin.h>
36#endif
37
38#define SSE_ALIGN 16
39
40extern int sse41_supported;
41extern int sse3_supported;
42extern int avx2_supported;
43
44/* Octo-Viterbi butterfly
45 * Compute 8-wide butterfly generating 16 path decisions and 16 accumulated
46 * sums. Inputs all packed 16-bit integers in three 128-bit XMM registers.
47 * Two intermediate registers are used and results are set in the upper 4
48 * registers.
49 *
50 * Input:
51 * M0 - Path metrics 0 (packed 16-bit integers)
52 * M1 - Path metrics 1 (packed 16-bit integers)
53 * M2 - Branch metrics (packed 16-bit integers)
54 *
55 * Output:
56 * M2 - Selected and accumulated path metrics 0
57 * M4 - Selected and accumulated path metrics 1
58 * M3 - Path selections 0
59 * M1 - Path selections 1
60 */
61#define SSE_BUTTERFLY(M0, M1, M2, M3, M4) \
62{ \
63 M3 = _mm_adds_epi16(M0, M2); \
64 M4 = _mm_subs_epi16(M1, M2); \
65 M0 = _mm_subs_epi16(M0, M2); \
66 M1 = _mm_adds_epi16(M1, M2); \
67 M2 = _mm_max_epi16(M3, M4); \
68 M3 = _mm_or_si128(_mm_cmpgt_epi16(M3, M4), _mm_cmpeq_epi16(M3, M4)); \
69 M4 = _mm_max_epi16(M0, M1); \
70 M1 = _mm_or_si128(_mm_cmpgt_epi16(M0, M1), _mm_cmpeq_epi16(M0, M1)); \
71}
72
73/* Two lane deinterleaving K = 5:
74 * Take 16 interleaved 16-bit integers and deinterleave to 2 packed 128-bit
75 * registers. The operation summarized below. Four registers are used with
76 * the lower 2 as input and upper 2 as output.
77 *
78 * In - 10101010 10101010 10101010 10101010
79 * Out - 00000000 11111111 00000000 11111111
80 *
81 * Input:
82 * M0:1 - Packed 16-bit integers
83 *
84 * Output:
85 * M2:3 - Deinterleaved packed 16-bit integers
86 */
87#define _I8_SHUFFLE_MASK 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
88
89#define SSE_DEINTERLEAVE_K5(M0, M1, M2, M3) \
90{ \
91 M2 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
92 M0 = _mm_shuffle_epi8(M0, M2); \
93 M1 = _mm_shuffle_epi8(M1, M2); \
94 M2 = _mm_unpacklo_epi64(M0, M1); \
95 M3 = _mm_unpackhi_epi64(M0, M1); \
96}
97
98/* Two lane deinterleaving K = 7:
99 * Take 64 interleaved 16-bit integers and deinterleave to 8 packed 128-bit
100 * registers. The operation summarized below. 16 registers are used with the
101 * lower 8 as input and upper 8 as output.
102 *
103 * In - 10101010 10101010 10101010 10101010 ...
104 * Out - 00000000 11111111 00000000 11111111 ...
105 *
106 * Input:
107 * M0:7 - Packed 16-bit integers
108 *
109 * Output:
110 * M8:15 - Deinterleaved packed 16-bit integers
111 */
112#define SSE_DEINTERLEAVE_K7(M0, M1, M2, M3, M4, M5, M6, M7, \
113 M8, M9, M10, M11, M12, M13, M14, M15) \
114{ \
115 M8 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
116 M0 = _mm_shuffle_epi8(M0, M8); \
117 M1 = _mm_shuffle_epi8(M1, M8); \
118 M2 = _mm_shuffle_epi8(M2, M8); \
119 M3 = _mm_shuffle_epi8(M3, M8); \
120 M4 = _mm_shuffle_epi8(M4, M8); \
121 M5 = _mm_shuffle_epi8(M5, M8); \
122 M6 = _mm_shuffle_epi8(M6, M8); \
123 M7 = _mm_shuffle_epi8(M7, M8); \
124 M8 = _mm_unpacklo_epi64(M0, M1); \
125 M9 = _mm_unpackhi_epi64(M0, M1); \
126 M10 = _mm_unpacklo_epi64(M2, M3); \
127 M11 = _mm_unpackhi_epi64(M2, M3); \
128 M12 = _mm_unpacklo_epi64(M4, M5); \
129 M13 = _mm_unpackhi_epi64(M4, M5); \
130 M14 = _mm_unpacklo_epi64(M6, M7); \
131 M15 = _mm_unpackhi_epi64(M6, M7); \
132}
133
134/* Generate branch metrics N = 2:
135 * Compute 16 branch metrics from trellis outputs and input values.
136 *
137 * Input:
138 * M0:3 - 16 x 2 packed 16-bit trellis outputs
139 * M4 - Expanded and packed 16-bit input value
140 *
141 * Output:
142 * M6:7 - 16 computed 16-bit branch metrics
143 */
144#define SSE_BRANCH_METRIC_N2(M0, M1, M2, M3, M4, M6, M7) \
145{ \
146 M0 = _mm_sign_epi16(M4, M0); \
147 M1 = _mm_sign_epi16(M4, M1); \
148 M2 = _mm_sign_epi16(M4, M2); \
149 M3 = _mm_sign_epi16(M4, M3); \
150 M6 = _mm_hadds_epi16(M0, M1); \
151 M7 = _mm_hadds_epi16(M2, M3); \
152}
153
154/* Generate branch metrics N = 4:
155 * Compute 8 branch metrics from trellis outputs and input values. This
156 * macro is reused for N less than 4 where the extra soft input bits are
157 * padded.
158 *
159 * Input:
160 * M0:3 - 8 x 4 packed 16-bit trellis outputs
161 * M4 - Expanded and packed 16-bit input value
162 *
163 * Output:
164 * M5 - 8 computed 16-bit branch metrics
165 */
166#define SSE_BRANCH_METRIC_N4(M0, M1, M2, M3, M4, M5) \
167{ \
168 M0 = _mm_sign_epi16(M4, M0); \
169 M1 = _mm_sign_epi16(M4, M1); \
170 M2 = _mm_sign_epi16(M4, M2); \
171 M3 = _mm_sign_epi16(M4, M3); \
172 M0 = _mm_hadds_epi16(M0, M1); \
173 M1 = _mm_hadds_epi16(M2, M3); \
174 M5 = _mm_hadds_epi16(M0, M1); \
175}
176
177/* Broadcast 16-bit integer
178 * Repeat the low 16-bit integer to all elements of the 128-bit SSE
179 * register. Only AVX2 has a dedicated broadcast instruction; use repeat
180 * unpacks for SSE only architectures. This is a destructive operation and
181 * the source register is overwritten.
182 *
183 * Input:
184 * M0 - Low 16-bit element is read
185 *
186 * Output:
187 * M0 - Contains broadcasted values
188 */
189#ifdef HAVE_AVX2
190#define SSE_BROADCAST(M0) \
191{ \
192 if (avx2_supported) { \
193 M0 = _mm_broadcastw_epi16(M0); \
194 } else { \
195 M0 = _mm_unpacklo_epi16(M0, M0); \
196 M0 = _mm_unpacklo_epi32(M0, M0); \
197 M0 = _mm_unpacklo_epi64(M0, M0); \
198 } \
199}
200#else
201#define SSE_BROADCAST(M0) \
202{ \
203 M0 = _mm_unpacklo_epi16(M0, M0); \
204 M0 = _mm_unpacklo_epi32(M0, M0); \
205 M0 = _mm_unpacklo_epi64(M0, M0); \
206}
207#endif
208
209/* Horizontal minimum
210 * Compute horizontal minimum of packed unsigned 16-bit integers and place
211 * result in the low 16-bit element of the source register. Only SSE 4.1
212 * has a dedicated minpos instruction. One intermediate register is used
213 * if SSE 4.1 is not available. This is a destructive operation and the
214 * source register is overwritten.
215 *
216 * Input:
217 * M0 - Packed unsigned 16-bit integers
218 *
219 * Output:
220 * M0 - Minimum value placed in low 16-bit element
221 */
222#if defined(HAVE_SSE4_1) || defined(HAVE_SSE41)
223#define SSE_MINPOS(M0, M1) \
224{ \
225 if (sse41_supported) { \
226 M0 = _mm_minpos_epu16(M0); \
227 } else { \
228 M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
229 M0 = _mm_min_epi16(M0, M1); \
230 M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
231 M0 = _mm_min_epi16(M0, M1); \
232 M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
233 M0 = _mm_min_epi16(M0, M1); \
234 } \
235}
236#else
237#define SSE_MINPOS(M0, M1) \
238{ \
239 M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
240 M0 = _mm_min_epi16(M0, M1); \
241 M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
242 M0 = _mm_min_epi16(M0, M1); \
243 M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
244 M0 = _mm_min_epi16(M0, M1); \
245}
246#endif
247
248/* Normalize state metrics K = 5:
249 * Compute 16-wide normalization by subtracting the smallest value from
250 * all values. Inputs are 16 packed 16-bit integers across 2 XMM registers.
251 * Two intermediate registers are used and normalized results are placed
252 * in the originating locations.
253 *
254 * Input:
255 * M0:1 - Path metrics 0:1 (packed 16-bit integers)
256 *
257 * Output:
258 * M0:1 - Normalized path metrics 0:1
259 */
260#define SSE_NORMALIZE_K5(M0, M1, M2, M3) \
261{ \
262 M2 = _mm_min_epi16(M0, M1); \
263 SSE_MINPOS(M2, M3) \
264 SSE_BROADCAST(M2) \
265 M0 = _mm_subs_epi16(M0, M2); \
266 M1 = _mm_subs_epi16(M1, M2); \
267}
268
269/* Normalize state metrics K = 7:
270 * Compute 64-wide normalization by subtracting the smallest value from
271 * all values. Inputs are 8 registers of accumulated sums and 4 temporary
272 * registers. Normalized results are returned in the originating locations.
273 *
274 * Input:
275 * M0:7 - Path metrics 0:7 (packed 16-bit integers)
276 *
277 * Output:
278 * M0:7 - Normalized path metrics 0:7
279 */
280#define SSE_NORMALIZE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11) \
281{ \
282 M8 = _mm_min_epi16(M0, M1); \
283 M9 = _mm_min_epi16(M2, M3); \
284 M10 = _mm_min_epi16(M4, M5); \
285 M11 = _mm_min_epi16(M6, M7); \
286 M8 = _mm_min_epi16(M8, M9); \
287 M10 = _mm_min_epi16(M10, M11); \
288 M8 = _mm_min_epi16(M8, M10); \
289 SSE_MINPOS(M8, M9) \
290 SSE_BROADCAST(M8) \
291 M0 = _mm_subs_epi16(M0, M8); \
292 M1 = _mm_subs_epi16(M1, M8); \
293 M2 = _mm_subs_epi16(M2, M8); \
294 M3 = _mm_subs_epi16(M3, M8); \
295 M4 = _mm_subs_epi16(M4, M8); \
296 M5 = _mm_subs_epi16(M5, M8); \
297 M6 = _mm_subs_epi16(M6, M8); \
298 M7 = _mm_subs_epi16(M7, M8); \
299}
300
301/* Combined BMU/PMU (K=5, N=2)
302 * Compute branch metrics followed by path metrics for half rate 16-state
303 * trellis. 8 butterflies are computed. Accumulated path sums are not
304 * preserved and read and written into the same memory location. Normalize
305 * sums if requires.
306 */
307__always_inline static void _sse_metrics_k5_n2(const int16_t *val,
308 const int16_t *out, int16_t *sums, int16_t *paths, int norm)
309{
310 __m128i m0, m1, m2, m3, m4, m5, m6;
311
312 /* (BMU) Load input sequence */
313 m2 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
314
315 /* (BMU) Load trellis outputs */
316 m0 = _mm_load_si128((__m128i *) &out[0]);
317 m1 = _mm_load_si128((__m128i *) &out[8]);
318
319 /* (BMU) Compute branch metrics */
320 m0 = _mm_sign_epi16(m2, m0);
321 m1 = _mm_sign_epi16(m2, m1);
322 m2 = _mm_hadds_epi16(m0, m1);
323
324 /* (PMU) Load accumulated path metrics */
325 m0 = _mm_load_si128((__m128i *) &sums[0]);
326 m1 = _mm_load_si128((__m128i *) &sums[8]);
327
328 SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
329
330 /* (PMU) Butterflies: 0-7 */
331 SSE_BUTTERFLY(m3, m4, m2, m5, m6)
332
333 if (norm)
334 SSE_NORMALIZE_K5(m2, m6, m0, m1)
335
336 _mm_store_si128((__m128i *) &sums[0], m2);
337 _mm_store_si128((__m128i *) &sums[8], m6);
338 _mm_store_si128((__m128i *) &paths[0], m5);
339 _mm_store_si128((__m128i *) &paths[8], m4);
340}
341
342/* Combined BMU/PMU (K=5, N=3 and N=4)
343 * Compute branch metrics followed by path metrics for 16-state and rates
344 * to 1/4. 8 butterflies are computed. The input sequence is read four 16-bit
345 * values at a time, and extra values should be set to zero for rates other
346 * than 1/4. Normally only rates 1/3 and 1/4 are used as there is a
347 * dedicated implementation of rate 1/2.
348 */
349__always_inline static void _sse_metrics_k5_n4(const int16_t *val,
350 const int16_t *out, int16_t *sums, int16_t *paths, int norm)
351{
352 __m128i m0, m1, m2, m3, m4, m5, m6;
353
354 /* (BMU) Load input sequence */
355 m4 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
356
357 /* (BMU) Load trellis outputs */
358 m0 = _mm_load_si128((__m128i *) &out[0]);
359 m1 = _mm_load_si128((__m128i *) &out[8]);
360 m2 = _mm_load_si128((__m128i *) &out[16]);
361 m3 = _mm_load_si128((__m128i *) &out[24]);
362
363 SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m4, m2)
364
365 /* (PMU) Load accumulated path metrics */
366 m0 = _mm_load_si128((__m128i *) &sums[0]);
367 m1 = _mm_load_si128((__m128i *) &sums[8]);
368
369 SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
370
371 /* (PMU) Butterflies: 0-7 */
372 SSE_BUTTERFLY(m3, m4, m2, m5, m6)
373
374 if (norm)
375 SSE_NORMALIZE_K5(m2, m6, m0, m1)
376
377 _mm_store_si128((__m128i *) &sums[0], m2);
378 _mm_store_si128((__m128i *) &sums[8], m6);
379 _mm_store_si128((__m128i *) &paths[0], m5);
380 _mm_store_si128((__m128i *) &paths[8], m4);
381}
382
383/* Combined BMU/PMU (K=7, N=2)
384 * Compute branch metrics followed by path metrics for half rate 64-state
385 * trellis. 32 butterfly operations are computed. Deinterleaving path
386 * metrics requires usage of the full SSE register file, so separate sums
387 * before computing branch metrics to avoid register spilling.
388 */
389__always_inline static void _sse_metrics_k7_n2(const int16_t *val,
390 const int16_t *out, int16_t *sums, int16_t *paths, int norm)
391{
392 __m128i m0, m1, m2, m3, m4, m5, m6, m7, m8,
393 m9, m10, m11, m12, m13, m14, m15;
394
395 /* (PMU) Load accumulated path metrics */
396 m0 = _mm_load_si128((__m128i *) &sums[0]);
397 m1 = _mm_load_si128((__m128i *) &sums[8]);
398 m2 = _mm_load_si128((__m128i *) &sums[16]);
399 m3 = _mm_load_si128((__m128i *) &sums[24]);
400 m4 = _mm_load_si128((__m128i *) &sums[32]);
401 m5 = _mm_load_si128((__m128i *) &sums[40]);
402 m6 = _mm_load_si128((__m128i *) &sums[48]);
403 m7 = _mm_load_si128((__m128i *) &sums[56]);
404
405 /* (PMU) Deinterleave to even-odd registers */
406 SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
407 m8, m9, m10, m11, m12, m13, m14, m15)
408
409 /* (BMU) Load input symbols */
410 m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
411
412 /* (BMU) Load trellis outputs */
413 m0 = _mm_load_si128((__m128i *) &out[0]);
414 m1 = _mm_load_si128((__m128i *) &out[8]);
415 m2 = _mm_load_si128((__m128i *) &out[16]);
416 m3 = _mm_load_si128((__m128i *) &out[24]);
417
418 SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m4, m5)
419
420 m0 = _mm_load_si128((__m128i *) &out[32]);
421 m1 = _mm_load_si128((__m128i *) &out[40]);
422 m2 = _mm_load_si128((__m128i *) &out[48]);
423 m3 = _mm_load_si128((__m128i *) &out[56]);
424
425 SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m6, m7)
426
427 /* (PMU) Butterflies: 0-15 */
428 SSE_BUTTERFLY(m8, m9, m4, m0, m1)
429 SSE_BUTTERFLY(m10, m11, m5, m2, m3)
430
431 _mm_store_si128((__m128i *) &paths[0], m0);
432 _mm_store_si128((__m128i *) &paths[8], m2);
433 _mm_store_si128((__m128i *) &paths[32], m9);
434 _mm_store_si128((__m128i *) &paths[40], m11);
435
436 /* (PMU) Butterflies: 17-31 */
437 SSE_BUTTERFLY(m12, m13, m6, m0, m2)
438 SSE_BUTTERFLY(m14, m15, m7, m9, m11)
439
440 _mm_store_si128((__m128i *) &paths[16], m0);
441 _mm_store_si128((__m128i *) &paths[24], m9);
442 _mm_store_si128((__m128i *) &paths[48], m13);
443 _mm_store_si128((__m128i *) &paths[56], m15);
444
445 if (norm)
446 SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
447 m7, m11, m0, m8, m9, m10)
448
449 _mm_store_si128((__m128i *) &sums[0], m4);
450 _mm_store_si128((__m128i *) &sums[8], m5);
451 _mm_store_si128((__m128i *) &sums[16], m6);
452 _mm_store_si128((__m128i *) &sums[24], m7);
453 _mm_store_si128((__m128i *) &sums[32], m1);
454 _mm_store_si128((__m128i *) &sums[40], m3);
455 _mm_store_si128((__m128i *) &sums[48], m2);
456 _mm_store_si128((__m128i *) &sums[56], m11);
457}
458
459/* Combined BMU/PMU (K=7, N=3 and N=4)
460 * Compute branch metrics followed by path metrics for half rate 64-state
461 * trellis. 32 butterfly operations are computed. Deinterleave path
462 * metrics before computing branch metrics as in the half rate case.
463 */
464__always_inline static void _sse_metrics_k7_n4(const int16_t *val,
465 const int16_t *out, int16_t *sums, int16_t *paths, int norm)
466{
467 __m128i m0, m1, m2, m3, m4, m5, m6, m7;
468 __m128i m8, m9, m10, m11, m12, m13, m14, m15;
469
470 /* (PMU) Load accumulated path metrics */
471 m0 = _mm_load_si128((__m128i *) &sums[0]);
472 m1 = _mm_load_si128((__m128i *) &sums[8]);
473 m2 = _mm_load_si128((__m128i *) &sums[16]);
474 m3 = _mm_load_si128((__m128i *) &sums[24]);
475 m4 = _mm_load_si128((__m128i *) &sums[32]);
476 m5 = _mm_load_si128((__m128i *) &sums[40]);
477 m6 = _mm_load_si128((__m128i *) &sums[48]);
478 m7 = _mm_load_si128((__m128i *) &sums[56]);
479
480 /* (PMU) Deinterleave into even and odd packed registers */
481 SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
482 m8, m9, m10, m11, m12, m13, m14, m15)
483
484 /* (BMU) Load and expand 8-bit input out to 16-bits */
485 m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
486
487 /* (BMU) Load and compute branch metrics */
488 m0 = _mm_load_si128((__m128i *) &out[0]);
489 m1 = _mm_load_si128((__m128i *) &out[8]);
490 m2 = _mm_load_si128((__m128i *) &out[16]);
491 m3 = _mm_load_si128((__m128i *) &out[24]);
492
493 SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m4)
494
495 m0 = _mm_load_si128((__m128i *) &out[32]);
496 m1 = _mm_load_si128((__m128i *) &out[40]);
497 m2 = _mm_load_si128((__m128i *) &out[48]);
498 m3 = _mm_load_si128((__m128i *) &out[56]);
499
500 SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m5)
501
502 m0 = _mm_load_si128((__m128i *) &out[64]);
503 m1 = _mm_load_si128((__m128i *) &out[72]);
504 m2 = _mm_load_si128((__m128i *) &out[80]);
505 m3 = _mm_load_si128((__m128i *) &out[88]);
506
507 SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m6)
508
509 m0 = _mm_load_si128((__m128i *) &out[96]);
510 m1 = _mm_load_si128((__m128i *) &out[104]);
511 m2 = _mm_load_si128((__m128i *) &out[112]);
512 m3 = _mm_load_si128((__m128i *) &out[120]);
513
514 SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m7)
515
516 /* (PMU) Butterflies: 0-15 */
517 SSE_BUTTERFLY(m8, m9, m4, m0, m1)
518 SSE_BUTTERFLY(m10, m11, m5, m2, m3)
519
520 _mm_store_si128((__m128i *) &paths[0], m0);
521 _mm_store_si128((__m128i *) &paths[8], m2);
522 _mm_store_si128((__m128i *) &paths[32], m9);
523 _mm_store_si128((__m128i *) &paths[40], m11);
524
525 /* (PMU) Butterflies: 17-31 */
526 SSE_BUTTERFLY(m12, m13, m6, m0, m2)
527 SSE_BUTTERFLY(m14, m15, m7, m9, m11)
528
529 _mm_store_si128((__m128i *) &paths[16], m0);
530 _mm_store_si128((__m128i *) &paths[24], m9);
531 _mm_store_si128((__m128i *) &paths[48], m13);
532 _mm_store_si128((__m128i *) &paths[56], m15);
533
534 if (norm)
535 SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
536 m7, m11, m0, m8, m9, m10)
537
538 _mm_store_si128((__m128i *) &sums[0], m4);
539 _mm_store_si128((__m128i *) &sums[8], m5);
540 _mm_store_si128((__m128i *) &sums[16], m6);
541 _mm_store_si128((__m128i *) &sums[24], m7);
542 _mm_store_si128((__m128i *) &sums[32], m1);
543 _mm_store_si128((__m128i *) &sums[40], m3);
544 _mm_store_si128((__m128i *) &sums[48], m2);
545 _mm_store_si128((__m128i *) &sums[56], m11);
546}
547
548/* Aligned Memory Allocator
549 * SSE requires 16-byte memory alignment. We store relevant trellis values
550 * (accumulated sums, outputs, and path decisions) as 16 bit signed integers
551 * so the allocated memory is casted as such.
552 */
553__attribute__ ((visibility("hidden")))
554int16_t *osmo_conv_vdec_malloc_sse3(size_t n)
555{
556 return (int16_t *) _mm_malloc(sizeof(int16_t) * n, SSE_ALIGN);
557}
558
559__attribute__ ((visibility("hidden")))
560void osmo_conv_vdec_free_sse3(int16_t *ptr)
561{
562 _mm_free(ptr);
563}
564
565__attribute__ ((visibility("hidden")))
566void osmo_conv_gen_metrics_k5_n2_sse(const int8_t *val, const int16_t *out,
567 int16_t *sums, int16_t *paths, int norm)
568{
569 const int16_t _val[4] = { val[0], val[1], val[0], val[1] };
570
571 _sse_metrics_k5_n2(_val, out, sums, paths, norm);
572}
573
574__attribute__ ((visibility("hidden")))
575void osmo_conv_gen_metrics_k5_n3_sse(const int8_t *val, const int16_t *out,
576 int16_t *sums, int16_t *paths, int norm)
577{
578 const int16_t _val[4] = { val[0], val[1], val[2], 0 };
579
580 _sse_metrics_k5_n4(_val, out, sums, paths, norm);
581}
582
583__attribute__ ((visibility("hidden")))
584void osmo_conv_gen_metrics_k5_n4_sse(const int8_t *val, const int16_t *out,
585 int16_t *sums, int16_t *paths, int norm)
586{
587 const int16_t _val[4] = { val[0], val[1], val[2], val[3] };
588
589 _sse_metrics_k5_n4(_val, out, sums, paths, norm);
590}
591
592__attribute__ ((visibility("hidden")))
593void osmo_conv_gen_metrics_k7_n2_sse(const int8_t *val, const int16_t *out,
594 int16_t *sums, int16_t *paths, int norm)
595{
596 const int16_t _val[4] = { val[0], val[1], val[0], val[1] };
597
598 _sse_metrics_k7_n2(_val, out, sums, paths, norm);
599}
600
601__attribute__ ((visibility("hidden")))
602void osmo_conv_gen_metrics_k7_n3_sse(const int8_t *val, const int16_t *out,
603 int16_t *sums, int16_t *paths, int norm)
604{
605 const int16_t _val[4] = { val[0], val[1], val[2], 0 };
606
607 _sse_metrics_k7_n4(_val, out, sums, paths, norm);
608}
609
610__attribute__ ((visibility("hidden")))
611void osmo_conv_gen_metrics_k7_n4_sse(const int8_t *val, const int16_t *out,
612 int16_t *sums, int16_t *paths, int norm)
613{
614 const int16_t _val[4] = { val[0], val[1], val[2], val[3] };
615
616 _sse_metrics_k7_n4(_val, out, sums, paths, norm);
617}