blob: 7d48c9429c145f2dfa12500e481175873e18a4c9 [file] [log] [blame]
Vadim Yanitskiy0d49f472017-05-28 18:20:02 +07001/*
2 * Intel SSE Viterbi decoder
3 *
4 * Copyright (C) 2013, 2014 Thomas Tsou <tom@tsou.cc>
5 *
6 * All Rights Reserved
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21 */
22
23extern int sse41_supported;
24
25/* Octo-Viterbi butterfly
26 * Compute 8-wide butterfly generating 16 path decisions and 16 accumulated
27 * sums. Inputs all packed 16-bit integers in three 128-bit XMM registers.
28 * Two intermediate registers are used and results are set in the upper 4
29 * registers.
30 *
31 * Input:
32 * M0 - Path metrics 0 (packed 16-bit integers)
33 * M1 - Path metrics 1 (packed 16-bit integers)
34 * M2 - Branch metrics (packed 16-bit integers)
35 *
36 * Output:
37 * M2 - Selected and accumulated path metrics 0
38 * M4 - Selected and accumulated path metrics 1
39 * M3 - Path selections 0
40 * M1 - Path selections 1
41 */
42#define SSE_BUTTERFLY(M0, M1, M2, M3, M4) \
43{ \
44 M3 = _mm_adds_epi16(M0, M2); \
45 M4 = _mm_subs_epi16(M1, M2); \
46 M0 = _mm_subs_epi16(M0, M2); \
47 M1 = _mm_adds_epi16(M1, M2); \
48 M2 = _mm_max_epi16(M3, M4); \
49 M3 = _mm_or_si128(_mm_cmpgt_epi16(M3, M4), _mm_cmpeq_epi16(M3, M4)); \
50 M4 = _mm_max_epi16(M0, M1); \
51 M1 = _mm_or_si128(_mm_cmpgt_epi16(M0, M1), _mm_cmpeq_epi16(M0, M1)); \
52}
53
54/* Two lane deinterleaving K = 5:
55 * Take 16 interleaved 16-bit integers and deinterleave to 2 packed 128-bit
56 * registers. The operation summarized below. Four registers are used with
57 * the lower 2 as input and upper 2 as output.
58 *
59 * In - 10101010 10101010 10101010 10101010
60 * Out - 00000000 11111111 00000000 11111111
61 *
62 * Input:
63 * M0:1 - Packed 16-bit integers
64 *
65 * Output:
66 * M2:3 - Deinterleaved packed 16-bit integers
67 */
68#define _I8_SHUFFLE_MASK 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
69
70#define SSE_DEINTERLEAVE_K5(M0, M1, M2, M3) \
71{ \
72 M2 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
73 M0 = _mm_shuffle_epi8(M0, M2); \
74 M1 = _mm_shuffle_epi8(M1, M2); \
75 M2 = _mm_unpacklo_epi64(M0, M1); \
76 M3 = _mm_unpackhi_epi64(M0, M1); \
77}
78
79/* Two lane deinterleaving K = 7:
80 * Take 64 interleaved 16-bit integers and deinterleave to 8 packed 128-bit
81 * registers. The operation summarized below. 16 registers are used with the
82 * lower 8 as input and upper 8 as output.
83 *
84 * In - 10101010 10101010 10101010 10101010 ...
85 * Out - 00000000 11111111 00000000 11111111 ...
86 *
87 * Input:
88 * M0:7 - Packed 16-bit integers
89 *
90 * Output:
91 * M8:15 - Deinterleaved packed 16-bit integers
92 */
93#define SSE_DEINTERLEAVE_K7(M0, M1, M2, M3, M4, M5, M6, M7, \
94 M8, M9, M10, M11, M12, M13, M14, M15) \
95{ \
96 M8 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
97 M0 = _mm_shuffle_epi8(M0, M8); \
98 M1 = _mm_shuffle_epi8(M1, M8); \
99 M2 = _mm_shuffle_epi8(M2, M8); \
100 M3 = _mm_shuffle_epi8(M3, M8); \
101 M4 = _mm_shuffle_epi8(M4, M8); \
102 M5 = _mm_shuffle_epi8(M5, M8); \
103 M6 = _mm_shuffle_epi8(M6, M8); \
104 M7 = _mm_shuffle_epi8(M7, M8); \
105 M8 = _mm_unpacklo_epi64(M0, M1); \
106 M9 = _mm_unpackhi_epi64(M0, M1); \
107 M10 = _mm_unpacklo_epi64(M2, M3); \
108 M11 = _mm_unpackhi_epi64(M2, M3); \
109 M12 = _mm_unpacklo_epi64(M4, M5); \
110 M13 = _mm_unpackhi_epi64(M4, M5); \
111 M14 = _mm_unpacklo_epi64(M6, M7); \
112 M15 = _mm_unpackhi_epi64(M6, M7); \
113}
114
115/* Generate branch metrics N = 2:
116 * Compute 16 branch metrics from trellis outputs and input values.
117 *
118 * Input:
119 * M0:3 - 16 x 2 packed 16-bit trellis outputs
120 * M4 - Expanded and packed 16-bit input value
121 *
122 * Output:
123 * M6:7 - 16 computed 16-bit branch metrics
124 */
125#define SSE_BRANCH_METRIC_N2(M0, M1, M2, M3, M4, M6, M7) \
126{ \
127 M0 = _mm_sign_epi16(M4, M0); \
128 M1 = _mm_sign_epi16(M4, M1); \
129 M2 = _mm_sign_epi16(M4, M2); \
130 M3 = _mm_sign_epi16(M4, M3); \
131 M6 = _mm_hadds_epi16(M0, M1); \
132 M7 = _mm_hadds_epi16(M2, M3); \
133}
134
135/* Generate branch metrics N = 4:
136 * Compute 8 branch metrics from trellis outputs and input values. This
137 * macro is reused for N less than 4 where the extra soft input bits are
138 * padded.
139 *
140 * Input:
141 * M0:3 - 8 x 4 packed 16-bit trellis outputs
142 * M4 - Expanded and packed 16-bit input value
143 *
144 * Output:
145 * M5 - 8 computed 16-bit branch metrics
146 */
147#define SSE_BRANCH_METRIC_N4(M0, M1, M2, M3, M4, M5) \
148{ \
149 M0 = _mm_sign_epi16(M4, M0); \
150 M1 = _mm_sign_epi16(M4, M1); \
151 M2 = _mm_sign_epi16(M4, M2); \
152 M3 = _mm_sign_epi16(M4, M3); \
153 M0 = _mm_hadds_epi16(M0, M1); \
154 M1 = _mm_hadds_epi16(M2, M3); \
155 M5 = _mm_hadds_epi16(M0, M1); \
156}
157
158/* Horizontal minimum
159 * Compute horizontal minimum of packed unsigned 16-bit integers and place
160 * result in the low 16-bit element of the source register. Only SSE 4.1
161 * has a dedicated minpos instruction. One intermediate register is used
162 * if SSE 4.1 is not available. This is a destructive operation and the
163 * source register is overwritten.
164 *
165 * Input:
166 * M0 - Packed unsigned 16-bit integers
167 *
168 * Output:
169 * M0 - Minimum value placed in low 16-bit element
170 */
171#if defined(HAVE_SSE4_1) || defined(HAVE_SSE41)
172#define SSE_MINPOS(M0, M1) \
173{ \
174 if (sse41_supported) { \
175 M0 = _mm_minpos_epu16(M0); \
176 } else { \
177 M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
178 M0 = _mm_min_epi16(M0, M1); \
179 M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
180 M0 = _mm_min_epi16(M0, M1); \
181 M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
182 M0 = _mm_min_epi16(M0, M1); \
183 } \
184}
185#else
186#define SSE_MINPOS(M0, M1) \
187{ \
188 M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
189 M0 = _mm_min_epi16(M0, M1); \
190 M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
191 M0 = _mm_min_epi16(M0, M1); \
192 M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
193 M0 = _mm_min_epi16(M0, M1); \
194}
195#endif
196
197/* Normalize state metrics K = 5:
198 * Compute 16-wide normalization by subtracting the smallest value from
199 * all values. Inputs are 16 packed 16-bit integers across 2 XMM registers.
200 * Two intermediate registers are used and normalized results are placed
201 * in the originating locations.
202 *
203 * Input:
204 * M0:1 - Path metrics 0:1 (packed 16-bit integers)
205 *
206 * Output:
207 * M0:1 - Normalized path metrics 0:1
208 */
209#define SSE_NORMALIZE_K5(M0, M1, M2, M3) \
210{ \
211 M2 = _mm_min_epi16(M0, M1); \
212 SSE_MINPOS(M2, M3) \
213 SSE_BROADCAST(M2) \
214 M0 = _mm_subs_epi16(M0, M2); \
215 M1 = _mm_subs_epi16(M1, M2); \
216}
217
218/* Normalize state metrics K = 7:
219 * Compute 64-wide normalization by subtracting the smallest value from
220 * all values. Inputs are 8 registers of accumulated sums and 4 temporary
221 * registers. Normalized results are returned in the originating locations.
222 *
223 * Input:
224 * M0:7 - Path metrics 0:7 (packed 16-bit integers)
225 *
226 * Output:
227 * M0:7 - Normalized path metrics 0:7
228 */
229#define SSE_NORMALIZE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11) \
230{ \
231 M8 = _mm_min_epi16(M0, M1); \
232 M9 = _mm_min_epi16(M2, M3); \
233 M10 = _mm_min_epi16(M4, M5); \
234 M11 = _mm_min_epi16(M6, M7); \
235 M8 = _mm_min_epi16(M8, M9); \
236 M10 = _mm_min_epi16(M10, M11); \
237 M8 = _mm_min_epi16(M8, M10); \
238 SSE_MINPOS(M8, M9) \
239 SSE_BROADCAST(M8) \
240 M0 = _mm_subs_epi16(M0, M8); \
241 M1 = _mm_subs_epi16(M1, M8); \
242 M2 = _mm_subs_epi16(M2, M8); \
243 M3 = _mm_subs_epi16(M3, M8); \
244 M4 = _mm_subs_epi16(M4, M8); \
245 M5 = _mm_subs_epi16(M5, M8); \
246 M6 = _mm_subs_epi16(M6, M8); \
247 M7 = _mm_subs_epi16(M7, M8); \
248}
249
250/* Combined BMU/PMU (K=5, N=2)
251 * Compute branch metrics followed by path metrics for half rate 16-state
252 * trellis. 8 butterflies are computed. Accumulated path sums are not
253 * preserved and read and written into the same memory location. Normalize
254 * sums if requires.
255 */
256__always_inline static void _sse_metrics_k5_n2(const int16_t *val,
257 const int16_t *out, int16_t *sums, int16_t *paths, int norm)
258{
259 __m128i m0, m1, m2, m3, m4, m5, m6;
260
261 /* (BMU) Load input sequence */
262 m2 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
263
264 /* (BMU) Load trellis outputs */
265 m0 = _mm_load_si128((__m128i *) &out[0]);
266 m1 = _mm_load_si128((__m128i *) &out[8]);
267
268 /* (BMU) Compute branch metrics */
269 m0 = _mm_sign_epi16(m2, m0);
270 m1 = _mm_sign_epi16(m2, m1);
271 m2 = _mm_hadds_epi16(m0, m1);
272
273 /* (PMU) Load accumulated path metrics */
274 m0 = _mm_load_si128((__m128i *) &sums[0]);
275 m1 = _mm_load_si128((__m128i *) &sums[8]);
276
277 SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
278
279 /* (PMU) Butterflies: 0-7 */
280 SSE_BUTTERFLY(m3, m4, m2, m5, m6)
281
282 if (norm)
283 SSE_NORMALIZE_K5(m2, m6, m0, m1)
284
285 _mm_store_si128((__m128i *) &sums[0], m2);
286 _mm_store_si128((__m128i *) &sums[8], m6);
287 _mm_store_si128((__m128i *) &paths[0], m5);
288 _mm_store_si128((__m128i *) &paths[8], m4);
289}
290
291/* Combined BMU/PMU (K=5, N=3 and N=4)
292 * Compute branch metrics followed by path metrics for 16-state and rates
293 * to 1/4. 8 butterflies are computed. The input sequence is read four 16-bit
294 * values at a time, and extra values should be set to zero for rates other
295 * than 1/4. Normally only rates 1/3 and 1/4 are used as there is a
296 * dedicated implementation of rate 1/2.
297 */
298__always_inline static void _sse_metrics_k5_n4(const int16_t *val,
299 const int16_t *out, int16_t *sums, int16_t *paths, int norm)
300{
301 __m128i m0, m1, m2, m3, m4, m5, m6;
302
303 /* (BMU) Load input sequence */
304 m4 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
305
306 /* (BMU) Load trellis outputs */
307 m0 = _mm_load_si128((__m128i *) &out[0]);
308 m1 = _mm_load_si128((__m128i *) &out[8]);
309 m2 = _mm_load_si128((__m128i *) &out[16]);
310 m3 = _mm_load_si128((__m128i *) &out[24]);
311
312 SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m4, m2)
313
314 /* (PMU) Load accumulated path metrics */
315 m0 = _mm_load_si128((__m128i *) &sums[0]);
316 m1 = _mm_load_si128((__m128i *) &sums[8]);
317
318 SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
319
320 /* (PMU) Butterflies: 0-7 */
321 SSE_BUTTERFLY(m3, m4, m2, m5, m6)
322
323 if (norm)
324 SSE_NORMALIZE_K5(m2, m6, m0, m1)
325
326 _mm_store_si128((__m128i *) &sums[0], m2);
327 _mm_store_si128((__m128i *) &sums[8], m6);
328 _mm_store_si128((__m128i *) &paths[0], m5);
329 _mm_store_si128((__m128i *) &paths[8], m4);
330}
331
332/* Combined BMU/PMU (K=7, N=2)
333 * Compute branch metrics followed by path metrics for half rate 64-state
334 * trellis. 32 butterfly operations are computed. Deinterleaving path
335 * metrics requires usage of the full SSE register file, so separate sums
336 * before computing branch metrics to avoid register spilling.
337 */
338__always_inline static void _sse_metrics_k7_n2(const int16_t *val,
339 const int16_t *out, int16_t *sums, int16_t *paths, int norm)
340{
341 __m128i m0, m1, m2, m3, m4, m5, m6, m7, m8,
342 m9, m10, m11, m12, m13, m14, m15;
343
344 /* (PMU) Load accumulated path metrics */
345 m0 = _mm_load_si128((__m128i *) &sums[0]);
346 m1 = _mm_load_si128((__m128i *) &sums[8]);
347 m2 = _mm_load_si128((__m128i *) &sums[16]);
348 m3 = _mm_load_si128((__m128i *) &sums[24]);
349 m4 = _mm_load_si128((__m128i *) &sums[32]);
350 m5 = _mm_load_si128((__m128i *) &sums[40]);
351 m6 = _mm_load_si128((__m128i *) &sums[48]);
352 m7 = _mm_load_si128((__m128i *) &sums[56]);
353
354 /* (PMU) Deinterleave to even-odd registers */
355 SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
356 m8, m9, m10, m11, m12, m13, m14, m15)
357
358 /* (BMU) Load input symbols */
359 m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
360
361 /* (BMU) Load trellis outputs */
362 m0 = _mm_load_si128((__m128i *) &out[0]);
363 m1 = _mm_load_si128((__m128i *) &out[8]);
364 m2 = _mm_load_si128((__m128i *) &out[16]);
365 m3 = _mm_load_si128((__m128i *) &out[24]);
366
367 SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m4, m5)
368
369 m0 = _mm_load_si128((__m128i *) &out[32]);
370 m1 = _mm_load_si128((__m128i *) &out[40]);
371 m2 = _mm_load_si128((__m128i *) &out[48]);
372 m3 = _mm_load_si128((__m128i *) &out[56]);
373
374 SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m6, m7)
375
376 /* (PMU) Butterflies: 0-15 */
377 SSE_BUTTERFLY(m8, m9, m4, m0, m1)
378 SSE_BUTTERFLY(m10, m11, m5, m2, m3)
379
380 _mm_store_si128((__m128i *) &paths[0], m0);
381 _mm_store_si128((__m128i *) &paths[8], m2);
382 _mm_store_si128((__m128i *) &paths[32], m9);
383 _mm_store_si128((__m128i *) &paths[40], m11);
384
385 /* (PMU) Butterflies: 17-31 */
386 SSE_BUTTERFLY(m12, m13, m6, m0, m2)
387 SSE_BUTTERFLY(m14, m15, m7, m9, m11)
388
389 _mm_store_si128((__m128i *) &paths[16], m0);
390 _mm_store_si128((__m128i *) &paths[24], m9);
391 _mm_store_si128((__m128i *) &paths[48], m13);
392 _mm_store_si128((__m128i *) &paths[56], m15);
393
394 if (norm)
395 SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
396 m7, m11, m0, m8, m9, m10)
397
398 _mm_store_si128((__m128i *) &sums[0], m4);
399 _mm_store_si128((__m128i *) &sums[8], m5);
400 _mm_store_si128((__m128i *) &sums[16], m6);
401 _mm_store_si128((__m128i *) &sums[24], m7);
402 _mm_store_si128((__m128i *) &sums[32], m1);
403 _mm_store_si128((__m128i *) &sums[40], m3);
404 _mm_store_si128((__m128i *) &sums[48], m2);
405 _mm_store_si128((__m128i *) &sums[56], m11);
406}
407
408/* Combined BMU/PMU (K=7, N=3 and N=4)
409 * Compute branch metrics followed by path metrics for half rate 64-state
410 * trellis. 32 butterfly operations are computed. Deinterleave path
411 * metrics before computing branch metrics as in the half rate case.
412 */
413__always_inline static void _sse_metrics_k7_n4(const int16_t *val,
414 const int16_t *out, int16_t *sums, int16_t *paths, int norm)
415{
416 __m128i m0, m1, m2, m3, m4, m5, m6, m7;
417 __m128i m8, m9, m10, m11, m12, m13, m14, m15;
418
419 /* (PMU) Load accumulated path metrics */
420 m0 = _mm_load_si128((__m128i *) &sums[0]);
421 m1 = _mm_load_si128((__m128i *) &sums[8]);
422 m2 = _mm_load_si128((__m128i *) &sums[16]);
423 m3 = _mm_load_si128((__m128i *) &sums[24]);
424 m4 = _mm_load_si128((__m128i *) &sums[32]);
425 m5 = _mm_load_si128((__m128i *) &sums[40]);
426 m6 = _mm_load_si128((__m128i *) &sums[48]);
427 m7 = _mm_load_si128((__m128i *) &sums[56]);
428
429 /* (PMU) Deinterleave into even and odd packed registers */
430 SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
431 m8, m9, m10, m11, m12, m13, m14, m15)
432
433 /* (BMU) Load and expand 8-bit input out to 16-bits */
434 m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
435
436 /* (BMU) Load and compute branch metrics */
437 m0 = _mm_load_si128((__m128i *) &out[0]);
438 m1 = _mm_load_si128((__m128i *) &out[8]);
439 m2 = _mm_load_si128((__m128i *) &out[16]);
440 m3 = _mm_load_si128((__m128i *) &out[24]);
441
442 SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m4)
443
444 m0 = _mm_load_si128((__m128i *) &out[32]);
445 m1 = _mm_load_si128((__m128i *) &out[40]);
446 m2 = _mm_load_si128((__m128i *) &out[48]);
447 m3 = _mm_load_si128((__m128i *) &out[56]);
448
449 SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m5)
450
451 m0 = _mm_load_si128((__m128i *) &out[64]);
452 m1 = _mm_load_si128((__m128i *) &out[72]);
453 m2 = _mm_load_si128((__m128i *) &out[80]);
454 m3 = _mm_load_si128((__m128i *) &out[88]);
455
456 SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m6)
457
458 m0 = _mm_load_si128((__m128i *) &out[96]);
459 m1 = _mm_load_si128((__m128i *) &out[104]);
460 m2 = _mm_load_si128((__m128i *) &out[112]);
461 m3 = _mm_load_si128((__m128i *) &out[120]);
462
463 SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m7)
464
465 /* (PMU) Butterflies: 0-15 */
466 SSE_BUTTERFLY(m8, m9, m4, m0, m1)
467 SSE_BUTTERFLY(m10, m11, m5, m2, m3)
468
469 _mm_store_si128((__m128i *) &paths[0], m0);
470 _mm_store_si128((__m128i *) &paths[8], m2);
471 _mm_store_si128((__m128i *) &paths[32], m9);
472 _mm_store_si128((__m128i *) &paths[40], m11);
473
474 /* (PMU) Butterflies: 17-31 */
475 SSE_BUTTERFLY(m12, m13, m6, m0, m2)
476 SSE_BUTTERFLY(m14, m15, m7, m9, m11)
477
478 _mm_store_si128((__m128i *) &paths[16], m0);
479 _mm_store_si128((__m128i *) &paths[24], m9);
480 _mm_store_si128((__m128i *) &paths[48], m13);
481 _mm_store_si128((__m128i *) &paths[56], m15);
482
483 if (norm)
484 SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
485 m7, m11, m0, m8, m9, m10)
486
487 _mm_store_si128((__m128i *) &sums[0], m4);
488 _mm_store_si128((__m128i *) &sums[8], m5);
489 _mm_store_si128((__m128i *) &sums[16], m6);
490 _mm_store_si128((__m128i *) &sums[24], m7);
491 _mm_store_si128((__m128i *) &sums[32], m1);
492 _mm_store_si128((__m128i *) &sums[40], m3);
493 _mm_store_si128((__m128i *) &sums[48], m2);
494 _mm_store_si128((__m128i *) &sums[56], m11);
495}