Blame - src/viterbi_sse.c - libosmocore

blob: 5ca21b090231e07d3bfbb7403c3ddbea97e1c007 [file] [log] [blame]

Tom Tsou	34e228a	2017-04-29 00:16:43 +0700	[diff] [blame^]	1	/*
				2	* Intel SSE Viterbi decoder
				3	*
				4	* Copyright (C) 2013, 2014 Thomas Tsou <tom@tsou.cc>
				5	*
				6	* All Rights Reserved
				7	*
				8	* This program is free software; you can redistribute it and/or modify
				9	* it under the terms of the GNU General Public License as published by
				10	* the Free Software Foundation; either version 2 of the License, or
				11	* (at your option) any later version.
				12	*
				13	* This program is distributed in the hope that it will be useful,
				14	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				15	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				16	* GNU General Public License for more details.
				17	*
				18	* You should have received a copy of the GNU General Public License along
				19	* with this program; if not, write to the Free Software Foundation, Inc.,
				20	* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
				21	*/
				22
				23	#include <stdint.h>
				24	#include <emmintrin.h>
				25	#include <tmmintrin.h>
				26	#include <xmmintrin.h>
				27
				28	#include "config.h"
				29
				30	#if defined(HAVE_SSE4_1) \|\| defined(HAVE_SSE41)
				31	#include <smmintrin.h>
				32	#endif
				33
				34	#ifdef HAVE_AVX2
				35	#include <immintrin.h>
				36	#endif
				37
				38	#define SSE_ALIGN 16
				39
				40	extern int sse41_supported;
				41	extern int sse3_supported;
				42	extern int avx2_supported;
				43
				44	/* Octo-Viterbi butterfly
				45	* Compute 8-wide butterfly generating 16 path decisions and 16 accumulated
				46	* sums. Inputs all packed 16-bit integers in three 128-bit XMM registers.
				47	* Two intermediate registers are used and results are set in the upper 4
				48	* registers.
				49	*
				50	* Input:
				51	* M0 - Path metrics 0 (packed 16-bit integers)
				52	* M1 - Path metrics 1 (packed 16-bit integers)
				53	* M2 - Branch metrics (packed 16-bit integers)
				54	*
				55	* Output:
				56	* M2 - Selected and accumulated path metrics 0
				57	* M4 - Selected and accumulated path metrics 1
				58	* M3 - Path selections 0
				59	* M1 - Path selections 1
				60	*/
				61	#define SSE_BUTTERFLY(M0, M1, M2, M3, M4) \
				62	{ \
				63	M3 = _mm_adds_epi16(M0, M2); \
				64	M4 = _mm_subs_epi16(M1, M2); \
				65	M0 = _mm_subs_epi16(M0, M2); \
				66	M1 = _mm_adds_epi16(M1, M2); \
				67	M2 = _mm_max_epi16(M3, M4); \
				68	M3 = _mm_or_si128(_mm_cmpgt_epi16(M3, M4), _mm_cmpeq_epi16(M3, M4)); \
				69	M4 = _mm_max_epi16(M0, M1); \
				70	M1 = _mm_or_si128(_mm_cmpgt_epi16(M0, M1), _mm_cmpeq_epi16(M0, M1)); \
				71	}
				72
				73	/* Two lane deinterleaving K = 5:
				74	* Take 16 interleaved 16-bit integers and deinterleave to 2 packed 128-bit
				75	* registers. The operation summarized below. Four registers are used with
				76	* the lower 2 as input and upper 2 as output.
				77	*
				78	* In - 10101010 10101010 10101010 10101010
				79	* Out - 00000000 11111111 00000000 11111111
				80	*
				81	* Input:
				82	* M0:1 - Packed 16-bit integers
				83	*
				84	* Output:
				85	* M2:3 - Deinterleaved packed 16-bit integers
				86	*/
				87	#define _I8_SHUFFLE_MASK 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
				88
				89	#define SSE_DEINTERLEAVE_K5(M0, M1, M2, M3) \
				90	{ \
				91	M2 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
				92	M0 = _mm_shuffle_epi8(M0, M2); \
				93	M1 = _mm_shuffle_epi8(M1, M2); \
				94	M2 = _mm_unpacklo_epi64(M0, M1); \
				95	M3 = _mm_unpackhi_epi64(M0, M1); \
				96	}
				97
				98	/* Two lane deinterleaving K = 7:
				99	* Take 64 interleaved 16-bit integers and deinterleave to 8 packed 128-bit
				100	* registers. The operation summarized below. 16 registers are used with the
				101	* lower 8 as input and upper 8 as output.
				102	*
				103	* In - 10101010 10101010 10101010 10101010 ...
				104	* Out - 00000000 11111111 00000000 11111111 ...
				105	*
				106	* Input:
				107	* M0:7 - Packed 16-bit integers
				108	*
				109	* Output:
				110	* M8:15 - Deinterleaved packed 16-bit integers
				111	*/
				112	#define SSE_DEINTERLEAVE_K7(M0, M1, M2, M3, M4, M5, M6, M7, \
				113	M8, M9, M10, M11, M12, M13, M14, M15) \
				114	{ \
				115	M8 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
				116	M0 = _mm_shuffle_epi8(M0, M8); \
				117	M1 = _mm_shuffle_epi8(M1, M8); \
				118	M2 = _mm_shuffle_epi8(M2, M8); \
				119	M3 = _mm_shuffle_epi8(M3, M8); \
				120	M4 = _mm_shuffle_epi8(M4, M8); \
				121	M5 = _mm_shuffle_epi8(M5, M8); \
				122	M6 = _mm_shuffle_epi8(M6, M8); \
				123	M7 = _mm_shuffle_epi8(M7, M8); \
				124	M8 = _mm_unpacklo_epi64(M0, M1); \
				125	M9 = _mm_unpackhi_epi64(M0, M1); \
				126	M10 = _mm_unpacklo_epi64(M2, M3); \
				127	M11 = _mm_unpackhi_epi64(M2, M3); \
				128	M12 = _mm_unpacklo_epi64(M4, M5); \
				129	M13 = _mm_unpackhi_epi64(M4, M5); \
				130	M14 = _mm_unpacklo_epi64(M6, M7); \
				131	M15 = _mm_unpackhi_epi64(M6, M7); \
				132	}
				133
				134	/* Generate branch metrics N = 2:
				135	* Compute 16 branch metrics from trellis outputs and input values.
				136	*
				137	* Input:
				138	* M0:3 - 16 x 2 packed 16-bit trellis outputs
				139	* M4 - Expanded and packed 16-bit input value
				140	*
				141	* Output:
				142	* M6:7 - 16 computed 16-bit branch metrics
				143	*/
				144	#define SSE_BRANCH_METRIC_N2(M0, M1, M2, M3, M4, M6, M7) \
				145	{ \
				146	M0 = _mm_sign_epi16(M4, M0); \
				147	M1 = _mm_sign_epi16(M4, M1); \
				148	M2 = _mm_sign_epi16(M4, M2); \
				149	M3 = _mm_sign_epi16(M4, M3); \
				150	M6 = _mm_hadds_epi16(M0, M1); \
				151	M7 = _mm_hadds_epi16(M2, M3); \
				152	}
				153
				154	/* Generate branch metrics N = 4:
				155	* Compute 8 branch metrics from trellis outputs and input values. This
				156	* macro is reused for N less than 4 where the extra soft input bits are
				157	* padded.
				158	*
				159	* Input:
				160	* M0:3 - 8 x 4 packed 16-bit trellis outputs
				161	* M4 - Expanded and packed 16-bit input value
				162	*
				163	* Output:
				164	* M5 - 8 computed 16-bit branch metrics
				165	*/
				166	#define SSE_BRANCH_METRIC_N4(M0, M1, M2, M3, M4, M5) \
				167	{ \
				168	M0 = _mm_sign_epi16(M4, M0); \
				169	M1 = _mm_sign_epi16(M4, M1); \
				170	M2 = _mm_sign_epi16(M4, M2); \
				171	M3 = _mm_sign_epi16(M4, M3); \
				172	M0 = _mm_hadds_epi16(M0, M1); \
				173	M1 = _mm_hadds_epi16(M2, M3); \
				174	M5 = _mm_hadds_epi16(M0, M1); \
				175	}
				176
				177	/* Broadcast 16-bit integer
				178	* Repeat the low 16-bit integer to all elements of the 128-bit SSE
				179	* register. Only AVX2 has a dedicated broadcast instruction; use repeat
				180	* unpacks for SSE only architectures. This is a destructive operation and
				181	* the source register is overwritten.
				182	*
				183	* Input:
				184	* M0 - Low 16-bit element is read
				185	*
				186	* Output:
				187	* M0 - Contains broadcasted values
				188	*/
				189	#ifdef HAVE_AVX2
				190	#define SSE_BROADCAST(M0) \
				191	{ \
				192	if (avx2_supported) { \
				193	M0 = _mm_broadcastw_epi16(M0); \
				194	} else { \
				195	M0 = _mm_unpacklo_epi16(M0, M0); \
				196	M0 = _mm_unpacklo_epi32(M0, M0); \
				197	M0 = _mm_unpacklo_epi64(M0, M0); \
				198	} \
				199	}
				200	#else
				201	#define SSE_BROADCAST(M0) \
				202	{ \
				203	M0 = _mm_unpacklo_epi16(M0, M0); \
				204	M0 = _mm_unpacklo_epi32(M0, M0); \
				205	M0 = _mm_unpacklo_epi64(M0, M0); \
				206	}
				207	#endif
				208
				209	/* Horizontal minimum
				210	* Compute horizontal minimum of packed unsigned 16-bit integers and place
				211	* result in the low 16-bit element of the source register. Only SSE 4.1
				212	* has a dedicated minpos instruction. One intermediate register is used
				213	* if SSE 4.1 is not available. This is a destructive operation and the
				214	* source register is overwritten.
				215	*
				216	* Input:
				217	* M0 - Packed unsigned 16-bit integers
				218	*
				219	* Output:
				220	* M0 - Minimum value placed in low 16-bit element
				221	*/
				222	#if defined(HAVE_SSE4_1) \|\| defined(HAVE_SSE41)
				223	#define SSE_MINPOS(M0, M1) \
				224	{ \
				225	if (sse41_supported) { \
				226	M0 = _mm_minpos_epu16(M0); \
				227	} else { \
				228	M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
				229	M0 = _mm_min_epi16(M0, M1); \
				230	M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
				231	M0 = _mm_min_epi16(M0, M1); \
				232	M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
				233	M0 = _mm_min_epi16(M0, M1); \
				234	} \
				235	}
				236	#else
				237	#define SSE_MINPOS(M0, M1) \
				238	{ \
				239	M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
				240	M0 = _mm_min_epi16(M0, M1); \
				241	M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
				242	M0 = _mm_min_epi16(M0, M1); \
				243	M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
				244	M0 = _mm_min_epi16(M0, M1); \
				245	}
				246	#endif
				247
				248	/* Normalize state metrics K = 5:
				249	* Compute 16-wide normalization by subtracting the smallest value from
				250	* all values. Inputs are 16 packed 16-bit integers across 2 XMM registers.
				251	* Two intermediate registers are used and normalized results are placed
				252	* in the originating locations.
				253	*
				254	* Input:
				255	* M0:1 - Path metrics 0:1 (packed 16-bit integers)
				256	*
				257	* Output:
				258	* M0:1 - Normalized path metrics 0:1
				259	*/
				260	#define SSE_NORMALIZE_K5(M0, M1, M2, M3) \
				261	{ \
				262	M2 = _mm_min_epi16(M0, M1); \
				263	SSE_MINPOS(M2, M3) \
				264	SSE_BROADCAST(M2) \
				265	M0 = _mm_subs_epi16(M0, M2); \
				266	M1 = _mm_subs_epi16(M1, M2); \
				267	}
				268
				269	/* Normalize state metrics K = 7:
				270	* Compute 64-wide normalization by subtracting the smallest value from
				271	* all values. Inputs are 8 registers of accumulated sums and 4 temporary
				272	* registers. Normalized results are returned in the originating locations.
				273	*
				274	* Input:
				275	* M0:7 - Path metrics 0:7 (packed 16-bit integers)
				276	*
				277	* Output:
				278	* M0:7 - Normalized path metrics 0:7
				279	*/
				280	#define SSE_NORMALIZE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11) \
				281	{ \
				282	M8 = _mm_min_epi16(M0, M1); \
				283	M9 = _mm_min_epi16(M2, M3); \
				284	M10 = _mm_min_epi16(M4, M5); \
				285	M11 = _mm_min_epi16(M6, M7); \
				286	M8 = _mm_min_epi16(M8, M9); \
				287	M10 = _mm_min_epi16(M10, M11); \
				288	M8 = _mm_min_epi16(M8, M10); \
				289	SSE_MINPOS(M8, M9) \
				290	SSE_BROADCAST(M8) \
				291	M0 = _mm_subs_epi16(M0, M8); \
				292	M1 = _mm_subs_epi16(M1, M8); \
				293	M2 = _mm_subs_epi16(M2, M8); \
				294	M3 = _mm_subs_epi16(M3, M8); \
				295	M4 = _mm_subs_epi16(M4, M8); \
				296	M5 = _mm_subs_epi16(M5, M8); \
				297	M6 = _mm_subs_epi16(M6, M8); \
				298	M7 = _mm_subs_epi16(M7, M8); \
				299	}
				300
				301	/* Combined BMU/PMU (K=5, N=2)
				302	* Compute branch metrics followed by path metrics for half rate 16-state
				303	* trellis. 8 butterflies are computed. Accumulated path sums are not
				304	* preserved and read and written into the same memory location. Normalize
				305	* sums if requires.
				306	*/
				307	__always_inline static void _sse_metrics_k5_n2(const int16_t *val,
				308	const int16_t out, int16_t sums, int16_t *paths, int norm)
				309	{
				310	__m128i m0, m1, m2, m3, m4, m5, m6;
				311
				312	/* (BMU) Load input sequence */
				313	m2 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
				314
				315	/* (BMU) Load trellis outputs */
				316	m0 = _mm_load_si128((__m128i *) &out[0]);
				317	m1 = _mm_load_si128((__m128i *) &out[8]);
				318
				319	/* (BMU) Compute branch metrics */
				320	m0 = _mm_sign_epi16(m2, m0);
				321	m1 = _mm_sign_epi16(m2, m1);
				322	m2 = _mm_hadds_epi16(m0, m1);
				323
				324	/* (PMU) Load accumulated path metrics */
				325	m0 = _mm_load_si128((__m128i *) &sums[0]);
				326	m1 = _mm_load_si128((__m128i *) &sums[8]);
				327
				328	SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
				329
				330	/* (PMU) Butterflies: 0-7 */
				331	SSE_BUTTERFLY(m3, m4, m2, m5, m6)
				332
				333	if (norm)
				334	SSE_NORMALIZE_K5(m2, m6, m0, m1)
				335
				336	_mm_store_si128((__m128i *) &sums[0], m2);
				337	_mm_store_si128((__m128i *) &sums[8], m6);
				338	_mm_store_si128((__m128i *) &paths[0], m5);
				339	_mm_store_si128((__m128i *) &paths[8], m4);
				340	}
				341
				342	/* Combined BMU/PMU (K=5, N=3 and N=4)
				343	* Compute branch metrics followed by path metrics for 16-state and rates
				344	* to 1/4. 8 butterflies are computed. The input sequence is read four 16-bit
				345	* values at a time, and extra values should be set to zero for rates other
				346	* than 1/4. Normally only rates 1/3 and 1/4 are used as there is a
				347	* dedicated implementation of rate 1/2.
				348	*/
				349	__always_inline static void _sse_metrics_k5_n4(const int16_t *val,
				350	const int16_t out, int16_t sums, int16_t *paths, int norm)
				351	{
				352	__m128i m0, m1, m2, m3, m4, m5, m6;
				353
				354	/* (BMU) Load input sequence */
				355	m4 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
				356
				357	/* (BMU) Load trellis outputs */
				358	m0 = _mm_load_si128((__m128i *) &out[0]);
				359	m1 = _mm_load_si128((__m128i *) &out[8]);
				360	m2 = _mm_load_si128((__m128i *) &out[16]);
				361	m3 = _mm_load_si128((__m128i *) &out[24]);
				362
				363	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m4, m2)
				364
				365	/* (PMU) Load accumulated path metrics */
				366	m0 = _mm_load_si128((__m128i *) &sums[0]);
				367	m1 = _mm_load_si128((__m128i *) &sums[8]);
				368
				369	SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
				370
				371	/* (PMU) Butterflies: 0-7 */
				372	SSE_BUTTERFLY(m3, m4, m2, m5, m6)
				373
				374	if (norm)
				375	SSE_NORMALIZE_K5(m2, m6, m0, m1)
				376
				377	_mm_store_si128((__m128i *) &sums[0], m2);
				378	_mm_store_si128((__m128i *) &sums[8], m6);
				379	_mm_store_si128((__m128i *) &paths[0], m5);
				380	_mm_store_si128((__m128i *) &paths[8], m4);
				381	}
				382
				383	/* Combined BMU/PMU (K=7, N=2)
				384	* Compute branch metrics followed by path metrics for half rate 64-state
				385	* trellis. 32 butterfly operations are computed. Deinterleaving path
				386	* metrics requires usage of the full SSE register file, so separate sums
				387	* before computing branch metrics to avoid register spilling.
				388	*/
				389	__always_inline static void _sse_metrics_k7_n2(const int16_t *val,
				390	const int16_t out, int16_t sums, int16_t *paths, int norm)
				391	{
				392	__m128i m0, m1, m2, m3, m4, m5, m6, m7, m8,
				393	m9, m10, m11, m12, m13, m14, m15;
				394
				395	/* (PMU) Load accumulated path metrics */
				396	m0 = _mm_load_si128((__m128i *) &sums[0]);
				397	m1 = _mm_load_si128((__m128i *) &sums[8]);
				398	m2 = _mm_load_si128((__m128i *) &sums[16]);
				399	m3 = _mm_load_si128((__m128i *) &sums[24]);
				400	m4 = _mm_load_si128((__m128i *) &sums[32]);
				401	m5 = _mm_load_si128((__m128i *) &sums[40]);
				402	m6 = _mm_load_si128((__m128i *) &sums[48]);
				403	m7 = _mm_load_si128((__m128i *) &sums[56]);
				404
				405	/* (PMU) Deinterleave to even-odd registers */
				406	SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
				407	m8, m9, m10, m11, m12, m13, m14, m15)
				408
				409	/* (BMU) Load input symbols */
				410	m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
				411
				412	/* (BMU) Load trellis outputs */
				413	m0 = _mm_load_si128((__m128i *) &out[0]);
				414	m1 = _mm_load_si128((__m128i *) &out[8]);
				415	m2 = _mm_load_si128((__m128i *) &out[16]);
				416	m3 = _mm_load_si128((__m128i *) &out[24]);
				417
				418	SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m4, m5)
				419
				420	m0 = _mm_load_si128((__m128i *) &out[32]);
				421	m1 = _mm_load_si128((__m128i *) &out[40]);
				422	m2 = _mm_load_si128((__m128i *) &out[48]);
				423	m3 = _mm_load_si128((__m128i *) &out[56]);
				424
				425	SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m6, m7)
				426
				427	/* (PMU) Butterflies: 0-15 */
				428	SSE_BUTTERFLY(m8, m9, m4, m0, m1)
				429	SSE_BUTTERFLY(m10, m11, m5, m2, m3)
				430
				431	_mm_store_si128((__m128i *) &paths[0], m0);
				432	_mm_store_si128((__m128i *) &paths[8], m2);
				433	_mm_store_si128((__m128i *) &paths[32], m9);
				434	_mm_store_si128((__m128i *) &paths[40], m11);
				435
				436	/* (PMU) Butterflies: 17-31 */
				437	SSE_BUTTERFLY(m12, m13, m6, m0, m2)
				438	SSE_BUTTERFLY(m14, m15, m7, m9, m11)
				439
				440	_mm_store_si128((__m128i *) &paths[16], m0);
				441	_mm_store_si128((__m128i *) &paths[24], m9);
				442	_mm_store_si128((__m128i *) &paths[48], m13);
				443	_mm_store_si128((__m128i *) &paths[56], m15);
				444
				445	if (norm)
				446	SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
				447	m7, m11, m0, m8, m9, m10)
				448
				449	_mm_store_si128((__m128i *) &sums[0], m4);
				450	_mm_store_si128((__m128i *) &sums[8], m5);
				451	_mm_store_si128((__m128i *) &sums[16], m6);
				452	_mm_store_si128((__m128i *) &sums[24], m7);
				453	_mm_store_si128((__m128i *) &sums[32], m1);
				454	_mm_store_si128((__m128i *) &sums[40], m3);
				455	_mm_store_si128((__m128i *) &sums[48], m2);
				456	_mm_store_si128((__m128i *) &sums[56], m11);
				457	}
				458
				459	/* Combined BMU/PMU (K=7, N=3 and N=4)
				460	* Compute branch metrics followed by path metrics for half rate 64-state
				461	* trellis. 32 butterfly operations are computed. Deinterleave path
				462	* metrics before computing branch metrics as in the half rate case.
				463	*/
				464	__always_inline static void _sse_metrics_k7_n4(const int16_t *val,
				465	const int16_t out, int16_t sums, int16_t *paths, int norm)
				466	{
				467	__m128i m0, m1, m2, m3, m4, m5, m6, m7;
				468	__m128i m8, m9, m10, m11, m12, m13, m14, m15;
				469
				470	/* (PMU) Load accumulated path metrics */
				471	m0 = _mm_load_si128((__m128i *) &sums[0]);
				472	m1 = _mm_load_si128((__m128i *) &sums[8]);
				473	m2 = _mm_load_si128((__m128i *) &sums[16]);
				474	m3 = _mm_load_si128((__m128i *) &sums[24]);
				475	m4 = _mm_load_si128((__m128i *) &sums[32]);
				476	m5 = _mm_load_si128((__m128i *) &sums[40]);
				477	m6 = _mm_load_si128((__m128i *) &sums[48]);
				478	m7 = _mm_load_si128((__m128i *) &sums[56]);
				479
				480	/* (PMU) Deinterleave into even and odd packed registers */
				481	SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
				482	m8, m9, m10, m11, m12, m13, m14, m15)
				483
				484	/* (BMU) Load and expand 8-bit input out to 16-bits */
				485	m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
				486
				487	/* (BMU) Load and compute branch metrics */
				488	m0 = _mm_load_si128((__m128i *) &out[0]);
				489	m1 = _mm_load_si128((__m128i *) &out[8]);
				490	m2 = _mm_load_si128((__m128i *) &out[16]);
				491	m3 = _mm_load_si128((__m128i *) &out[24]);
				492
				493	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m4)
				494
				495	m0 = _mm_load_si128((__m128i *) &out[32]);
				496	m1 = _mm_load_si128((__m128i *) &out[40]);
				497	m2 = _mm_load_si128((__m128i *) &out[48]);
				498	m3 = _mm_load_si128((__m128i *) &out[56]);
				499
				500	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m5)
				501
				502	m0 = _mm_load_si128((__m128i *) &out[64]);
				503	m1 = _mm_load_si128((__m128i *) &out[72]);
				504	m2 = _mm_load_si128((__m128i *) &out[80]);
				505	m3 = _mm_load_si128((__m128i *) &out[88]);
				506
				507	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m6)
				508
				509	m0 = _mm_load_si128((__m128i *) &out[96]);
				510	m1 = _mm_load_si128((__m128i *) &out[104]);
				511	m2 = _mm_load_si128((__m128i *) &out[112]);
				512	m3 = _mm_load_si128((__m128i *) &out[120]);
				513
				514	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m7)
				515
				516	/* (PMU) Butterflies: 0-15 */
				517	SSE_BUTTERFLY(m8, m9, m4, m0, m1)
				518	SSE_BUTTERFLY(m10, m11, m5, m2, m3)
				519
				520	_mm_store_si128((__m128i *) &paths[0], m0);
				521	_mm_store_si128((__m128i *) &paths[8], m2);
				522	_mm_store_si128((__m128i *) &paths[32], m9);
				523	_mm_store_si128((__m128i *) &paths[40], m11);
				524
				525	/* (PMU) Butterflies: 17-31 */
				526	SSE_BUTTERFLY(m12, m13, m6, m0, m2)
				527	SSE_BUTTERFLY(m14, m15, m7, m9, m11)
				528
				529	_mm_store_si128((__m128i *) &paths[16], m0);
				530	_mm_store_si128((__m128i *) &paths[24], m9);
				531	_mm_store_si128((__m128i *) &paths[48], m13);
				532	_mm_store_si128((__m128i *) &paths[56], m15);
				533
				534	if (norm)
				535	SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
				536	m7, m11, m0, m8, m9, m10)
				537
				538	_mm_store_si128((__m128i *) &sums[0], m4);
				539	_mm_store_si128((__m128i *) &sums[8], m5);
				540	_mm_store_si128((__m128i *) &sums[16], m6);
				541	_mm_store_si128((__m128i *) &sums[24], m7);
				542	_mm_store_si128((__m128i *) &sums[32], m1);
				543	_mm_store_si128((__m128i *) &sums[40], m3);
				544	_mm_store_si128((__m128i *) &sums[48], m2);
				545	_mm_store_si128((__m128i *) &sums[56], m11);
				546	}
				547
				548	/* Aligned Memory Allocator
				549	* SSE requires 16-byte memory alignment. We store relevant trellis values
				550	* (accumulated sums, outputs, and path decisions) as 16 bit signed integers
				551	* so the allocated memory is casted as such.
				552	*/
				553	__attribute__ ((visibility("hidden")))
				554	int16_t *osmo_conv_vdec_malloc_sse3(size_t n)
				555	{
				556	return (int16_t ) _mm_malloc(sizeof(int16_t) n, SSE_ALIGN);
				557	}
				558
				559	__attribute__ ((visibility("hidden")))
				560	void osmo_conv_vdec_free_sse3(int16_t *ptr)
				561	{
				562	_mm_free(ptr);
				563	}
				564
				565	__attribute__ ((visibility("hidden")))
				566	void osmo_conv_gen_metrics_k5_n2_sse(const int8_t val, const int16_t out,
				567	int16_t sums, int16_t paths, int norm)
				568	{
				569	const int16_t _val[4] = { val[0], val[1], val[0], val[1] };
				570
				571	_sse_metrics_k5_n2(_val, out, sums, paths, norm);
				572	}
				573
				574	__attribute__ ((visibility("hidden")))
				575	void osmo_conv_gen_metrics_k5_n3_sse(const int8_t val, const int16_t out,
				576	int16_t sums, int16_t paths, int norm)
				577	{
				578	const int16_t _val[4] = { val[0], val[1], val[2], 0 };
				579
				580	_sse_metrics_k5_n4(_val, out, sums, paths, norm);
				581	}
				582
				583	__attribute__ ((visibility("hidden")))
				584	void osmo_conv_gen_metrics_k5_n4_sse(const int8_t val, const int16_t out,
				585	int16_t sums, int16_t paths, int norm)
				586	{
				587	const int16_t _val[4] = { val[0], val[1], val[2], val[3] };
				588
				589	_sse_metrics_k5_n4(_val, out, sums, paths, norm);
				590	}
				591
				592	__attribute__ ((visibility("hidden")))
				593	void osmo_conv_gen_metrics_k7_n2_sse(const int8_t val, const int16_t out,
				594	int16_t sums, int16_t paths, int norm)
				595	{
				596	const int16_t _val[4] = { val[0], val[1], val[0], val[1] };
				597
				598	_sse_metrics_k7_n2(_val, out, sums, paths, norm);
				599	}
				600
				601	__attribute__ ((visibility("hidden")))
				602	void osmo_conv_gen_metrics_k7_n3_sse(const int8_t val, const int16_t out,
				603	int16_t sums, int16_t paths, int norm)
				604	{
				605	const int16_t _val[4] = { val[0], val[1], val[2], 0 };
				606
				607	_sse_metrics_k7_n4(_val, out, sums, paths, norm);
				608	}
				609
				610	__attribute__ ((visibility("hidden")))
				611	void osmo_conv_gen_metrics_k7_n4_sse(const int8_t val, const int16_t out,
				612	int16_t sums, int16_t paths, int norm)
				613	{
				614	const int16_t _val[4] = { val[0], val[1], val[2], val[3] };
				615
				616	_sse_metrics_k7_n4(_val, out, sums, paths, norm);
				617	}