Blame - src/conv_acc_sse_impl.h - libosmocore

blob: 560af6289425c68376966a760ba3ece684f05f2d [file] [log] [blame]

Neels Hofmeyr	17518fe	2017-06-20 04:35:06 +0200	[diff] [blame]	1	/*! \file conv_acc_sse_impl.h
				2	* Accelerated Viterbi decoder implementation:
Vadim Yanitskiy	46e533c	2017-06-19 18:21:02 +0700	[diff] [blame]	3	* Actual definitions which are being included
Neels Hofmeyr	17518fe	2017-06-20 04:35:06 +0200	[diff] [blame]	4	* from both conv_acc_sse.c and conv_acc_sse_avx.c. */
				5	/*
Vadim Yanitskiy	0d49f47	2017-05-28 18:20:02 +0700	[diff] [blame]	6	* Copyright (C) 2013, 2014 Thomas Tsou <tom@tsou.cc>
				7	*
				8	* All Rights Reserved
				9	*
				10	* This program is free software; you can redistribute it and/or modify
				11	* it under the terms of the GNU General Public License as published by
				12	* the Free Software Foundation; either version 2 of the License, or
				13	* (at your option) any later version.
				14	*
				15	* This program is distributed in the hope that it will be useful,
				16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				18	* GNU General Public License for more details.
				19	*
				20	* You should have received a copy of the GNU General Public License along
				21	* with this program; if not, write to the Free Software Foundation, Inc.,
				22	* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
				23	*/
				24
Harald Welte	81dc67d	2017-10-09 11:00:56 +0800	[diff] [blame]	25	/* Some distributions (notably Alpine Linux) for some strange reason
				26	* don't have this #define */
				27	#ifndef __always_inline
				28	#define __always_inline inline __attribute__((always_inline))
				29	#endif
				30
Vadim Yanitskiy	0d49f47	2017-05-28 18:20:02 +0700	[diff] [blame]	31	extern int sse41_supported;
				32
				33	/* Octo-Viterbi butterfly
				34	* Compute 8-wide butterfly generating 16 path decisions and 16 accumulated
				35	* sums. Inputs all packed 16-bit integers in three 128-bit XMM registers.
				36	* Two intermediate registers are used and results are set in the upper 4
				37	* registers.
				38	*
				39	* Input:
				40	* M0 - Path metrics 0 (packed 16-bit integers)
				41	* M1 - Path metrics 1 (packed 16-bit integers)
				42	* M2 - Branch metrics (packed 16-bit integers)
				43	*
				44	* Output:
				45	* M2 - Selected and accumulated path metrics 0
				46	* M4 - Selected and accumulated path metrics 1
				47	* M3 - Path selections 0
				48	* M1 - Path selections 1
				49	*/
				50	#define SSE_BUTTERFLY(M0, M1, M2, M3, M4) \
				51	{ \
				52	M3 = _mm_adds_epi16(M0, M2); \
				53	M4 = _mm_subs_epi16(M1, M2); \
				54	M0 = _mm_subs_epi16(M0, M2); \
				55	M1 = _mm_adds_epi16(M1, M2); \
				56	M2 = _mm_max_epi16(M3, M4); \
				57	M3 = _mm_or_si128(_mm_cmpgt_epi16(M3, M4), _mm_cmpeq_epi16(M3, M4)); \
				58	M4 = _mm_max_epi16(M0, M1); \
				59	M1 = _mm_or_si128(_mm_cmpgt_epi16(M0, M1), _mm_cmpeq_epi16(M0, M1)); \
				60	}
				61
				62	/* Two lane deinterleaving K = 5:
				63	* Take 16 interleaved 16-bit integers and deinterleave to 2 packed 128-bit
				64	* registers. The operation summarized below. Four registers are used with
				65	* the lower 2 as input and upper 2 as output.
				66	*
				67	* In - 10101010 10101010 10101010 10101010
				68	* Out - 00000000 11111111 00000000 11111111
				69	*
				70	* Input:
				71	* M0:1 - Packed 16-bit integers
				72	*
				73	* Output:
				74	* M2:3 - Deinterleaved packed 16-bit integers
				75	*/
				76	#define _I8_SHUFFLE_MASK 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
				77
				78	#define SSE_DEINTERLEAVE_K5(M0, M1, M2, M3) \
				79	{ \
				80	M2 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
				81	M0 = _mm_shuffle_epi8(M0, M2); \
				82	M1 = _mm_shuffle_epi8(M1, M2); \
				83	M2 = _mm_unpacklo_epi64(M0, M1); \
				84	M3 = _mm_unpackhi_epi64(M0, M1); \
				85	}
				86
				87	/* Two lane deinterleaving K = 7:
				88	* Take 64 interleaved 16-bit integers and deinterleave to 8 packed 128-bit
				89	* registers. The operation summarized below. 16 registers are used with the
				90	* lower 8 as input and upper 8 as output.
				91	*
				92	* In - 10101010 10101010 10101010 10101010 ...
				93	* Out - 00000000 11111111 00000000 11111111 ...
				94	*
				95	* Input:
				96	* M0:7 - Packed 16-bit integers
				97	*
				98	* Output:
				99	* M8:15 - Deinterleaved packed 16-bit integers
				100	*/
				101	#define SSE_DEINTERLEAVE_K7(M0, M1, M2, M3, M4, M5, M6, M7, \
				102	M8, M9, M10, M11, M12, M13, M14, M15) \
				103	{ \
				104	M8 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
				105	M0 = _mm_shuffle_epi8(M0, M8); \
				106	M1 = _mm_shuffle_epi8(M1, M8); \
				107	M2 = _mm_shuffle_epi8(M2, M8); \
				108	M3 = _mm_shuffle_epi8(M3, M8); \
				109	M4 = _mm_shuffle_epi8(M4, M8); \
				110	M5 = _mm_shuffle_epi8(M5, M8); \
				111	M6 = _mm_shuffle_epi8(M6, M8); \
				112	M7 = _mm_shuffle_epi8(M7, M8); \
				113	M8 = _mm_unpacklo_epi64(M0, M1); \
				114	M9 = _mm_unpackhi_epi64(M0, M1); \
				115	M10 = _mm_unpacklo_epi64(M2, M3); \
				116	M11 = _mm_unpackhi_epi64(M2, M3); \
				117	M12 = _mm_unpacklo_epi64(M4, M5); \
				118	M13 = _mm_unpackhi_epi64(M4, M5); \
				119	M14 = _mm_unpacklo_epi64(M6, M7); \
				120	M15 = _mm_unpackhi_epi64(M6, M7); \
				121	}
				122
				123	/* Generate branch metrics N = 2:
				124	* Compute 16 branch metrics from trellis outputs and input values.
				125	*
				126	* Input:
				127	* M0:3 - 16 x 2 packed 16-bit trellis outputs
				128	* M4 - Expanded and packed 16-bit input value
				129	*
				130	* Output:
				131	* M6:7 - 16 computed 16-bit branch metrics
				132	*/
				133	#define SSE_BRANCH_METRIC_N2(M0, M1, M2, M3, M4, M6, M7) \
				134	{ \
				135	M0 = _mm_sign_epi16(M4, M0); \
				136	M1 = _mm_sign_epi16(M4, M1); \
				137	M2 = _mm_sign_epi16(M4, M2); \
				138	M3 = _mm_sign_epi16(M4, M3); \
				139	M6 = _mm_hadds_epi16(M0, M1); \
				140	M7 = _mm_hadds_epi16(M2, M3); \
				141	}
				142
				143	/* Generate branch metrics N = 4:
				144	* Compute 8 branch metrics from trellis outputs and input values. This
				145	* macro is reused for N less than 4 where the extra soft input bits are
				146	* padded.
				147	*
				148	* Input:
				149	* M0:3 - 8 x 4 packed 16-bit trellis outputs
				150	* M4 - Expanded and packed 16-bit input value
				151	*
				152	* Output:
				153	* M5 - 8 computed 16-bit branch metrics
				154	*/
				155	#define SSE_BRANCH_METRIC_N4(M0, M1, M2, M3, M4, M5) \
				156	{ \
				157	M0 = _mm_sign_epi16(M4, M0); \
				158	M1 = _mm_sign_epi16(M4, M1); \
				159	M2 = _mm_sign_epi16(M4, M2); \
				160	M3 = _mm_sign_epi16(M4, M3); \
				161	M0 = _mm_hadds_epi16(M0, M1); \
				162	M1 = _mm_hadds_epi16(M2, M3); \
				163	M5 = _mm_hadds_epi16(M0, M1); \
				164	}
				165
				166	/* Horizontal minimum
				167	* Compute horizontal minimum of packed unsigned 16-bit integers and place
				168	* result in the low 16-bit element of the source register. Only SSE 4.1
				169	* has a dedicated minpos instruction. One intermediate register is used
				170	* if SSE 4.1 is not available. This is a destructive operation and the
				171	* source register is overwritten.
				172	*
				173	* Input:
				174	* M0 - Packed unsigned 16-bit integers
				175	*
				176	* Output:
				177	* M0 - Minimum value placed in low 16-bit element
				178	*/
				179	#if defined(HAVE_SSE4_1) \|\| defined(HAVE_SSE41)
				180	#define SSE_MINPOS(M0, M1) \
				181	{ \
				182	if (sse41_supported) { \
				183	M0 = _mm_minpos_epu16(M0); \
				184	} else { \
				185	M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
				186	M0 = _mm_min_epi16(M0, M1); \
				187	M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
				188	M0 = _mm_min_epi16(M0, M1); \
				189	M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
				190	M0 = _mm_min_epi16(M0, M1); \
				191	} \
				192	}
				193	#else
				194	#define SSE_MINPOS(M0, M1) \
				195	{ \
				196	M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
				197	M0 = _mm_min_epi16(M0, M1); \
				198	M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
				199	M0 = _mm_min_epi16(M0, M1); \
				200	M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
				201	M0 = _mm_min_epi16(M0, M1); \
				202	}
				203	#endif
				204
				205	/* Normalize state metrics K = 5:
				206	* Compute 16-wide normalization by subtracting the smallest value from
				207	* all values. Inputs are 16 packed 16-bit integers across 2 XMM registers.
				208	* Two intermediate registers are used and normalized results are placed
				209	* in the originating locations.
				210	*
				211	* Input:
				212	* M0:1 - Path metrics 0:1 (packed 16-bit integers)
				213	*
				214	* Output:
				215	* M0:1 - Normalized path metrics 0:1
				216	*/
				217	#define SSE_NORMALIZE_K5(M0, M1, M2, M3) \
				218	{ \
				219	M2 = _mm_min_epi16(M0, M1); \
				220	SSE_MINPOS(M2, M3) \
				221	SSE_BROADCAST(M2) \
				222	M0 = _mm_subs_epi16(M0, M2); \
				223	M1 = _mm_subs_epi16(M1, M2); \
				224	}
				225
				226	/* Normalize state metrics K = 7:
				227	* Compute 64-wide normalization by subtracting the smallest value from
				228	* all values. Inputs are 8 registers of accumulated sums and 4 temporary
				229	* registers. Normalized results are returned in the originating locations.
				230	*
				231	* Input:
				232	* M0:7 - Path metrics 0:7 (packed 16-bit integers)
				233	*
				234	* Output:
				235	* M0:7 - Normalized path metrics 0:7
				236	*/
				237	#define SSE_NORMALIZE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11) \
				238	{ \
				239	M8 = _mm_min_epi16(M0, M1); \
				240	M9 = _mm_min_epi16(M2, M3); \
				241	M10 = _mm_min_epi16(M4, M5); \
				242	M11 = _mm_min_epi16(M6, M7); \
				243	M8 = _mm_min_epi16(M8, M9); \
				244	M10 = _mm_min_epi16(M10, M11); \
				245	M8 = _mm_min_epi16(M8, M10); \
				246	SSE_MINPOS(M8, M9) \
				247	SSE_BROADCAST(M8) \
				248	M0 = _mm_subs_epi16(M0, M8); \
				249	M1 = _mm_subs_epi16(M1, M8); \
				250	M2 = _mm_subs_epi16(M2, M8); \
				251	M3 = _mm_subs_epi16(M3, M8); \
				252	M4 = _mm_subs_epi16(M4, M8); \
				253	M5 = _mm_subs_epi16(M5, M8); \
				254	M6 = _mm_subs_epi16(M6, M8); \
				255	M7 = _mm_subs_epi16(M7, M8); \
				256	}
				257
				258	/* Combined BMU/PMU (K=5, N=2)
				259	* Compute branch metrics followed by path metrics for half rate 16-state
				260	* trellis. 8 butterflies are computed. Accumulated path sums are not
				261	* preserved and read and written into the same memory location. Normalize
				262	* sums if requires.
				263	*/
				264	__always_inline static void _sse_metrics_k5_n2(const int16_t *val,
				265	const int16_t out, int16_t sums, int16_t *paths, int norm)
				266	{
				267	__m128i m0, m1, m2, m3, m4, m5, m6;
				268
				269	/* (BMU) Load input sequence */
				270	m2 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
				271
				272	/* (BMU) Load trellis outputs */
				273	m0 = _mm_load_si128((__m128i *) &out[0]);
				274	m1 = _mm_load_si128((__m128i *) &out[8]);
				275
				276	/* (BMU) Compute branch metrics */
				277	m0 = _mm_sign_epi16(m2, m0);
				278	m1 = _mm_sign_epi16(m2, m1);
				279	m2 = _mm_hadds_epi16(m0, m1);
				280
				281	/* (PMU) Load accumulated path metrics */
				282	m0 = _mm_load_si128((__m128i *) &sums[0]);
				283	m1 = _mm_load_si128((__m128i *) &sums[8]);
				284
				285	SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
				286
				287	/* (PMU) Butterflies: 0-7 */
				288	SSE_BUTTERFLY(m3, m4, m2, m5, m6)
				289
				290	if (norm)
				291	SSE_NORMALIZE_K5(m2, m6, m0, m1)
				292
				293	_mm_store_si128((__m128i *) &sums[0], m2);
				294	_mm_store_si128((__m128i *) &sums[8], m6);
				295	_mm_store_si128((__m128i *) &paths[0], m5);
				296	_mm_store_si128((__m128i *) &paths[8], m4);
				297	}
				298
				299	/* Combined BMU/PMU (K=5, N=3 and N=4)
				300	* Compute branch metrics followed by path metrics for 16-state and rates
				301	* to 1/4. 8 butterflies are computed. The input sequence is read four 16-bit
				302	* values at a time, and extra values should be set to zero for rates other
				303	* than 1/4. Normally only rates 1/3 and 1/4 are used as there is a
				304	* dedicated implementation of rate 1/2.
				305	*/
				306	__always_inline static void _sse_metrics_k5_n4(const int16_t *val,
				307	const int16_t out, int16_t sums, int16_t *paths, int norm)
				308	{
				309	__m128i m0, m1, m2, m3, m4, m5, m6;
				310
				311	/* (BMU) Load input sequence */
				312	m4 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
				313
				314	/* (BMU) Load trellis outputs */
				315	m0 = _mm_load_si128((__m128i *) &out[0]);
				316	m1 = _mm_load_si128((__m128i *) &out[8]);
				317	m2 = _mm_load_si128((__m128i *) &out[16]);
				318	m3 = _mm_load_si128((__m128i *) &out[24]);
				319
				320	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m4, m2)
				321
				322	/* (PMU) Load accumulated path metrics */
				323	m0 = _mm_load_si128((__m128i *) &sums[0]);
				324	m1 = _mm_load_si128((__m128i *) &sums[8]);
				325
				326	SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
				327
				328	/* (PMU) Butterflies: 0-7 */
				329	SSE_BUTTERFLY(m3, m4, m2, m5, m6)
				330
				331	if (norm)
				332	SSE_NORMALIZE_K5(m2, m6, m0, m1)
				333
				334	_mm_store_si128((__m128i *) &sums[0], m2);
				335	_mm_store_si128((__m128i *) &sums[8], m6);
				336	_mm_store_si128((__m128i *) &paths[0], m5);
				337	_mm_store_si128((__m128i *) &paths[8], m4);
				338	}
				339
				340	/* Combined BMU/PMU (K=7, N=2)
				341	* Compute branch metrics followed by path metrics for half rate 64-state
				342	* trellis. 32 butterfly operations are computed. Deinterleaving path
				343	* metrics requires usage of the full SSE register file, so separate sums
				344	* before computing branch metrics to avoid register spilling.
				345	*/
				346	__always_inline static void _sse_metrics_k7_n2(const int16_t *val,
				347	const int16_t out, int16_t sums, int16_t *paths, int norm)
				348	{
				349	__m128i m0, m1, m2, m3, m4, m5, m6, m7, m8,
				350	m9, m10, m11, m12, m13, m14, m15;
				351
				352	/* (PMU) Load accumulated path metrics */
				353	m0 = _mm_load_si128((__m128i *) &sums[0]);
				354	m1 = _mm_load_si128((__m128i *) &sums[8]);
				355	m2 = _mm_load_si128((__m128i *) &sums[16]);
				356	m3 = _mm_load_si128((__m128i *) &sums[24]);
				357	m4 = _mm_load_si128((__m128i *) &sums[32]);
				358	m5 = _mm_load_si128((__m128i *) &sums[40]);
				359	m6 = _mm_load_si128((__m128i *) &sums[48]);
				360	m7 = _mm_load_si128((__m128i *) &sums[56]);
				361
				362	/* (PMU) Deinterleave to even-odd registers */
				363	SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
				364	m8, m9, m10, m11, m12, m13, m14, m15)
				365
				366	/* (BMU) Load input symbols */
				367	m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
				368
				369	/* (BMU) Load trellis outputs */
				370	m0 = _mm_load_si128((__m128i *) &out[0]);
				371	m1 = _mm_load_si128((__m128i *) &out[8]);
				372	m2 = _mm_load_si128((__m128i *) &out[16]);
				373	m3 = _mm_load_si128((__m128i *) &out[24]);
				374
				375	SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m4, m5)
				376
				377	m0 = _mm_load_si128((__m128i *) &out[32]);
				378	m1 = _mm_load_si128((__m128i *) &out[40]);
				379	m2 = _mm_load_si128((__m128i *) &out[48]);
				380	m3 = _mm_load_si128((__m128i *) &out[56]);
				381
				382	SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m6, m7)
				383
				384	/* (PMU) Butterflies: 0-15 */
				385	SSE_BUTTERFLY(m8, m9, m4, m0, m1)
				386	SSE_BUTTERFLY(m10, m11, m5, m2, m3)
				387
				388	_mm_store_si128((__m128i *) &paths[0], m0);
				389	_mm_store_si128((__m128i *) &paths[8], m2);
				390	_mm_store_si128((__m128i *) &paths[32], m9);
				391	_mm_store_si128((__m128i *) &paths[40], m11);
				392
				393	/* (PMU) Butterflies: 17-31 */
				394	SSE_BUTTERFLY(m12, m13, m6, m0, m2)
				395	SSE_BUTTERFLY(m14, m15, m7, m9, m11)
				396
				397	_mm_store_si128((__m128i *) &paths[16], m0);
				398	_mm_store_si128((__m128i *) &paths[24], m9);
				399	_mm_store_si128((__m128i *) &paths[48], m13);
				400	_mm_store_si128((__m128i *) &paths[56], m15);
				401
				402	if (norm)
				403	SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
				404	m7, m11, m0, m8, m9, m10)
				405
				406	_mm_store_si128((__m128i *) &sums[0], m4);
				407	_mm_store_si128((__m128i *) &sums[8], m5);
				408	_mm_store_si128((__m128i *) &sums[16], m6);
				409	_mm_store_si128((__m128i *) &sums[24], m7);
				410	_mm_store_si128((__m128i *) &sums[32], m1);
				411	_mm_store_si128((__m128i *) &sums[40], m3);
				412	_mm_store_si128((__m128i *) &sums[48], m2);
				413	_mm_store_si128((__m128i *) &sums[56], m11);
				414	}
				415
				416	/* Combined BMU/PMU (K=7, N=3 and N=4)
				417	* Compute branch metrics followed by path metrics for half rate 64-state
				418	* trellis. 32 butterfly operations are computed. Deinterleave path
				419	* metrics before computing branch metrics as in the half rate case.
				420	*/
				421	__always_inline static void _sse_metrics_k7_n4(const int16_t *val,
				422	const int16_t out, int16_t sums, int16_t *paths, int norm)
				423	{
				424	__m128i m0, m1, m2, m3, m4, m5, m6, m7;
				425	__m128i m8, m9, m10, m11, m12, m13, m14, m15;
				426
				427	/* (PMU) Load accumulated path metrics */
				428	m0 = _mm_load_si128((__m128i *) &sums[0]);
				429	m1 = _mm_load_si128((__m128i *) &sums[8]);
				430	m2 = _mm_load_si128((__m128i *) &sums[16]);
				431	m3 = _mm_load_si128((__m128i *) &sums[24]);
				432	m4 = _mm_load_si128((__m128i *) &sums[32]);
				433	m5 = _mm_load_si128((__m128i *) &sums[40]);
				434	m6 = _mm_load_si128((__m128i *) &sums[48]);
				435	m7 = _mm_load_si128((__m128i *) &sums[56]);
				436
				437	/* (PMU) Deinterleave into even and odd packed registers */
				438	SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
				439	m8, m9, m10, m11, m12, m13, m14, m15)
				440
				441	/* (BMU) Load and expand 8-bit input out to 16-bits */
				442	m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
				443
				444	/* (BMU) Load and compute branch metrics */
				445	m0 = _mm_load_si128((__m128i *) &out[0]);
				446	m1 = _mm_load_si128((__m128i *) &out[8]);
				447	m2 = _mm_load_si128((__m128i *) &out[16]);
				448	m3 = _mm_load_si128((__m128i *) &out[24]);
				449
				450	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m4)
				451
				452	m0 = _mm_load_si128((__m128i *) &out[32]);
				453	m1 = _mm_load_si128((__m128i *) &out[40]);
				454	m2 = _mm_load_si128((__m128i *) &out[48]);
				455	m3 = _mm_load_si128((__m128i *) &out[56]);
				456
				457	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m5)
				458
				459	m0 = _mm_load_si128((__m128i *) &out[64]);
				460	m1 = _mm_load_si128((__m128i *) &out[72]);
				461	m2 = _mm_load_si128((__m128i *) &out[80]);
				462	m3 = _mm_load_si128((__m128i *) &out[88]);
				463
				464	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m6)
				465
				466	m0 = _mm_load_si128((__m128i *) &out[96]);
				467	m1 = _mm_load_si128((__m128i *) &out[104]);
				468	m2 = _mm_load_si128((__m128i *) &out[112]);
				469	m3 = _mm_load_si128((__m128i *) &out[120]);
				470
				471	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m7)
				472
				473	/* (PMU) Butterflies: 0-15 */
				474	SSE_BUTTERFLY(m8, m9, m4, m0, m1)
				475	SSE_BUTTERFLY(m10, m11, m5, m2, m3)
				476
				477	_mm_store_si128((__m128i *) &paths[0], m0);
				478	_mm_store_si128((__m128i *) &paths[8], m2);
				479	_mm_store_si128((__m128i *) &paths[32], m9);
				480	_mm_store_si128((__m128i *) &paths[40], m11);
				481
				482	/* (PMU) Butterflies: 17-31 */
				483	SSE_BUTTERFLY(m12, m13, m6, m0, m2)
				484	SSE_BUTTERFLY(m14, m15, m7, m9, m11)
				485
				486	_mm_store_si128((__m128i *) &paths[16], m0);
				487	_mm_store_si128((__m128i *) &paths[24], m9);
				488	_mm_store_si128((__m128i *) &paths[48], m13);
				489	_mm_store_si128((__m128i *) &paths[56], m15);
				490
				491	if (norm)
				492	SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
				493	m7, m11, m0, m8, m9, m10)
				494
				495	_mm_store_si128((__m128i *) &sums[0], m4);
				496	_mm_store_si128((__m128i *) &sums[8], m5);
				497	_mm_store_si128((__m128i *) &sums[16], m6);
				498	_mm_store_si128((__m128i *) &sums[24], m7);
				499	_mm_store_si128((__m128i *) &sums[32], m1);
				500	_mm_store_si128((__m128i *) &sums[40], m3);
				501	_mm_store_si128((__m128i *) &sums[48], m2);
				502	_mm_store_si128((__m128i *) &sums[56], m11);
				503	}