Blame - src/conv_acc_sse_impl.h - libosmocore

blob: 9ebbfe9c951475f8043117c6e49f49aed552713f [file] [log] [blame]

Neels Hofmeyr	17518fe	2017-06-20 04:35:06 +0200	[diff] [blame]	1	/*! \file conv_acc_sse_impl.h
				2	* Accelerated Viterbi decoder implementation:
Vadim Yanitskiy	46e533c	2017-06-19 18:21:02 +0700	[diff] [blame]	3	* Actual definitions which are being included
Neels Hofmeyr	17518fe	2017-06-20 04:35:06 +0200	[diff] [blame]	4	* from both conv_acc_sse.c and conv_acc_sse_avx.c. */
				5	/*
Vadim Yanitskiy	0d49f47	2017-05-28 18:20:02 +0700	[diff] [blame]	6	* Copyright (C) 2013, 2014 Thomas Tsou <tom@tsou.cc>
				7	*
				8	* All Rights Reserved
				9	*
Harald Welte	e08da97	2017-11-13 01:00:26 +0900	[diff] [blame]	10	* SPDX-License-Identifier: GPL-2.0+
				11	*
Vadim Yanitskiy	0d49f47	2017-05-28 18:20:02 +0700	[diff] [blame]	12	* This program is free software; you can redistribute it and/or modify
				13	* it under the terms of the GNU General Public License as published by
				14	* the Free Software Foundation; either version 2 of the License, or
				15	* (at your option) any later version.
				16	*
				17	* This program is distributed in the hope that it will be useful,
				18	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				20	* GNU General Public License for more details.
				21	*
				22	* You should have received a copy of the GNU General Public License along
				23	* with this program; if not, write to the Free Software Foundation, Inc.,
				24	* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
				25	*/
				26
Harald Welte	81dc67d	2017-10-09 11:00:56 +0800	[diff] [blame]	27	/* Some distributions (notably Alpine Linux) for some strange reason
				28	* don't have this #define */
				29	#ifndef __always_inline
				30	#define __always_inline inline __attribute__((always_inline))
				31	#endif
				32
Vadim Yanitskiy	0d49f47	2017-05-28 18:20:02 +0700	[diff] [blame]	33	extern int sse41_supported;
				34
				35	/* Octo-Viterbi butterfly
				36	* Compute 8-wide butterfly generating 16 path decisions and 16 accumulated
				37	* sums. Inputs all packed 16-bit integers in three 128-bit XMM registers.
				38	* Two intermediate registers are used and results are set in the upper 4
				39	* registers.
				40	*
				41	* Input:
				42	* M0 - Path metrics 0 (packed 16-bit integers)
				43	* M1 - Path metrics 1 (packed 16-bit integers)
				44	* M2 - Branch metrics (packed 16-bit integers)
				45	*
				46	* Output:
				47	* M2 - Selected and accumulated path metrics 0
				48	* M4 - Selected and accumulated path metrics 1
				49	* M3 - Path selections 0
				50	* M1 - Path selections 1
				51	*/
				52	#define SSE_BUTTERFLY(M0, M1, M2, M3, M4) \
				53	{ \
				54	M3 = _mm_adds_epi16(M0, M2); \
				55	M4 = _mm_subs_epi16(M1, M2); \
				56	M0 = _mm_subs_epi16(M0, M2); \
				57	M1 = _mm_adds_epi16(M1, M2); \
				58	M2 = _mm_max_epi16(M3, M4); \
				59	M3 = _mm_or_si128(_mm_cmpgt_epi16(M3, M4), _mm_cmpeq_epi16(M3, M4)); \
				60	M4 = _mm_max_epi16(M0, M1); \
				61	M1 = _mm_or_si128(_mm_cmpgt_epi16(M0, M1), _mm_cmpeq_epi16(M0, M1)); \
				62	}
				63
				64	/* Two lane deinterleaving K = 5:
				65	* Take 16 interleaved 16-bit integers and deinterleave to 2 packed 128-bit
				66	* registers. The operation summarized below. Four registers are used with
				67	* the lower 2 as input and upper 2 as output.
				68	*
				69	* In - 10101010 10101010 10101010 10101010
				70	* Out - 00000000 11111111 00000000 11111111
				71	*
				72	* Input:
				73	* M0:1 - Packed 16-bit integers
				74	*
				75	* Output:
				76	* M2:3 - Deinterleaved packed 16-bit integers
				77	*/
				78	#define _I8_SHUFFLE_MASK 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
				79
				80	#define SSE_DEINTERLEAVE_K5(M0, M1, M2, M3) \
				81	{ \
				82	M2 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
				83	M0 = _mm_shuffle_epi8(M0, M2); \
				84	M1 = _mm_shuffle_epi8(M1, M2); \
				85	M2 = _mm_unpacklo_epi64(M0, M1); \
				86	M3 = _mm_unpackhi_epi64(M0, M1); \
				87	}
				88
				89	/* Two lane deinterleaving K = 7:
				90	* Take 64 interleaved 16-bit integers and deinterleave to 8 packed 128-bit
				91	* registers. The operation summarized below. 16 registers are used with the
				92	* lower 8 as input and upper 8 as output.
				93	*
				94	* In - 10101010 10101010 10101010 10101010 ...
				95	* Out - 00000000 11111111 00000000 11111111 ...
				96	*
				97	* Input:
				98	* M0:7 - Packed 16-bit integers
				99	*
				100	* Output:
				101	* M8:15 - Deinterleaved packed 16-bit integers
				102	*/
				103	#define SSE_DEINTERLEAVE_K7(M0, M1, M2, M3, M4, M5, M6, M7, \
				104	M8, M9, M10, M11, M12, M13, M14, M15) \
				105	{ \
				106	M8 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
				107	M0 = _mm_shuffle_epi8(M0, M8); \
				108	M1 = _mm_shuffle_epi8(M1, M8); \
				109	M2 = _mm_shuffle_epi8(M2, M8); \
				110	M3 = _mm_shuffle_epi8(M3, M8); \
				111	M4 = _mm_shuffle_epi8(M4, M8); \
				112	M5 = _mm_shuffle_epi8(M5, M8); \
				113	M6 = _mm_shuffle_epi8(M6, M8); \
				114	M7 = _mm_shuffle_epi8(M7, M8); \
				115	M8 = _mm_unpacklo_epi64(M0, M1); \
				116	M9 = _mm_unpackhi_epi64(M0, M1); \
				117	M10 = _mm_unpacklo_epi64(M2, M3); \
				118	M11 = _mm_unpackhi_epi64(M2, M3); \
				119	M12 = _mm_unpacklo_epi64(M4, M5); \
				120	M13 = _mm_unpackhi_epi64(M4, M5); \
				121	M14 = _mm_unpacklo_epi64(M6, M7); \
				122	M15 = _mm_unpackhi_epi64(M6, M7); \
				123	}
				124
				125	/* Generate branch metrics N = 2:
				126	* Compute 16 branch metrics from trellis outputs and input values.
				127	*
				128	* Input:
				129	* M0:3 - 16 x 2 packed 16-bit trellis outputs
				130	* M4 - Expanded and packed 16-bit input value
				131	*
				132	* Output:
				133	* M6:7 - 16 computed 16-bit branch metrics
				134	*/
				135	#define SSE_BRANCH_METRIC_N2(M0, M1, M2, M3, M4, M6, M7) \
				136	{ \
				137	M0 = _mm_sign_epi16(M4, M0); \
				138	M1 = _mm_sign_epi16(M4, M1); \
				139	M2 = _mm_sign_epi16(M4, M2); \
				140	M3 = _mm_sign_epi16(M4, M3); \
				141	M6 = _mm_hadds_epi16(M0, M1); \
				142	M7 = _mm_hadds_epi16(M2, M3); \
				143	}
				144
				145	/* Generate branch metrics N = 4:
				146	* Compute 8 branch metrics from trellis outputs and input values. This
				147	* macro is reused for N less than 4 where the extra soft input bits are
				148	* padded.
				149	*
				150	* Input:
				151	* M0:3 - 8 x 4 packed 16-bit trellis outputs
				152	* M4 - Expanded and packed 16-bit input value
				153	*
				154	* Output:
				155	* M5 - 8 computed 16-bit branch metrics
				156	*/
				157	#define SSE_BRANCH_METRIC_N4(M0, M1, M2, M3, M4, M5) \
				158	{ \
				159	M0 = _mm_sign_epi16(M4, M0); \
				160	M1 = _mm_sign_epi16(M4, M1); \
				161	M2 = _mm_sign_epi16(M4, M2); \
				162	M3 = _mm_sign_epi16(M4, M3); \
				163	M0 = _mm_hadds_epi16(M0, M1); \
				164	M1 = _mm_hadds_epi16(M2, M3); \
				165	M5 = _mm_hadds_epi16(M0, M1); \
				166	}
				167
				168	/* Horizontal minimum
				169	* Compute horizontal minimum of packed unsigned 16-bit integers and place
				170	* result in the low 16-bit element of the source register. Only SSE 4.1
				171	* has a dedicated minpos instruction. One intermediate register is used
				172	* if SSE 4.1 is not available. This is a destructive operation and the
				173	* source register is overwritten.
				174	*
				175	* Input:
				176	* M0 - Packed unsigned 16-bit integers
				177	*
				178	* Output:
				179	* M0 - Minimum value placed in low 16-bit element
				180	*/
				181	#if defined(HAVE_SSE4_1) \|\| defined(HAVE_SSE41)
				182	#define SSE_MINPOS(M0, M1) \
				183	{ \
				184	if (sse41_supported) { \
				185	M0 = _mm_minpos_epu16(M0); \
				186	} else { \
				187	M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
				188	M0 = _mm_min_epi16(M0, M1); \
				189	M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
				190	M0 = _mm_min_epi16(M0, M1); \
				191	M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
				192	M0 = _mm_min_epi16(M0, M1); \
				193	} \
				194	}
				195	#else
				196	#define SSE_MINPOS(M0, M1) \
				197	{ \
				198	M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
				199	M0 = _mm_min_epi16(M0, M1); \
				200	M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
				201	M0 = _mm_min_epi16(M0, M1); \
				202	M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
				203	M0 = _mm_min_epi16(M0, M1); \
				204	}
				205	#endif
				206
				207	/* Normalize state metrics K = 5:
				208	* Compute 16-wide normalization by subtracting the smallest value from
				209	* all values. Inputs are 16 packed 16-bit integers across 2 XMM registers.
				210	* Two intermediate registers are used and normalized results are placed
				211	* in the originating locations.
				212	*
				213	* Input:
				214	* M0:1 - Path metrics 0:1 (packed 16-bit integers)
				215	*
				216	* Output:
				217	* M0:1 - Normalized path metrics 0:1
				218	*/
				219	#define SSE_NORMALIZE_K5(M0, M1, M2, M3) \
				220	{ \
				221	M2 = _mm_min_epi16(M0, M1); \
				222	SSE_MINPOS(M2, M3) \
				223	SSE_BROADCAST(M2) \
				224	M0 = _mm_subs_epi16(M0, M2); \
				225	M1 = _mm_subs_epi16(M1, M2); \
				226	}
				227
				228	/* Normalize state metrics K = 7:
				229	* Compute 64-wide normalization by subtracting the smallest value from
				230	* all values. Inputs are 8 registers of accumulated sums and 4 temporary
				231	* registers. Normalized results are returned in the originating locations.
				232	*
				233	* Input:
				234	* M0:7 - Path metrics 0:7 (packed 16-bit integers)
				235	*
				236	* Output:
				237	* M0:7 - Normalized path metrics 0:7
				238	*/
				239	#define SSE_NORMALIZE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11) \
				240	{ \
				241	M8 = _mm_min_epi16(M0, M1); \
				242	M9 = _mm_min_epi16(M2, M3); \
				243	M10 = _mm_min_epi16(M4, M5); \
				244	M11 = _mm_min_epi16(M6, M7); \
				245	M8 = _mm_min_epi16(M8, M9); \
				246	M10 = _mm_min_epi16(M10, M11); \
				247	M8 = _mm_min_epi16(M8, M10); \
				248	SSE_MINPOS(M8, M9) \
				249	SSE_BROADCAST(M8) \
				250	M0 = _mm_subs_epi16(M0, M8); \
				251	M1 = _mm_subs_epi16(M1, M8); \
				252	M2 = _mm_subs_epi16(M2, M8); \
				253	M3 = _mm_subs_epi16(M3, M8); \
				254	M4 = _mm_subs_epi16(M4, M8); \
				255	M5 = _mm_subs_epi16(M5, M8); \
				256	M6 = _mm_subs_epi16(M6, M8); \
				257	M7 = _mm_subs_epi16(M7, M8); \
				258	}
				259
				260	/* Combined BMU/PMU (K=5, N=2)
				261	* Compute branch metrics followed by path metrics for half rate 16-state
				262	* trellis. 8 butterflies are computed. Accumulated path sums are not
				263	* preserved and read and written into the same memory location. Normalize
				264	* sums if requires.
				265	*/
				266	__always_inline static void _sse_metrics_k5_n2(const int16_t *val,
				267	const int16_t out, int16_t sums, int16_t *paths, int norm)
				268	{
				269	__m128i m0, m1, m2, m3, m4, m5, m6;
				270
				271	/* (BMU) Load input sequence */
				272	m2 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
				273
				274	/* (BMU) Load trellis outputs */
				275	m0 = _mm_load_si128((__m128i *) &out[0]);
				276	m1 = _mm_load_si128((__m128i *) &out[8]);
				277
				278	/* (BMU) Compute branch metrics */
				279	m0 = _mm_sign_epi16(m2, m0);
				280	m1 = _mm_sign_epi16(m2, m1);
				281	m2 = _mm_hadds_epi16(m0, m1);
				282
				283	/* (PMU) Load accumulated path metrics */
				284	m0 = _mm_load_si128((__m128i *) &sums[0]);
				285	m1 = _mm_load_si128((__m128i *) &sums[8]);
				286
				287	SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
				288
				289	/* (PMU) Butterflies: 0-7 */
				290	SSE_BUTTERFLY(m3, m4, m2, m5, m6)
				291
				292	if (norm)
				293	SSE_NORMALIZE_K5(m2, m6, m0, m1)
				294
				295	_mm_store_si128((__m128i *) &sums[0], m2);
				296	_mm_store_si128((__m128i *) &sums[8], m6);
				297	_mm_store_si128((__m128i *) &paths[0], m5);
				298	_mm_store_si128((__m128i *) &paths[8], m4);
				299	}
				300
				301	/* Combined BMU/PMU (K=5, N=3 and N=4)
				302	* Compute branch metrics followed by path metrics for 16-state and rates
				303	* to 1/4. 8 butterflies are computed. The input sequence is read four 16-bit
				304	* values at a time, and extra values should be set to zero for rates other
				305	* than 1/4. Normally only rates 1/3 and 1/4 are used as there is a
				306	* dedicated implementation of rate 1/2.
				307	*/
				308	__always_inline static void _sse_metrics_k5_n4(const int16_t *val,
				309	const int16_t out, int16_t sums, int16_t *paths, int norm)
				310	{
				311	__m128i m0, m1, m2, m3, m4, m5, m6;
				312
				313	/* (BMU) Load input sequence */
				314	m4 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
				315
				316	/* (BMU) Load trellis outputs */
				317	m0 = _mm_load_si128((__m128i *) &out[0]);
				318	m1 = _mm_load_si128((__m128i *) &out[8]);
				319	m2 = _mm_load_si128((__m128i *) &out[16]);
				320	m3 = _mm_load_si128((__m128i *) &out[24]);
				321
				322	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m4, m2)
				323
				324	/* (PMU) Load accumulated path metrics */
				325	m0 = _mm_load_si128((__m128i *) &sums[0]);
				326	m1 = _mm_load_si128((__m128i *) &sums[8]);
				327
				328	SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
				329
				330	/* (PMU) Butterflies: 0-7 */
				331	SSE_BUTTERFLY(m3, m4, m2, m5, m6)
				332
				333	if (norm)
				334	SSE_NORMALIZE_K5(m2, m6, m0, m1)
				335
				336	_mm_store_si128((__m128i *) &sums[0], m2);
				337	_mm_store_si128((__m128i *) &sums[8], m6);
				338	_mm_store_si128((__m128i *) &paths[0], m5);
				339	_mm_store_si128((__m128i *) &paths[8], m4);
				340	}
				341
				342	/* Combined BMU/PMU (K=7, N=2)
				343	* Compute branch metrics followed by path metrics for half rate 64-state
				344	* trellis. 32 butterfly operations are computed. Deinterleaving path
				345	* metrics requires usage of the full SSE register file, so separate sums
				346	* before computing branch metrics to avoid register spilling.
				347	*/
				348	__always_inline static void _sse_metrics_k7_n2(const int16_t *val,
				349	const int16_t out, int16_t sums, int16_t *paths, int norm)
				350	{
				351	__m128i m0, m1, m2, m3, m4, m5, m6, m7, m8,
				352	m9, m10, m11, m12, m13, m14, m15;
				353
				354	/* (PMU) Load accumulated path metrics */
				355	m0 = _mm_load_si128((__m128i *) &sums[0]);
				356	m1 = _mm_load_si128((__m128i *) &sums[8]);
				357	m2 = _mm_load_si128((__m128i *) &sums[16]);
				358	m3 = _mm_load_si128((__m128i *) &sums[24]);
				359	m4 = _mm_load_si128((__m128i *) &sums[32]);
				360	m5 = _mm_load_si128((__m128i *) &sums[40]);
				361	m6 = _mm_load_si128((__m128i *) &sums[48]);
				362	m7 = _mm_load_si128((__m128i *) &sums[56]);
				363
				364	/* (PMU) Deinterleave to even-odd registers */
				365	SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
				366	m8, m9, m10, m11, m12, m13, m14, m15)
				367
				368	/* (BMU) Load input symbols */
				369	m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
				370
				371	/* (BMU) Load trellis outputs */
				372	m0 = _mm_load_si128((__m128i *) &out[0]);
				373	m1 = _mm_load_si128((__m128i *) &out[8]);
				374	m2 = _mm_load_si128((__m128i *) &out[16]);
				375	m3 = _mm_load_si128((__m128i *) &out[24]);
				376
				377	SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m4, m5)
				378
				379	m0 = _mm_load_si128((__m128i *) &out[32]);
				380	m1 = _mm_load_si128((__m128i *) &out[40]);
				381	m2 = _mm_load_si128((__m128i *) &out[48]);
				382	m3 = _mm_load_si128((__m128i *) &out[56]);
				383
				384	SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m6, m7)
				385
				386	/* (PMU) Butterflies: 0-15 */
				387	SSE_BUTTERFLY(m8, m9, m4, m0, m1)
				388	SSE_BUTTERFLY(m10, m11, m5, m2, m3)
				389
				390	_mm_store_si128((__m128i *) &paths[0], m0);
				391	_mm_store_si128((__m128i *) &paths[8], m2);
				392	_mm_store_si128((__m128i *) &paths[32], m9);
				393	_mm_store_si128((__m128i *) &paths[40], m11);
				394
				395	/* (PMU) Butterflies: 17-31 */
				396	SSE_BUTTERFLY(m12, m13, m6, m0, m2)
				397	SSE_BUTTERFLY(m14, m15, m7, m9, m11)
				398
				399	_mm_store_si128((__m128i *) &paths[16], m0);
				400	_mm_store_si128((__m128i *) &paths[24], m9);
				401	_mm_store_si128((__m128i *) &paths[48], m13);
				402	_mm_store_si128((__m128i *) &paths[56], m15);
				403
				404	if (norm)
				405	SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
				406	m7, m11, m0, m8, m9, m10)
				407
				408	_mm_store_si128((__m128i *) &sums[0], m4);
				409	_mm_store_si128((__m128i *) &sums[8], m5);
				410	_mm_store_si128((__m128i *) &sums[16], m6);
				411	_mm_store_si128((__m128i *) &sums[24], m7);
				412	_mm_store_si128((__m128i *) &sums[32], m1);
				413	_mm_store_si128((__m128i *) &sums[40], m3);
				414	_mm_store_si128((__m128i *) &sums[48], m2);
				415	_mm_store_si128((__m128i *) &sums[56], m11);
				416	}
				417
				418	/* Combined BMU/PMU (K=7, N=3 and N=4)
				419	* Compute branch metrics followed by path metrics for half rate 64-state
				420	* trellis. 32 butterfly operations are computed. Deinterleave path
				421	* metrics before computing branch metrics as in the half rate case.
				422	*/
				423	__always_inline static void _sse_metrics_k7_n4(const int16_t *val,
				424	const int16_t out, int16_t sums, int16_t *paths, int norm)
				425	{
				426	__m128i m0, m1, m2, m3, m4, m5, m6, m7;
				427	__m128i m8, m9, m10, m11, m12, m13, m14, m15;
				428
				429	/* (PMU) Load accumulated path metrics */
				430	m0 = _mm_load_si128((__m128i *) &sums[0]);
				431	m1 = _mm_load_si128((__m128i *) &sums[8]);
				432	m2 = _mm_load_si128((__m128i *) &sums[16]);
				433	m3 = _mm_load_si128((__m128i *) &sums[24]);
				434	m4 = _mm_load_si128((__m128i *) &sums[32]);
				435	m5 = _mm_load_si128((__m128i *) &sums[40]);
				436	m6 = _mm_load_si128((__m128i *) &sums[48]);
				437	m7 = _mm_load_si128((__m128i *) &sums[56]);
				438
				439	/* (PMU) Deinterleave into even and odd packed registers */
				440	SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
				441	m8, m9, m10, m11, m12, m13, m14, m15)
				442
				443	/* (BMU) Load and expand 8-bit input out to 16-bits */
				444	m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
				445
				446	/* (BMU) Load and compute branch metrics */
				447	m0 = _mm_load_si128((__m128i *) &out[0]);
				448	m1 = _mm_load_si128((__m128i *) &out[8]);
				449	m2 = _mm_load_si128((__m128i *) &out[16]);
				450	m3 = _mm_load_si128((__m128i *) &out[24]);
				451
				452	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m4)
				453
				454	m0 = _mm_load_si128((__m128i *) &out[32]);
				455	m1 = _mm_load_si128((__m128i *) &out[40]);
				456	m2 = _mm_load_si128((__m128i *) &out[48]);
				457	m3 = _mm_load_si128((__m128i *) &out[56]);
				458
				459	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m5)
				460
				461	m0 = _mm_load_si128((__m128i *) &out[64]);
				462	m1 = _mm_load_si128((__m128i *) &out[72]);
				463	m2 = _mm_load_si128((__m128i *) &out[80]);
				464	m3 = _mm_load_si128((__m128i *) &out[88]);
				465
				466	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m6)
				467
				468	m0 = _mm_load_si128((__m128i *) &out[96]);
				469	m1 = _mm_load_si128((__m128i *) &out[104]);
				470	m2 = _mm_load_si128((__m128i *) &out[112]);
				471	m3 = _mm_load_si128((__m128i *) &out[120]);
				472
				473	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m7)
				474
				475	/* (PMU) Butterflies: 0-15 */
				476	SSE_BUTTERFLY(m8, m9, m4, m0, m1)
				477	SSE_BUTTERFLY(m10, m11, m5, m2, m3)
				478
				479	_mm_store_si128((__m128i *) &paths[0], m0);
				480	_mm_store_si128((__m128i *) &paths[8], m2);
				481	_mm_store_si128((__m128i *) &paths[32], m9);
				482	_mm_store_si128((__m128i *) &paths[40], m11);
				483
				484	/* (PMU) Butterflies: 17-31 */
				485	SSE_BUTTERFLY(m12, m13, m6, m0, m2)
				486	SSE_BUTTERFLY(m14, m15, m7, m9, m11)
				487
				488	_mm_store_si128((__m128i *) &paths[16], m0);
				489	_mm_store_si128((__m128i *) &paths[24], m9);
				490	_mm_store_si128((__m128i *) &paths[48], m13);
				491	_mm_store_si128((__m128i *) &paths[56], m15);
				492
				493	if (norm)
				494	SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
				495	m7, m11, m0, m8, m9, m10)
				496
				497	_mm_store_si128((__m128i *) &sums[0], m4);
				498	_mm_store_si128((__m128i *) &sums[8], m5);
				499	_mm_store_si128((__m128i *) &sums[16], m6);
				500	_mm_store_si128((__m128i *) &sums[24], m7);
				501	_mm_store_si128((__m128i *) &sums[32], m1);
				502	_mm_store_si128((__m128i *) &sums[40], m3);
				503	_mm_store_si128((__m128i *) &sums[48], m2);
				504	_mm_store_si128((__m128i *) &sums[56], m11);
				505	}