Blame - src/conv_acc_sse_impl.h - libosmocore

blob: 807dbe5ea17c5e7c1d908c3f571407fc0426edc0 [file] [log] [blame]

Neels Hofmeyr	17518fe	2017-06-20 04:35:06 +0200	[diff] [blame]	1	/*! \file conv_acc_sse_impl.h
				2	* Accelerated Viterbi decoder implementation:
Vadim Yanitskiy	46e533c	2017-06-19 18:21:02 +0700	[diff] [blame]	3	* Actual definitions which are being included
Neels Hofmeyr	17518fe	2017-06-20 04:35:06 +0200	[diff] [blame]	4	* from both conv_acc_sse.c and conv_acc_sse_avx.c. */
				5	/*
Vadim Yanitskiy	0d49f47	2017-05-28 18:20:02 +0700	[diff] [blame]	6	* Copyright (C) 2013, 2014 Thomas Tsou <tom@tsou.cc>
				7	*
				8	* All Rights Reserved
				9	*
Harald Welte	e08da97	2017-11-13 01:00:26 +0900	[diff] [blame]	10	* SPDX-License-Identifier: GPL-2.0+
				11	*
Vadim Yanitskiy	0d49f47	2017-05-28 18:20:02 +0700	[diff] [blame]	12	* This program is free software; you can redistribute it and/or modify
				13	* it under the terms of the GNU General Public License as published by
				14	* the Free Software Foundation; either version 2 of the License, or
				15	* (at your option) any later version.
				16	*
				17	* This program is distributed in the hope that it will be useful,
				18	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				20	* GNU General Public License for more details.
Vadim Yanitskiy	0d49f47	2017-05-28 18:20:02 +0700	[diff] [blame]	21	*/
				22
Harald Welte	81dc67d	2017-10-09 11:00:56 +0800	[diff] [blame]	23	/* Some distributions (notably Alpine Linux) for some strange reason
				24	* don't have this #define */
				25	#ifndef __always_inline
				26	#define __always_inline inline __attribute__((always_inline))
				27	#endif
				28
Vadim Yanitskiy	0d49f47	2017-05-28 18:20:02 +0700	[diff] [blame]	29	extern int sse41_supported;
				30
				31	/* Octo-Viterbi butterfly
				32	* Compute 8-wide butterfly generating 16 path decisions and 16 accumulated
				33	* sums. Inputs all packed 16-bit integers in three 128-bit XMM registers.
				34	* Two intermediate registers are used and results are set in the upper 4
				35	* registers.
				36	*
				37	* Input:
				38	* M0 - Path metrics 0 (packed 16-bit integers)
				39	* M1 - Path metrics 1 (packed 16-bit integers)
				40	* M2 - Branch metrics (packed 16-bit integers)
				41	*
				42	* Output:
				43	* M2 - Selected and accumulated path metrics 0
				44	* M4 - Selected and accumulated path metrics 1
				45	* M3 - Path selections 0
				46	* M1 - Path selections 1
				47	*/
				48	#define SSE_BUTTERFLY(M0, M1, M2, M3, M4) \
				49	{ \
				50	M3 = _mm_adds_epi16(M0, M2); \
				51	M4 = _mm_subs_epi16(M1, M2); \
				52	M0 = _mm_subs_epi16(M0, M2); \
				53	M1 = _mm_adds_epi16(M1, M2); \
				54	M2 = _mm_max_epi16(M3, M4); \
				55	M3 = _mm_or_si128(_mm_cmpgt_epi16(M3, M4), _mm_cmpeq_epi16(M3, M4)); \
				56	M4 = _mm_max_epi16(M0, M1); \
				57	M1 = _mm_or_si128(_mm_cmpgt_epi16(M0, M1), _mm_cmpeq_epi16(M0, M1)); \
				58	}
				59
				60	/* Two lane deinterleaving K = 5:
				61	* Take 16 interleaved 16-bit integers and deinterleave to 2 packed 128-bit
				62	* registers. The operation summarized below. Four registers are used with
				63	* the lower 2 as input and upper 2 as output.
				64	*
				65	* In - 10101010 10101010 10101010 10101010
				66	* Out - 00000000 11111111 00000000 11111111
				67	*
				68	* Input:
				69	* M0:1 - Packed 16-bit integers
				70	*
				71	* Output:
				72	* M2:3 - Deinterleaved packed 16-bit integers
				73	*/
				74	#define _I8_SHUFFLE_MASK 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
				75
				76	#define SSE_DEINTERLEAVE_K5(M0, M1, M2, M3) \
				77	{ \
				78	M2 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
				79	M0 = _mm_shuffle_epi8(M0, M2); \
				80	M1 = _mm_shuffle_epi8(M1, M2); \
				81	M2 = _mm_unpacklo_epi64(M0, M1); \
				82	M3 = _mm_unpackhi_epi64(M0, M1); \
				83	}
				84
				85	/* Two lane deinterleaving K = 7:
				86	* Take 64 interleaved 16-bit integers and deinterleave to 8 packed 128-bit
				87	* registers. The operation summarized below. 16 registers are used with the
				88	* lower 8 as input and upper 8 as output.
				89	*
				90	* In - 10101010 10101010 10101010 10101010 ...
				91	* Out - 00000000 11111111 00000000 11111111 ...
				92	*
				93	* Input:
				94	* M0:7 - Packed 16-bit integers
				95	*
				96	* Output:
				97	* M8:15 - Deinterleaved packed 16-bit integers
				98	*/
				99	#define SSE_DEINTERLEAVE_K7(M0, M1, M2, M3, M4, M5, M6, M7, \
				100	M8, M9, M10, M11, M12, M13, M14, M15) \
				101	{ \
				102	M8 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
				103	M0 = _mm_shuffle_epi8(M0, M8); \
				104	M1 = _mm_shuffle_epi8(M1, M8); \
				105	M2 = _mm_shuffle_epi8(M2, M8); \
				106	M3 = _mm_shuffle_epi8(M3, M8); \
				107	M4 = _mm_shuffle_epi8(M4, M8); \
				108	M5 = _mm_shuffle_epi8(M5, M8); \
				109	M6 = _mm_shuffle_epi8(M6, M8); \
				110	M7 = _mm_shuffle_epi8(M7, M8); \
				111	M8 = _mm_unpacklo_epi64(M0, M1); \
				112	M9 = _mm_unpackhi_epi64(M0, M1); \
				113	M10 = _mm_unpacklo_epi64(M2, M3); \
				114	M11 = _mm_unpackhi_epi64(M2, M3); \
				115	M12 = _mm_unpacklo_epi64(M4, M5); \
				116	M13 = _mm_unpackhi_epi64(M4, M5); \
				117	M14 = _mm_unpacklo_epi64(M6, M7); \
				118	M15 = _mm_unpackhi_epi64(M6, M7); \
				119	}
				120
				121	/* Generate branch metrics N = 2:
				122	* Compute 16 branch metrics from trellis outputs and input values.
				123	*
				124	* Input:
				125	* M0:3 - 16 x 2 packed 16-bit trellis outputs
				126	* M4 - Expanded and packed 16-bit input value
				127	*
				128	* Output:
				129	* M6:7 - 16 computed 16-bit branch metrics
				130	*/
				131	#define SSE_BRANCH_METRIC_N2(M0, M1, M2, M3, M4, M6, M7) \
				132	{ \
				133	M0 = _mm_sign_epi16(M4, M0); \
				134	M1 = _mm_sign_epi16(M4, M1); \
				135	M2 = _mm_sign_epi16(M4, M2); \
				136	M3 = _mm_sign_epi16(M4, M3); \
				137	M6 = _mm_hadds_epi16(M0, M1); \
				138	M7 = _mm_hadds_epi16(M2, M3); \
				139	}
				140
				141	/* Generate branch metrics N = 4:
				142	* Compute 8 branch metrics from trellis outputs and input values. This
				143	* macro is reused for N less than 4 where the extra soft input bits are
				144	* padded.
				145	*
				146	* Input:
				147	* M0:3 - 8 x 4 packed 16-bit trellis outputs
				148	* M4 - Expanded and packed 16-bit input value
				149	*
				150	* Output:
				151	* M5 - 8 computed 16-bit branch metrics
				152	*/
				153	#define SSE_BRANCH_METRIC_N4(M0, M1, M2, M3, M4, M5) \
				154	{ \
				155	M0 = _mm_sign_epi16(M4, M0); \
				156	M1 = _mm_sign_epi16(M4, M1); \
				157	M2 = _mm_sign_epi16(M4, M2); \
				158	M3 = _mm_sign_epi16(M4, M3); \
				159	M0 = _mm_hadds_epi16(M0, M1); \
				160	M1 = _mm_hadds_epi16(M2, M3); \
				161	M5 = _mm_hadds_epi16(M0, M1); \
				162	}
				163
				164	/* Horizontal minimum
				165	* Compute horizontal minimum of packed unsigned 16-bit integers and place
				166	* result in the low 16-bit element of the source register. Only SSE 4.1
				167	* has a dedicated minpos instruction. One intermediate register is used
				168	* if SSE 4.1 is not available. This is a destructive operation and the
				169	* source register is overwritten.
				170	*
				171	* Input:
				172	* M0 - Packed unsigned 16-bit integers
				173	*
				174	* Output:
				175	* M0 - Minimum value placed in low 16-bit element
				176	*/
				177	#if defined(HAVE_SSE4_1) \|\| defined(HAVE_SSE41)
				178	#define SSE_MINPOS(M0, M1) \
				179	{ \
				180	if (sse41_supported) { \
				181	M0 = _mm_minpos_epu16(M0); \
				182	} else { \
				183	M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
				184	M0 = _mm_min_epi16(M0, M1); \
				185	M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
				186	M0 = _mm_min_epi16(M0, M1); \
				187	M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
				188	M0 = _mm_min_epi16(M0, M1); \
				189	} \
				190	}
				191	#else
				192	#define SSE_MINPOS(M0, M1) \
				193	{ \
				194	M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
				195	M0 = _mm_min_epi16(M0, M1); \
				196	M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
				197	M0 = _mm_min_epi16(M0, M1); \
				198	M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
				199	M0 = _mm_min_epi16(M0, M1); \
				200	}
				201	#endif
				202
				203	/* Normalize state metrics K = 5:
				204	* Compute 16-wide normalization by subtracting the smallest value from
				205	* all values. Inputs are 16 packed 16-bit integers across 2 XMM registers.
				206	* Two intermediate registers are used and normalized results are placed
				207	* in the originating locations.
				208	*
				209	* Input:
				210	* M0:1 - Path metrics 0:1 (packed 16-bit integers)
				211	*
				212	* Output:
				213	* M0:1 - Normalized path metrics 0:1
				214	*/
				215	#define SSE_NORMALIZE_K5(M0, M1, M2, M3) \
				216	{ \
				217	M2 = _mm_min_epi16(M0, M1); \
				218	SSE_MINPOS(M2, M3) \
				219	SSE_BROADCAST(M2) \
				220	M0 = _mm_subs_epi16(M0, M2); \
				221	M1 = _mm_subs_epi16(M1, M2); \
				222	}
				223
				224	/* Normalize state metrics K = 7:
				225	* Compute 64-wide normalization by subtracting the smallest value from
				226	* all values. Inputs are 8 registers of accumulated sums and 4 temporary
				227	* registers. Normalized results are returned in the originating locations.
				228	*
				229	* Input:
				230	* M0:7 - Path metrics 0:7 (packed 16-bit integers)
				231	*
				232	* Output:
				233	* M0:7 - Normalized path metrics 0:7
				234	*/
				235	#define SSE_NORMALIZE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11) \
				236	{ \
				237	M8 = _mm_min_epi16(M0, M1); \
				238	M9 = _mm_min_epi16(M2, M3); \
				239	M10 = _mm_min_epi16(M4, M5); \
				240	M11 = _mm_min_epi16(M6, M7); \
				241	M8 = _mm_min_epi16(M8, M9); \
				242	M10 = _mm_min_epi16(M10, M11); \
				243	M8 = _mm_min_epi16(M8, M10); \
				244	SSE_MINPOS(M8, M9) \
				245	SSE_BROADCAST(M8) \
				246	M0 = _mm_subs_epi16(M0, M8); \
				247	M1 = _mm_subs_epi16(M1, M8); \
				248	M2 = _mm_subs_epi16(M2, M8); \
				249	M3 = _mm_subs_epi16(M3, M8); \
				250	M4 = _mm_subs_epi16(M4, M8); \
				251	M5 = _mm_subs_epi16(M5, M8); \
				252	M6 = _mm_subs_epi16(M6, M8); \
				253	M7 = _mm_subs_epi16(M7, M8); \
				254	}
				255
				256	/* Combined BMU/PMU (K=5, N=2)
				257	* Compute branch metrics followed by path metrics for half rate 16-state
				258	* trellis. 8 butterflies are computed. Accumulated path sums are not
				259	* preserved and read and written into the same memory location. Normalize
				260	* sums if requires.
				261	*/
				262	__always_inline static void _sse_metrics_k5_n2(const int16_t *val,
				263	const int16_t out, int16_t sums, int16_t *paths, int norm)
				264	{
				265	__m128i m0, m1, m2, m3, m4, m5, m6;
				266
				267	/* (BMU) Load input sequence */
				268	m2 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
				269
				270	/* (BMU) Load trellis outputs */
				271	m0 = _mm_load_si128((__m128i *) &out[0]);
				272	m1 = _mm_load_si128((__m128i *) &out[8]);
				273
				274	/* (BMU) Compute branch metrics */
				275	m0 = _mm_sign_epi16(m2, m0);
				276	m1 = _mm_sign_epi16(m2, m1);
				277	m2 = _mm_hadds_epi16(m0, m1);
				278
				279	/* (PMU) Load accumulated path metrics */
				280	m0 = _mm_load_si128((__m128i *) &sums[0]);
				281	m1 = _mm_load_si128((__m128i *) &sums[8]);
				282
				283	SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
				284
				285	/* (PMU) Butterflies: 0-7 */
				286	SSE_BUTTERFLY(m3, m4, m2, m5, m6)
				287
				288	if (norm)
				289	SSE_NORMALIZE_K5(m2, m6, m0, m1)
				290
				291	_mm_store_si128((__m128i *) &sums[0], m2);
				292	_mm_store_si128((__m128i *) &sums[8], m6);
				293	_mm_store_si128((__m128i *) &paths[0], m5);
				294	_mm_store_si128((__m128i *) &paths[8], m4);
				295	}
				296
				297	/* Combined BMU/PMU (K=5, N=3 and N=4)
				298	* Compute branch metrics followed by path metrics for 16-state and rates
				299	* to 1/4. 8 butterflies are computed. The input sequence is read four 16-bit
				300	* values at a time, and extra values should be set to zero for rates other
				301	* than 1/4. Normally only rates 1/3 and 1/4 are used as there is a
				302	* dedicated implementation of rate 1/2.
				303	*/
				304	__always_inline static void _sse_metrics_k5_n4(const int16_t *val,
				305	const int16_t out, int16_t sums, int16_t *paths, int norm)
				306	{
				307	__m128i m0, m1, m2, m3, m4, m5, m6;
				308
				309	/* (BMU) Load input sequence */
				310	m4 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
				311
				312	/* (BMU) Load trellis outputs */
				313	m0 = _mm_load_si128((__m128i *) &out[0]);
				314	m1 = _mm_load_si128((__m128i *) &out[8]);
				315	m2 = _mm_load_si128((__m128i *) &out[16]);
				316	m3 = _mm_load_si128((__m128i *) &out[24]);
				317
				318	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m4, m2)
				319
				320	/* (PMU) Load accumulated path metrics */
				321	m0 = _mm_load_si128((__m128i *) &sums[0]);
				322	m1 = _mm_load_si128((__m128i *) &sums[8]);
				323
				324	SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
				325
				326	/* (PMU) Butterflies: 0-7 */
				327	SSE_BUTTERFLY(m3, m4, m2, m5, m6)
				328
				329	if (norm)
				330	SSE_NORMALIZE_K5(m2, m6, m0, m1)
				331
				332	_mm_store_si128((__m128i *) &sums[0], m2);
				333	_mm_store_si128((__m128i *) &sums[8], m6);
				334	_mm_store_si128((__m128i *) &paths[0], m5);
				335	_mm_store_si128((__m128i *) &paths[8], m4);
				336	}
				337
				338	/* Combined BMU/PMU (K=7, N=2)
				339	* Compute branch metrics followed by path metrics for half rate 64-state
				340	* trellis. 32 butterfly operations are computed. Deinterleaving path
				341	* metrics requires usage of the full SSE register file, so separate sums
				342	* before computing branch metrics to avoid register spilling.
				343	*/
				344	__always_inline static void _sse_metrics_k7_n2(const int16_t *val,
				345	const int16_t out, int16_t sums, int16_t *paths, int norm)
				346	{
				347	__m128i m0, m1, m2, m3, m4, m5, m6, m7, m8,
				348	m9, m10, m11, m12, m13, m14, m15;
				349
				350	/* (PMU) Load accumulated path metrics */
				351	m0 = _mm_load_si128((__m128i *) &sums[0]);
				352	m1 = _mm_load_si128((__m128i *) &sums[8]);
				353	m2 = _mm_load_si128((__m128i *) &sums[16]);
				354	m3 = _mm_load_si128((__m128i *) &sums[24]);
				355	m4 = _mm_load_si128((__m128i *) &sums[32]);
				356	m5 = _mm_load_si128((__m128i *) &sums[40]);
				357	m6 = _mm_load_si128((__m128i *) &sums[48]);
				358	m7 = _mm_load_si128((__m128i *) &sums[56]);
				359
				360	/* (PMU) Deinterleave to even-odd registers */
				361	SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
				362	m8, m9, m10, m11, m12, m13, m14, m15)
				363
				364	/* (BMU) Load input symbols */
				365	m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
				366
				367	/* (BMU) Load trellis outputs */
				368	m0 = _mm_load_si128((__m128i *) &out[0]);
				369	m1 = _mm_load_si128((__m128i *) &out[8]);
				370	m2 = _mm_load_si128((__m128i *) &out[16]);
				371	m3 = _mm_load_si128((__m128i *) &out[24]);
				372
				373	SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m4, m5)
				374
				375	m0 = _mm_load_si128((__m128i *) &out[32]);
				376	m1 = _mm_load_si128((__m128i *) &out[40]);
				377	m2 = _mm_load_si128((__m128i *) &out[48]);
				378	m3 = _mm_load_si128((__m128i *) &out[56]);
				379
				380	SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m6, m7)
				381
				382	/* (PMU) Butterflies: 0-15 */
				383	SSE_BUTTERFLY(m8, m9, m4, m0, m1)
				384	SSE_BUTTERFLY(m10, m11, m5, m2, m3)
				385
				386	_mm_store_si128((__m128i *) &paths[0], m0);
				387	_mm_store_si128((__m128i *) &paths[8], m2);
				388	_mm_store_si128((__m128i *) &paths[32], m9);
				389	_mm_store_si128((__m128i *) &paths[40], m11);
				390
				391	/* (PMU) Butterflies: 17-31 */
				392	SSE_BUTTERFLY(m12, m13, m6, m0, m2)
				393	SSE_BUTTERFLY(m14, m15, m7, m9, m11)
				394
				395	_mm_store_si128((__m128i *) &paths[16], m0);
				396	_mm_store_si128((__m128i *) &paths[24], m9);
				397	_mm_store_si128((__m128i *) &paths[48], m13);
				398	_mm_store_si128((__m128i *) &paths[56], m15);
				399
				400	if (norm)
				401	SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
				402	m7, m11, m0, m8, m9, m10)
				403
				404	_mm_store_si128((__m128i *) &sums[0], m4);
				405	_mm_store_si128((__m128i *) &sums[8], m5);
				406	_mm_store_si128((__m128i *) &sums[16], m6);
				407	_mm_store_si128((__m128i *) &sums[24], m7);
				408	_mm_store_si128((__m128i *) &sums[32], m1);
				409	_mm_store_si128((__m128i *) &sums[40], m3);
				410	_mm_store_si128((__m128i *) &sums[48], m2);
				411	_mm_store_si128((__m128i *) &sums[56], m11);
				412	}
				413
				414	/* Combined BMU/PMU (K=7, N=3 and N=4)
				415	* Compute branch metrics followed by path metrics for half rate 64-state
				416	* trellis. 32 butterfly operations are computed. Deinterleave path
				417	* metrics before computing branch metrics as in the half rate case.
				418	*/
				419	__always_inline static void _sse_metrics_k7_n4(const int16_t *val,
				420	const int16_t out, int16_t sums, int16_t *paths, int norm)
				421	{
				422	__m128i m0, m1, m2, m3, m4, m5, m6, m7;
				423	__m128i m8, m9, m10, m11, m12, m13, m14, m15;
				424
				425	/* (PMU) Load accumulated path metrics */
				426	m0 = _mm_load_si128((__m128i *) &sums[0]);
				427	m1 = _mm_load_si128((__m128i *) &sums[8]);
				428	m2 = _mm_load_si128((__m128i *) &sums[16]);
				429	m3 = _mm_load_si128((__m128i *) &sums[24]);
				430	m4 = _mm_load_si128((__m128i *) &sums[32]);
				431	m5 = _mm_load_si128((__m128i *) &sums[40]);
				432	m6 = _mm_load_si128((__m128i *) &sums[48]);
				433	m7 = _mm_load_si128((__m128i *) &sums[56]);
				434
				435	/* (PMU) Deinterleave into even and odd packed registers */
				436	SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
				437	m8, m9, m10, m11, m12, m13, m14, m15)
				438
				439	/* (BMU) Load and expand 8-bit input out to 16-bits */
				440	m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
				441
				442	/* (BMU) Load and compute branch metrics */
				443	m0 = _mm_load_si128((__m128i *) &out[0]);
				444	m1 = _mm_load_si128((__m128i *) &out[8]);
				445	m2 = _mm_load_si128((__m128i *) &out[16]);
				446	m3 = _mm_load_si128((__m128i *) &out[24]);
				447
				448	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m4)
				449
				450	m0 = _mm_load_si128((__m128i *) &out[32]);
				451	m1 = _mm_load_si128((__m128i *) &out[40]);
				452	m2 = _mm_load_si128((__m128i *) &out[48]);
				453	m3 = _mm_load_si128((__m128i *) &out[56]);
				454
				455	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m5)
				456
				457	m0 = _mm_load_si128((__m128i *) &out[64]);
				458	m1 = _mm_load_si128((__m128i *) &out[72]);
				459	m2 = _mm_load_si128((__m128i *) &out[80]);
				460	m3 = _mm_load_si128((__m128i *) &out[88]);
				461
				462	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m6)
				463
				464	m0 = _mm_load_si128((__m128i *) &out[96]);
				465	m1 = _mm_load_si128((__m128i *) &out[104]);
				466	m2 = _mm_load_si128((__m128i *) &out[112]);
				467	m3 = _mm_load_si128((__m128i *) &out[120]);
				468
				469	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m7)
				470
				471	/* (PMU) Butterflies: 0-15 */
				472	SSE_BUTTERFLY(m8, m9, m4, m0, m1)
				473	SSE_BUTTERFLY(m10, m11, m5, m2, m3)
				474
				475	_mm_store_si128((__m128i *) &paths[0], m0);
				476	_mm_store_si128((__m128i *) &paths[8], m2);
				477	_mm_store_si128((__m128i *) &paths[32], m9);
				478	_mm_store_si128((__m128i *) &paths[40], m11);
				479
				480	/* (PMU) Butterflies: 17-31 */
				481	SSE_BUTTERFLY(m12, m13, m6, m0, m2)
				482	SSE_BUTTERFLY(m14, m15, m7, m9, m11)
				483
				484	_mm_store_si128((__m128i *) &paths[16], m0);
				485	_mm_store_si128((__m128i *) &paths[24], m9);
				486	_mm_store_si128((__m128i *) &paths[48], m13);
				487	_mm_store_si128((__m128i *) &paths[56], m15);
				488
				489	if (norm)
				490	SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
				491	m7, m11, m0, m8, m9, m10)
				492
				493	_mm_store_si128((__m128i *) &sums[0], m4);
				494	_mm_store_si128((__m128i *) &sums[8], m5);
				495	_mm_store_si128((__m128i *) &sums[16], m6);
				496	_mm_store_si128((__m128i *) &sums[24], m7);
				497	_mm_store_si128((__m128i *) &sums[32], m1);
				498	_mm_store_si128((__m128i *) &sums[40], m3);
				499	_mm_store_si128((__m128i *) &sums[48], m2);
				500	_mm_store_si128((__m128i *) &sums[56], m11);
				501	}