Blame - src/conv_acc_sse_impl.h - libosmocore

blob: e6eaaa6d1d1e903b25363c9977623630eba20b41 [file] [log] [blame]

Neels Hofmeyr	17518fe	2017-06-20 04:35:06 +0200	[diff] [blame^]	1	/*! \file conv_acc_sse_impl.h
				2	* Accelerated Viterbi decoder implementation:
Vadim Yanitskiy	46e533c	2017-06-19 18:21:02 +0700	[diff] [blame]	3	* Actual definitions which are being included
Neels Hofmeyr	17518fe	2017-06-20 04:35:06 +0200	[diff] [blame^]	4	* from both conv_acc_sse.c and conv_acc_sse_avx.c. */
				5	/*
Vadim Yanitskiy	0d49f47	2017-05-28 18:20:02 +0700	[diff] [blame]	6	* Copyright (C) 2013, 2014 Thomas Tsou <tom@tsou.cc>
				7	*
				8	* All Rights Reserved
				9	*
				10	* This program is free software; you can redistribute it and/or modify
				11	* it under the terms of the GNU General Public License as published by
				12	* the Free Software Foundation; either version 2 of the License, or
				13	* (at your option) any later version.
				14	*
				15	* This program is distributed in the hope that it will be useful,
				16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				18	* GNU General Public License for more details.
				19	*
				20	* You should have received a copy of the GNU General Public License along
				21	* with this program; if not, write to the Free Software Foundation, Inc.,
				22	* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
				23	*/
				24
				25	extern int sse41_supported;
				26
				27	/* Octo-Viterbi butterfly
				28	* Compute 8-wide butterfly generating 16 path decisions and 16 accumulated
				29	* sums. Inputs all packed 16-bit integers in three 128-bit XMM registers.
				30	* Two intermediate registers are used and results are set in the upper 4
				31	* registers.
				32	*
				33	* Input:
				34	* M0 - Path metrics 0 (packed 16-bit integers)
				35	* M1 - Path metrics 1 (packed 16-bit integers)
				36	* M2 - Branch metrics (packed 16-bit integers)
				37	*
				38	* Output:
				39	* M2 - Selected and accumulated path metrics 0
				40	* M4 - Selected and accumulated path metrics 1
				41	* M3 - Path selections 0
				42	* M1 - Path selections 1
				43	*/
				44	#define SSE_BUTTERFLY(M0, M1, M2, M3, M4) \
				45	{ \
				46	M3 = _mm_adds_epi16(M0, M2); \
				47	M4 = _mm_subs_epi16(M1, M2); \
				48	M0 = _mm_subs_epi16(M0, M2); \
				49	M1 = _mm_adds_epi16(M1, M2); \
				50	M2 = _mm_max_epi16(M3, M4); \
				51	M3 = _mm_or_si128(_mm_cmpgt_epi16(M3, M4), _mm_cmpeq_epi16(M3, M4)); \
				52	M4 = _mm_max_epi16(M0, M1); \
				53	M1 = _mm_or_si128(_mm_cmpgt_epi16(M0, M1), _mm_cmpeq_epi16(M0, M1)); \
				54	}
				55
				56	/* Two lane deinterleaving K = 5:
				57	* Take 16 interleaved 16-bit integers and deinterleave to 2 packed 128-bit
				58	* registers. The operation summarized below. Four registers are used with
				59	* the lower 2 as input and upper 2 as output.
				60	*
				61	* In - 10101010 10101010 10101010 10101010
				62	* Out - 00000000 11111111 00000000 11111111
				63	*
				64	* Input:
				65	* M0:1 - Packed 16-bit integers
				66	*
				67	* Output:
				68	* M2:3 - Deinterleaved packed 16-bit integers
				69	*/
				70	#define _I8_SHUFFLE_MASK 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
				71
				72	#define SSE_DEINTERLEAVE_K5(M0, M1, M2, M3) \
				73	{ \
				74	M2 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
				75	M0 = _mm_shuffle_epi8(M0, M2); \
				76	M1 = _mm_shuffle_epi8(M1, M2); \
				77	M2 = _mm_unpacklo_epi64(M0, M1); \
				78	M3 = _mm_unpackhi_epi64(M0, M1); \
				79	}
				80
				81	/* Two lane deinterleaving K = 7:
				82	* Take 64 interleaved 16-bit integers and deinterleave to 8 packed 128-bit
				83	* registers. The operation summarized below. 16 registers are used with the
				84	* lower 8 as input and upper 8 as output.
				85	*
				86	* In - 10101010 10101010 10101010 10101010 ...
				87	* Out - 00000000 11111111 00000000 11111111 ...
				88	*
				89	* Input:
				90	* M0:7 - Packed 16-bit integers
				91	*
				92	* Output:
				93	* M8:15 - Deinterleaved packed 16-bit integers
				94	*/
				95	#define SSE_DEINTERLEAVE_K7(M0, M1, M2, M3, M4, M5, M6, M7, \
				96	M8, M9, M10, M11, M12, M13, M14, M15) \
				97	{ \
				98	M8 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
				99	M0 = _mm_shuffle_epi8(M0, M8); \
				100	M1 = _mm_shuffle_epi8(M1, M8); \
				101	M2 = _mm_shuffle_epi8(M2, M8); \
				102	M3 = _mm_shuffle_epi8(M3, M8); \
				103	M4 = _mm_shuffle_epi8(M4, M8); \
				104	M5 = _mm_shuffle_epi8(M5, M8); \
				105	M6 = _mm_shuffle_epi8(M6, M8); \
				106	M7 = _mm_shuffle_epi8(M7, M8); \
				107	M8 = _mm_unpacklo_epi64(M0, M1); \
				108	M9 = _mm_unpackhi_epi64(M0, M1); \
				109	M10 = _mm_unpacklo_epi64(M2, M3); \
				110	M11 = _mm_unpackhi_epi64(M2, M3); \
				111	M12 = _mm_unpacklo_epi64(M4, M5); \
				112	M13 = _mm_unpackhi_epi64(M4, M5); \
				113	M14 = _mm_unpacklo_epi64(M6, M7); \
				114	M15 = _mm_unpackhi_epi64(M6, M7); \
				115	}
				116
				117	/* Generate branch metrics N = 2:
				118	* Compute 16 branch metrics from trellis outputs and input values.
				119	*
				120	* Input:
				121	* M0:3 - 16 x 2 packed 16-bit trellis outputs
				122	* M4 - Expanded and packed 16-bit input value
				123	*
				124	* Output:
				125	* M6:7 - 16 computed 16-bit branch metrics
				126	*/
				127	#define SSE_BRANCH_METRIC_N2(M0, M1, M2, M3, M4, M6, M7) \
				128	{ \
				129	M0 = _mm_sign_epi16(M4, M0); \
				130	M1 = _mm_sign_epi16(M4, M1); \
				131	M2 = _mm_sign_epi16(M4, M2); \
				132	M3 = _mm_sign_epi16(M4, M3); \
				133	M6 = _mm_hadds_epi16(M0, M1); \
				134	M7 = _mm_hadds_epi16(M2, M3); \
				135	}
				136
				137	/* Generate branch metrics N = 4:
				138	* Compute 8 branch metrics from trellis outputs and input values. This
				139	* macro is reused for N less than 4 where the extra soft input bits are
				140	* padded.
				141	*
				142	* Input:
				143	* M0:3 - 8 x 4 packed 16-bit trellis outputs
				144	* M4 - Expanded and packed 16-bit input value
				145	*
				146	* Output:
				147	* M5 - 8 computed 16-bit branch metrics
				148	*/
				149	#define SSE_BRANCH_METRIC_N4(M0, M1, M2, M3, M4, M5) \
				150	{ \
				151	M0 = _mm_sign_epi16(M4, M0); \
				152	M1 = _mm_sign_epi16(M4, M1); \
				153	M2 = _mm_sign_epi16(M4, M2); \
				154	M3 = _mm_sign_epi16(M4, M3); \
				155	M0 = _mm_hadds_epi16(M0, M1); \
				156	M1 = _mm_hadds_epi16(M2, M3); \
				157	M5 = _mm_hadds_epi16(M0, M1); \
				158	}
				159
				160	/* Horizontal minimum
				161	* Compute horizontal minimum of packed unsigned 16-bit integers and place
				162	* result in the low 16-bit element of the source register. Only SSE 4.1
				163	* has a dedicated minpos instruction. One intermediate register is used
				164	* if SSE 4.1 is not available. This is a destructive operation and the
				165	* source register is overwritten.
				166	*
				167	* Input:
				168	* M0 - Packed unsigned 16-bit integers
				169	*
				170	* Output:
				171	* M0 - Minimum value placed in low 16-bit element
				172	*/
				173	#if defined(HAVE_SSE4_1) \|\| defined(HAVE_SSE41)
				174	#define SSE_MINPOS(M0, M1) \
				175	{ \
				176	if (sse41_supported) { \
				177	M0 = _mm_minpos_epu16(M0); \
				178	} else { \
				179	M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
				180	M0 = _mm_min_epi16(M0, M1); \
				181	M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
				182	M0 = _mm_min_epi16(M0, M1); \
				183	M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
				184	M0 = _mm_min_epi16(M0, M1); \
				185	} \
				186	}
				187	#else
				188	#define SSE_MINPOS(M0, M1) \
				189	{ \
				190	M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
				191	M0 = _mm_min_epi16(M0, M1); \
				192	M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
				193	M0 = _mm_min_epi16(M0, M1); \
				194	M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
				195	M0 = _mm_min_epi16(M0, M1); \
				196	}
				197	#endif
				198
				199	/* Normalize state metrics K = 5:
				200	* Compute 16-wide normalization by subtracting the smallest value from
				201	* all values. Inputs are 16 packed 16-bit integers across 2 XMM registers.
				202	* Two intermediate registers are used and normalized results are placed
				203	* in the originating locations.
				204	*
				205	* Input:
				206	* M0:1 - Path metrics 0:1 (packed 16-bit integers)
				207	*
				208	* Output:
				209	* M0:1 - Normalized path metrics 0:1
				210	*/
				211	#define SSE_NORMALIZE_K5(M0, M1, M2, M3) \
				212	{ \
				213	M2 = _mm_min_epi16(M0, M1); \
				214	SSE_MINPOS(M2, M3) \
				215	SSE_BROADCAST(M2) \
				216	M0 = _mm_subs_epi16(M0, M2); \
				217	M1 = _mm_subs_epi16(M1, M2); \
				218	}
				219
				220	/* Normalize state metrics K = 7:
				221	* Compute 64-wide normalization by subtracting the smallest value from
				222	* all values. Inputs are 8 registers of accumulated sums and 4 temporary
				223	* registers. Normalized results are returned in the originating locations.
				224	*
				225	* Input:
				226	* M0:7 - Path metrics 0:7 (packed 16-bit integers)
				227	*
				228	* Output:
				229	* M0:7 - Normalized path metrics 0:7
				230	*/
				231	#define SSE_NORMALIZE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11) \
				232	{ \
				233	M8 = _mm_min_epi16(M0, M1); \
				234	M9 = _mm_min_epi16(M2, M3); \
				235	M10 = _mm_min_epi16(M4, M5); \
				236	M11 = _mm_min_epi16(M6, M7); \
				237	M8 = _mm_min_epi16(M8, M9); \
				238	M10 = _mm_min_epi16(M10, M11); \
				239	M8 = _mm_min_epi16(M8, M10); \
				240	SSE_MINPOS(M8, M9) \
				241	SSE_BROADCAST(M8) \
				242	M0 = _mm_subs_epi16(M0, M8); \
				243	M1 = _mm_subs_epi16(M1, M8); \
				244	M2 = _mm_subs_epi16(M2, M8); \
				245	M3 = _mm_subs_epi16(M3, M8); \
				246	M4 = _mm_subs_epi16(M4, M8); \
				247	M5 = _mm_subs_epi16(M5, M8); \
				248	M6 = _mm_subs_epi16(M6, M8); \
				249	M7 = _mm_subs_epi16(M7, M8); \
				250	}
				251
				252	/* Combined BMU/PMU (K=5, N=2)
				253	* Compute branch metrics followed by path metrics for half rate 16-state
				254	* trellis. 8 butterflies are computed. Accumulated path sums are not
				255	* preserved and read and written into the same memory location. Normalize
				256	* sums if requires.
				257	*/
				258	__always_inline static void _sse_metrics_k5_n2(const int16_t *val,
				259	const int16_t out, int16_t sums, int16_t *paths, int norm)
				260	{
				261	__m128i m0, m1, m2, m3, m4, m5, m6;
				262
				263	/* (BMU) Load input sequence */
				264	m2 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
				265
				266	/* (BMU) Load trellis outputs */
				267	m0 = _mm_load_si128((__m128i *) &out[0]);
				268	m1 = _mm_load_si128((__m128i *) &out[8]);
				269
				270	/* (BMU) Compute branch metrics */
				271	m0 = _mm_sign_epi16(m2, m0);
				272	m1 = _mm_sign_epi16(m2, m1);
				273	m2 = _mm_hadds_epi16(m0, m1);
				274
				275	/* (PMU) Load accumulated path metrics */
				276	m0 = _mm_load_si128((__m128i *) &sums[0]);
				277	m1 = _mm_load_si128((__m128i *) &sums[8]);
				278
				279	SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
				280
				281	/* (PMU) Butterflies: 0-7 */
				282	SSE_BUTTERFLY(m3, m4, m2, m5, m6)
				283
				284	if (norm)
				285	SSE_NORMALIZE_K5(m2, m6, m0, m1)
				286
				287	_mm_store_si128((__m128i *) &sums[0], m2);
				288	_mm_store_si128((__m128i *) &sums[8], m6);
				289	_mm_store_si128((__m128i *) &paths[0], m5);
				290	_mm_store_si128((__m128i *) &paths[8], m4);
				291	}
				292
				293	/* Combined BMU/PMU (K=5, N=3 and N=4)
				294	* Compute branch metrics followed by path metrics for 16-state and rates
				295	* to 1/4. 8 butterflies are computed. The input sequence is read four 16-bit
				296	* values at a time, and extra values should be set to zero for rates other
				297	* than 1/4. Normally only rates 1/3 and 1/4 are used as there is a
				298	* dedicated implementation of rate 1/2.
				299	*/
				300	__always_inline static void _sse_metrics_k5_n4(const int16_t *val,
				301	const int16_t out, int16_t sums, int16_t *paths, int norm)
				302	{
				303	__m128i m0, m1, m2, m3, m4, m5, m6;
				304
				305	/* (BMU) Load input sequence */
				306	m4 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
				307
				308	/* (BMU) Load trellis outputs */
				309	m0 = _mm_load_si128((__m128i *) &out[0]);
				310	m1 = _mm_load_si128((__m128i *) &out[8]);
				311	m2 = _mm_load_si128((__m128i *) &out[16]);
				312	m3 = _mm_load_si128((__m128i *) &out[24]);
				313
				314	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m4, m2)
				315
				316	/* (PMU) Load accumulated path metrics */
				317	m0 = _mm_load_si128((__m128i *) &sums[0]);
				318	m1 = _mm_load_si128((__m128i *) &sums[8]);
				319
				320	SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
				321
				322	/* (PMU) Butterflies: 0-7 */
				323	SSE_BUTTERFLY(m3, m4, m2, m5, m6)
				324
				325	if (norm)
				326	SSE_NORMALIZE_K5(m2, m6, m0, m1)
				327
				328	_mm_store_si128((__m128i *) &sums[0], m2);
				329	_mm_store_si128((__m128i *) &sums[8], m6);
				330	_mm_store_si128((__m128i *) &paths[0], m5);
				331	_mm_store_si128((__m128i *) &paths[8], m4);
				332	}
				333
				334	/* Combined BMU/PMU (K=7, N=2)
				335	* Compute branch metrics followed by path metrics for half rate 64-state
				336	* trellis. 32 butterfly operations are computed. Deinterleaving path
				337	* metrics requires usage of the full SSE register file, so separate sums
				338	* before computing branch metrics to avoid register spilling.
				339	*/
				340	__always_inline static void _sse_metrics_k7_n2(const int16_t *val,
				341	const int16_t out, int16_t sums, int16_t *paths, int norm)
				342	{
				343	__m128i m0, m1, m2, m3, m4, m5, m6, m7, m8,
				344	m9, m10, m11, m12, m13, m14, m15;
				345
				346	/* (PMU) Load accumulated path metrics */
				347	m0 = _mm_load_si128((__m128i *) &sums[0]);
				348	m1 = _mm_load_si128((__m128i *) &sums[8]);
				349	m2 = _mm_load_si128((__m128i *) &sums[16]);
				350	m3 = _mm_load_si128((__m128i *) &sums[24]);
				351	m4 = _mm_load_si128((__m128i *) &sums[32]);
				352	m5 = _mm_load_si128((__m128i *) &sums[40]);
				353	m6 = _mm_load_si128((__m128i *) &sums[48]);
				354	m7 = _mm_load_si128((__m128i *) &sums[56]);
				355
				356	/* (PMU) Deinterleave to even-odd registers */
				357	SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
				358	m8, m9, m10, m11, m12, m13, m14, m15)
				359
				360	/* (BMU) Load input symbols */
				361	m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
				362
				363	/* (BMU) Load trellis outputs */
				364	m0 = _mm_load_si128((__m128i *) &out[0]);
				365	m1 = _mm_load_si128((__m128i *) &out[8]);
				366	m2 = _mm_load_si128((__m128i *) &out[16]);
				367	m3 = _mm_load_si128((__m128i *) &out[24]);
				368
				369	SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m4, m5)
				370
				371	m0 = _mm_load_si128((__m128i *) &out[32]);
				372	m1 = _mm_load_si128((__m128i *) &out[40]);
				373	m2 = _mm_load_si128((__m128i *) &out[48]);
				374	m3 = _mm_load_si128((__m128i *) &out[56]);
				375
				376	SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m6, m7)
				377
				378	/* (PMU) Butterflies: 0-15 */
				379	SSE_BUTTERFLY(m8, m9, m4, m0, m1)
				380	SSE_BUTTERFLY(m10, m11, m5, m2, m3)
				381
				382	_mm_store_si128((__m128i *) &paths[0], m0);
				383	_mm_store_si128((__m128i *) &paths[8], m2);
				384	_mm_store_si128((__m128i *) &paths[32], m9);
				385	_mm_store_si128((__m128i *) &paths[40], m11);
				386
				387	/* (PMU) Butterflies: 17-31 */
				388	SSE_BUTTERFLY(m12, m13, m6, m0, m2)
				389	SSE_BUTTERFLY(m14, m15, m7, m9, m11)
				390
				391	_mm_store_si128((__m128i *) &paths[16], m0);
				392	_mm_store_si128((__m128i *) &paths[24], m9);
				393	_mm_store_si128((__m128i *) &paths[48], m13);
				394	_mm_store_si128((__m128i *) &paths[56], m15);
				395
				396	if (norm)
				397	SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
				398	m7, m11, m0, m8, m9, m10)
				399
				400	_mm_store_si128((__m128i *) &sums[0], m4);
				401	_mm_store_si128((__m128i *) &sums[8], m5);
				402	_mm_store_si128((__m128i *) &sums[16], m6);
				403	_mm_store_si128((__m128i *) &sums[24], m7);
				404	_mm_store_si128((__m128i *) &sums[32], m1);
				405	_mm_store_si128((__m128i *) &sums[40], m3);
				406	_mm_store_si128((__m128i *) &sums[48], m2);
				407	_mm_store_si128((__m128i *) &sums[56], m11);
				408	}
				409
				410	/* Combined BMU/PMU (K=7, N=3 and N=4)
				411	* Compute branch metrics followed by path metrics for half rate 64-state
				412	* trellis. 32 butterfly operations are computed. Deinterleave path
				413	* metrics before computing branch metrics as in the half rate case.
				414	*/
				415	__always_inline static void _sse_metrics_k7_n4(const int16_t *val,
				416	const int16_t out, int16_t sums, int16_t *paths, int norm)
				417	{
				418	__m128i m0, m1, m2, m3, m4, m5, m6, m7;
				419	__m128i m8, m9, m10, m11, m12, m13, m14, m15;
				420
				421	/* (PMU) Load accumulated path metrics */
				422	m0 = _mm_load_si128((__m128i *) &sums[0]);
				423	m1 = _mm_load_si128((__m128i *) &sums[8]);
				424	m2 = _mm_load_si128((__m128i *) &sums[16]);
				425	m3 = _mm_load_si128((__m128i *) &sums[24]);
				426	m4 = _mm_load_si128((__m128i *) &sums[32]);
				427	m5 = _mm_load_si128((__m128i *) &sums[40]);
				428	m6 = _mm_load_si128((__m128i *) &sums[48]);
				429	m7 = _mm_load_si128((__m128i *) &sums[56]);
				430
				431	/* (PMU) Deinterleave into even and odd packed registers */
				432	SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
				433	m8, m9, m10, m11, m12, m13, m14, m15)
				434
				435	/* (BMU) Load and expand 8-bit input out to 16-bits */
				436	m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
				437
				438	/* (BMU) Load and compute branch metrics */
				439	m0 = _mm_load_si128((__m128i *) &out[0]);
				440	m1 = _mm_load_si128((__m128i *) &out[8]);
				441	m2 = _mm_load_si128((__m128i *) &out[16]);
				442	m3 = _mm_load_si128((__m128i *) &out[24]);
				443
				444	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m4)
				445
				446	m0 = _mm_load_si128((__m128i *) &out[32]);
				447	m1 = _mm_load_si128((__m128i *) &out[40]);
				448	m2 = _mm_load_si128((__m128i *) &out[48]);
				449	m3 = _mm_load_si128((__m128i *) &out[56]);
				450
				451	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m5)
				452
				453	m0 = _mm_load_si128((__m128i *) &out[64]);
				454	m1 = _mm_load_si128((__m128i *) &out[72]);
				455	m2 = _mm_load_si128((__m128i *) &out[80]);
				456	m3 = _mm_load_si128((__m128i *) &out[88]);
				457
				458	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m6)
				459
				460	m0 = _mm_load_si128((__m128i *) &out[96]);
				461	m1 = _mm_load_si128((__m128i *) &out[104]);
				462	m2 = _mm_load_si128((__m128i *) &out[112]);
				463	m3 = _mm_load_si128((__m128i *) &out[120]);
				464
				465	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m7)
				466
				467	/* (PMU) Butterflies: 0-15 */
				468	SSE_BUTTERFLY(m8, m9, m4, m0, m1)
				469	SSE_BUTTERFLY(m10, m11, m5, m2, m3)
				470
				471	_mm_store_si128((__m128i *) &paths[0], m0);
				472	_mm_store_si128((__m128i *) &paths[8], m2);
				473	_mm_store_si128((__m128i *) &paths[32], m9);
				474	_mm_store_si128((__m128i *) &paths[40], m11);
				475
				476	/* (PMU) Butterflies: 17-31 */
				477	SSE_BUTTERFLY(m12, m13, m6, m0, m2)
				478	SSE_BUTTERFLY(m14, m15, m7, m9, m11)
				479
				480	_mm_store_si128((__m128i *) &paths[16], m0);
				481	_mm_store_si128((__m128i *) &paths[24], m9);
				482	_mm_store_si128((__m128i *) &paths[48], m13);
				483	_mm_store_si128((__m128i *) &paths[56], m15);
				484
				485	if (norm)
				486	SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
				487	m7, m11, m0, m8, m9, m10)
				488
				489	_mm_store_si128((__m128i *) &sums[0], m4);
				490	_mm_store_si128((__m128i *) &sums[8], m5);
				491	_mm_store_si128((__m128i *) &sums[16], m6);
				492	_mm_store_si128((__m128i *) &sums[24], m7);
				493	_mm_store_si128((__m128i *) &sums[32], m1);
				494	_mm_store_si128((__m128i *) &sums[40], m3);
				495	_mm_store_si128((__m128i *) &sums[48], m2);
				496	_mm_store_si128((__m128i *) &sums[56], m11);
				497	}