Blame - src/viterbi_sse_common.h - libosmocore

blob: 7d48c9429c145f2dfa12500e481175873e18a4c9 [file] [log] [blame]

Vadim Yanitskiy	0d49f47	2017-05-28 18:20:02 +0700	[diff] [blame^]	1	/*
				2	* Intel SSE Viterbi decoder
				3	*
				4	* Copyright (C) 2013, 2014 Thomas Tsou <tom@tsou.cc>
				5	*
				6	* All Rights Reserved
				7	*
				8	* This program is free software; you can redistribute it and/or modify
				9	* it under the terms of the GNU General Public License as published by
				10	* the Free Software Foundation; either version 2 of the License, or
				11	* (at your option) any later version.
				12	*
				13	* This program is distributed in the hope that it will be useful,
				14	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				15	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				16	* GNU General Public License for more details.
				17	*
				18	* You should have received a copy of the GNU General Public License along
				19	* with this program; if not, write to the Free Software Foundation, Inc.,
				20	* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
				21	*/
				22
				23	extern int sse41_supported;
				24
				25	/* Octo-Viterbi butterfly
				26	* Compute 8-wide butterfly generating 16 path decisions and 16 accumulated
				27	* sums. Inputs all packed 16-bit integers in three 128-bit XMM registers.
				28	* Two intermediate registers are used and results are set in the upper 4
				29	* registers.
				30	*
				31	* Input:
				32	* M0 - Path metrics 0 (packed 16-bit integers)
				33	* M1 - Path metrics 1 (packed 16-bit integers)
				34	* M2 - Branch metrics (packed 16-bit integers)
				35	*
				36	* Output:
				37	* M2 - Selected and accumulated path metrics 0
				38	* M4 - Selected and accumulated path metrics 1
				39	* M3 - Path selections 0
				40	* M1 - Path selections 1
				41	*/
				42	#define SSE_BUTTERFLY(M0, M1, M2, M3, M4) \
				43	{ \
				44	M3 = _mm_adds_epi16(M0, M2); \
				45	M4 = _mm_subs_epi16(M1, M2); \
				46	M0 = _mm_subs_epi16(M0, M2); \
				47	M1 = _mm_adds_epi16(M1, M2); \
				48	M2 = _mm_max_epi16(M3, M4); \
				49	M3 = _mm_or_si128(_mm_cmpgt_epi16(M3, M4), _mm_cmpeq_epi16(M3, M4)); \
				50	M4 = _mm_max_epi16(M0, M1); \
				51	M1 = _mm_or_si128(_mm_cmpgt_epi16(M0, M1), _mm_cmpeq_epi16(M0, M1)); \
				52	}
				53
				54	/* Two lane deinterleaving K = 5:
				55	* Take 16 interleaved 16-bit integers and deinterleave to 2 packed 128-bit
				56	* registers. The operation summarized below. Four registers are used with
				57	* the lower 2 as input and upper 2 as output.
				58	*
				59	* In - 10101010 10101010 10101010 10101010
				60	* Out - 00000000 11111111 00000000 11111111
				61	*
				62	* Input:
				63	* M0:1 - Packed 16-bit integers
				64	*
				65	* Output:
				66	* M2:3 - Deinterleaved packed 16-bit integers
				67	*/
				68	#define _I8_SHUFFLE_MASK 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
				69
				70	#define SSE_DEINTERLEAVE_K5(M0, M1, M2, M3) \
				71	{ \
				72	M2 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
				73	M0 = _mm_shuffle_epi8(M0, M2); \
				74	M1 = _mm_shuffle_epi8(M1, M2); \
				75	M2 = _mm_unpacklo_epi64(M0, M1); \
				76	M3 = _mm_unpackhi_epi64(M0, M1); \
				77	}
				78
				79	/* Two lane deinterleaving K = 7:
				80	* Take 64 interleaved 16-bit integers and deinterleave to 8 packed 128-bit
				81	* registers. The operation summarized below. 16 registers are used with the
				82	* lower 8 as input and upper 8 as output.
				83	*
				84	* In - 10101010 10101010 10101010 10101010 ...
				85	* Out - 00000000 11111111 00000000 11111111 ...
				86	*
				87	* Input:
				88	* M0:7 - Packed 16-bit integers
				89	*
				90	* Output:
				91	* M8:15 - Deinterleaved packed 16-bit integers
				92	*/
				93	#define SSE_DEINTERLEAVE_K7(M0, M1, M2, M3, M4, M5, M6, M7, \
				94	M8, M9, M10, M11, M12, M13, M14, M15) \
				95	{ \
				96	M8 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
				97	M0 = _mm_shuffle_epi8(M0, M8); \
				98	M1 = _mm_shuffle_epi8(M1, M8); \
				99	M2 = _mm_shuffle_epi8(M2, M8); \
				100	M3 = _mm_shuffle_epi8(M3, M8); \
				101	M4 = _mm_shuffle_epi8(M4, M8); \
				102	M5 = _mm_shuffle_epi8(M5, M8); \
				103	M6 = _mm_shuffle_epi8(M6, M8); \
				104	M7 = _mm_shuffle_epi8(M7, M8); \
				105	M8 = _mm_unpacklo_epi64(M0, M1); \
				106	M9 = _mm_unpackhi_epi64(M0, M1); \
				107	M10 = _mm_unpacklo_epi64(M2, M3); \
				108	M11 = _mm_unpackhi_epi64(M2, M3); \
				109	M12 = _mm_unpacklo_epi64(M4, M5); \
				110	M13 = _mm_unpackhi_epi64(M4, M5); \
				111	M14 = _mm_unpacklo_epi64(M6, M7); \
				112	M15 = _mm_unpackhi_epi64(M6, M7); \
				113	}
				114
				115	/* Generate branch metrics N = 2:
				116	* Compute 16 branch metrics from trellis outputs and input values.
				117	*
				118	* Input:
				119	* M0:3 - 16 x 2 packed 16-bit trellis outputs
				120	* M4 - Expanded and packed 16-bit input value
				121	*
				122	* Output:
				123	* M6:7 - 16 computed 16-bit branch metrics
				124	*/
				125	#define SSE_BRANCH_METRIC_N2(M0, M1, M2, M3, M4, M6, M7) \
				126	{ \
				127	M0 = _mm_sign_epi16(M4, M0); \
				128	M1 = _mm_sign_epi16(M4, M1); \
				129	M2 = _mm_sign_epi16(M4, M2); \
				130	M3 = _mm_sign_epi16(M4, M3); \
				131	M6 = _mm_hadds_epi16(M0, M1); \
				132	M7 = _mm_hadds_epi16(M2, M3); \
				133	}
				134
				135	/* Generate branch metrics N = 4:
				136	* Compute 8 branch metrics from trellis outputs and input values. This
				137	* macro is reused for N less than 4 where the extra soft input bits are
				138	* padded.
				139	*
				140	* Input:
				141	* M0:3 - 8 x 4 packed 16-bit trellis outputs
				142	* M4 - Expanded and packed 16-bit input value
				143	*
				144	* Output:
				145	* M5 - 8 computed 16-bit branch metrics
				146	*/
				147	#define SSE_BRANCH_METRIC_N4(M0, M1, M2, M3, M4, M5) \
				148	{ \
				149	M0 = _mm_sign_epi16(M4, M0); \
				150	M1 = _mm_sign_epi16(M4, M1); \
				151	M2 = _mm_sign_epi16(M4, M2); \
				152	M3 = _mm_sign_epi16(M4, M3); \
				153	M0 = _mm_hadds_epi16(M0, M1); \
				154	M1 = _mm_hadds_epi16(M2, M3); \
				155	M5 = _mm_hadds_epi16(M0, M1); \
				156	}
				157
				158	/* Horizontal minimum
				159	* Compute horizontal minimum of packed unsigned 16-bit integers and place
				160	* result in the low 16-bit element of the source register. Only SSE 4.1
				161	* has a dedicated minpos instruction. One intermediate register is used
				162	* if SSE 4.1 is not available. This is a destructive operation and the
				163	* source register is overwritten.
				164	*
				165	* Input:
				166	* M0 - Packed unsigned 16-bit integers
				167	*
				168	* Output:
				169	* M0 - Minimum value placed in low 16-bit element
				170	*/
				171	#if defined(HAVE_SSE4_1) \|\| defined(HAVE_SSE41)
				172	#define SSE_MINPOS(M0, M1) \
				173	{ \
				174	if (sse41_supported) { \
				175	M0 = _mm_minpos_epu16(M0); \
				176	} else { \
				177	M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
				178	M0 = _mm_min_epi16(M0, M1); \
				179	M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
				180	M0 = _mm_min_epi16(M0, M1); \
				181	M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
				182	M0 = _mm_min_epi16(M0, M1); \
				183	} \
				184	}
				185	#else
				186	#define SSE_MINPOS(M0, M1) \
				187	{ \
				188	M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
				189	M0 = _mm_min_epi16(M0, M1); \
				190	M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
				191	M0 = _mm_min_epi16(M0, M1); \
				192	M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
				193	M0 = _mm_min_epi16(M0, M1); \
				194	}
				195	#endif
				196
				197	/* Normalize state metrics K = 5:
				198	* Compute 16-wide normalization by subtracting the smallest value from
				199	* all values. Inputs are 16 packed 16-bit integers across 2 XMM registers.
				200	* Two intermediate registers are used and normalized results are placed
				201	* in the originating locations.
				202	*
				203	* Input:
				204	* M0:1 - Path metrics 0:1 (packed 16-bit integers)
				205	*
				206	* Output:
				207	* M0:1 - Normalized path metrics 0:1
				208	*/
				209	#define SSE_NORMALIZE_K5(M0, M1, M2, M3) \
				210	{ \
				211	M2 = _mm_min_epi16(M0, M1); \
				212	SSE_MINPOS(M2, M3) \
				213	SSE_BROADCAST(M2) \
				214	M0 = _mm_subs_epi16(M0, M2); \
				215	M1 = _mm_subs_epi16(M1, M2); \
				216	}
				217
				218	/* Normalize state metrics K = 7:
				219	* Compute 64-wide normalization by subtracting the smallest value from
				220	* all values. Inputs are 8 registers of accumulated sums and 4 temporary
				221	* registers. Normalized results are returned in the originating locations.
				222	*
				223	* Input:
				224	* M0:7 - Path metrics 0:7 (packed 16-bit integers)
				225	*
				226	* Output:
				227	* M0:7 - Normalized path metrics 0:7
				228	*/
				229	#define SSE_NORMALIZE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11) \
				230	{ \
				231	M8 = _mm_min_epi16(M0, M1); \
				232	M9 = _mm_min_epi16(M2, M3); \
				233	M10 = _mm_min_epi16(M4, M5); \
				234	M11 = _mm_min_epi16(M6, M7); \
				235	M8 = _mm_min_epi16(M8, M9); \
				236	M10 = _mm_min_epi16(M10, M11); \
				237	M8 = _mm_min_epi16(M8, M10); \
				238	SSE_MINPOS(M8, M9) \
				239	SSE_BROADCAST(M8) \
				240	M0 = _mm_subs_epi16(M0, M8); \
				241	M1 = _mm_subs_epi16(M1, M8); \
				242	M2 = _mm_subs_epi16(M2, M8); \
				243	M3 = _mm_subs_epi16(M3, M8); \
				244	M4 = _mm_subs_epi16(M4, M8); \
				245	M5 = _mm_subs_epi16(M5, M8); \
				246	M6 = _mm_subs_epi16(M6, M8); \
				247	M7 = _mm_subs_epi16(M7, M8); \
				248	}
				249
				250	/* Combined BMU/PMU (K=5, N=2)
				251	* Compute branch metrics followed by path metrics for half rate 16-state
				252	* trellis. 8 butterflies are computed. Accumulated path sums are not
				253	* preserved and read and written into the same memory location. Normalize
				254	* sums if requires.
				255	*/
				256	__always_inline static void _sse_metrics_k5_n2(const int16_t *val,
				257	const int16_t out, int16_t sums, int16_t *paths, int norm)
				258	{
				259	__m128i m0, m1, m2, m3, m4, m5, m6;
				260
				261	/* (BMU) Load input sequence */
				262	m2 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
				263
				264	/* (BMU) Load trellis outputs */
				265	m0 = _mm_load_si128((__m128i *) &out[0]);
				266	m1 = _mm_load_si128((__m128i *) &out[8]);
				267
				268	/* (BMU) Compute branch metrics */
				269	m0 = _mm_sign_epi16(m2, m0);
				270	m1 = _mm_sign_epi16(m2, m1);
				271	m2 = _mm_hadds_epi16(m0, m1);
				272
				273	/* (PMU) Load accumulated path metrics */
				274	m0 = _mm_load_si128((__m128i *) &sums[0]);
				275	m1 = _mm_load_si128((__m128i *) &sums[8]);
				276
				277	SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
				278
				279	/* (PMU) Butterflies: 0-7 */
				280	SSE_BUTTERFLY(m3, m4, m2, m5, m6)
				281
				282	if (norm)
				283	SSE_NORMALIZE_K5(m2, m6, m0, m1)
				284
				285	_mm_store_si128((__m128i *) &sums[0], m2);
				286	_mm_store_si128((__m128i *) &sums[8], m6);
				287	_mm_store_si128((__m128i *) &paths[0], m5);
				288	_mm_store_si128((__m128i *) &paths[8], m4);
				289	}
				290
				291	/* Combined BMU/PMU (K=5, N=3 and N=4)
				292	* Compute branch metrics followed by path metrics for 16-state and rates
				293	* to 1/4. 8 butterflies are computed. The input sequence is read four 16-bit
				294	* values at a time, and extra values should be set to zero for rates other
				295	* than 1/4. Normally only rates 1/3 and 1/4 are used as there is a
				296	* dedicated implementation of rate 1/2.
				297	*/
				298	__always_inline static void _sse_metrics_k5_n4(const int16_t *val,
				299	const int16_t out, int16_t sums, int16_t *paths, int norm)
				300	{
				301	__m128i m0, m1, m2, m3, m4, m5, m6;
				302
				303	/* (BMU) Load input sequence */
				304	m4 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
				305
				306	/* (BMU) Load trellis outputs */
				307	m0 = _mm_load_si128((__m128i *) &out[0]);
				308	m1 = _mm_load_si128((__m128i *) &out[8]);
				309	m2 = _mm_load_si128((__m128i *) &out[16]);
				310	m3 = _mm_load_si128((__m128i *) &out[24]);
				311
				312	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m4, m2)
				313
				314	/* (PMU) Load accumulated path metrics */
				315	m0 = _mm_load_si128((__m128i *) &sums[0]);
				316	m1 = _mm_load_si128((__m128i *) &sums[8]);
				317
				318	SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
				319
				320	/* (PMU) Butterflies: 0-7 */
				321	SSE_BUTTERFLY(m3, m4, m2, m5, m6)
				322
				323	if (norm)
				324	SSE_NORMALIZE_K5(m2, m6, m0, m1)
				325
				326	_mm_store_si128((__m128i *) &sums[0], m2);
				327	_mm_store_si128((__m128i *) &sums[8], m6);
				328	_mm_store_si128((__m128i *) &paths[0], m5);
				329	_mm_store_si128((__m128i *) &paths[8], m4);
				330	}
				331
				332	/* Combined BMU/PMU (K=7, N=2)
				333	* Compute branch metrics followed by path metrics for half rate 64-state
				334	* trellis. 32 butterfly operations are computed. Deinterleaving path
				335	* metrics requires usage of the full SSE register file, so separate sums
				336	* before computing branch metrics to avoid register spilling.
				337	*/
				338	__always_inline static void _sse_metrics_k7_n2(const int16_t *val,
				339	const int16_t out, int16_t sums, int16_t *paths, int norm)
				340	{
				341	__m128i m0, m1, m2, m3, m4, m5, m6, m7, m8,
				342	m9, m10, m11, m12, m13, m14, m15;
				343
				344	/* (PMU) Load accumulated path metrics */
				345	m0 = _mm_load_si128((__m128i *) &sums[0]);
				346	m1 = _mm_load_si128((__m128i *) &sums[8]);
				347	m2 = _mm_load_si128((__m128i *) &sums[16]);
				348	m3 = _mm_load_si128((__m128i *) &sums[24]);
				349	m4 = _mm_load_si128((__m128i *) &sums[32]);
				350	m5 = _mm_load_si128((__m128i *) &sums[40]);
				351	m6 = _mm_load_si128((__m128i *) &sums[48]);
				352	m7 = _mm_load_si128((__m128i *) &sums[56]);
				353
				354	/* (PMU) Deinterleave to even-odd registers */
				355	SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
				356	m8, m9, m10, m11, m12, m13, m14, m15)
				357
				358	/* (BMU) Load input symbols */
				359	m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
				360
				361	/* (BMU) Load trellis outputs */
				362	m0 = _mm_load_si128((__m128i *) &out[0]);
				363	m1 = _mm_load_si128((__m128i *) &out[8]);
				364	m2 = _mm_load_si128((__m128i *) &out[16]);
				365	m3 = _mm_load_si128((__m128i *) &out[24]);
				366
				367	SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m4, m5)
				368
				369	m0 = _mm_load_si128((__m128i *) &out[32]);
				370	m1 = _mm_load_si128((__m128i *) &out[40]);
				371	m2 = _mm_load_si128((__m128i *) &out[48]);
				372	m3 = _mm_load_si128((__m128i *) &out[56]);
				373
				374	SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m6, m7)
				375
				376	/* (PMU) Butterflies: 0-15 */
				377	SSE_BUTTERFLY(m8, m9, m4, m0, m1)
				378	SSE_BUTTERFLY(m10, m11, m5, m2, m3)
				379
				380	_mm_store_si128((__m128i *) &paths[0], m0);
				381	_mm_store_si128((__m128i *) &paths[8], m2);
				382	_mm_store_si128((__m128i *) &paths[32], m9);
				383	_mm_store_si128((__m128i *) &paths[40], m11);
				384
				385	/* (PMU) Butterflies: 17-31 */
				386	SSE_BUTTERFLY(m12, m13, m6, m0, m2)
				387	SSE_BUTTERFLY(m14, m15, m7, m9, m11)
				388
				389	_mm_store_si128((__m128i *) &paths[16], m0);
				390	_mm_store_si128((__m128i *) &paths[24], m9);
				391	_mm_store_si128((__m128i *) &paths[48], m13);
				392	_mm_store_si128((__m128i *) &paths[56], m15);
				393
				394	if (norm)
				395	SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
				396	m7, m11, m0, m8, m9, m10)
				397
				398	_mm_store_si128((__m128i *) &sums[0], m4);
				399	_mm_store_si128((__m128i *) &sums[8], m5);
				400	_mm_store_si128((__m128i *) &sums[16], m6);
				401	_mm_store_si128((__m128i *) &sums[24], m7);
				402	_mm_store_si128((__m128i *) &sums[32], m1);
				403	_mm_store_si128((__m128i *) &sums[40], m3);
				404	_mm_store_si128((__m128i *) &sums[48], m2);
				405	_mm_store_si128((__m128i *) &sums[56], m11);
				406	}
				407
				408	/* Combined BMU/PMU (K=7, N=3 and N=4)
				409	* Compute branch metrics followed by path metrics for half rate 64-state
				410	* trellis. 32 butterfly operations are computed. Deinterleave path
				411	* metrics before computing branch metrics as in the half rate case.
				412	*/
				413	__always_inline static void _sse_metrics_k7_n4(const int16_t *val,
				414	const int16_t out, int16_t sums, int16_t *paths, int norm)
				415	{
				416	__m128i m0, m1, m2, m3, m4, m5, m6, m7;
				417	__m128i m8, m9, m10, m11, m12, m13, m14, m15;
				418
				419	/* (PMU) Load accumulated path metrics */
				420	m0 = _mm_load_si128((__m128i *) &sums[0]);
				421	m1 = _mm_load_si128((__m128i *) &sums[8]);
				422	m2 = _mm_load_si128((__m128i *) &sums[16]);
				423	m3 = _mm_load_si128((__m128i *) &sums[24]);
				424	m4 = _mm_load_si128((__m128i *) &sums[32]);
				425	m5 = _mm_load_si128((__m128i *) &sums[40]);
				426	m6 = _mm_load_si128((__m128i *) &sums[48]);
				427	m7 = _mm_load_si128((__m128i *) &sums[56]);
				428
				429	/* (PMU) Deinterleave into even and odd packed registers */
				430	SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
				431	m8, m9, m10, m11, m12, m13, m14, m15)
				432
				433	/* (BMU) Load and expand 8-bit input out to 16-bits */
				434	m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
				435
				436	/* (BMU) Load and compute branch metrics */
				437	m0 = _mm_load_si128((__m128i *) &out[0]);
				438	m1 = _mm_load_si128((__m128i *) &out[8]);
				439	m2 = _mm_load_si128((__m128i *) &out[16]);
				440	m3 = _mm_load_si128((__m128i *) &out[24]);
				441
				442	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m4)
				443
				444	m0 = _mm_load_si128((__m128i *) &out[32]);
				445	m1 = _mm_load_si128((__m128i *) &out[40]);
				446	m2 = _mm_load_si128((__m128i *) &out[48]);
				447	m3 = _mm_load_si128((__m128i *) &out[56]);
				448
				449	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m5)
				450
				451	m0 = _mm_load_si128((__m128i *) &out[64]);
				452	m1 = _mm_load_si128((__m128i *) &out[72]);
				453	m2 = _mm_load_si128((__m128i *) &out[80]);
				454	m3 = _mm_load_si128((__m128i *) &out[88]);
				455
				456	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m6)
				457
				458	m0 = _mm_load_si128((__m128i *) &out[96]);
				459	m1 = _mm_load_si128((__m128i *) &out[104]);
				460	m2 = _mm_load_si128((__m128i *) &out[112]);
				461	m3 = _mm_load_si128((__m128i *) &out[120]);
				462
				463	SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m7)
				464
				465	/* (PMU) Butterflies: 0-15 */
				466	SSE_BUTTERFLY(m8, m9, m4, m0, m1)
				467	SSE_BUTTERFLY(m10, m11, m5, m2, m3)
				468
				469	_mm_store_si128((__m128i *) &paths[0], m0);
				470	_mm_store_si128((__m128i *) &paths[8], m2);
				471	_mm_store_si128((__m128i *) &paths[32], m9);
				472	_mm_store_si128((__m128i *) &paths[40], m11);
				473
				474	/* (PMU) Butterflies: 17-31 */
				475	SSE_BUTTERFLY(m12, m13, m6, m0, m2)
				476	SSE_BUTTERFLY(m14, m15, m7, m9, m11)
				477
				478	_mm_store_si128((__m128i *) &paths[16], m0);
				479	_mm_store_si128((__m128i *) &paths[24], m9);
				480	_mm_store_si128((__m128i *) &paths[48], m13);
				481	_mm_store_si128((__m128i *) &paths[56], m15);
				482
				483	if (norm)
				484	SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
				485	m7, m11, m0, m8, m9, m10)
				486
				487	_mm_store_si128((__m128i *) &sums[0], m4);
				488	_mm_store_si128((__m128i *) &sums[8], m5);
				489	_mm_store_si128((__m128i *) &sums[16], m6);
				490	_mm_store_si128((__m128i *) &sums[24], m7);
				491	_mm_store_si128((__m128i *) &sums[32], m1);
				492	_mm_store_si128((__m128i *) &sums[40], m3);
				493	_mm_store_si128((__m128i *) &sums[48], m2);
				494	_mm_store_si128((__m128i *) &sums[56], m11);
				495	}