blob: dbee3cc9b20620ccc2795c313953cb9d1480183b [file] [log] [blame]
Philipp Maiere8ae9fc2017-03-20 12:08:42 +01001/*
2 * SSE Convolution
3 * Copyright (C) 2012, 2013 Thomas Tsou <tom@tsou.cc>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2.1 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19
20#include <malloc.h>
21#include <string.h>
22#include <stdio.h>
23#include "convolve_sse_3.h"
24
25#ifdef HAVE_CONFIG_H
26#include "config.h"
27#endif
28
29#ifdef HAVE_SSE3
30#include <xmmintrin.h>
31#include <pmmintrin.h>
32
33/* 4-tap SSE complex-real convolution */
34void sse_conv_real4(const float *x, int x_len,
35 const float *h, int h_len,
36 float *y, int y_len,
37 int start, int len, int step, int offset)
38{
39 /* NOTE: The parameter list of this function has to match the parameter
40 * list of _base_convolve_real() in convolve_base.c. This specific
41 * implementation, ignores some of the parameters of
42 * _base_convolve_complex(), which are: x_len, y_len, offset, step */
43
44 __m128 m0, m1, m2, m3, m4, m5, m6, m7;
45
46 const float *_x = &x[2 * (-(h_len - 1) + start)];
47
48 /* Load (aligned) filter taps */
49 m0 = _mm_load_ps(&h[0]);
50 m1 = _mm_load_ps(&h[4]);
51 m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
52
53 for (int i = 0; i < len; i++) {
54 /* Load (unaligned) input data */
55 m0 = _mm_loadu_ps(&_x[2 * i + 0]);
56 m1 = _mm_loadu_ps(&_x[2 * i + 4]);
57 m2 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
58 m3 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
59
60 /* Quad multiply */
61 m4 = _mm_mul_ps(m2, m7);
62 m5 = _mm_mul_ps(m3, m7);
63
64 /* Sum and store */
65 m6 = _mm_hadd_ps(m4, m5);
66 m0 = _mm_hadd_ps(m6, m6);
67
68 _mm_store_ss(&y[2 * i + 0], m0);
69 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(0, 3, 2, 1));
70 _mm_store_ss(&y[2 * i + 1], m0);
71 }
72}
73
74/* 8-tap SSE complex-real convolution */
75void sse_conv_real8(const float *x, int x_len,
76 const float *h, int h_len,
77 float *y, int y_len,
78 int start, int len, int step, int offset)
79{
80 /* See NOTE in sse_conv_real4() */
81
82 __m128 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9;
83
84 const float *_x = &x[2 * (-(h_len - 1) + start)];
85
86 /* Load (aligned) filter taps */
87 m0 = _mm_load_ps(&h[0]);
88 m1 = _mm_load_ps(&h[4]);
89 m2 = _mm_load_ps(&h[8]);
90 m3 = _mm_load_ps(&h[12]);
91
92 m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
93 m5 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
94
95 for (int i = 0; i < len; i++) {
96 /* Load (unaligned) input data */
97 m0 = _mm_loadu_ps(&_x[2 * i + 0]);
98 m1 = _mm_loadu_ps(&_x[2 * i + 4]);
99 m2 = _mm_loadu_ps(&_x[2 * i + 8]);
100 m3 = _mm_loadu_ps(&_x[2 * i + 12]);
101
102 m6 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
103 m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
104 m8 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
105 m9 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
106
107 /* Quad multiply */
108 m6 = _mm_mul_ps(m6, m4);
109 m7 = _mm_mul_ps(m7, m4);
110 m8 = _mm_mul_ps(m8, m5);
111 m9 = _mm_mul_ps(m9, m5);
112
113 /* Sum and store */
114 m6 = _mm_add_ps(m6, m8);
115 m7 = _mm_add_ps(m7, m9);
116 m6 = _mm_hadd_ps(m6, m7);
117 m6 = _mm_hadd_ps(m6, m6);
118
119 _mm_store_ss(&y[2 * i + 0], m6);
120 m6 = _mm_shuffle_ps(m6, m6, _MM_SHUFFLE(0, 3, 2, 1));
121 _mm_store_ss(&y[2 * i + 1], m6);
122 }
123}
124
125/* 12-tap SSE complex-real convolution */
126void sse_conv_real12(const float *x, int x_len,
127 const float *h, int h_len,
128 float *y, int y_len,
129 int start, int len, int step, int offset)
130{
131 /* See NOTE in sse_conv_real4() */
132
133 __m128 m0, m1, m2, m3, m4, m5, m6, m7;
134 __m128 m8, m9, m10, m11, m12, m13, m14;
135
136 const float *_x = &x[2 * (-(h_len - 1) + start)];
137
138 /* Load (aligned) filter taps */
139 m0 = _mm_load_ps(&h[0]);
140 m1 = _mm_load_ps(&h[4]);
141 m2 = _mm_load_ps(&h[8]);
142 m3 = _mm_load_ps(&h[12]);
143 m4 = _mm_load_ps(&h[16]);
144 m5 = _mm_load_ps(&h[20]);
145
146 m12 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
147 m13 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
148 m14 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(0, 2, 0, 2));
149
150 for (int i = 0; i < len; i++) {
151 /* Load (unaligned) input data */
152 m0 = _mm_loadu_ps(&_x[2 * i + 0]);
153 m1 = _mm_loadu_ps(&_x[2 * i + 4]);
154 m2 = _mm_loadu_ps(&_x[2 * i + 8]);
155 m3 = _mm_loadu_ps(&_x[2 * i + 12]);
156
157 m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
158 m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
159 m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
160 m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
161
162 m0 = _mm_loadu_ps(&_x[2 * i + 16]);
163 m1 = _mm_loadu_ps(&_x[2 * i + 20]);
164
165 m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
166 m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
167
168 /* Quad multiply */
169 m0 = _mm_mul_ps(m4, m12);
170 m1 = _mm_mul_ps(m5, m12);
171 m2 = _mm_mul_ps(m6, m13);
172 m3 = _mm_mul_ps(m7, m13);
173 m4 = _mm_mul_ps(m8, m14);
174 m5 = _mm_mul_ps(m9, m14);
175
176 /* Sum and store */
177 m8 = _mm_add_ps(m0, m2);
178 m9 = _mm_add_ps(m1, m3);
179 m10 = _mm_add_ps(m8, m4);
180 m11 = _mm_add_ps(m9, m5);
181
182 m2 = _mm_hadd_ps(m10, m11);
183 m3 = _mm_hadd_ps(m2, m2);
184
185 _mm_store_ss(&y[2 * i + 0], m3);
186 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(0, 3, 2, 1));
187 _mm_store_ss(&y[2 * i + 1], m3);
188 }
189}
190
191/* 16-tap SSE complex-real convolution */
192void sse_conv_real16(const float *x, int x_len,
193 const float *h, int h_len,
194 float *y, int y_len,
195 int start, int len, int step, int offset)
196{
197 /* See NOTE in sse_conv_real4() */
198
199 __m128 m0, m1, m2, m3, m4, m5, m6, m7;
200 __m128 m8, m9, m10, m11, m12, m13, m14, m15;
201
202 const float *_x = &x[2 * (-(h_len - 1) + start)];
203
204 /* Load (aligned) filter taps */
205 m0 = _mm_load_ps(&h[0]);
206 m1 = _mm_load_ps(&h[4]);
207 m2 = _mm_load_ps(&h[8]);
208 m3 = _mm_load_ps(&h[12]);
209
210 m4 = _mm_load_ps(&h[16]);
211 m5 = _mm_load_ps(&h[20]);
212 m6 = _mm_load_ps(&h[24]);
213 m7 = _mm_load_ps(&h[28]);
214
215 m12 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
216 m13 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
217 m14 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(0, 2, 0, 2));
218 m15 = _mm_shuffle_ps(m6, m7, _MM_SHUFFLE(0, 2, 0, 2));
219
220 for (int i = 0; i < len; i++) {
221 /* Load (unaligned) input data */
222 m0 = _mm_loadu_ps(&_x[2 * i + 0]);
223 m1 = _mm_loadu_ps(&_x[2 * i + 4]);
224 m2 = _mm_loadu_ps(&_x[2 * i + 8]);
225 m3 = _mm_loadu_ps(&_x[2 * i + 12]);
226
227 m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
228 m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
229 m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
230 m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
231
232 m0 = _mm_loadu_ps(&_x[2 * i + 16]);
233 m1 = _mm_loadu_ps(&_x[2 * i + 20]);
234 m2 = _mm_loadu_ps(&_x[2 * i + 24]);
235 m3 = _mm_loadu_ps(&_x[2 * i + 28]);
236
237 m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
238 m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
239 m10 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
240 m11 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
241
242 /* Quad multiply */
243 m0 = _mm_mul_ps(m4, m12);
244 m1 = _mm_mul_ps(m5, m12);
245 m2 = _mm_mul_ps(m6, m13);
246 m3 = _mm_mul_ps(m7, m13);
247
248 m4 = _mm_mul_ps(m8, m14);
249 m5 = _mm_mul_ps(m9, m14);
250 m6 = _mm_mul_ps(m10, m15);
251 m7 = _mm_mul_ps(m11, m15);
252
253 /* Sum and store */
254 m8 = _mm_add_ps(m0, m2);
255 m9 = _mm_add_ps(m1, m3);
256 m10 = _mm_add_ps(m4, m6);
257 m11 = _mm_add_ps(m5, m7);
258
259 m0 = _mm_add_ps(m8, m10);
260 m1 = _mm_add_ps(m9, m11);
261 m2 = _mm_hadd_ps(m0, m1);
262 m3 = _mm_hadd_ps(m2, m2);
263
264 _mm_store_ss(&y[2 * i + 0], m3);
265 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(0, 3, 2, 1));
266 _mm_store_ss(&y[2 * i + 1], m3);
267 }
268}
269
270/* 20-tap SSE complex-real convolution */
271void sse_conv_real20(const float *x, int x_len,
272 const float *h, int h_len,
273 float *y, int y_len,
274 int start, int len, int step, int offset)
275{
276 /* See NOTE in sse_conv_real4() */
277
278 __m128 m0, m1, m2, m3, m4, m5, m6, m7;
279 __m128 m8, m9, m11, m12, m13, m14, m15;
280
281 const float *_x = &x[2 * (-(h_len - 1) + start)];
282
283 /* Load (aligned) filter taps */
284 m0 = _mm_load_ps(&h[0]);
285 m1 = _mm_load_ps(&h[4]);
286 m2 = _mm_load_ps(&h[8]);
287 m3 = _mm_load_ps(&h[12]);
288 m4 = _mm_load_ps(&h[16]);
289 m5 = _mm_load_ps(&h[20]);
290 m6 = _mm_load_ps(&h[24]);
291 m7 = _mm_load_ps(&h[28]);
292 m8 = _mm_load_ps(&h[32]);
293 m9 = _mm_load_ps(&h[36]);
294
295 m11 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
296 m12 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
297 m13 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(0, 2, 0, 2));
298 m14 = _mm_shuffle_ps(m6, m7, _MM_SHUFFLE(0, 2, 0, 2));
299 m15 = _mm_shuffle_ps(m8, m9, _MM_SHUFFLE(0, 2, 0, 2));
300
301 for (int i = 0; i < len; i++) {
302 /* Multiply-accumulate first 12 taps */
303 m0 = _mm_loadu_ps(&_x[2 * i + 0]);
304 m1 = _mm_loadu_ps(&_x[2 * i + 4]);
305 m2 = _mm_loadu_ps(&_x[2 * i + 8]);
306 m3 = _mm_loadu_ps(&_x[2 * i + 12]);
307 m4 = _mm_loadu_ps(&_x[2 * i + 16]);
308 m5 = _mm_loadu_ps(&_x[2 * i + 20]);
309
310 m6 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
311 m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
312 m8 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
313 m9 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
314 m0 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(0, 2, 0, 2));
315 m1 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(1, 3, 1, 3));
316
317 m2 = _mm_mul_ps(m6, m11);
318 m3 = _mm_mul_ps(m7, m11);
319 m4 = _mm_mul_ps(m8, m12);
320 m5 = _mm_mul_ps(m9, m12);
321 m6 = _mm_mul_ps(m0, m13);
322 m7 = _mm_mul_ps(m1, m13);
323
324 m0 = _mm_add_ps(m2, m4);
325 m1 = _mm_add_ps(m3, m5);
326 m8 = _mm_add_ps(m0, m6);
327 m9 = _mm_add_ps(m1, m7);
328
329 /* Multiply-accumulate last 8 taps */
330 m0 = _mm_loadu_ps(&_x[2 * i + 24]);
331 m1 = _mm_loadu_ps(&_x[2 * i + 28]);
332 m2 = _mm_loadu_ps(&_x[2 * i + 32]);
333 m3 = _mm_loadu_ps(&_x[2 * i + 36]);
334
335 m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
336 m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
337 m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
338 m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
339
340 m0 = _mm_mul_ps(m4, m14);
341 m1 = _mm_mul_ps(m5, m14);
342 m2 = _mm_mul_ps(m6, m15);
343 m3 = _mm_mul_ps(m7, m15);
344
345 m4 = _mm_add_ps(m0, m2);
346 m5 = _mm_add_ps(m1, m3);
347
348 /* Final sum and store */
349 m0 = _mm_add_ps(m8, m4);
350 m1 = _mm_add_ps(m9, m5);
351 m2 = _mm_hadd_ps(m0, m1);
352 m3 = _mm_hadd_ps(m2, m2);
353
354 _mm_store_ss(&y[2 * i + 0], m3);
355 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(0, 3, 2, 1));
356 _mm_store_ss(&y[2 * i + 1], m3);
357 }
358}
359
360/* 4*N-tap SSE complex-real convolution */
361void sse_conv_real4n(const float *x, int x_len,
362 const float *h, int h_len,
363 float *y, int y_len,
364 int start, int len, int step, int offset)
365{
366 /* See NOTE in sse_conv_real4() */
367
368 __m128 m0, m1, m2, m4, m5, m6, m7;
369
370 const float *_x = &x[2 * (-(h_len - 1) + start)];
371
372 for (int i = 0; i < len; i++) {
373 /* Zero */
374 m6 = _mm_setzero_ps();
375 m7 = _mm_setzero_ps();
376
377 for (int n = 0; n < h_len / 4; n++) {
378 /* Load (aligned) filter taps */
379 m0 = _mm_load_ps(&h[8 * n + 0]);
380 m1 = _mm_load_ps(&h[8 * n + 4]);
381 m2 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
382
383 /* Load (unaligned) input data */
384 m0 = _mm_loadu_ps(&_x[2 * i + 8 * n + 0]);
385 m1 = _mm_loadu_ps(&_x[2 * i + 8 * n + 4]);
386 m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
387 m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
388
389 /* Quad multiply */
390 m0 = _mm_mul_ps(m2, m4);
391 m1 = _mm_mul_ps(m2, m5);
392
393 /* Accumulate */
394 m6 = _mm_add_ps(m6, m0);
395 m7 = _mm_add_ps(m7, m1);
396 }
397
398 m0 = _mm_hadd_ps(m6, m7);
399 m0 = _mm_hadd_ps(m0, m0);
400
401 _mm_store_ss(&y[2 * i + 0], m0);
402 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(0, 3, 2, 1));
403 _mm_store_ss(&y[2 * i + 1], m0);
404 }
405}
406
407/* 4*N-tap SSE complex-complex convolution */
408void sse_conv_cmplx_4n(const float *x, int x_len,
409 const float *h, int h_len,
410 float *y, int y_len,
411 int start, int len, int step, int offset)
412{
413 /* NOTE: The parameter list of this function has to match the parameter
414 * list of _base_convolve_complex() in convolve_base.c. This specific
415 * implementation, ignores some of the parameters of
416 * _base_convolve_complex(), which are: x_len, y_len, offset, step. */
417
418 __m128 m0, m1, m2, m3, m4, m5, m6, m7;
419
420 const float *_x = &x[2 * (-(h_len - 1) + start)];
421
422 for (int i = 0; i < len; i++) {
423 /* Zero */
424 m6 = _mm_setzero_ps();
425 m7 = _mm_setzero_ps();
426
427 for (int n = 0; n < h_len / 4; n++) {
428 /* Load (aligned) filter taps */
429 m0 = _mm_load_ps(&h[8 * n + 0]);
430 m1 = _mm_load_ps(&h[8 * n + 4]);
431 m2 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
432 m3 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
433
434 /* Load (unaligned) input data */
435 m0 = _mm_loadu_ps(&_x[2 * i + 8 * n + 0]);
436 m1 = _mm_loadu_ps(&_x[2 * i + 8 * n + 4]);
437 m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
438 m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
439
440 /* Quad multiply */
441 m0 = _mm_mul_ps(m2, m4);
442 m1 = _mm_mul_ps(m3, m5);
443
444 m2 = _mm_mul_ps(m2, m5);
445 m3 = _mm_mul_ps(m3, m4);
446
447 /* Sum */
448 m0 = _mm_sub_ps(m0, m1);
449 m2 = _mm_add_ps(m2, m3);
450
451 /* Accumulate */
452 m6 = _mm_add_ps(m6, m0);
453 m7 = _mm_add_ps(m7, m2);
454 }
455
456 m0 = _mm_hadd_ps(m6, m7);
457 m0 = _mm_hadd_ps(m0, m0);
458
459 _mm_store_ss(&y[2 * i + 0], m0);
460 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(0, 3, 2, 1));
461 _mm_store_ss(&y[2 * i + 1], m0);
462 }
463}
464
465/* 8*N-tap SSE complex-complex convolution */
466void sse_conv_cmplx_8n(const float *x, int x_len,
467 const float *h, int h_len,
468 float *y, int y_len,
469 int start, int len, int step, int offset)
470{
471 /* See NOTE in sse_conv_cmplx_4n() */
472
473 __m128 m0, m1, m2, m3, m4, m5, m6, m7;
474 __m128 m8, m9, m10, m11, m12, m13, m14, m15;
475
476 const float *_x = &x[2 * (-(h_len - 1) + start)];
477
478 for (int i = 0; i < len; i++) {
479 /* Zero */
480 m12 = _mm_setzero_ps();
481 m13 = _mm_setzero_ps();
482 m14 = _mm_setzero_ps();
483 m15 = _mm_setzero_ps();
484
485 for (int n = 0; n < h_len / 8; n++) {
486 /* Load (aligned) filter taps */
487 m0 = _mm_load_ps(&h[16 * n + 0]);
488 m1 = _mm_load_ps(&h[16 * n + 4]);
489 m2 = _mm_load_ps(&h[16 * n + 8]);
490 m3 = _mm_load_ps(&h[16 * n + 12]);
491
492 m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
493 m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
494 m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
495 m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
496
497 /* Load (unaligned) input data */
498 m0 = _mm_loadu_ps(&_x[2 * i + 16 * n + 0]);
499 m1 = _mm_loadu_ps(&_x[2 * i + 16 * n + 4]);
500 m2 = _mm_loadu_ps(&_x[2 * i + 16 * n + 8]);
501 m3 = _mm_loadu_ps(&_x[2 * i + 16 * n + 12]);
502
503 m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
504 m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
505 m10 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
506 m11 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
507
508 /* Quad multiply */
509 m0 = _mm_mul_ps(m4, m8);
510 m1 = _mm_mul_ps(m5, m9);
511 m2 = _mm_mul_ps(m6, m10);
512 m3 = _mm_mul_ps(m7, m11);
513
514 m4 = _mm_mul_ps(m4, m9);
515 m5 = _mm_mul_ps(m5, m8);
516 m6 = _mm_mul_ps(m6, m11);
517 m7 = _mm_mul_ps(m7, m10);
518
519 /* Sum */
520 m0 = _mm_sub_ps(m0, m1);
521 m2 = _mm_sub_ps(m2, m3);
522 m4 = _mm_add_ps(m4, m5);
523 m6 = _mm_add_ps(m6, m7);
524
525 /* Accumulate */
526 m12 = _mm_add_ps(m12, m0);
527 m13 = _mm_add_ps(m13, m2);
528 m14 = _mm_add_ps(m14, m4);
529 m15 = _mm_add_ps(m15, m6);
530 }
531
532 m0 = _mm_add_ps(m12, m13);
533 m1 = _mm_add_ps(m14, m15);
534 m2 = _mm_hadd_ps(m0, m1);
535 m2 = _mm_hadd_ps(m2, m2);
536
537 _mm_store_ss(&y[2 * i + 0], m2);
538 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 3, 2, 1));
539 _mm_store_ss(&y[2 * i + 1], m2);
540 }
541}
542#endif