blob: ca4dc7133760e4a61eca2c33393e6e082d202b0b [file] [log] [blame]
Philipp Maiere8ae9fc2017-03-20 12:08:42 +01001/*
2 * SSE Convolution
3 * Copyright (C) 2012, 2013 Thomas Tsou <tom@tsou.cc>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2.1 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
Philipp Maiere8ae9fc2017-03-20 12:08:42 +010014 */
15
16#include <malloc.h>
17#include <string.h>
18#include <stdio.h>
19#include "convolve_sse_3.h"
20
21#ifdef HAVE_CONFIG_H
22#include "config.h"
23#endif
24
25#ifdef HAVE_SSE3
26#include <xmmintrin.h>
27#include <pmmintrin.h>
28
29/* 4-tap SSE complex-real convolution */
30void sse_conv_real4(const float *x, int x_len,
31 const float *h, int h_len,
32 float *y, int y_len,
Sylvain Munauta3934a12018-12-20 19:10:26 +010033 int start, int len)
Philipp Maiere8ae9fc2017-03-20 12:08:42 +010034{
35 /* NOTE: The parameter list of this function has to match the parameter
36 * list of _base_convolve_real() in convolve_base.c. This specific
37 * implementation, ignores some of the parameters of
Sylvain Munauta3934a12018-12-20 19:10:26 +010038 * _base_convolve_complex(), which are: x_len, y_len. */
Philipp Maiere8ae9fc2017-03-20 12:08:42 +010039
40 __m128 m0, m1, m2, m3, m4, m5, m6, m7;
41
42 const float *_x = &x[2 * (-(h_len - 1) + start)];
43
44 /* Load (aligned) filter taps */
45 m0 = _mm_load_ps(&h[0]);
46 m1 = _mm_load_ps(&h[4]);
47 m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
48
49 for (int i = 0; i < len; i++) {
50 /* Load (unaligned) input data */
51 m0 = _mm_loadu_ps(&_x[2 * i + 0]);
52 m1 = _mm_loadu_ps(&_x[2 * i + 4]);
53 m2 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
54 m3 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
55
56 /* Quad multiply */
57 m4 = _mm_mul_ps(m2, m7);
58 m5 = _mm_mul_ps(m3, m7);
59
60 /* Sum and store */
61 m6 = _mm_hadd_ps(m4, m5);
62 m0 = _mm_hadd_ps(m6, m6);
63
64 _mm_store_ss(&y[2 * i + 0], m0);
65 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(0, 3, 2, 1));
66 _mm_store_ss(&y[2 * i + 1], m0);
67 }
68}
69
70/* 8-tap SSE complex-real convolution */
71void sse_conv_real8(const float *x, int x_len,
72 const float *h, int h_len,
73 float *y, int y_len,
Sylvain Munauta3934a12018-12-20 19:10:26 +010074 int start, int len)
Philipp Maiere8ae9fc2017-03-20 12:08:42 +010075{
76 /* See NOTE in sse_conv_real4() */
77
78 __m128 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9;
79
80 const float *_x = &x[2 * (-(h_len - 1) + start)];
81
82 /* Load (aligned) filter taps */
83 m0 = _mm_load_ps(&h[0]);
84 m1 = _mm_load_ps(&h[4]);
85 m2 = _mm_load_ps(&h[8]);
86 m3 = _mm_load_ps(&h[12]);
87
88 m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
89 m5 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
90
91 for (int i = 0; i < len; i++) {
92 /* Load (unaligned) input data */
93 m0 = _mm_loadu_ps(&_x[2 * i + 0]);
94 m1 = _mm_loadu_ps(&_x[2 * i + 4]);
95 m2 = _mm_loadu_ps(&_x[2 * i + 8]);
96 m3 = _mm_loadu_ps(&_x[2 * i + 12]);
97
98 m6 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
99 m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
100 m8 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
101 m9 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
102
103 /* Quad multiply */
104 m6 = _mm_mul_ps(m6, m4);
105 m7 = _mm_mul_ps(m7, m4);
106 m8 = _mm_mul_ps(m8, m5);
107 m9 = _mm_mul_ps(m9, m5);
108
109 /* Sum and store */
110 m6 = _mm_add_ps(m6, m8);
111 m7 = _mm_add_ps(m7, m9);
112 m6 = _mm_hadd_ps(m6, m7);
113 m6 = _mm_hadd_ps(m6, m6);
114
115 _mm_store_ss(&y[2 * i + 0], m6);
116 m6 = _mm_shuffle_ps(m6, m6, _MM_SHUFFLE(0, 3, 2, 1));
117 _mm_store_ss(&y[2 * i + 1], m6);
118 }
119}
120
121/* 12-tap SSE complex-real convolution */
122void sse_conv_real12(const float *x, int x_len,
123 const float *h, int h_len,
124 float *y, int y_len,
Sylvain Munauta3934a12018-12-20 19:10:26 +0100125 int start, int len)
Philipp Maiere8ae9fc2017-03-20 12:08:42 +0100126{
127 /* See NOTE in sse_conv_real4() */
128
129 __m128 m0, m1, m2, m3, m4, m5, m6, m7;
130 __m128 m8, m9, m10, m11, m12, m13, m14;
131
132 const float *_x = &x[2 * (-(h_len - 1) + start)];
133
134 /* Load (aligned) filter taps */
135 m0 = _mm_load_ps(&h[0]);
136 m1 = _mm_load_ps(&h[4]);
137 m2 = _mm_load_ps(&h[8]);
138 m3 = _mm_load_ps(&h[12]);
139 m4 = _mm_load_ps(&h[16]);
140 m5 = _mm_load_ps(&h[20]);
141
142 m12 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
143 m13 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
144 m14 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(0, 2, 0, 2));
145
146 for (int i = 0; i < len; i++) {
147 /* Load (unaligned) input data */
148 m0 = _mm_loadu_ps(&_x[2 * i + 0]);
149 m1 = _mm_loadu_ps(&_x[2 * i + 4]);
150 m2 = _mm_loadu_ps(&_x[2 * i + 8]);
151 m3 = _mm_loadu_ps(&_x[2 * i + 12]);
152
153 m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
154 m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
155 m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
156 m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
157
158 m0 = _mm_loadu_ps(&_x[2 * i + 16]);
159 m1 = _mm_loadu_ps(&_x[2 * i + 20]);
160
161 m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
162 m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
163
164 /* Quad multiply */
165 m0 = _mm_mul_ps(m4, m12);
166 m1 = _mm_mul_ps(m5, m12);
167 m2 = _mm_mul_ps(m6, m13);
168 m3 = _mm_mul_ps(m7, m13);
169 m4 = _mm_mul_ps(m8, m14);
170 m5 = _mm_mul_ps(m9, m14);
171
172 /* Sum and store */
173 m8 = _mm_add_ps(m0, m2);
174 m9 = _mm_add_ps(m1, m3);
175 m10 = _mm_add_ps(m8, m4);
176 m11 = _mm_add_ps(m9, m5);
177
178 m2 = _mm_hadd_ps(m10, m11);
179 m3 = _mm_hadd_ps(m2, m2);
180
181 _mm_store_ss(&y[2 * i + 0], m3);
182 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(0, 3, 2, 1));
183 _mm_store_ss(&y[2 * i + 1], m3);
184 }
185}
186
187/* 16-tap SSE complex-real convolution */
188void sse_conv_real16(const float *x, int x_len,
189 const float *h, int h_len,
190 float *y, int y_len,
Sylvain Munauta3934a12018-12-20 19:10:26 +0100191 int start, int len)
Philipp Maiere8ae9fc2017-03-20 12:08:42 +0100192{
193 /* See NOTE in sse_conv_real4() */
194
195 __m128 m0, m1, m2, m3, m4, m5, m6, m7;
196 __m128 m8, m9, m10, m11, m12, m13, m14, m15;
197
198 const float *_x = &x[2 * (-(h_len - 1) + start)];
199
200 /* Load (aligned) filter taps */
201 m0 = _mm_load_ps(&h[0]);
202 m1 = _mm_load_ps(&h[4]);
203 m2 = _mm_load_ps(&h[8]);
204 m3 = _mm_load_ps(&h[12]);
205
206 m4 = _mm_load_ps(&h[16]);
207 m5 = _mm_load_ps(&h[20]);
208 m6 = _mm_load_ps(&h[24]);
209 m7 = _mm_load_ps(&h[28]);
210
211 m12 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
212 m13 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
213 m14 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(0, 2, 0, 2));
214 m15 = _mm_shuffle_ps(m6, m7, _MM_SHUFFLE(0, 2, 0, 2));
215
216 for (int i = 0; i < len; i++) {
217 /* Load (unaligned) input data */
218 m0 = _mm_loadu_ps(&_x[2 * i + 0]);
219 m1 = _mm_loadu_ps(&_x[2 * i + 4]);
220 m2 = _mm_loadu_ps(&_x[2 * i + 8]);
221 m3 = _mm_loadu_ps(&_x[2 * i + 12]);
222
223 m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
224 m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
225 m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
226 m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
227
228 m0 = _mm_loadu_ps(&_x[2 * i + 16]);
229 m1 = _mm_loadu_ps(&_x[2 * i + 20]);
230 m2 = _mm_loadu_ps(&_x[2 * i + 24]);
231 m3 = _mm_loadu_ps(&_x[2 * i + 28]);
232
233 m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
234 m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
235 m10 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
236 m11 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
237
238 /* Quad multiply */
239 m0 = _mm_mul_ps(m4, m12);
240 m1 = _mm_mul_ps(m5, m12);
241 m2 = _mm_mul_ps(m6, m13);
242 m3 = _mm_mul_ps(m7, m13);
243
244 m4 = _mm_mul_ps(m8, m14);
245 m5 = _mm_mul_ps(m9, m14);
246 m6 = _mm_mul_ps(m10, m15);
247 m7 = _mm_mul_ps(m11, m15);
248
249 /* Sum and store */
250 m8 = _mm_add_ps(m0, m2);
251 m9 = _mm_add_ps(m1, m3);
252 m10 = _mm_add_ps(m4, m6);
253 m11 = _mm_add_ps(m5, m7);
254
255 m0 = _mm_add_ps(m8, m10);
256 m1 = _mm_add_ps(m9, m11);
257 m2 = _mm_hadd_ps(m0, m1);
258 m3 = _mm_hadd_ps(m2, m2);
259
260 _mm_store_ss(&y[2 * i + 0], m3);
261 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(0, 3, 2, 1));
262 _mm_store_ss(&y[2 * i + 1], m3);
263 }
264}
265
266/* 20-tap SSE complex-real convolution */
267void sse_conv_real20(const float *x, int x_len,
268 const float *h, int h_len,
269 float *y, int y_len,
Sylvain Munauta3934a12018-12-20 19:10:26 +0100270 int start, int len)
Philipp Maiere8ae9fc2017-03-20 12:08:42 +0100271{
272 /* See NOTE in sse_conv_real4() */
273
274 __m128 m0, m1, m2, m3, m4, m5, m6, m7;
275 __m128 m8, m9, m11, m12, m13, m14, m15;
276
277 const float *_x = &x[2 * (-(h_len - 1) + start)];
278
279 /* Load (aligned) filter taps */
280 m0 = _mm_load_ps(&h[0]);
281 m1 = _mm_load_ps(&h[4]);
282 m2 = _mm_load_ps(&h[8]);
283 m3 = _mm_load_ps(&h[12]);
284 m4 = _mm_load_ps(&h[16]);
285 m5 = _mm_load_ps(&h[20]);
286 m6 = _mm_load_ps(&h[24]);
287 m7 = _mm_load_ps(&h[28]);
288 m8 = _mm_load_ps(&h[32]);
289 m9 = _mm_load_ps(&h[36]);
290
291 m11 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
292 m12 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
293 m13 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(0, 2, 0, 2));
294 m14 = _mm_shuffle_ps(m6, m7, _MM_SHUFFLE(0, 2, 0, 2));
295 m15 = _mm_shuffle_ps(m8, m9, _MM_SHUFFLE(0, 2, 0, 2));
296
297 for (int i = 0; i < len; i++) {
298 /* Multiply-accumulate first 12 taps */
299 m0 = _mm_loadu_ps(&_x[2 * i + 0]);
300 m1 = _mm_loadu_ps(&_x[2 * i + 4]);
301 m2 = _mm_loadu_ps(&_x[2 * i + 8]);
302 m3 = _mm_loadu_ps(&_x[2 * i + 12]);
303 m4 = _mm_loadu_ps(&_x[2 * i + 16]);
304 m5 = _mm_loadu_ps(&_x[2 * i + 20]);
305
306 m6 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
307 m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
308 m8 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
309 m9 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
310 m0 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(0, 2, 0, 2));
311 m1 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(1, 3, 1, 3));
312
313 m2 = _mm_mul_ps(m6, m11);
314 m3 = _mm_mul_ps(m7, m11);
315 m4 = _mm_mul_ps(m8, m12);
316 m5 = _mm_mul_ps(m9, m12);
317 m6 = _mm_mul_ps(m0, m13);
318 m7 = _mm_mul_ps(m1, m13);
319
320 m0 = _mm_add_ps(m2, m4);
321 m1 = _mm_add_ps(m3, m5);
322 m8 = _mm_add_ps(m0, m6);
323 m9 = _mm_add_ps(m1, m7);
324
325 /* Multiply-accumulate last 8 taps */
326 m0 = _mm_loadu_ps(&_x[2 * i + 24]);
327 m1 = _mm_loadu_ps(&_x[2 * i + 28]);
328 m2 = _mm_loadu_ps(&_x[2 * i + 32]);
329 m3 = _mm_loadu_ps(&_x[2 * i + 36]);
330
331 m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
332 m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
333 m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
334 m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
335
336 m0 = _mm_mul_ps(m4, m14);
337 m1 = _mm_mul_ps(m5, m14);
338 m2 = _mm_mul_ps(m6, m15);
339 m3 = _mm_mul_ps(m7, m15);
340
341 m4 = _mm_add_ps(m0, m2);
342 m5 = _mm_add_ps(m1, m3);
343
344 /* Final sum and store */
345 m0 = _mm_add_ps(m8, m4);
346 m1 = _mm_add_ps(m9, m5);
347 m2 = _mm_hadd_ps(m0, m1);
348 m3 = _mm_hadd_ps(m2, m2);
349
350 _mm_store_ss(&y[2 * i + 0], m3);
351 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(0, 3, 2, 1));
352 _mm_store_ss(&y[2 * i + 1], m3);
353 }
354}
355
356/* 4*N-tap SSE complex-real convolution */
357void sse_conv_real4n(const float *x, int x_len,
358 const float *h, int h_len,
359 float *y, int y_len,
Sylvain Munauta3934a12018-12-20 19:10:26 +0100360 int start, int len)
Philipp Maiere8ae9fc2017-03-20 12:08:42 +0100361{
362 /* See NOTE in sse_conv_real4() */
363
364 __m128 m0, m1, m2, m4, m5, m6, m7;
365
366 const float *_x = &x[2 * (-(h_len - 1) + start)];
367
368 for (int i = 0; i < len; i++) {
369 /* Zero */
370 m6 = _mm_setzero_ps();
371 m7 = _mm_setzero_ps();
372
373 for (int n = 0; n < h_len / 4; n++) {
374 /* Load (aligned) filter taps */
375 m0 = _mm_load_ps(&h[8 * n + 0]);
376 m1 = _mm_load_ps(&h[8 * n + 4]);
377 m2 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
378
379 /* Load (unaligned) input data */
380 m0 = _mm_loadu_ps(&_x[2 * i + 8 * n + 0]);
381 m1 = _mm_loadu_ps(&_x[2 * i + 8 * n + 4]);
382 m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
383 m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
384
385 /* Quad multiply */
386 m0 = _mm_mul_ps(m2, m4);
387 m1 = _mm_mul_ps(m2, m5);
388
389 /* Accumulate */
390 m6 = _mm_add_ps(m6, m0);
391 m7 = _mm_add_ps(m7, m1);
392 }
393
394 m0 = _mm_hadd_ps(m6, m7);
395 m0 = _mm_hadd_ps(m0, m0);
396
397 _mm_store_ss(&y[2 * i + 0], m0);
398 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(0, 3, 2, 1));
399 _mm_store_ss(&y[2 * i + 1], m0);
400 }
401}
402
403/* 4*N-tap SSE complex-complex convolution */
404void sse_conv_cmplx_4n(const float *x, int x_len,
405 const float *h, int h_len,
406 float *y, int y_len,
Sylvain Munauta3934a12018-12-20 19:10:26 +0100407 int start, int len)
Philipp Maiere8ae9fc2017-03-20 12:08:42 +0100408{
409 /* NOTE: The parameter list of this function has to match the parameter
410 * list of _base_convolve_complex() in convolve_base.c. This specific
411 * implementation, ignores some of the parameters of
Sylvain Munauta3934a12018-12-20 19:10:26 +0100412 * _base_convolve_complex(), which are: x_len, y_len. */
Philipp Maiere8ae9fc2017-03-20 12:08:42 +0100413
414 __m128 m0, m1, m2, m3, m4, m5, m6, m7;
415
416 const float *_x = &x[2 * (-(h_len - 1) + start)];
417
418 for (int i = 0; i < len; i++) {
419 /* Zero */
420 m6 = _mm_setzero_ps();
421 m7 = _mm_setzero_ps();
422
423 for (int n = 0; n < h_len / 4; n++) {
424 /* Load (aligned) filter taps */
425 m0 = _mm_load_ps(&h[8 * n + 0]);
426 m1 = _mm_load_ps(&h[8 * n + 4]);
427 m2 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
428 m3 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
429
430 /* Load (unaligned) input data */
431 m0 = _mm_loadu_ps(&_x[2 * i + 8 * n + 0]);
432 m1 = _mm_loadu_ps(&_x[2 * i + 8 * n + 4]);
433 m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
434 m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
435
436 /* Quad multiply */
437 m0 = _mm_mul_ps(m2, m4);
438 m1 = _mm_mul_ps(m3, m5);
439
440 m2 = _mm_mul_ps(m2, m5);
441 m3 = _mm_mul_ps(m3, m4);
442
443 /* Sum */
444 m0 = _mm_sub_ps(m0, m1);
445 m2 = _mm_add_ps(m2, m3);
446
447 /* Accumulate */
448 m6 = _mm_add_ps(m6, m0);
449 m7 = _mm_add_ps(m7, m2);
450 }
451
452 m0 = _mm_hadd_ps(m6, m7);
453 m0 = _mm_hadd_ps(m0, m0);
454
455 _mm_store_ss(&y[2 * i + 0], m0);
456 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(0, 3, 2, 1));
457 _mm_store_ss(&y[2 * i + 1], m0);
458 }
459}
460
461/* 8*N-tap SSE complex-complex convolution */
462void sse_conv_cmplx_8n(const float *x, int x_len,
463 const float *h, int h_len,
464 float *y, int y_len,
Sylvain Munauta3934a12018-12-20 19:10:26 +0100465 int start, int len)
Philipp Maiere8ae9fc2017-03-20 12:08:42 +0100466{
467 /* See NOTE in sse_conv_cmplx_4n() */
468
469 __m128 m0, m1, m2, m3, m4, m5, m6, m7;
470 __m128 m8, m9, m10, m11, m12, m13, m14, m15;
471
472 const float *_x = &x[2 * (-(h_len - 1) + start)];
473
474 for (int i = 0; i < len; i++) {
475 /* Zero */
476 m12 = _mm_setzero_ps();
477 m13 = _mm_setzero_ps();
478 m14 = _mm_setzero_ps();
479 m15 = _mm_setzero_ps();
480
481 for (int n = 0; n < h_len / 8; n++) {
482 /* Load (aligned) filter taps */
483 m0 = _mm_load_ps(&h[16 * n + 0]);
484 m1 = _mm_load_ps(&h[16 * n + 4]);
485 m2 = _mm_load_ps(&h[16 * n + 8]);
486 m3 = _mm_load_ps(&h[16 * n + 12]);
487
488 m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
489 m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
490 m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
491 m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
492
493 /* Load (unaligned) input data */
494 m0 = _mm_loadu_ps(&_x[2 * i + 16 * n + 0]);
495 m1 = _mm_loadu_ps(&_x[2 * i + 16 * n + 4]);
496 m2 = _mm_loadu_ps(&_x[2 * i + 16 * n + 8]);
497 m3 = _mm_loadu_ps(&_x[2 * i + 16 * n + 12]);
498
499 m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
500 m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
501 m10 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
502 m11 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
503
504 /* Quad multiply */
505 m0 = _mm_mul_ps(m4, m8);
506 m1 = _mm_mul_ps(m5, m9);
507 m2 = _mm_mul_ps(m6, m10);
508 m3 = _mm_mul_ps(m7, m11);
509
510 m4 = _mm_mul_ps(m4, m9);
511 m5 = _mm_mul_ps(m5, m8);
512 m6 = _mm_mul_ps(m6, m11);
513 m7 = _mm_mul_ps(m7, m10);
514
515 /* Sum */
516 m0 = _mm_sub_ps(m0, m1);
517 m2 = _mm_sub_ps(m2, m3);
518 m4 = _mm_add_ps(m4, m5);
519 m6 = _mm_add_ps(m6, m7);
520
521 /* Accumulate */
522 m12 = _mm_add_ps(m12, m0);
523 m13 = _mm_add_ps(m13, m2);
524 m14 = _mm_add_ps(m14, m4);
525 m15 = _mm_add_ps(m15, m6);
526 }
527
528 m0 = _mm_add_ps(m12, m13);
529 m1 = _mm_add_ps(m14, m15);
530 m2 = _mm_hadd_ps(m0, m1);
531 m2 = _mm_hadd_ps(m2, m2);
532
533 _mm_store_ss(&y[2 * i + 0], m2);
534 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 3, 2, 1));
535 _mm_store_ss(&y[2 * i + 1], m2);
536 }
537}
538#endif