blob: 04923bcdcd300fcaa235aef9d9daf9d6ad413ebf [file] [log] [blame]
Thomas Tsou3eaae802013-08-20 19:31:14 -04001/*
2 * SSE Convolution
3 * Copyright (C) 2012, 2013 Thomas Tsou <tom@tsou.cc>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2.1 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19
20#include <malloc.h>
21#include <string.h>
22#include <stdio.h>
Thomas Tsou17bbb9b2013-10-30 21:24:40 -040023#include "convolve.h"
Thomas Tsou3eaae802013-08-20 19:31:14 -040024
25#ifdef HAVE_CONFIG_H
26#include "config.h"
27#endif
28
Thomas Tsou17bbb9b2013-10-30 21:24:40 -040029/* Forward declarations from base implementation */
Tom Tsouf147b172015-03-25 12:55:11 -070030int _base_convolve_real(const float *x, int x_len,
31 const float *h, int h_len,
Thomas Tsou17bbb9b2013-10-30 21:24:40 -040032 float *y, int y_len,
33 int start, int len,
34 int step, int offset);
35
Tom Tsouf147b172015-03-25 12:55:11 -070036int _base_convolve_complex(const float *x, int x_len,
37 const float *h, int h_len,
Thomas Tsou17bbb9b2013-10-30 21:24:40 -040038 float *y, int y_len,
39 int start, int len,
40 int step, int offset);
41
42int bounds_check(int x_len, int h_len, int y_len,
43 int start, int len, int step);
44
Thomas Tsou3eaae802013-08-20 19:31:14 -040045#ifdef HAVE_SSE3
46#include <xmmintrin.h>
47#include <pmmintrin.h>
48
49/* 4-tap SSE complex-real convolution */
Tom Tsouf147b172015-03-25 12:55:11 -070050static void sse_conv_real4(const float *restrict x,
51 const float *restrict h,
Thomas Tsou3eaae802013-08-20 19:31:14 -040052 float *restrict y,
53 int len)
54{
55 __m128 m0, m1, m2, m3, m4, m5, m6, m7;
56
57 /* Load (aligned) filter taps */
58 m0 = _mm_load_ps(&h[0]);
59 m1 = _mm_load_ps(&h[4]);
60 m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
61
62 for (int i = 0; i < len; i++) {
63 /* Load (unaligned) input data */
64 m0 = _mm_loadu_ps(&x[2 * i + 0]);
65 m1 = _mm_loadu_ps(&x[2 * i + 4]);
66 m2 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
67 m3 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
68
69 /* Quad multiply */
70 m4 = _mm_mul_ps(m2, m7);
71 m5 = _mm_mul_ps(m3, m7);
72
73 /* Sum and store */
74 m6 = _mm_hadd_ps(m4, m5);
75 m0 = _mm_hadd_ps(m6, m6);
76
77 _mm_store_ss(&y[2 * i + 0], m0);
78 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(0, 3, 2, 1));
79 _mm_store_ss(&y[2 * i + 1], m0);
80 }
81}
82
83/* 8-tap SSE complex-real convolution */
Tom Tsouf147b172015-03-25 12:55:11 -070084static void sse_conv_real8(const float *restrict x,
85 const float *restrict h,
Thomas Tsou3eaae802013-08-20 19:31:14 -040086 float *restrict y,
87 int len)
88{
89 __m128 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9;
90
91 /* Load (aligned) filter taps */
92 m0 = _mm_load_ps(&h[0]);
93 m1 = _mm_load_ps(&h[4]);
94 m2 = _mm_load_ps(&h[8]);
95 m3 = _mm_load_ps(&h[12]);
96
97 m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
98 m5 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
99
100 for (int i = 0; i < len; i++) {
101 /* Load (unaligned) input data */
102 m0 = _mm_loadu_ps(&x[2 * i + 0]);
103 m1 = _mm_loadu_ps(&x[2 * i + 4]);
104 m2 = _mm_loadu_ps(&x[2 * i + 8]);
105 m3 = _mm_loadu_ps(&x[2 * i + 12]);
106
107 m6 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
108 m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
109 m8 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
110 m9 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
111
112 /* Quad multiply */
113 m6 = _mm_mul_ps(m6, m4);
114 m7 = _mm_mul_ps(m7, m4);
115 m8 = _mm_mul_ps(m8, m5);
116 m9 = _mm_mul_ps(m9, m5);
117
118 /* Sum and store */
119 m6 = _mm_add_ps(m6, m8);
120 m7 = _mm_add_ps(m7, m9);
121 m6 = _mm_hadd_ps(m6, m7);
122 m6 = _mm_hadd_ps(m6, m6);
123
124 _mm_store_ss(&y[2 * i + 0], m6);
125 m6 = _mm_shuffle_ps(m6, m6, _MM_SHUFFLE(0, 3, 2, 1));
126 _mm_store_ss(&y[2 * i + 1], m6);
127 }
128}
129
130/* 12-tap SSE complex-real convolution */
Tom Tsouf147b172015-03-25 12:55:11 -0700131static void sse_conv_real12(const float *restrict x,
132 const float *restrict h,
Thomas Tsou3eaae802013-08-20 19:31:14 -0400133 float *restrict y,
134 int len)
135{
136 __m128 m0, m1, m2, m3, m4, m5, m6, m7;
137 __m128 m8, m9, m10, m11, m12, m13, m14;
138
139 /* Load (aligned) filter taps */
140 m0 = _mm_load_ps(&h[0]);
141 m1 = _mm_load_ps(&h[4]);
142 m2 = _mm_load_ps(&h[8]);
143 m3 = _mm_load_ps(&h[12]);
144 m4 = _mm_load_ps(&h[16]);
145 m5 = _mm_load_ps(&h[20]);
146
147 m12 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
148 m13 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
149 m14 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(0, 2, 0, 2));
150
151 for (int i = 0; i < len; i++) {
152 /* Load (unaligned) input data */
153 m0 = _mm_loadu_ps(&x[2 * i + 0]);
154 m1 = _mm_loadu_ps(&x[2 * i + 4]);
155 m2 = _mm_loadu_ps(&x[2 * i + 8]);
156 m3 = _mm_loadu_ps(&x[2 * i + 12]);
157
158 m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
159 m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
160 m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
161 m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
162
163 m0 = _mm_loadu_ps(&x[2 * i + 16]);
164 m1 = _mm_loadu_ps(&x[2 * i + 20]);
165
166 m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
167 m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
168
169 /* Quad multiply */
170 m0 = _mm_mul_ps(m4, m12);
171 m1 = _mm_mul_ps(m5, m12);
172 m2 = _mm_mul_ps(m6, m13);
173 m3 = _mm_mul_ps(m7, m13);
174 m4 = _mm_mul_ps(m8, m14);
175 m5 = _mm_mul_ps(m9, m14);
176
177 /* Sum and store */
178 m8 = _mm_add_ps(m0, m2);
179 m9 = _mm_add_ps(m1, m3);
180 m10 = _mm_add_ps(m8, m4);
181 m11 = _mm_add_ps(m9, m5);
182
183 m2 = _mm_hadd_ps(m10, m11);
184 m3 = _mm_hadd_ps(m2, m2);
185
186 _mm_store_ss(&y[2 * i + 0], m3);
187 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(0, 3, 2, 1));
188 _mm_store_ss(&y[2 * i + 1], m3);
189 }
190}
191
192/* 16-tap SSE complex-real convolution */
Tom Tsouf147b172015-03-25 12:55:11 -0700193static void sse_conv_real16(const float *restrict x,
194 const float *restrict h,
Thomas Tsou3eaae802013-08-20 19:31:14 -0400195 float *restrict y,
196 int len)
197{
198 __m128 m0, m1, m2, m3, m4, m5, m6, m7;
199 __m128 m8, m9, m10, m11, m12, m13, m14, m15;
200
201 /* Load (aligned) filter taps */
202 m0 = _mm_load_ps(&h[0]);
203 m1 = _mm_load_ps(&h[4]);
204 m2 = _mm_load_ps(&h[8]);
205 m3 = _mm_load_ps(&h[12]);
206
207 m4 = _mm_load_ps(&h[16]);
208 m5 = _mm_load_ps(&h[20]);
209 m6 = _mm_load_ps(&h[24]);
210 m7 = _mm_load_ps(&h[28]);
211
212 m12 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
213 m13 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
214 m14 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(0, 2, 0, 2));
215 m15 = _mm_shuffle_ps(m6, m7, _MM_SHUFFLE(0, 2, 0, 2));
216
217 for (int i = 0; i < len; i++) {
218 /* Load (unaligned) input data */
219 m0 = _mm_loadu_ps(&x[2 * i + 0]);
220 m1 = _mm_loadu_ps(&x[2 * i + 4]);
221 m2 = _mm_loadu_ps(&x[2 * i + 8]);
222 m3 = _mm_loadu_ps(&x[2 * i + 12]);
223
224 m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
225 m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
226 m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
227 m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
228
229 m0 = _mm_loadu_ps(&x[2 * i + 16]);
230 m1 = _mm_loadu_ps(&x[2 * i + 20]);
231 m2 = _mm_loadu_ps(&x[2 * i + 24]);
232 m3 = _mm_loadu_ps(&x[2 * i + 28]);
233
234 m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
235 m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
236 m10 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
237 m11 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
238
239 /* Quad multiply */
240 m0 = _mm_mul_ps(m4, m12);
241 m1 = _mm_mul_ps(m5, m12);
242 m2 = _mm_mul_ps(m6, m13);
243 m3 = _mm_mul_ps(m7, m13);
244
245 m4 = _mm_mul_ps(m8, m14);
246 m5 = _mm_mul_ps(m9, m14);
247 m6 = _mm_mul_ps(m10, m15);
248 m7 = _mm_mul_ps(m11, m15);
249
250 /* Sum and store */
251 m8 = _mm_add_ps(m0, m2);
252 m9 = _mm_add_ps(m1, m3);
253 m10 = _mm_add_ps(m4, m6);
254 m11 = _mm_add_ps(m5, m7);
255
256 m0 = _mm_add_ps(m8, m10);
257 m1 = _mm_add_ps(m9, m11);
258 m2 = _mm_hadd_ps(m0, m1);
259 m3 = _mm_hadd_ps(m2, m2);
260
261 _mm_store_ss(&y[2 * i + 0], m3);
262 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(0, 3, 2, 1));
263 _mm_store_ss(&y[2 * i + 1], m3);
264 }
265}
266
267/* 20-tap SSE complex-real convolution */
Tom Tsouf147b172015-03-25 12:55:11 -0700268static void sse_conv_real20(const float *restrict x,
269 const float *restrict h,
Thomas Tsou3eaae802013-08-20 19:31:14 -0400270 float *restrict y,
271 int len)
272{
273 __m128 m0, m1, m2, m3, m4, m5, m6, m7;
274 __m128 m8, m9, m11, m12, m13, m14, m15;
275
276 /* Load (aligned) filter taps */
277 m0 = _mm_load_ps(&h[0]);
278 m1 = _mm_load_ps(&h[4]);
279 m2 = _mm_load_ps(&h[8]);
280 m3 = _mm_load_ps(&h[12]);
281 m4 = _mm_load_ps(&h[16]);
282 m5 = _mm_load_ps(&h[20]);
283 m6 = _mm_load_ps(&h[24]);
284 m7 = _mm_load_ps(&h[28]);
285 m8 = _mm_load_ps(&h[32]);
286 m9 = _mm_load_ps(&h[36]);
287
288 m11 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
289 m12 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
290 m13 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(0, 2, 0, 2));
291 m14 = _mm_shuffle_ps(m6, m7, _MM_SHUFFLE(0, 2, 0, 2));
292 m15 = _mm_shuffle_ps(m8, m9, _MM_SHUFFLE(0, 2, 0, 2));
293
294 for (int i = 0; i < len; i++) {
295 /* Multiply-accumulate first 12 taps */
296 m0 = _mm_loadu_ps(&x[2 * i + 0]);
297 m1 = _mm_loadu_ps(&x[2 * i + 4]);
298 m2 = _mm_loadu_ps(&x[2 * i + 8]);
299 m3 = _mm_loadu_ps(&x[2 * i + 12]);
300 m4 = _mm_loadu_ps(&x[2 * i + 16]);
301 m5 = _mm_loadu_ps(&x[2 * i + 20]);
302
303 m6 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
304 m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
305 m8 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
306 m9 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
307 m0 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(0, 2, 0, 2));
308 m1 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(1, 3, 1, 3));
309
310 m2 = _mm_mul_ps(m6, m11);
311 m3 = _mm_mul_ps(m7, m11);
312 m4 = _mm_mul_ps(m8, m12);
313 m5 = _mm_mul_ps(m9, m12);
314 m6 = _mm_mul_ps(m0, m13);
315 m7 = _mm_mul_ps(m1, m13);
316
317 m0 = _mm_add_ps(m2, m4);
318 m1 = _mm_add_ps(m3, m5);
319 m8 = _mm_add_ps(m0, m6);
320 m9 = _mm_add_ps(m1, m7);
321
322 /* Multiply-accumulate last 8 taps */
323 m0 = _mm_loadu_ps(&x[2 * i + 24]);
324 m1 = _mm_loadu_ps(&x[2 * i + 28]);
325 m2 = _mm_loadu_ps(&x[2 * i + 32]);
326 m3 = _mm_loadu_ps(&x[2 * i + 36]);
327
328 m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
329 m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
330 m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
331 m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
332
333 m0 = _mm_mul_ps(m4, m14);
334 m1 = _mm_mul_ps(m5, m14);
335 m2 = _mm_mul_ps(m6, m15);
336 m3 = _mm_mul_ps(m7, m15);
337
338 m4 = _mm_add_ps(m0, m2);
339 m5 = _mm_add_ps(m1, m3);
340
341 /* Final sum and store */
342 m0 = _mm_add_ps(m8, m4);
343 m1 = _mm_add_ps(m9, m5);
344 m2 = _mm_hadd_ps(m0, m1);
345 m3 = _mm_hadd_ps(m2, m2);
346
347 _mm_store_ss(&y[2 * i + 0], m3);
348 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(0, 3, 2, 1));
349 _mm_store_ss(&y[2 * i + 1], m3);
350 }
351}
352
353/* 4*N-tap SSE complex-real convolution */
Tom Tsouf147b172015-03-25 12:55:11 -0700354static void sse_conv_real4n(const float *x,
355 const float *h,
356 float *y,
357 int h_len, int len)
Thomas Tsou3eaae802013-08-20 19:31:14 -0400358{
359 __m128 m0, m1, m2, m4, m5, m6, m7;
360
361 for (int i = 0; i < len; i++) {
362 /* Zero */
363 m6 = _mm_setzero_ps();
364 m7 = _mm_setzero_ps();
365
366 for (int n = 0; n < h_len / 4; n++) {
367 /* Load (aligned) filter taps */
368 m0 = _mm_load_ps(&h[8 * n + 0]);
369 m1 = _mm_load_ps(&h[8 * n + 4]);
370 m2 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
371
372 /* Load (unaligned) input data */
373 m0 = _mm_loadu_ps(&x[2 * i + 8 * n + 0]);
374 m1 = _mm_loadu_ps(&x[2 * i + 8 * n + 4]);
375 m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
376 m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
377
378 /* Quad multiply */
379 m0 = _mm_mul_ps(m2, m4);
380 m1 = _mm_mul_ps(m2, m5);
381
382 /* Accumulate */
383 m6 = _mm_add_ps(m6, m0);
384 m7 = _mm_add_ps(m7, m1);
385 }
386
387 m0 = _mm_hadd_ps(m6, m7);
388 m0 = _mm_hadd_ps(m0, m0);
389
390 _mm_store_ss(&y[2 * i + 0], m0);
391 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(0, 3, 2, 1));
392 _mm_store_ss(&y[2 * i + 1], m0);
393 }
394}
395
396/* 4*N-tap SSE complex-complex convolution */
Tom Tsouf147b172015-03-25 12:55:11 -0700397static void sse_conv_cmplx_4n(const float *x,
398 const float *h,
399 float *y,
400 int h_len, int len)
Thomas Tsou3eaae802013-08-20 19:31:14 -0400401{
402 __m128 m0, m1, m2, m3, m4, m5, m6, m7;
403
404 for (int i = 0; i < len; i++) {
405 /* Zero */
406 m6 = _mm_setzero_ps();
407 m7 = _mm_setzero_ps();
408
409 for (int n = 0; n < h_len / 4; n++) {
410 /* Load (aligned) filter taps */
411 m0 = _mm_load_ps(&h[8 * n + 0]);
412 m1 = _mm_load_ps(&h[8 * n + 4]);
413 m2 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
414 m3 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
415
416 /* Load (unaligned) input data */
417 m0 = _mm_loadu_ps(&x[2 * i + 8 * n + 0]);
418 m1 = _mm_loadu_ps(&x[2 * i + 8 * n + 4]);
419 m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
420 m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
421
422 /* Quad multiply */
423 m0 = _mm_mul_ps(m2, m4);
424 m1 = _mm_mul_ps(m3, m5);
425
426 m2 = _mm_mul_ps(m2, m5);
427 m3 = _mm_mul_ps(m3, m4);
428
429 /* Sum */
430 m0 = _mm_sub_ps(m0, m1);
431 m2 = _mm_add_ps(m2, m3);
432
433 /* Accumulate */
434 m6 = _mm_add_ps(m6, m0);
435 m7 = _mm_add_ps(m7, m2);
436 }
437
438 m0 = _mm_hadd_ps(m6, m7);
439 m0 = _mm_hadd_ps(m0, m0);
440
441 _mm_store_ss(&y[2 * i + 0], m0);
442 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(0, 3, 2, 1));
443 _mm_store_ss(&y[2 * i + 1], m0);
444 }
445}
446
447/* 8*N-tap SSE complex-complex convolution */
Tom Tsouf147b172015-03-25 12:55:11 -0700448static void sse_conv_cmplx_8n(const float *x,
449 const float *h,
450 float *y,
451 int h_len, int len)
Thomas Tsou3eaae802013-08-20 19:31:14 -0400452{
453 __m128 m0, m1, m2, m3, m4, m5, m6, m7;
454 __m128 m8, m9, m10, m11, m12, m13, m14, m15;
455
456 for (int i = 0; i < len; i++) {
457 /* Zero */
458 m12 = _mm_setzero_ps();
459 m13 = _mm_setzero_ps();
460 m14 = _mm_setzero_ps();
461 m15 = _mm_setzero_ps();
462
463 for (int n = 0; n < h_len / 8; n++) {
464 /* Load (aligned) filter taps */
465 m0 = _mm_load_ps(&h[16 * n + 0]);
466 m1 = _mm_load_ps(&h[16 * n + 4]);
467 m2 = _mm_load_ps(&h[16 * n + 8]);
468 m3 = _mm_load_ps(&h[16 * n + 12]);
469
470 m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
471 m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
Thomas Tsou187225c2014-05-01 02:15:51 -0400472 m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
Thomas Tsou3eaae802013-08-20 19:31:14 -0400473 m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
474
475 /* Load (unaligned) input data */
476 m0 = _mm_loadu_ps(&x[2 * i + 16 * n + 0]);
477 m1 = _mm_loadu_ps(&x[2 * i + 16 * n + 4]);
478 m2 = _mm_loadu_ps(&x[2 * i + 16 * n + 8]);
479 m3 = _mm_loadu_ps(&x[2 * i + 16 * n + 12]);
480
481 m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
482 m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
483 m10 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
484 m11 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
485
486 /* Quad multiply */
487 m0 = _mm_mul_ps(m4, m8);
488 m1 = _mm_mul_ps(m5, m9);
489 m2 = _mm_mul_ps(m6, m10);
490 m3 = _mm_mul_ps(m7, m11);
491
492 m4 = _mm_mul_ps(m4, m9);
493 m5 = _mm_mul_ps(m5, m8);
494 m6 = _mm_mul_ps(m6, m11);
495 m7 = _mm_mul_ps(m7, m10);
496
497 /* Sum */
498 m0 = _mm_sub_ps(m0, m1);
499 m2 = _mm_sub_ps(m2, m3);
500 m4 = _mm_add_ps(m4, m5);
501 m6 = _mm_add_ps(m6, m7);
502
503 /* Accumulate */
504 m12 = _mm_add_ps(m12, m0);
505 m13 = _mm_add_ps(m13, m2);
506 m14 = _mm_add_ps(m14, m4);
507 m15 = _mm_add_ps(m15, m6);
508 }
509
510 m0 = _mm_add_ps(m12, m13);
511 m1 = _mm_add_ps(m14, m15);
512 m2 = _mm_hadd_ps(m0, m1);
513 m2 = _mm_hadd_ps(m2, m2);
514
515 _mm_store_ss(&y[2 * i + 0], m2);
516 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 3, 2, 1));
517 _mm_store_ss(&y[2 * i + 1], m2);
518 }
519}
520#endif
521
Thomas Tsou3eaae802013-08-20 19:31:14 -0400522/* API: Aligned complex-real */
Tom Tsouf147b172015-03-25 12:55:11 -0700523int convolve_real(const float *x, int x_len,
524 const float *h, int h_len,
Thomas Tsou3eaae802013-08-20 19:31:14 -0400525 float *y, int y_len,
526 int start, int len,
527 int step, int offset)
528{
Tom Tsouf147b172015-03-25 12:55:11 -0700529 void (*conv_func)(const float *, const float *,
530 float *, int) = NULL;
531 void (*conv_func_n)(const float *, const float *,
532 float *, int, int) = NULL;
Thomas Tsou3eaae802013-08-20 19:31:14 -0400533
534 if (bounds_check(x_len, h_len, y_len, start, len, step) < 0)
535 return -1;
536
537 memset(y, 0, len * 2 * sizeof(float));
538
539#ifdef HAVE_SSE3
540 if (step <= 4) {
541 switch (h_len) {
542 case 4:
543 conv_func = sse_conv_real4;
544 break;
545 case 8:
546 conv_func = sse_conv_real8;
547 break;
548 case 12:
549 conv_func = sse_conv_real12;
550 break;
551 case 16:
552 conv_func = sse_conv_real16;
553 break;
554 case 20:
555 conv_func = sse_conv_real20;
556 break;
557 default:
558 if (!(h_len % 4))
559 conv_func_n = sse_conv_real4n;
560 }
561 }
562#endif
563 if (conv_func) {
564 conv_func(&x[2 * (-(h_len - 1) + start)],
565 h, y, len);
566 } else if (conv_func_n) {
567 conv_func_n(&x[2 * (-(h_len - 1) + start)],
568 h, y, h_len, len);
569 } else {
570 _base_convolve_real(x, x_len,
571 h, h_len,
572 y, y_len,
573 start, len, step, offset);
574 }
575
576 return len;
577}
578
579/* API: Aligned complex-complex */
Tom Tsouf147b172015-03-25 12:55:11 -0700580int convolve_complex(const float *x, int x_len,
581 const float *h, int h_len,
Thomas Tsou3eaae802013-08-20 19:31:14 -0400582 float *y, int y_len,
583 int start, int len,
584 int step, int offset)
585{
Tom Tsouf147b172015-03-25 12:55:11 -0700586 void (*conv_func)(const float *, const float *,
587 float *, int, int) = NULL;
Thomas Tsou3eaae802013-08-20 19:31:14 -0400588
589 if (bounds_check(x_len, h_len, y_len, start, len, step) < 0)
590 return -1;
591
592 memset(y, 0, len * 2 * sizeof(float));
593
594#ifdef HAVE_SSE3
595 if (step <= 4) {
596 if (!(h_len % 8))
597 conv_func = sse_conv_cmplx_8n;
598 else if (!(h_len % 4))
599 conv_func = sse_conv_cmplx_4n;
600 }
601#endif
602 if (conv_func) {
603 conv_func(&x[2 * (-(h_len - 1) + start)],
604 h, y, h_len, len);
605 } else {
606 _base_convolve_complex(x, x_len,
607 h, h_len,
608 y, y_len,
609 start, len, step, offset);
610 }
611
612 return len;
613}