blob: 6f48ea0c98dbdcbc27da0469e4654aaf48b8954f [file] [log] [blame]
Thomas Tsou3eaae802013-08-20 19:31:14 -04001/*
2 * SSE Convolution
3 * Copyright (C) 2012, 2013 Thomas Tsou <tom@tsou.cc>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2.1 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19
20#include <malloc.h>
21#include <string.h>
22#include <stdio.h>
23
24#ifdef HAVE_CONFIG_H
25#include "config.h"
26#endif
27
28#ifdef HAVE_SSE3
29#include <xmmintrin.h>
30#include <pmmintrin.h>
31
32/* 4-tap SSE complex-real convolution */
33static void sse_conv_real4(float *restrict x,
34 float *restrict h,
35 float *restrict y,
36 int len)
37{
38 __m128 m0, m1, m2, m3, m4, m5, m6, m7;
39
40 /* Load (aligned) filter taps */
41 m0 = _mm_load_ps(&h[0]);
42 m1 = _mm_load_ps(&h[4]);
43 m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
44
45 for (int i = 0; i < len; i++) {
46 /* Load (unaligned) input data */
47 m0 = _mm_loadu_ps(&x[2 * i + 0]);
48 m1 = _mm_loadu_ps(&x[2 * i + 4]);
49 m2 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
50 m3 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
51
52 /* Quad multiply */
53 m4 = _mm_mul_ps(m2, m7);
54 m5 = _mm_mul_ps(m3, m7);
55
56 /* Sum and store */
57 m6 = _mm_hadd_ps(m4, m5);
58 m0 = _mm_hadd_ps(m6, m6);
59
60 _mm_store_ss(&y[2 * i + 0], m0);
61 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(0, 3, 2, 1));
62 _mm_store_ss(&y[2 * i + 1], m0);
63 }
64}
65
66/* 8-tap SSE complex-real convolution */
67static void sse_conv_real8(float *restrict x,
68 float *restrict h,
69 float *restrict y,
70 int len)
71{
72 __m128 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9;
73
74 /* Load (aligned) filter taps */
75 m0 = _mm_load_ps(&h[0]);
76 m1 = _mm_load_ps(&h[4]);
77 m2 = _mm_load_ps(&h[8]);
78 m3 = _mm_load_ps(&h[12]);
79
80 m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
81 m5 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
82
83 for (int i = 0; i < len; i++) {
84 /* Load (unaligned) input data */
85 m0 = _mm_loadu_ps(&x[2 * i + 0]);
86 m1 = _mm_loadu_ps(&x[2 * i + 4]);
87 m2 = _mm_loadu_ps(&x[2 * i + 8]);
88 m3 = _mm_loadu_ps(&x[2 * i + 12]);
89
90 m6 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
91 m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
92 m8 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
93 m9 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
94
95 /* Quad multiply */
96 m6 = _mm_mul_ps(m6, m4);
97 m7 = _mm_mul_ps(m7, m4);
98 m8 = _mm_mul_ps(m8, m5);
99 m9 = _mm_mul_ps(m9, m5);
100
101 /* Sum and store */
102 m6 = _mm_add_ps(m6, m8);
103 m7 = _mm_add_ps(m7, m9);
104 m6 = _mm_hadd_ps(m6, m7);
105 m6 = _mm_hadd_ps(m6, m6);
106
107 _mm_store_ss(&y[2 * i + 0], m6);
108 m6 = _mm_shuffle_ps(m6, m6, _MM_SHUFFLE(0, 3, 2, 1));
109 _mm_store_ss(&y[2 * i + 1], m6);
110 }
111}
112
113/* 12-tap SSE complex-real convolution */
114static void sse_conv_real12(float *restrict x,
115 float *restrict h,
116 float *restrict y,
117 int len)
118{
119 __m128 m0, m1, m2, m3, m4, m5, m6, m7;
120 __m128 m8, m9, m10, m11, m12, m13, m14;
121
122 /* Load (aligned) filter taps */
123 m0 = _mm_load_ps(&h[0]);
124 m1 = _mm_load_ps(&h[4]);
125 m2 = _mm_load_ps(&h[8]);
126 m3 = _mm_load_ps(&h[12]);
127 m4 = _mm_load_ps(&h[16]);
128 m5 = _mm_load_ps(&h[20]);
129
130 m12 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
131 m13 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
132 m14 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(0, 2, 0, 2));
133
134 for (int i = 0; i < len; i++) {
135 /* Load (unaligned) input data */
136 m0 = _mm_loadu_ps(&x[2 * i + 0]);
137 m1 = _mm_loadu_ps(&x[2 * i + 4]);
138 m2 = _mm_loadu_ps(&x[2 * i + 8]);
139 m3 = _mm_loadu_ps(&x[2 * i + 12]);
140
141 m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
142 m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
143 m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
144 m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
145
146 m0 = _mm_loadu_ps(&x[2 * i + 16]);
147 m1 = _mm_loadu_ps(&x[2 * i + 20]);
148
149 m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
150 m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
151
152 /* Quad multiply */
153 m0 = _mm_mul_ps(m4, m12);
154 m1 = _mm_mul_ps(m5, m12);
155 m2 = _mm_mul_ps(m6, m13);
156 m3 = _mm_mul_ps(m7, m13);
157 m4 = _mm_mul_ps(m8, m14);
158 m5 = _mm_mul_ps(m9, m14);
159
160 /* Sum and store */
161 m8 = _mm_add_ps(m0, m2);
162 m9 = _mm_add_ps(m1, m3);
163 m10 = _mm_add_ps(m8, m4);
164 m11 = _mm_add_ps(m9, m5);
165
166 m2 = _mm_hadd_ps(m10, m11);
167 m3 = _mm_hadd_ps(m2, m2);
168
169 _mm_store_ss(&y[2 * i + 0], m3);
170 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(0, 3, 2, 1));
171 _mm_store_ss(&y[2 * i + 1], m3);
172 }
173}
174
175/* 16-tap SSE complex-real convolution */
176static void sse_conv_real16(float *restrict x,
177 float *restrict h,
178 float *restrict y,
179 int len)
180{
181 __m128 m0, m1, m2, m3, m4, m5, m6, m7;
182 __m128 m8, m9, m10, m11, m12, m13, m14, m15;
183
184 /* Load (aligned) filter taps */
185 m0 = _mm_load_ps(&h[0]);
186 m1 = _mm_load_ps(&h[4]);
187 m2 = _mm_load_ps(&h[8]);
188 m3 = _mm_load_ps(&h[12]);
189
190 m4 = _mm_load_ps(&h[16]);
191 m5 = _mm_load_ps(&h[20]);
192 m6 = _mm_load_ps(&h[24]);
193 m7 = _mm_load_ps(&h[28]);
194
195 m12 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
196 m13 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
197 m14 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(0, 2, 0, 2));
198 m15 = _mm_shuffle_ps(m6, m7, _MM_SHUFFLE(0, 2, 0, 2));
199
200 for (int i = 0; i < len; i++) {
201 /* Load (unaligned) input data */
202 m0 = _mm_loadu_ps(&x[2 * i + 0]);
203 m1 = _mm_loadu_ps(&x[2 * i + 4]);
204 m2 = _mm_loadu_ps(&x[2 * i + 8]);
205 m3 = _mm_loadu_ps(&x[2 * i + 12]);
206
207 m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
208 m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
209 m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
210 m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
211
212 m0 = _mm_loadu_ps(&x[2 * i + 16]);
213 m1 = _mm_loadu_ps(&x[2 * i + 20]);
214 m2 = _mm_loadu_ps(&x[2 * i + 24]);
215 m3 = _mm_loadu_ps(&x[2 * i + 28]);
216
217 m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
218 m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
219 m10 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
220 m11 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
221
222 /* Quad multiply */
223 m0 = _mm_mul_ps(m4, m12);
224 m1 = _mm_mul_ps(m5, m12);
225 m2 = _mm_mul_ps(m6, m13);
226 m3 = _mm_mul_ps(m7, m13);
227
228 m4 = _mm_mul_ps(m8, m14);
229 m5 = _mm_mul_ps(m9, m14);
230 m6 = _mm_mul_ps(m10, m15);
231 m7 = _mm_mul_ps(m11, m15);
232
233 /* Sum and store */
234 m8 = _mm_add_ps(m0, m2);
235 m9 = _mm_add_ps(m1, m3);
236 m10 = _mm_add_ps(m4, m6);
237 m11 = _mm_add_ps(m5, m7);
238
239 m0 = _mm_add_ps(m8, m10);
240 m1 = _mm_add_ps(m9, m11);
241 m2 = _mm_hadd_ps(m0, m1);
242 m3 = _mm_hadd_ps(m2, m2);
243
244 _mm_store_ss(&y[2 * i + 0], m3);
245 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(0, 3, 2, 1));
246 _mm_store_ss(&y[2 * i + 1], m3);
247 }
248}
249
250/* 20-tap SSE complex-real convolution */
251static void sse_conv_real20(float *restrict x,
252 float *restrict h,
253 float *restrict y,
254 int len)
255{
256 __m128 m0, m1, m2, m3, m4, m5, m6, m7;
257 __m128 m8, m9, m11, m12, m13, m14, m15;
258
259 /* Load (aligned) filter taps */
260 m0 = _mm_load_ps(&h[0]);
261 m1 = _mm_load_ps(&h[4]);
262 m2 = _mm_load_ps(&h[8]);
263 m3 = _mm_load_ps(&h[12]);
264 m4 = _mm_load_ps(&h[16]);
265 m5 = _mm_load_ps(&h[20]);
266 m6 = _mm_load_ps(&h[24]);
267 m7 = _mm_load_ps(&h[28]);
268 m8 = _mm_load_ps(&h[32]);
269 m9 = _mm_load_ps(&h[36]);
270
271 m11 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
272 m12 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
273 m13 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(0, 2, 0, 2));
274 m14 = _mm_shuffle_ps(m6, m7, _MM_SHUFFLE(0, 2, 0, 2));
275 m15 = _mm_shuffle_ps(m8, m9, _MM_SHUFFLE(0, 2, 0, 2));
276
277 for (int i = 0; i < len; i++) {
278 /* Multiply-accumulate first 12 taps */
279 m0 = _mm_loadu_ps(&x[2 * i + 0]);
280 m1 = _mm_loadu_ps(&x[2 * i + 4]);
281 m2 = _mm_loadu_ps(&x[2 * i + 8]);
282 m3 = _mm_loadu_ps(&x[2 * i + 12]);
283 m4 = _mm_loadu_ps(&x[2 * i + 16]);
284 m5 = _mm_loadu_ps(&x[2 * i + 20]);
285
286 m6 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
287 m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
288 m8 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
289 m9 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
290 m0 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(0, 2, 0, 2));
291 m1 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(1, 3, 1, 3));
292
293 m2 = _mm_mul_ps(m6, m11);
294 m3 = _mm_mul_ps(m7, m11);
295 m4 = _mm_mul_ps(m8, m12);
296 m5 = _mm_mul_ps(m9, m12);
297 m6 = _mm_mul_ps(m0, m13);
298 m7 = _mm_mul_ps(m1, m13);
299
300 m0 = _mm_add_ps(m2, m4);
301 m1 = _mm_add_ps(m3, m5);
302 m8 = _mm_add_ps(m0, m6);
303 m9 = _mm_add_ps(m1, m7);
304
305 /* Multiply-accumulate last 8 taps */
306 m0 = _mm_loadu_ps(&x[2 * i + 24]);
307 m1 = _mm_loadu_ps(&x[2 * i + 28]);
308 m2 = _mm_loadu_ps(&x[2 * i + 32]);
309 m3 = _mm_loadu_ps(&x[2 * i + 36]);
310
311 m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
312 m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
313 m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
314 m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
315
316 m0 = _mm_mul_ps(m4, m14);
317 m1 = _mm_mul_ps(m5, m14);
318 m2 = _mm_mul_ps(m6, m15);
319 m3 = _mm_mul_ps(m7, m15);
320
321 m4 = _mm_add_ps(m0, m2);
322 m5 = _mm_add_ps(m1, m3);
323
324 /* Final sum and store */
325 m0 = _mm_add_ps(m8, m4);
326 m1 = _mm_add_ps(m9, m5);
327 m2 = _mm_hadd_ps(m0, m1);
328 m3 = _mm_hadd_ps(m2, m2);
329
330 _mm_store_ss(&y[2 * i + 0], m3);
331 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(0, 3, 2, 1));
332 _mm_store_ss(&y[2 * i + 1], m3);
333 }
334}
335
336/* 4*N-tap SSE complex-real convolution */
337static void sse_conv_real4n(float *x, float *h, float *y, int h_len, int len)
338{
339 __m128 m0, m1, m2, m4, m5, m6, m7;
340
341 for (int i = 0; i < len; i++) {
342 /* Zero */
343 m6 = _mm_setzero_ps();
344 m7 = _mm_setzero_ps();
345
346 for (int n = 0; n < h_len / 4; n++) {
347 /* Load (aligned) filter taps */
348 m0 = _mm_load_ps(&h[8 * n + 0]);
349 m1 = _mm_load_ps(&h[8 * n + 4]);
350 m2 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
351
352 /* Load (unaligned) input data */
353 m0 = _mm_loadu_ps(&x[2 * i + 8 * n + 0]);
354 m1 = _mm_loadu_ps(&x[2 * i + 8 * n + 4]);
355 m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
356 m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
357
358 /* Quad multiply */
359 m0 = _mm_mul_ps(m2, m4);
360 m1 = _mm_mul_ps(m2, m5);
361
362 /* Accumulate */
363 m6 = _mm_add_ps(m6, m0);
364 m7 = _mm_add_ps(m7, m1);
365 }
366
367 m0 = _mm_hadd_ps(m6, m7);
368 m0 = _mm_hadd_ps(m0, m0);
369
370 _mm_store_ss(&y[2 * i + 0], m0);
371 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(0, 3, 2, 1));
372 _mm_store_ss(&y[2 * i + 1], m0);
373 }
374}
375
376/* 4*N-tap SSE complex-complex convolution */
377static void sse_conv_cmplx_4n(float *x, float *h, float *y, int h_len, int len)
378{
379 __m128 m0, m1, m2, m3, m4, m5, m6, m7;
380
381 for (int i = 0; i < len; i++) {
382 /* Zero */
383 m6 = _mm_setzero_ps();
384 m7 = _mm_setzero_ps();
385
386 for (int n = 0; n < h_len / 4; n++) {
387 /* Load (aligned) filter taps */
388 m0 = _mm_load_ps(&h[8 * n + 0]);
389 m1 = _mm_load_ps(&h[8 * n + 4]);
390 m2 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
391 m3 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
392
393 /* Load (unaligned) input data */
394 m0 = _mm_loadu_ps(&x[2 * i + 8 * n + 0]);
395 m1 = _mm_loadu_ps(&x[2 * i + 8 * n + 4]);
396 m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
397 m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
398
399 /* Quad multiply */
400 m0 = _mm_mul_ps(m2, m4);
401 m1 = _mm_mul_ps(m3, m5);
402
403 m2 = _mm_mul_ps(m2, m5);
404 m3 = _mm_mul_ps(m3, m4);
405
406 /* Sum */
407 m0 = _mm_sub_ps(m0, m1);
408 m2 = _mm_add_ps(m2, m3);
409
410 /* Accumulate */
411 m6 = _mm_add_ps(m6, m0);
412 m7 = _mm_add_ps(m7, m2);
413 }
414
415 m0 = _mm_hadd_ps(m6, m7);
416 m0 = _mm_hadd_ps(m0, m0);
417
418 _mm_store_ss(&y[2 * i + 0], m0);
419 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(0, 3, 2, 1));
420 _mm_store_ss(&y[2 * i + 1], m0);
421 }
422}
423
424/* 8*N-tap SSE complex-complex convolution */
425static void sse_conv_cmplx_8n(float *x, float *h, float *y, int h_len, int len)
426{
427 __m128 m0, m1, m2, m3, m4, m5, m6, m7;
428 __m128 m8, m9, m10, m11, m12, m13, m14, m15;
429
430 for (int i = 0; i < len; i++) {
431 /* Zero */
432 m12 = _mm_setzero_ps();
433 m13 = _mm_setzero_ps();
434 m14 = _mm_setzero_ps();
435 m15 = _mm_setzero_ps();
436
437 for (int n = 0; n < h_len / 8; n++) {
438 /* Load (aligned) filter taps */
439 m0 = _mm_load_ps(&h[16 * n + 0]);
440 m1 = _mm_load_ps(&h[16 * n + 4]);
441 m2 = _mm_load_ps(&h[16 * n + 8]);
442 m3 = _mm_load_ps(&h[16 * n + 12]);
443
444 m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
445 m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
446 m6 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 2, 0, 2));
447 m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
448
449 /* Load (unaligned) input data */
450 m0 = _mm_loadu_ps(&x[2 * i + 16 * n + 0]);
451 m1 = _mm_loadu_ps(&x[2 * i + 16 * n + 4]);
452 m2 = _mm_loadu_ps(&x[2 * i + 16 * n + 8]);
453 m3 = _mm_loadu_ps(&x[2 * i + 16 * n + 12]);
454
455 m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
456 m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
457 m10 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
458 m11 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
459
460 /* Quad multiply */
461 m0 = _mm_mul_ps(m4, m8);
462 m1 = _mm_mul_ps(m5, m9);
463 m2 = _mm_mul_ps(m6, m10);
464 m3 = _mm_mul_ps(m7, m11);
465
466 m4 = _mm_mul_ps(m4, m9);
467 m5 = _mm_mul_ps(m5, m8);
468 m6 = _mm_mul_ps(m6, m11);
469 m7 = _mm_mul_ps(m7, m10);
470
471 /* Sum */
472 m0 = _mm_sub_ps(m0, m1);
473 m2 = _mm_sub_ps(m2, m3);
474 m4 = _mm_add_ps(m4, m5);
475 m6 = _mm_add_ps(m6, m7);
476
477 /* Accumulate */
478 m12 = _mm_add_ps(m12, m0);
479 m13 = _mm_add_ps(m13, m2);
480 m14 = _mm_add_ps(m14, m4);
481 m15 = _mm_add_ps(m15, m6);
482 }
483
484 m0 = _mm_add_ps(m12, m13);
485 m1 = _mm_add_ps(m14, m15);
486 m2 = _mm_hadd_ps(m0, m1);
487 m2 = _mm_hadd_ps(m2, m2);
488
489 _mm_store_ss(&y[2 * i + 0], m2);
490 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 3, 2, 1));
491 _mm_store_ss(&y[2 * i + 1], m2);
492 }
493}
494#endif
495
496/* Base multiply and accumulate complex-real */
497static void mac_real(float *x, float *h, float *y)
498{
499 y[0] += x[0] * h[0];
500 y[1] += x[1] * h[0];
501}
502
503/* Base multiply and accumulate complex-complex */
504static void mac_cmplx(float *x, float *h, float *y)
505{
506 y[0] += x[0] * h[0] - x[1] * h[1];
507 y[1] += x[0] * h[1] + x[1] * h[0];
508}
509
510/* Base vector complex-complex multiply and accumulate */
511static void mac_real_vec_n(float *x, float *h, float *y,
512 int len, int step, int offset)
513{
514 for (int i = offset; i < len; i += step)
515 mac_real(&x[2 * i], &h[2 * i], y);
516}
517
518/* Base vector complex-complex multiply and accumulate */
519static void mac_cmplx_vec_n(float *x, float *h, float *y,
520 int len, int step, int offset)
521{
522 for (int i = offset; i < len; i += step)
523 mac_cmplx(&x[2 * i], &h[2 * i], y);
524}
525
526/* Base complex-real convolution */
527static int _base_convolve_real(float *x, int x_len,
528 float *h, int h_len,
529 float *y, int y_len,
530 int start, int len,
531 int step, int offset)
532{
533 for (int i = 0; i < len; i++) {
534 mac_real_vec_n(&x[2 * (i - (h_len - 1) + start)],
535 h,
536 &y[2 * i], h_len,
537 step, offset);
538 }
539
540 return len;
541}
542
543/* Base complex-complex convolution */
544static int _base_convolve_complex(float *x, int x_len,
545 float *h, int h_len,
546 float *y, int y_len,
547 int start, int len,
548 int step, int offset)
549{
550 for (int i = 0; i < len; i++) {
551 mac_cmplx_vec_n(&x[2 * (i - (h_len - 1) + start)],
552 h,
553 &y[2 * i],
554 h_len, step, offset);
555 }
556
557 return len;
558}
559
560/* Buffer validity checks */
561static int bounds_check(int x_len, int h_len, int y_len,
562 int start, int len, int step)
563{
564 if ((x_len < 1) || (h_len < 1) ||
565 (y_len < 1) || (len < 1) || (step < 1)) {
566 fprintf(stderr, "Convolve: Invalid input\n");
567 return -1;
568 }
569
570 if ((start + len > x_len) || (len > y_len) || (x_len < h_len)) {
571 fprintf(stderr, "Convolve: Boundary exception\n");
572 fprintf(stderr, "start: %i, len: %i, x: %i, h: %i, y: %i\n",
573 start, len, x_len, h_len, y_len);
574 return -1;
575 }
576
577 return 0;
578}
579
580/* API: Aligned complex-real */
581int convolve_real(float *x, int x_len,
582 float *h, int h_len,
583 float *y, int y_len,
584 int start, int len,
585 int step, int offset)
586{
587 void (*conv_func)(float *, float *, float *, int) = NULL;
588 void (*conv_func_n)(float *, float *, float *, int, int) = NULL;
589
590 if (bounds_check(x_len, h_len, y_len, start, len, step) < 0)
591 return -1;
592
593 memset(y, 0, len * 2 * sizeof(float));
594
595#ifdef HAVE_SSE3
596 if (step <= 4) {
597 switch (h_len) {
598 case 4:
599 conv_func = sse_conv_real4;
600 break;
601 case 8:
602 conv_func = sse_conv_real8;
603 break;
604 case 12:
605 conv_func = sse_conv_real12;
606 break;
607 case 16:
608 conv_func = sse_conv_real16;
609 break;
610 case 20:
611 conv_func = sse_conv_real20;
612 break;
613 default:
614 if (!(h_len % 4))
615 conv_func_n = sse_conv_real4n;
616 }
617 }
618#endif
619 if (conv_func) {
620 conv_func(&x[2 * (-(h_len - 1) + start)],
621 h, y, len);
622 } else if (conv_func_n) {
623 conv_func_n(&x[2 * (-(h_len - 1) + start)],
624 h, y, h_len, len);
625 } else {
626 _base_convolve_real(x, x_len,
627 h, h_len,
628 y, y_len,
629 start, len, step, offset);
630 }
631
632 return len;
633}
634
635/* API: Aligned complex-complex */
636int convolve_complex(float *x, int x_len,
637 float *h, int h_len,
638 float *y, int y_len,
639 int start, int len,
640 int step, int offset)
641{
642 void (*conv_func)(float *, float *, float *, int, int) = NULL;
643
644 if (bounds_check(x_len, h_len, y_len, start, len, step) < 0)
645 return -1;
646
647 memset(y, 0, len * 2 * sizeof(float));
648
649#ifdef HAVE_SSE3
650 if (step <= 4) {
651 if (!(h_len % 8))
652 conv_func = sse_conv_cmplx_8n;
653 else if (!(h_len % 4))
654 conv_func = sse_conv_cmplx_4n;
655 }
656#endif
657 if (conv_func) {
658 conv_func(&x[2 * (-(h_len - 1) + start)],
659 h, y, h_len, len);
660 } else {
661 _base_convolve_complex(x, x_len,
662 h, h_len,
663 y, y_len,
664 start, len, step, offset);
665 }
666
667 return len;
668}
669
670/* API: Non-aligned (no SSE) complex-real */
671int base_convolve_real(float *x, int x_len,
672 float *h, int h_len,
673 float *y, int y_len,
674 int start, int len,
675 int step, int offset)
676{
677 if (bounds_check(x_len, h_len, y_len, start, len, step) < 0)
678 return -1;
679
680 memset(y, 0, len * 2 * sizeof(float));
681
682 return _base_convolve_real(x, x_len,
683 h, h_len,
684 y, y_len,
685 start, len, step, offset);
686}
687
688/* API: Non-aligned (no SSE) complex-complex */
689int base_convolve_complex(float *x, int x_len,
690 float *h, int h_len,
691 float *y, int y_len,
692 int start, int len,
693 int step, int offset)
694{
695 if (bounds_check(x_len, h_len, y_len, start, len, step) < 0)
696 return -1;
697
698 memset(y, 0, len * 2 * sizeof(float));
699
700 return _base_convolve_complex(x, x_len,
701 h, h_len,
702 y, y_len,
703 start, len, step, offset);
704}
705
706/* Aligned filter tap allocation */
707void *convolve_h_alloc(int len)
708{
709#ifdef HAVE_SSE3
710 return memalign(16, len * 2 * sizeof(float));
711#else
712 return malloc(len * 2 * sizeof(float));
713#endif
714}