blob: e2a1dea79b713ce3f44242aef7e7d90e7c324f55 [file] [log] [blame]
Thomas Tsou3eaae802013-08-20 19:31:14 -04001/*
2 * SSE Convolution
3 * Copyright (C) 2012, 2013 Thomas Tsou <tom@tsou.cc>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2.1 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19
20#include <malloc.h>
21#include <string.h>
22#include <stdio.h>
Thomas Tsou17bbb9b2013-10-30 21:24:40 -040023#include "convolve.h"
Thomas Tsou3eaae802013-08-20 19:31:14 -040024
25#ifdef HAVE_CONFIG_H
26#include "config.h"
27#endif
28
Thomas Tsou17bbb9b2013-10-30 21:24:40 -040029/* Forward declarations from base implementation */
Tom Tsouf147b172015-03-25 12:55:11 -070030int _base_convolve_real(const float *x, int x_len,
31 const float *h, int h_len,
Thomas Tsou17bbb9b2013-10-30 21:24:40 -040032 float *y, int y_len,
33 int start, int len,
34 int step, int offset);
35
Tom Tsouf147b172015-03-25 12:55:11 -070036int _base_convolve_complex(const float *x, int x_len,
37 const float *h, int h_len,
Thomas Tsou17bbb9b2013-10-30 21:24:40 -040038 float *y, int y_len,
39 int start, int len,
40 int step, int offset);
41
42int bounds_check(int x_len, int h_len, int y_len,
43 int start, int len, int step);
44
Thomas Tsou3eaae802013-08-20 19:31:14 -040045#ifdef HAVE_SSE3
46#include <xmmintrin.h>
47#include <pmmintrin.h>
48
49/* 4-tap SSE complex-real convolution */
Philipp Maier131f82b2017-03-15 12:39:25 +010050static void sse_conv_real4(const float *x, int x_len,
51 const float *h, int h_len,
52 float *y, int y_len,
53 int start, int len,
54 int step, int offset)
Thomas Tsou3eaae802013-08-20 19:31:14 -040055{
Philipp Maier131f82b2017-03-15 12:39:25 +010056 /* NOTE: The parameter list of this function has to match the parameter
57 * list of _base_convolve_real() in convolve_base.c. This specific
58 * implementation, ignores some of the parameters of
59 * _base_convolve_complex(), which are: x_len, y_len, offset, step */
60
Thomas Tsou3eaae802013-08-20 19:31:14 -040061 __m128 m0, m1, m2, m3, m4, m5, m6, m7;
62
Philipp Maier131f82b2017-03-15 12:39:25 +010063 const float *_x = &x[2 * (-(h_len - 1) + start)];
64
Thomas Tsou3eaae802013-08-20 19:31:14 -040065 /* Load (aligned) filter taps */
66 m0 = _mm_load_ps(&h[0]);
67 m1 = _mm_load_ps(&h[4]);
68 m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
69
70 for (int i = 0; i < len; i++) {
71 /* Load (unaligned) input data */
Philipp Maier131f82b2017-03-15 12:39:25 +010072 m0 = _mm_loadu_ps(&_x[2 * i + 0]);
73 m1 = _mm_loadu_ps(&_x[2 * i + 4]);
Thomas Tsou3eaae802013-08-20 19:31:14 -040074 m2 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
75 m3 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
76
77 /* Quad multiply */
78 m4 = _mm_mul_ps(m2, m7);
79 m5 = _mm_mul_ps(m3, m7);
80
81 /* Sum and store */
82 m6 = _mm_hadd_ps(m4, m5);
83 m0 = _mm_hadd_ps(m6, m6);
84
85 _mm_store_ss(&y[2 * i + 0], m0);
86 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(0, 3, 2, 1));
87 _mm_store_ss(&y[2 * i + 1], m0);
88 }
89}
90
91/* 8-tap SSE complex-real convolution */
Philipp Maier131f82b2017-03-15 12:39:25 +010092static void sse_conv_real8(const float *x, int x_len,
93 const float *h, int h_len,
94 float *y, int y_len,
95 int start, int len,
96 int step, int offset)
Thomas Tsou3eaae802013-08-20 19:31:14 -040097{
Philipp Maier131f82b2017-03-15 12:39:25 +010098 /* See NOTE in sse_conv_real4() */
99
Thomas Tsou3eaae802013-08-20 19:31:14 -0400100 __m128 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9;
101
Philipp Maier131f82b2017-03-15 12:39:25 +0100102 const float *_x = &x[2 * (-(h_len - 1) + start)];
103
Thomas Tsou3eaae802013-08-20 19:31:14 -0400104 /* Load (aligned) filter taps */
105 m0 = _mm_load_ps(&h[0]);
106 m1 = _mm_load_ps(&h[4]);
107 m2 = _mm_load_ps(&h[8]);
108 m3 = _mm_load_ps(&h[12]);
109
110 m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
111 m5 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
112
113 for (int i = 0; i < len; i++) {
114 /* Load (unaligned) input data */
Philipp Maier131f82b2017-03-15 12:39:25 +0100115 m0 = _mm_loadu_ps(&_x[2 * i + 0]);
116 m1 = _mm_loadu_ps(&_x[2 * i + 4]);
117 m2 = _mm_loadu_ps(&_x[2 * i + 8]);
118 m3 = _mm_loadu_ps(&_x[2 * i + 12]);
Thomas Tsou3eaae802013-08-20 19:31:14 -0400119
120 m6 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
121 m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
122 m8 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
123 m9 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
124
125 /* Quad multiply */
126 m6 = _mm_mul_ps(m6, m4);
127 m7 = _mm_mul_ps(m7, m4);
128 m8 = _mm_mul_ps(m8, m5);
129 m9 = _mm_mul_ps(m9, m5);
130
131 /* Sum and store */
132 m6 = _mm_add_ps(m6, m8);
133 m7 = _mm_add_ps(m7, m9);
134 m6 = _mm_hadd_ps(m6, m7);
135 m6 = _mm_hadd_ps(m6, m6);
136
137 _mm_store_ss(&y[2 * i + 0], m6);
138 m6 = _mm_shuffle_ps(m6, m6, _MM_SHUFFLE(0, 3, 2, 1));
139 _mm_store_ss(&y[2 * i + 1], m6);
140 }
141}
142
143/* 12-tap SSE complex-real convolution */
Philipp Maier131f82b2017-03-15 12:39:25 +0100144static void sse_conv_real12(const float *x, int x_len,
145 const float *h, int h_len,
146 float *y, int y_len,
147 int start, int len,
148 int step, int offset)
Thomas Tsou3eaae802013-08-20 19:31:14 -0400149{
Philipp Maier131f82b2017-03-15 12:39:25 +0100150 /* See NOTE in sse_conv_real4() */
151
Thomas Tsou3eaae802013-08-20 19:31:14 -0400152 __m128 m0, m1, m2, m3, m4, m5, m6, m7;
153 __m128 m8, m9, m10, m11, m12, m13, m14;
154
Philipp Maier131f82b2017-03-15 12:39:25 +0100155 const float *_x = &x[2 * (-(h_len - 1) + start)];
156
Thomas Tsou3eaae802013-08-20 19:31:14 -0400157 /* Load (aligned) filter taps */
158 m0 = _mm_load_ps(&h[0]);
159 m1 = _mm_load_ps(&h[4]);
160 m2 = _mm_load_ps(&h[8]);
161 m3 = _mm_load_ps(&h[12]);
162 m4 = _mm_load_ps(&h[16]);
163 m5 = _mm_load_ps(&h[20]);
164
165 m12 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
166 m13 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
167 m14 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(0, 2, 0, 2));
168
169 for (int i = 0; i < len; i++) {
170 /* Load (unaligned) input data */
Philipp Maier131f82b2017-03-15 12:39:25 +0100171 m0 = _mm_loadu_ps(&_x[2 * i + 0]);
172 m1 = _mm_loadu_ps(&_x[2 * i + 4]);
173 m2 = _mm_loadu_ps(&_x[2 * i + 8]);
174 m3 = _mm_loadu_ps(&_x[2 * i + 12]);
Thomas Tsou3eaae802013-08-20 19:31:14 -0400175
176 m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
177 m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
178 m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
179 m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
180
Philipp Maier131f82b2017-03-15 12:39:25 +0100181 m0 = _mm_loadu_ps(&_x[2 * i + 16]);
182 m1 = _mm_loadu_ps(&_x[2 * i + 20]);
Thomas Tsou3eaae802013-08-20 19:31:14 -0400183
184 m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
185 m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
186
187 /* Quad multiply */
188 m0 = _mm_mul_ps(m4, m12);
189 m1 = _mm_mul_ps(m5, m12);
190 m2 = _mm_mul_ps(m6, m13);
191 m3 = _mm_mul_ps(m7, m13);
192 m4 = _mm_mul_ps(m8, m14);
193 m5 = _mm_mul_ps(m9, m14);
194
195 /* Sum and store */
196 m8 = _mm_add_ps(m0, m2);
197 m9 = _mm_add_ps(m1, m3);
198 m10 = _mm_add_ps(m8, m4);
199 m11 = _mm_add_ps(m9, m5);
200
201 m2 = _mm_hadd_ps(m10, m11);
202 m3 = _mm_hadd_ps(m2, m2);
203
204 _mm_store_ss(&y[2 * i + 0], m3);
205 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(0, 3, 2, 1));
206 _mm_store_ss(&y[2 * i + 1], m3);
207 }
208}
209
210/* 16-tap SSE complex-real convolution */
Philipp Maier131f82b2017-03-15 12:39:25 +0100211static void sse_conv_real16(const float *x, int x_len,
212 const float *h, int h_len,
213 float *y, int y_len,
214 int start, int len,
215 int step, int offset)
Thomas Tsou3eaae802013-08-20 19:31:14 -0400216{
Philipp Maier131f82b2017-03-15 12:39:25 +0100217 /* See NOTE in sse_conv_real4() */
218
Thomas Tsou3eaae802013-08-20 19:31:14 -0400219 __m128 m0, m1, m2, m3, m4, m5, m6, m7;
220 __m128 m8, m9, m10, m11, m12, m13, m14, m15;
221
Philipp Maier131f82b2017-03-15 12:39:25 +0100222 const float *_x = &x[2 * (-(h_len - 1) + start)];
223
Thomas Tsou3eaae802013-08-20 19:31:14 -0400224 /* Load (aligned) filter taps */
225 m0 = _mm_load_ps(&h[0]);
226 m1 = _mm_load_ps(&h[4]);
227 m2 = _mm_load_ps(&h[8]);
228 m3 = _mm_load_ps(&h[12]);
229
230 m4 = _mm_load_ps(&h[16]);
231 m5 = _mm_load_ps(&h[20]);
232 m6 = _mm_load_ps(&h[24]);
233 m7 = _mm_load_ps(&h[28]);
234
235 m12 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
236 m13 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
237 m14 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(0, 2, 0, 2));
238 m15 = _mm_shuffle_ps(m6, m7, _MM_SHUFFLE(0, 2, 0, 2));
239
240 for (int i = 0; i < len; i++) {
241 /* Load (unaligned) input data */
Philipp Maier131f82b2017-03-15 12:39:25 +0100242 m0 = _mm_loadu_ps(&_x[2 * i + 0]);
243 m1 = _mm_loadu_ps(&_x[2 * i + 4]);
244 m2 = _mm_loadu_ps(&_x[2 * i + 8]);
245 m3 = _mm_loadu_ps(&_x[2 * i + 12]);
Thomas Tsou3eaae802013-08-20 19:31:14 -0400246
247 m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
248 m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
249 m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
250 m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
251
Philipp Maier131f82b2017-03-15 12:39:25 +0100252 m0 = _mm_loadu_ps(&_x[2 * i + 16]);
253 m1 = _mm_loadu_ps(&_x[2 * i + 20]);
254 m2 = _mm_loadu_ps(&_x[2 * i + 24]);
255 m3 = _mm_loadu_ps(&_x[2 * i + 28]);
Thomas Tsou3eaae802013-08-20 19:31:14 -0400256
257 m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
258 m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
259 m10 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
260 m11 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
261
262 /* Quad multiply */
263 m0 = _mm_mul_ps(m4, m12);
264 m1 = _mm_mul_ps(m5, m12);
265 m2 = _mm_mul_ps(m6, m13);
266 m3 = _mm_mul_ps(m7, m13);
267
268 m4 = _mm_mul_ps(m8, m14);
269 m5 = _mm_mul_ps(m9, m14);
270 m6 = _mm_mul_ps(m10, m15);
271 m7 = _mm_mul_ps(m11, m15);
272
273 /* Sum and store */
274 m8 = _mm_add_ps(m0, m2);
275 m9 = _mm_add_ps(m1, m3);
276 m10 = _mm_add_ps(m4, m6);
277 m11 = _mm_add_ps(m5, m7);
278
279 m0 = _mm_add_ps(m8, m10);
280 m1 = _mm_add_ps(m9, m11);
281 m2 = _mm_hadd_ps(m0, m1);
282 m3 = _mm_hadd_ps(m2, m2);
283
284 _mm_store_ss(&y[2 * i + 0], m3);
285 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(0, 3, 2, 1));
286 _mm_store_ss(&y[2 * i + 1], m3);
287 }
288}
289
290/* 20-tap SSE complex-real convolution */
Philipp Maier131f82b2017-03-15 12:39:25 +0100291static void sse_conv_real20(const float *x, int x_len,
292 const float *h, int h_len,
293 float *y, int y_len,
294 int start, int len,
295 int step, int offset)
Thomas Tsou3eaae802013-08-20 19:31:14 -0400296{
Philipp Maier131f82b2017-03-15 12:39:25 +0100297 /* See NOTE in sse_conv_real4() */
298
Thomas Tsou3eaae802013-08-20 19:31:14 -0400299 __m128 m0, m1, m2, m3, m4, m5, m6, m7;
300 __m128 m8, m9, m11, m12, m13, m14, m15;
301
Philipp Maier131f82b2017-03-15 12:39:25 +0100302 const float *_x = &x[2 * (-(h_len - 1) + start)];
303
Thomas Tsou3eaae802013-08-20 19:31:14 -0400304 /* Load (aligned) filter taps */
305 m0 = _mm_load_ps(&h[0]);
306 m1 = _mm_load_ps(&h[4]);
307 m2 = _mm_load_ps(&h[8]);
308 m3 = _mm_load_ps(&h[12]);
309 m4 = _mm_load_ps(&h[16]);
310 m5 = _mm_load_ps(&h[20]);
311 m6 = _mm_load_ps(&h[24]);
312 m7 = _mm_load_ps(&h[28]);
313 m8 = _mm_load_ps(&h[32]);
314 m9 = _mm_load_ps(&h[36]);
315
316 m11 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
317 m12 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
318 m13 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(0, 2, 0, 2));
319 m14 = _mm_shuffle_ps(m6, m7, _MM_SHUFFLE(0, 2, 0, 2));
320 m15 = _mm_shuffle_ps(m8, m9, _MM_SHUFFLE(0, 2, 0, 2));
321
322 for (int i = 0; i < len; i++) {
323 /* Multiply-accumulate first 12 taps */
Philipp Maier131f82b2017-03-15 12:39:25 +0100324 m0 = _mm_loadu_ps(&_x[2 * i + 0]);
325 m1 = _mm_loadu_ps(&_x[2 * i + 4]);
326 m2 = _mm_loadu_ps(&_x[2 * i + 8]);
327 m3 = _mm_loadu_ps(&_x[2 * i + 12]);
328 m4 = _mm_loadu_ps(&_x[2 * i + 16]);
329 m5 = _mm_loadu_ps(&_x[2 * i + 20]);
Thomas Tsou3eaae802013-08-20 19:31:14 -0400330
331 m6 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
332 m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
333 m8 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
334 m9 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
335 m0 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(0, 2, 0, 2));
336 m1 = _mm_shuffle_ps(m4, m5, _MM_SHUFFLE(1, 3, 1, 3));
337
338 m2 = _mm_mul_ps(m6, m11);
339 m3 = _mm_mul_ps(m7, m11);
340 m4 = _mm_mul_ps(m8, m12);
341 m5 = _mm_mul_ps(m9, m12);
342 m6 = _mm_mul_ps(m0, m13);
343 m7 = _mm_mul_ps(m1, m13);
344
345 m0 = _mm_add_ps(m2, m4);
346 m1 = _mm_add_ps(m3, m5);
347 m8 = _mm_add_ps(m0, m6);
348 m9 = _mm_add_ps(m1, m7);
349
350 /* Multiply-accumulate last 8 taps */
Philipp Maier131f82b2017-03-15 12:39:25 +0100351 m0 = _mm_loadu_ps(&_x[2 * i + 24]);
352 m1 = _mm_loadu_ps(&_x[2 * i + 28]);
353 m2 = _mm_loadu_ps(&_x[2 * i + 32]);
354 m3 = _mm_loadu_ps(&_x[2 * i + 36]);
Thomas Tsou3eaae802013-08-20 19:31:14 -0400355
356 m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
357 m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
358 m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
359 m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
360
361 m0 = _mm_mul_ps(m4, m14);
362 m1 = _mm_mul_ps(m5, m14);
363 m2 = _mm_mul_ps(m6, m15);
364 m3 = _mm_mul_ps(m7, m15);
365
366 m4 = _mm_add_ps(m0, m2);
367 m5 = _mm_add_ps(m1, m3);
368
369 /* Final sum and store */
370 m0 = _mm_add_ps(m8, m4);
371 m1 = _mm_add_ps(m9, m5);
372 m2 = _mm_hadd_ps(m0, m1);
373 m3 = _mm_hadd_ps(m2, m2);
374
375 _mm_store_ss(&y[2 * i + 0], m3);
376 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(0, 3, 2, 1));
377 _mm_store_ss(&y[2 * i + 1], m3);
378 }
379}
380
381/* 4*N-tap SSE complex-real convolution */
Philipp Maier131f82b2017-03-15 12:39:25 +0100382static void sse_conv_real4n(const float *x, int x_len,
383 const float *h, int h_len,
384 float *y, int y_len,
385 int start, int len,
386 int step, int offset)
Thomas Tsou3eaae802013-08-20 19:31:14 -0400387{
Philipp Maier131f82b2017-03-15 12:39:25 +0100388 /* See NOTE in sse_conv_real4() */
389
Thomas Tsou3eaae802013-08-20 19:31:14 -0400390 __m128 m0, m1, m2, m4, m5, m6, m7;
391
Philipp Maier131f82b2017-03-15 12:39:25 +0100392 const float *_x = &x[2 * (-(h_len - 1) + start)];
393
Thomas Tsou3eaae802013-08-20 19:31:14 -0400394 for (int i = 0; i < len; i++) {
395 /* Zero */
396 m6 = _mm_setzero_ps();
397 m7 = _mm_setzero_ps();
398
399 for (int n = 0; n < h_len / 4; n++) {
400 /* Load (aligned) filter taps */
401 m0 = _mm_load_ps(&h[8 * n + 0]);
402 m1 = _mm_load_ps(&h[8 * n + 4]);
403 m2 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
404
405 /* Load (unaligned) input data */
Philipp Maier131f82b2017-03-15 12:39:25 +0100406 m0 = _mm_loadu_ps(&_x[2 * i + 8 * n + 0]);
407 m1 = _mm_loadu_ps(&_x[2 * i + 8 * n + 4]);
Thomas Tsou3eaae802013-08-20 19:31:14 -0400408 m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
409 m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
410
411 /* Quad multiply */
412 m0 = _mm_mul_ps(m2, m4);
413 m1 = _mm_mul_ps(m2, m5);
414
415 /* Accumulate */
416 m6 = _mm_add_ps(m6, m0);
417 m7 = _mm_add_ps(m7, m1);
418 }
419
420 m0 = _mm_hadd_ps(m6, m7);
421 m0 = _mm_hadd_ps(m0, m0);
422
423 _mm_store_ss(&y[2 * i + 0], m0);
424 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(0, 3, 2, 1));
425 _mm_store_ss(&y[2 * i + 1], m0);
426 }
427}
428
429/* 4*N-tap SSE complex-complex convolution */
Philipp Maier131f82b2017-03-15 12:39:25 +0100430static void sse_conv_cmplx_4n(const float *x, int x_len,
431 const float *h, int h_len,
432 float *y, int y_len,
433 int start, int len,
434 int step, int offset)
Thomas Tsou3eaae802013-08-20 19:31:14 -0400435{
Philipp Maier131f82b2017-03-15 12:39:25 +0100436 /* NOTE: The parameter list of this function has to match the parameter
437 * list of _base_convolve_complex() in convolve_base.c. This specific
438 * implementation, ignores some of the parameters of
439 * _base_convolve_complex(), which are: x_len, y_len, offset, step. */
440
Thomas Tsou3eaae802013-08-20 19:31:14 -0400441 __m128 m0, m1, m2, m3, m4, m5, m6, m7;
442
Philipp Maier131f82b2017-03-15 12:39:25 +0100443 const float *_x = &x[2 * (-(h_len - 1) + start)];
444
Thomas Tsou3eaae802013-08-20 19:31:14 -0400445 for (int i = 0; i < len; i++) {
446 /* Zero */
447 m6 = _mm_setzero_ps();
448 m7 = _mm_setzero_ps();
449
450 for (int n = 0; n < h_len / 4; n++) {
451 /* Load (aligned) filter taps */
452 m0 = _mm_load_ps(&h[8 * n + 0]);
453 m1 = _mm_load_ps(&h[8 * n + 4]);
454 m2 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
455 m3 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
456
457 /* Load (unaligned) input data */
Philipp Maier131f82b2017-03-15 12:39:25 +0100458 m0 = _mm_loadu_ps(&_x[2 * i + 8 * n + 0]);
459 m1 = _mm_loadu_ps(&_x[2 * i + 8 * n + 4]);
Thomas Tsou3eaae802013-08-20 19:31:14 -0400460 m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
461 m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
462
463 /* Quad multiply */
464 m0 = _mm_mul_ps(m2, m4);
465 m1 = _mm_mul_ps(m3, m5);
466
467 m2 = _mm_mul_ps(m2, m5);
468 m3 = _mm_mul_ps(m3, m4);
469
470 /* Sum */
471 m0 = _mm_sub_ps(m0, m1);
472 m2 = _mm_add_ps(m2, m3);
473
474 /* Accumulate */
475 m6 = _mm_add_ps(m6, m0);
476 m7 = _mm_add_ps(m7, m2);
477 }
478
479 m0 = _mm_hadd_ps(m6, m7);
480 m0 = _mm_hadd_ps(m0, m0);
481
482 _mm_store_ss(&y[2 * i + 0], m0);
483 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(0, 3, 2, 1));
484 _mm_store_ss(&y[2 * i + 1], m0);
485 }
486}
487
488/* 8*N-tap SSE complex-complex convolution */
Philipp Maier131f82b2017-03-15 12:39:25 +0100489static void sse_conv_cmplx_8n(const float *x, int x_len,
490 const float *h, int h_len,
491 float *y, int y_len,
492 int start, int len,
493 int step, int offset)
Thomas Tsou3eaae802013-08-20 19:31:14 -0400494{
Philipp Maier131f82b2017-03-15 12:39:25 +0100495 /* See NOTE in sse_conv_cmplx_4n() */
496
Thomas Tsou3eaae802013-08-20 19:31:14 -0400497 __m128 m0, m1, m2, m3, m4, m5, m6, m7;
498 __m128 m8, m9, m10, m11, m12, m13, m14, m15;
499
Philipp Maier131f82b2017-03-15 12:39:25 +0100500 const float *_x = &x[2 * (-(h_len - 1) + start)];
501
Thomas Tsou3eaae802013-08-20 19:31:14 -0400502 for (int i = 0; i < len; i++) {
503 /* Zero */
504 m12 = _mm_setzero_ps();
505 m13 = _mm_setzero_ps();
506 m14 = _mm_setzero_ps();
507 m15 = _mm_setzero_ps();
508
509 for (int n = 0; n < h_len / 8; n++) {
510 /* Load (aligned) filter taps */
511 m0 = _mm_load_ps(&h[16 * n + 0]);
512 m1 = _mm_load_ps(&h[16 * n + 4]);
513 m2 = _mm_load_ps(&h[16 * n + 8]);
514 m3 = _mm_load_ps(&h[16 * n + 12]);
515
516 m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
517 m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
Thomas Tsou187225c2014-05-01 02:15:51 -0400518 m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
Thomas Tsou3eaae802013-08-20 19:31:14 -0400519 m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
520
521 /* Load (unaligned) input data */
Philipp Maier131f82b2017-03-15 12:39:25 +0100522 m0 = _mm_loadu_ps(&_x[2 * i + 16 * n + 0]);
523 m1 = _mm_loadu_ps(&_x[2 * i + 16 * n + 4]);
524 m2 = _mm_loadu_ps(&_x[2 * i + 16 * n + 8]);
525 m3 = _mm_loadu_ps(&_x[2 * i + 16 * n + 12]);
Thomas Tsou3eaae802013-08-20 19:31:14 -0400526
527 m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2));
528 m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3));
529 m10 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2));
530 m11 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3));
531
532 /* Quad multiply */
533 m0 = _mm_mul_ps(m4, m8);
534 m1 = _mm_mul_ps(m5, m9);
535 m2 = _mm_mul_ps(m6, m10);
536 m3 = _mm_mul_ps(m7, m11);
537
538 m4 = _mm_mul_ps(m4, m9);
539 m5 = _mm_mul_ps(m5, m8);
540 m6 = _mm_mul_ps(m6, m11);
541 m7 = _mm_mul_ps(m7, m10);
542
543 /* Sum */
544 m0 = _mm_sub_ps(m0, m1);
545 m2 = _mm_sub_ps(m2, m3);
546 m4 = _mm_add_ps(m4, m5);
547 m6 = _mm_add_ps(m6, m7);
548
549 /* Accumulate */
550 m12 = _mm_add_ps(m12, m0);
551 m13 = _mm_add_ps(m13, m2);
552 m14 = _mm_add_ps(m14, m4);
553 m15 = _mm_add_ps(m15, m6);
554 }
555
556 m0 = _mm_add_ps(m12, m13);
557 m1 = _mm_add_ps(m14, m15);
558 m2 = _mm_hadd_ps(m0, m1);
559 m2 = _mm_hadd_ps(m2, m2);
560
561 _mm_store_ss(&y[2 * i + 0], m2);
562 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 3, 2, 1));
563 _mm_store_ss(&y[2 * i + 1], m2);
564 }
565}
566#endif
567
Thomas Tsou3eaae802013-08-20 19:31:14 -0400568/* API: Aligned complex-real */
Tom Tsouf147b172015-03-25 12:55:11 -0700569int convolve_real(const float *x, int x_len,
570 const float *h, int h_len,
Philipp Maier131f82b2017-03-15 12:39:25 +0100571 float *y, int y_len, int start, int len, int step, int offset)
Thomas Tsou3eaae802013-08-20 19:31:14 -0400572{
Philipp Maier131f82b2017-03-15 12:39:25 +0100573 void (*conv_func) (const float *, int, const float *, int, float *, int,
574 int, int, int, int) = (void *)_base_convolve_real;
Thomas Tsou3eaae802013-08-20 19:31:14 -0400575
576 if (bounds_check(x_len, h_len, y_len, start, len, step) < 0)
577 return -1;
578
579 memset(y, 0, len * 2 * sizeof(float));
580
581#ifdef HAVE_SSE3
582 if (step <= 4) {
583 switch (h_len) {
584 case 4:
585 conv_func = sse_conv_real4;
586 break;
587 case 8:
588 conv_func = sse_conv_real8;
589 break;
590 case 12:
591 conv_func = sse_conv_real12;
592 break;
593 case 16:
594 conv_func = sse_conv_real16;
595 break;
596 case 20:
597 conv_func = sse_conv_real20;
598 break;
599 default:
600 if (!(h_len % 4))
Philipp Maier131f82b2017-03-15 12:39:25 +0100601 conv_func = sse_conv_real4n;
Thomas Tsou3eaae802013-08-20 19:31:14 -0400602 }
603 }
604#endif
Philipp Maier131f82b2017-03-15 12:39:25 +0100605
606 conv_func(x, x_len, h, h_len, y, y_len, start, len, step, offset);
Thomas Tsou3eaae802013-08-20 19:31:14 -0400607
608 return len;
609}
610
611/* API: Aligned complex-complex */
Tom Tsouf147b172015-03-25 12:55:11 -0700612int convolve_complex(const float *x, int x_len,
613 const float *h, int h_len,
Thomas Tsou3eaae802013-08-20 19:31:14 -0400614 float *y, int y_len,
Philipp Maier131f82b2017-03-15 12:39:25 +0100615 int start, int len, int step, int offset)
Thomas Tsou3eaae802013-08-20 19:31:14 -0400616{
Philipp Maier131f82b2017-03-15 12:39:25 +0100617 void (*conv_func) (const float *, int, const float *, int, float *, int,
618 int, int, int, int) =
619 (void *)_base_convolve_complex;
Thomas Tsou3eaae802013-08-20 19:31:14 -0400620
621 if (bounds_check(x_len, h_len, y_len, start, len, step) < 0)
622 return -1;
623
624 memset(y, 0, len * 2 * sizeof(float));
625
626#ifdef HAVE_SSE3
627 if (step <= 4) {
628 if (!(h_len % 8))
629 conv_func = sse_conv_cmplx_8n;
630 else if (!(h_len % 4))
631 conv_func = sse_conv_cmplx_4n;
632 }
633#endif
Philipp Maier131f82b2017-03-15 12:39:25 +0100634
635 conv_func(x, x_len, h, h_len, y, y_len, start, len, step, offset);
Thomas Tsou3eaae802013-08-20 19:31:14 -0400636
637 return len;
638}