blob: 3f76b6561696720cd708e460e958966fdd2f318c [file] [log] [blame]
Thomas Tsou9471d762013-08-20 21:24:24 -04001/*
2 * SSE type conversions
3 * Copyright (C) 2013 Thomas Tsou <tom@tsou.cc>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2.1 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19
20#include <malloc.h>
21#include <string.h>
Thomas Tsou17bbb9b2013-10-30 21:24:40 -040022#include "convert.h"
Thomas Tsou9471d762013-08-20 21:24:24 -040023
24#ifdef HAVE_CONFIG_H
25#include "config.h"
26#endif
27
Philipp Maier7e07cf22017-03-15 18:09:35 +010028/* Architecture dependant function pointers */
29struct convert_cpu_context {
30 void (*convert_si16_ps_16n) (float *, const short *, int);
31 void (*convert_si16_ps) (float *, const short *, int);
32 void (*convert_scale_ps_si16_16n)(short *, const float *, float, int);
33 void (*convert_scale_ps_si16_8n)(short *, const float *, float, int);
34 void (*convert_scale_ps_si16)(short *, const float *, float, int);
35};
36
37static struct convert_cpu_context c;
38
Thomas Tsou9471d762013-08-20 21:24:24 -040039#ifdef HAVE_SSE3
40#include <xmmintrin.h>
41#include <emmintrin.h>
42
43#ifdef HAVE_SSE4_1
44#include <smmintrin.h>
45
46/* 16*N 16-bit signed integer converted to single precision floats */
47static void _sse_convert_si16_ps_16n(float *restrict out,
Tom Tsouf147b172015-03-25 12:55:11 -070048 const short *restrict in,
Thomas Tsou9471d762013-08-20 21:24:24 -040049 int len)
50{
51 __m128i m0, m1, m2, m3, m4, m5;
52 __m128 m6, m7, m8, m9;
53
54 for (int i = 0; i < len / 16; i++) {
55 /* Load (unaligned) packed floats */
56 m0 = _mm_loadu_si128((__m128i *) &in[16 * i + 0]);
57 m1 = _mm_loadu_si128((__m128i *) &in[16 * i + 8]);
58
59 /* Unpack */
60 m2 = _mm_cvtepi16_epi32(m0);
61 m4 = _mm_cvtepi16_epi32(m1);
62 m0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2));
63 m1 = _mm_shuffle_epi32(m1, _MM_SHUFFLE(1, 0, 3, 2));
64 m3 = _mm_cvtepi16_epi32(m0);
65 m5 = _mm_cvtepi16_epi32(m1);
66
67 /* Convert */
68 m6 = _mm_cvtepi32_ps(m2);
69 m7 = _mm_cvtepi32_ps(m3);
70 m8 = _mm_cvtepi32_ps(m4);
71 m9 = _mm_cvtepi32_ps(m5);
72
73 /* Store */
74 _mm_storeu_ps(&out[16 * i + 0], m6);
75 _mm_storeu_ps(&out[16 * i + 4], m7);
76 _mm_storeu_ps(&out[16 * i + 8], m8);
77 _mm_storeu_ps(&out[16 * i + 12], m9);
78 }
79}
80
81/* 16*N 16-bit signed integer conversion with remainder */
82static void _sse_convert_si16_ps(float *restrict out,
Tom Tsouf147b172015-03-25 12:55:11 -070083 const short *restrict in,
Thomas Tsou9471d762013-08-20 21:24:24 -040084 int len)
85{
86 int start = len / 16 * 16;
87
88 _sse_convert_si16_ps_16n(out, in, len);
89
90 for (int i = 0; i < len % 16; i++)
91 out[start + i] = in[start + i];
92}
93#endif /* HAVE_SSE4_1 */
94
95/* 8*N single precision floats scaled and converted to 16-bit signed integer */
96static void _sse_convert_scale_ps_si16_8n(short *restrict out,
Tom Tsouf147b172015-03-25 12:55:11 -070097 const float *restrict in,
Thomas Tsou9471d762013-08-20 21:24:24 -040098 float scale, int len)
99{
100 __m128 m0, m1, m2;
101 __m128i m4, m5;
102
103 for (int i = 0; i < len / 8; i++) {
104 /* Load (unaligned) packed floats */
105 m0 = _mm_loadu_ps(&in[8 * i + 0]);
106 m1 = _mm_loadu_ps(&in[8 * i + 4]);
107 m2 = _mm_load1_ps(&scale);
108
109 /* Scale */
110 m0 = _mm_mul_ps(m0, m2);
111 m1 = _mm_mul_ps(m1, m2);
112
113 /* Convert */
114 m4 = _mm_cvtps_epi32(m0);
115 m5 = _mm_cvtps_epi32(m1);
116
117 /* Pack and store */
118 m5 = _mm_packs_epi32(m4, m5);
119 _mm_storeu_si128((__m128i *) &out[8 * i], m5);
120 }
121}
122
123/* 8*N single precision floats scaled and converted with remainder */
124static void _sse_convert_scale_ps_si16(short *restrict out,
Tom Tsouf147b172015-03-25 12:55:11 -0700125 const float *restrict in,
Thomas Tsou9471d762013-08-20 21:24:24 -0400126 float scale, int len)
127{
128 int start = len / 8 * 8;
129
130 _sse_convert_scale_ps_si16_8n(out, in, scale, len);
131
132 for (int i = 0; i < len % 8; i++)
133 out[start + i] = in[start + i] * scale;
134}
135
136/* 16*N single precision floats scaled and converted to 16-bit signed integer */
137static void _sse_convert_scale_ps_si16_16n(short *restrict out,
Tom Tsouf147b172015-03-25 12:55:11 -0700138 const float *restrict in,
Thomas Tsou9471d762013-08-20 21:24:24 -0400139 float scale, int len)
140{
141 __m128 m0, m1, m2, m3, m4;
142 __m128i m5, m6, m7, m8;
143
144 for (int i = 0; i < len / 16; i++) {
145 /* Load (unaligned) packed floats */
146 m0 = _mm_loadu_ps(&in[16 * i + 0]);
147 m1 = _mm_loadu_ps(&in[16 * i + 4]);
148 m2 = _mm_loadu_ps(&in[16 * i + 8]);
149 m3 = _mm_loadu_ps(&in[16 * i + 12]);
150 m4 = _mm_load1_ps(&scale);
151
152 /* Scale */
153 m0 = _mm_mul_ps(m0, m4);
154 m1 = _mm_mul_ps(m1, m4);
155 m2 = _mm_mul_ps(m2, m4);
156 m3 = _mm_mul_ps(m3, m4);
157
158 /* Convert */
159 m5 = _mm_cvtps_epi32(m0);
160 m6 = _mm_cvtps_epi32(m1);
161 m7 = _mm_cvtps_epi32(m2);
162 m8 = _mm_cvtps_epi32(m3);
163
164 /* Pack and store */
165 m5 = _mm_packs_epi32(m5, m6);
166 m7 = _mm_packs_epi32(m7, m8);
167 _mm_storeu_si128((__m128i *) &out[16 * i + 0], m5);
168 _mm_storeu_si128((__m128i *) &out[16 * i + 8], m7);
169 }
170}
Philipp Maier7e07cf22017-03-15 18:09:35 +0100171#endif
172
Philipp Maier7e07cf22017-03-15 18:09:35 +0100173void convert_init(void)
174{
Philipp Maierfe976982017-03-16 14:50:25 +0100175 c.convert_scale_ps_si16_16n = base_convert_float_short;
176 c.convert_scale_ps_si16_8n = base_convert_float_short;
177 c.convert_scale_ps_si16 = base_convert_float_short;
178 c.convert_si16_ps_16n = base_convert_short_float;
179 c.convert_si16_ps = base_convert_short_float;
Philipp Maier7e07cf22017-03-15 18:09:35 +0100180
181#ifdef HAVE_SSE4_1
182 if (__builtin_cpu_supports("sse4.1")) {
183 c.convert_si16_ps_16n = &_sse_convert_si16_ps_16n;
184 c.convert_si16_ps = &_sse_convert_si16_ps;
185 }
Thomas Tsou9471d762013-08-20 21:24:24 -0400186#endif
187
Philipp Maier7e07cf22017-03-15 18:09:35 +0100188#ifdef HAVE_SSE3
189 if (__builtin_cpu_supports("sse3")) {
190 c.convert_scale_ps_si16_16n = _sse_convert_scale_ps_si16_16n;
191 c.convert_scale_ps_si16_8n = _sse_convert_scale_ps_si16_8n;
192 c.convert_scale_ps_si16 = _sse_convert_scale_ps_si16;
193 }
194#endif
195}
196
Tom Tsouf147b172015-03-25 12:55:11 -0700197void convert_float_short(short *out, const float *in, float scale, int len)
Thomas Tsou9471d762013-08-20 21:24:24 -0400198{
Thomas Tsou9471d762013-08-20 21:24:24 -0400199 if (!(len % 16))
Philipp Maier7e07cf22017-03-15 18:09:35 +0100200 c.convert_scale_ps_si16_16n(out, in, scale, len);
Thomas Tsou9471d762013-08-20 21:24:24 -0400201 else if (!(len % 8))
Philipp Maier7e07cf22017-03-15 18:09:35 +0100202 c.convert_scale_ps_si16_8n(out, in, scale, len);
Thomas Tsou9471d762013-08-20 21:24:24 -0400203 else
Philipp Maier7e07cf22017-03-15 18:09:35 +0100204 c.convert_scale_ps_si16(out, in, scale, len);
Thomas Tsou9471d762013-08-20 21:24:24 -0400205}
206
Tom Tsouf147b172015-03-25 12:55:11 -0700207void convert_short_float(float *out, const short *in, int len)
Thomas Tsou9471d762013-08-20 21:24:24 -0400208{
Thomas Tsou9471d762013-08-20 21:24:24 -0400209 if (!(len % 16))
Philipp Maier7e07cf22017-03-15 18:09:35 +0100210 c.convert_si16_ps_16n(out, in, len);
Thomas Tsou9471d762013-08-20 21:24:24 -0400211 else
Philipp Maier7e07cf22017-03-15 18:09:35 +0100212 c.convert_si16_ps(out, in, len);
Thomas Tsou9471d762013-08-20 21:24:24 -0400213}