blob: dc5e748dc3115d17a0fe75acb07445ace77d20a4 [file] [log] [blame]
Thomas Tsou9471d762013-08-20 21:24:24 -04001/*
2 * SSE type conversions
3 * Copyright (C) 2013 Thomas Tsou <tom@tsou.cc>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2.1 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19
20#include <malloc.h>
21#include <string.h>
22
23#ifdef HAVE_CONFIG_H
24#include "config.h"
25#endif
26
27#ifdef HAVE_SSE3
28#include <xmmintrin.h>
29#include <emmintrin.h>
30
31#ifdef HAVE_SSE4_1
32#include <smmintrin.h>
33
34/* 16*N 16-bit signed integer converted to single precision floats */
35static void _sse_convert_si16_ps_16n(float *restrict out,
36 short *restrict in,
37 int len)
38{
39 __m128i m0, m1, m2, m3, m4, m5;
40 __m128 m6, m7, m8, m9;
41
42 for (int i = 0; i < len / 16; i++) {
43 /* Load (unaligned) packed floats */
44 m0 = _mm_loadu_si128((__m128i *) &in[16 * i + 0]);
45 m1 = _mm_loadu_si128((__m128i *) &in[16 * i + 8]);
46
47 /* Unpack */
48 m2 = _mm_cvtepi16_epi32(m0);
49 m4 = _mm_cvtepi16_epi32(m1);
50 m0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2));
51 m1 = _mm_shuffle_epi32(m1, _MM_SHUFFLE(1, 0, 3, 2));
52 m3 = _mm_cvtepi16_epi32(m0);
53 m5 = _mm_cvtepi16_epi32(m1);
54
55 /* Convert */
56 m6 = _mm_cvtepi32_ps(m2);
57 m7 = _mm_cvtepi32_ps(m3);
58 m8 = _mm_cvtepi32_ps(m4);
59 m9 = _mm_cvtepi32_ps(m5);
60
61 /* Store */
62 _mm_storeu_ps(&out[16 * i + 0], m6);
63 _mm_storeu_ps(&out[16 * i + 4], m7);
64 _mm_storeu_ps(&out[16 * i + 8], m8);
65 _mm_storeu_ps(&out[16 * i + 12], m9);
66 }
67}
68
69/* 16*N 16-bit signed integer conversion with remainder */
70static void _sse_convert_si16_ps(float *restrict out,
71 short *restrict in,
72 int len)
73{
74 int start = len / 16 * 16;
75
76 _sse_convert_si16_ps_16n(out, in, len);
77
78 for (int i = 0; i < len % 16; i++)
79 out[start + i] = in[start + i];
80}
81#endif /* HAVE_SSE4_1 */
82
83/* 8*N single precision floats scaled and converted to 16-bit signed integer */
84static void _sse_convert_scale_ps_si16_8n(short *restrict out,
85 float *restrict in,
86 float scale, int len)
87{
88 __m128 m0, m1, m2;
89 __m128i m4, m5;
90
91 for (int i = 0; i < len / 8; i++) {
92 /* Load (unaligned) packed floats */
93 m0 = _mm_loadu_ps(&in[8 * i + 0]);
94 m1 = _mm_loadu_ps(&in[8 * i + 4]);
95 m2 = _mm_load1_ps(&scale);
96
97 /* Scale */
98 m0 = _mm_mul_ps(m0, m2);
99 m1 = _mm_mul_ps(m1, m2);
100
101 /* Convert */
102 m4 = _mm_cvtps_epi32(m0);
103 m5 = _mm_cvtps_epi32(m1);
104
105 /* Pack and store */
106 m5 = _mm_packs_epi32(m4, m5);
107 _mm_storeu_si128((__m128i *) &out[8 * i], m5);
108 }
109}
110
111/* 8*N single precision floats scaled and converted with remainder */
112static void _sse_convert_scale_ps_si16(short *restrict out,
113 float *restrict in,
114 float scale, int len)
115{
116 int start = len / 8 * 8;
117
118 _sse_convert_scale_ps_si16_8n(out, in, scale, len);
119
120 for (int i = 0; i < len % 8; i++)
121 out[start + i] = in[start + i] * scale;
122}
123
124/* 16*N single precision floats scaled and converted to 16-bit signed integer */
125static void _sse_convert_scale_ps_si16_16n(short *restrict out,
126 float *restrict in,
127 float scale, int len)
128{
129 __m128 m0, m1, m2, m3, m4;
130 __m128i m5, m6, m7, m8;
131
132 for (int i = 0; i < len / 16; i++) {
133 /* Load (unaligned) packed floats */
134 m0 = _mm_loadu_ps(&in[16 * i + 0]);
135 m1 = _mm_loadu_ps(&in[16 * i + 4]);
136 m2 = _mm_loadu_ps(&in[16 * i + 8]);
137 m3 = _mm_loadu_ps(&in[16 * i + 12]);
138 m4 = _mm_load1_ps(&scale);
139
140 /* Scale */
141 m0 = _mm_mul_ps(m0, m4);
142 m1 = _mm_mul_ps(m1, m4);
143 m2 = _mm_mul_ps(m2, m4);
144 m3 = _mm_mul_ps(m3, m4);
145
146 /* Convert */
147 m5 = _mm_cvtps_epi32(m0);
148 m6 = _mm_cvtps_epi32(m1);
149 m7 = _mm_cvtps_epi32(m2);
150 m8 = _mm_cvtps_epi32(m3);
151
152 /* Pack and store */
153 m5 = _mm_packs_epi32(m5, m6);
154 m7 = _mm_packs_epi32(m7, m8);
155 _mm_storeu_si128((__m128i *) &out[16 * i + 0], m5);
156 _mm_storeu_si128((__m128i *) &out[16 * i + 8], m7);
157 }
158}
159#else /* HAVE_SSE3 */
160static void convert_scale_ps_si16(short *out, float *in, float scale, int len)
161{
162 for (int i = 0; i < len; i++)
163 out[i] = in[i] * scale;
164}
165#endif
166
167#ifndef HAVE_SSE_4_1
168static void convert_si16_ps(float *out, short *in, int len)
169{
170 for (int i = 0; i < len; i++)
171 out[i] = in[i];
172}
173#endif
174
175void convert_float_short(short *out, float *in, float scale, int len)
176{
177#ifdef HAVE_SSE3
178 if (!(len % 16))
179 _sse_convert_scale_ps_si16_16n(out, in, scale, len);
180 else if (!(len % 8))
181 _sse_convert_scale_ps_si16_8n(out, in, scale, len);
182 else
183 _sse_convert_scale_ps_si16(out, in, scale, len);
184#else
185 convert_scale_ps_si16(out, in, scale, len);
186#endif
187}
188
189void convert_short_float(float *out, short *in, int len)
190{
191#ifdef HAVE_SSE4_1
192 if (!(len % 16))
193 _sse_convert_si16_ps_16n(out, in, len);
194 else
195 _sse_convert_si16_ps(out, in, len);
196#else
197 convert_si16_ps(out, in, len);
198#endif
199}