blob: f00ecf5b10bba970d225c790aac9712813593c45 [file] [log] [blame]
Philipp Maiere8ae9fc2017-03-20 12:08:42 +01001/*
2 * SSE type conversions
3 * Copyright (C) 2013 Thomas Tsou <tom@tsou.cc>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2.1 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
Philipp Maiere8ae9fc2017-03-20 12:08:42 +010014 */
15
16#include <malloc.h>
17#include <string.h>
18#include "convert_sse_3.h"
19
20#ifdef HAVE_CONFIG_H
21#include "config.h"
22#endif
23
24#ifdef HAVE_SSE3
25#include <xmmintrin.h>
26#include <emmintrin.h>
27
28/* 8*N single precision floats scaled and converted to 16-bit signed integer */
29void _sse_convert_scale_ps_si16_8n(short *restrict out,
30 const float *restrict in,
31 float scale, int len)
32{
33 __m128 m0, m1, m2;
34 __m128i m4, m5;
35
36 for (int i = 0; i < len / 8; i++) {
37 /* Load (unaligned) packed floats */
38 m0 = _mm_loadu_ps(&in[8 * i + 0]);
39 m1 = _mm_loadu_ps(&in[8 * i + 4]);
40 m2 = _mm_load1_ps(&scale);
41
42 /* Scale */
43 m0 = _mm_mul_ps(m0, m2);
44 m1 = _mm_mul_ps(m1, m2);
45
46 /* Convert */
47 m4 = _mm_cvtps_epi32(m0);
48 m5 = _mm_cvtps_epi32(m1);
49
50 /* Pack and store */
51 m5 = _mm_packs_epi32(m4, m5);
52 _mm_storeu_si128((__m128i *) & out[8 * i], m5);
53 }
54}
55
56/* 8*N single precision floats scaled and converted with remainder */
57void _sse_convert_scale_ps_si16(short *restrict out,
58 const float *restrict in, float scale, int len)
59{
60 int start = len / 8 * 8;
61
62 _sse_convert_scale_ps_si16_8n(out, in, scale, len);
63
64 for (int i = 0; i < len % 8; i++)
65 out[start + i] = in[start + i] * scale;
66}
67
68/* 16*N single precision floats scaled and converted to 16-bit signed integer */
69void _sse_convert_scale_ps_si16_16n(short *restrict out,
70 const float *restrict in,
71 float scale, int len)
72{
73 __m128 m0, m1, m2, m3, m4;
74 __m128i m5, m6, m7, m8;
75
76 for (int i = 0; i < len / 16; i++) {
77 /* Load (unaligned) packed floats */
78 m0 = _mm_loadu_ps(&in[16 * i + 0]);
79 m1 = _mm_loadu_ps(&in[16 * i + 4]);
80 m2 = _mm_loadu_ps(&in[16 * i + 8]);
81 m3 = _mm_loadu_ps(&in[16 * i + 12]);
82 m4 = _mm_load1_ps(&scale);
83
84 /* Scale */
85 m0 = _mm_mul_ps(m0, m4);
86 m1 = _mm_mul_ps(m1, m4);
87 m2 = _mm_mul_ps(m2, m4);
88 m3 = _mm_mul_ps(m3, m4);
89
90 /* Convert */
91 m5 = _mm_cvtps_epi32(m0);
92 m6 = _mm_cvtps_epi32(m1);
93 m7 = _mm_cvtps_epi32(m2);
94 m8 = _mm_cvtps_epi32(m3);
95
96 /* Pack and store */
97 m5 = _mm_packs_epi32(m5, m6);
98 m7 = _mm_packs_epi32(m7, m8);
99 _mm_storeu_si128((__m128i *) & out[16 * i + 0], m5);
100 _mm_storeu_si128((__m128i *) & out[16 * i + 8], m7);
101 }
102}
103#endif