Blame - Transceiver52M/arch/x86/convert_sse_4_1.c - osmo-trx

blob: 42a235c4ae2a15a4e6b57ae1ab5cd82bf976e9f3 [file] [log] [blame]

Philipp Maier	e8ae9fc	2017-03-20 12:08:42 +0100	[diff] [blame]	1	/*
				2	* SSE type conversions
				3	* Copyright (C) 2013 Thomas Tsou <tom@tsou.cc>
				4	*
				5	* This library is free software; you can redistribute it and/or
				6	* modify it under the terms of the GNU Lesser General Public
				7	* License as published by the Free Software Foundation; either
				8	* version 2.1 of the License, or (at your option) any later version.
				9	*
				10	* This library is distributed in the hope that it will be useful,
				11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				13	* Lesser General Public License for more details.
				14	*
				15	* You should have received a copy of the GNU Lesser General Public
				16	* License along with this library; if not, write to the Free Software
				17	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
				18	*/
				19
				20	#include <malloc.h>
				21	#include <string.h>
				22	#include "convert_sse_4_1.h"
				23
				24	#ifdef HAVE_CONFIG_H
				25	#include "config.h"
				26	#endif
				27
				28	#ifdef HAVE_SSE4_1
				29	#include <smmintrin.h>
				30
				31	/* 16N 16-bit signed integer converted to single precision floats /
				32	void _sse_convert_si16_ps_16n(float *restrict out,
				33	const short *restrict in, int len)
				34	{
				35	__m128i m0, m1, m2, m3, m4, m5;
				36	__m128 m6, m7, m8, m9;
				37
				38	for (int i = 0; i < len / 16; i++) {
				39	/* Load (unaligned) packed floats */
				40	m0 = _mm_loadu_si128((__m128i ) & in[16 i + 0]);
				41	m1 = _mm_loadu_si128((__m128i ) & in[16 i + 8]);
				42
				43	/* Unpack */
				44	m2 = _mm_cvtepi16_epi32(m0);
				45	m4 = _mm_cvtepi16_epi32(m1);
				46	m0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2));
				47	m1 = _mm_shuffle_epi32(m1, _MM_SHUFFLE(1, 0, 3, 2));
				48	m3 = _mm_cvtepi16_epi32(m0);
				49	m5 = _mm_cvtepi16_epi32(m1);
				50
				51	/* Convert */
				52	m6 = _mm_cvtepi32_ps(m2);
				53	m7 = _mm_cvtepi32_ps(m3);
				54	m8 = _mm_cvtepi32_ps(m4);
				55	m9 = _mm_cvtepi32_ps(m5);
				56
				57	/* Store */
				58	_mm_storeu_ps(&out[16 * i + 0], m6);
				59	_mm_storeu_ps(&out[16 * i + 4], m7);
				60	_mm_storeu_ps(&out[16 * i + 8], m8);
				61	_mm_storeu_ps(&out[16 * i + 12], m9);
				62	}
				63	}
				64
				65	/* 16N 16-bit signed integer conversion with remainder /
				66	void _sse_convert_si16_ps(float *restrict out,
				67	const short *restrict in, int len)
				68	{
				69	int start = len / 16 * 16;
				70
				71	_sse_convert_si16_ps_16n(out, in, len);
				72
				73	for (int i = 0; i < len % 16; i++)
				74	out[start + i] = in[start + i];
				75	}
				76
				77	#endif