Blame - Transceiver52M/arch/x86/convert_sse_4_1.c - osmo-trx

blob: 736a3769e74060b2501b88c9397eff3f914e382b [file] [log] [blame]

Philipp Maier	e8ae9fc	2017-03-20 12:08:42 +0100	[diff] [blame]	1	/*
				2	* SSE type conversions
				3	* Copyright (C) 2013 Thomas Tsou <tom@tsou.cc>
				4	*
				5	* This library is free software; you can redistribute it and/or
				6	* modify it under the terms of the GNU Lesser General Public
				7	* License as published by the Free Software Foundation; either
				8	* version 2.1 of the License, or (at your option) any later version.
				9	*
				10	* This library is distributed in the hope that it will be useful,
				11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				13	* Lesser General Public License for more details.
Philipp Maier	e8ae9fc	2017-03-20 12:08:42 +0100	[diff] [blame]	14	*/
				15
				16	#include <malloc.h>
				17	#include <string.h>
				18	#include "convert_sse_4_1.h"
				19
				20	#ifdef HAVE_CONFIG_H
				21	#include "config.h"
				22	#endif
				23
				24	#ifdef HAVE_SSE4_1
				25	#include <smmintrin.h>
				26
				27	/* 16N 16-bit signed integer converted to single precision floats /
				28	void _sse_convert_si16_ps_16n(float *restrict out,
				29	const short *restrict in, int len)
				30	{
				31	__m128i m0, m1, m2, m3, m4, m5;
				32	__m128 m6, m7, m8, m9;
				33
				34	for (int i = 0; i < len / 16; i++) {
				35	/* Load (unaligned) packed floats */
				36	m0 = _mm_loadu_si128((__m128i ) & in[16 i + 0]);
				37	m1 = _mm_loadu_si128((__m128i ) & in[16 i + 8]);
				38
				39	/* Unpack */
				40	m2 = _mm_cvtepi16_epi32(m0);
				41	m4 = _mm_cvtepi16_epi32(m1);
				42	m0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2));
				43	m1 = _mm_shuffle_epi32(m1, _MM_SHUFFLE(1, 0, 3, 2));
				44	m3 = _mm_cvtepi16_epi32(m0);
				45	m5 = _mm_cvtepi16_epi32(m1);
				46
				47	/* Convert */
				48	m6 = _mm_cvtepi32_ps(m2);
				49	m7 = _mm_cvtepi32_ps(m3);
				50	m8 = _mm_cvtepi32_ps(m4);
				51	m9 = _mm_cvtepi32_ps(m5);
				52
				53	/* Store */
				54	_mm_storeu_ps(&out[16 * i + 0], m6);
				55	_mm_storeu_ps(&out[16 * i + 4], m7);
				56	_mm_storeu_ps(&out[16 * i + 8], m8);
				57	_mm_storeu_ps(&out[16 * i + 12], m9);
				58	}
				59	}
				60
				61	/* 16N 16-bit signed integer conversion with remainder /
				62	void _sse_convert_si16_ps(float *restrict out,
				63	const short *restrict in, int len)
				64	{
				65	int start = len / 16 * 16;
				66
				67	_sse_convert_si16_ps_16n(out, in, len);
				68
				69	for (int i = 0; i < len % 16; i++)
				70	out[start + i] = in[start + i];
				71	}
				72
				73	#endif