Blame - Transceiver52M/x86/convert_sse_3.c - osmo-trx

blob: 255db674c9bd43f9bac1e8658c219176e1452104 [file] [log] [blame]

Philipp Maier	e8ae9fc	2017-03-20 12:08:42 +0100	[diff] [blame]	1	/*
				2	* SSE type conversions
				3	* Copyright (C) 2013 Thomas Tsou <tom@tsou.cc>
				4	*
				5	* This library is free software; you can redistribute it and/or
				6	* modify it under the terms of the GNU Lesser General Public
				7	* License as published by the Free Software Foundation; either
				8	* version 2.1 of the License, or (at your option) any later version.
				9	*
				10	* This library is distributed in the hope that it will be useful,
				11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				13	* Lesser General Public License for more details.
				14	*
				15	* You should have received a copy of the GNU Lesser General Public
				16	* License along with this library; if not, write to the Free Software
				17	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
				18	*/
				19
				20	#include <malloc.h>
				21	#include <string.h>
				22	#include "convert_sse_3.h"
				23
				24	#ifdef HAVE_CONFIG_H
				25	#include "config.h"
				26	#endif
				27
				28	#ifdef HAVE_SSE3
				29	#include <xmmintrin.h>
				30	#include <emmintrin.h>
				31
				32	/* 8N single precision floats scaled and converted to 16-bit signed integer /
				33	void _sse_convert_scale_ps_si16_8n(short *restrict out,
				34	const float *restrict in,
				35	float scale, int len)
				36	{
				37	__m128 m0, m1, m2;
				38	__m128i m4, m5;
				39
				40	for (int i = 0; i < len / 8; i++) {
				41	/* Load (unaligned) packed floats */
				42	m0 = _mm_loadu_ps(&in[8 * i + 0]);
				43	m1 = _mm_loadu_ps(&in[8 * i + 4]);
				44	m2 = _mm_load1_ps(&scale);
				45
				46	/* Scale */
				47	m0 = _mm_mul_ps(m0, m2);
				48	m1 = _mm_mul_ps(m1, m2);
				49
				50	/* Convert */
				51	m4 = _mm_cvtps_epi32(m0);
				52	m5 = _mm_cvtps_epi32(m1);
				53
				54	/* Pack and store */
				55	m5 = _mm_packs_epi32(m4, m5);
				56	_mm_storeu_si128((__m128i ) & out[8 i], m5);
				57	}
				58	}
				59
				60	/* 8N single precision floats scaled and converted with remainder /
				61	void _sse_convert_scale_ps_si16(short *restrict out,
				62	const float *restrict in, float scale, int len)
				63	{
				64	int start = len / 8 * 8;
				65
				66	_sse_convert_scale_ps_si16_8n(out, in, scale, len);
				67
				68	for (int i = 0; i < len % 8; i++)
				69	out[start + i] = in[start + i] * scale;
				70	}
				71
				72	/* 16N single precision floats scaled and converted to 16-bit signed integer /
				73	void _sse_convert_scale_ps_si16_16n(short *restrict out,
				74	const float *restrict in,
				75	float scale, int len)
				76	{
				77	__m128 m0, m1, m2, m3, m4;
				78	__m128i m5, m6, m7, m8;
				79
				80	for (int i = 0; i < len / 16; i++) {
				81	/* Load (unaligned) packed floats */
				82	m0 = _mm_loadu_ps(&in[16 * i + 0]);
				83	m1 = _mm_loadu_ps(&in[16 * i + 4]);
				84	m2 = _mm_loadu_ps(&in[16 * i + 8]);
				85	m3 = _mm_loadu_ps(&in[16 * i + 12]);
				86	m4 = _mm_load1_ps(&scale);
				87
				88	/* Scale */
				89	m0 = _mm_mul_ps(m0, m4);
				90	m1 = _mm_mul_ps(m1, m4);
				91	m2 = _mm_mul_ps(m2, m4);
				92	m3 = _mm_mul_ps(m3, m4);
				93
				94	/* Convert */
				95	m5 = _mm_cvtps_epi32(m0);
				96	m6 = _mm_cvtps_epi32(m1);
				97	m7 = _mm_cvtps_epi32(m2);
				98	m8 = _mm_cvtps_epi32(m3);
				99
				100	/* Pack and store */
				101	m5 = _mm_packs_epi32(m5, m6);
				102	m7 = _mm_packs_epi32(m7, m8);
				103	_mm_storeu_si128((__m128i ) & out[16 i + 0], m5);
				104	_mm_storeu_si128((__m128i ) & out[16 i + 8], m7);
				105	}
				106	}
				107	#endif