Blame - Transceiver52M/arch/x86/convert_sse_3.c - osmo-trx

blob: f00ecf5b10bba970d225c790aac9712813593c45 [file] [log] [blame]

Philipp Maier	e8ae9fc	2017-03-20 12:08:42 +0100	[diff] [blame]	1	/*
				2	* SSE type conversions
				3	* Copyright (C) 2013 Thomas Tsou <tom@tsou.cc>
				4	*
				5	* This library is free software; you can redistribute it and/or
				6	* modify it under the terms of the GNU Lesser General Public
				7	* License as published by the Free Software Foundation; either
				8	* version 2.1 of the License, or (at your option) any later version.
				9	*
				10	* This library is distributed in the hope that it will be useful,
				11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				13	* Lesser General Public License for more details.
Philipp Maier	e8ae9fc	2017-03-20 12:08:42 +0100	[diff] [blame]	14	*/
				15
				16	#include <malloc.h>
				17	#include <string.h>
				18	#include "convert_sse_3.h"
				19
				20	#ifdef HAVE_CONFIG_H
				21	#include "config.h"
				22	#endif
				23
				24	#ifdef HAVE_SSE3
				25	#include <xmmintrin.h>
				26	#include <emmintrin.h>
				27
				28	/* 8N single precision floats scaled and converted to 16-bit signed integer /
				29	void _sse_convert_scale_ps_si16_8n(short *restrict out,
				30	const float *restrict in,
				31	float scale, int len)
				32	{
				33	__m128 m0, m1, m2;
				34	__m128i m4, m5;
				35
				36	for (int i = 0; i < len / 8; i++) {
				37	/* Load (unaligned) packed floats */
				38	m0 = _mm_loadu_ps(&in[8 * i + 0]);
				39	m1 = _mm_loadu_ps(&in[8 * i + 4]);
				40	m2 = _mm_load1_ps(&scale);
				41
				42	/* Scale */
				43	m0 = _mm_mul_ps(m0, m2);
				44	m1 = _mm_mul_ps(m1, m2);
				45
				46	/* Convert */
				47	m4 = _mm_cvtps_epi32(m0);
				48	m5 = _mm_cvtps_epi32(m1);
				49
				50	/* Pack and store */
				51	m5 = _mm_packs_epi32(m4, m5);
				52	_mm_storeu_si128((__m128i ) & out[8 i], m5);
				53	}
				54	}
				55
				56	/* 8N single precision floats scaled and converted with remainder /
				57	void _sse_convert_scale_ps_si16(short *restrict out,
				58	const float *restrict in, float scale, int len)
				59	{
				60	int start = len / 8 * 8;
				61
				62	_sse_convert_scale_ps_si16_8n(out, in, scale, len);
				63
				64	for (int i = 0; i < len % 8; i++)
				65	out[start + i] = in[start + i] * scale;
				66	}
				67
				68	/* 16N single precision floats scaled and converted to 16-bit signed integer /
				69	void _sse_convert_scale_ps_si16_16n(short *restrict out,
				70	const float *restrict in,
				71	float scale, int len)
				72	{
				73	__m128 m0, m1, m2, m3, m4;
				74	__m128i m5, m6, m7, m8;
				75
				76	for (int i = 0; i < len / 16; i++) {
				77	/* Load (unaligned) packed floats */
				78	m0 = _mm_loadu_ps(&in[16 * i + 0]);
				79	m1 = _mm_loadu_ps(&in[16 * i + 4]);
				80	m2 = _mm_loadu_ps(&in[16 * i + 8]);
				81	m3 = _mm_loadu_ps(&in[16 * i + 12]);
				82	m4 = _mm_load1_ps(&scale);
				83
				84	/* Scale */
				85	m0 = _mm_mul_ps(m0, m4);
				86	m1 = _mm_mul_ps(m1, m4);
				87	m2 = _mm_mul_ps(m2, m4);
				88	m3 = _mm_mul_ps(m3, m4);
				89
				90	/* Convert */
				91	m5 = _mm_cvtps_epi32(m0);
				92	m6 = _mm_cvtps_epi32(m1);
				93	m7 = _mm_cvtps_epi32(m2);
				94	m8 = _mm_cvtps_epi32(m3);
				95
				96	/* Pack and store */
				97	m5 = _mm_packs_epi32(m5, m6);
				98	m7 = _mm_packs_epi32(m7, m8);
				99	_mm_storeu_si128((__m128i ) & out[16 i + 0], m5);
				100	_mm_storeu_si128((__m128i ) & out[16 i + 8], m7);
				101	}
				102	}
				103	#endif