Blame - src/conv_acc_sse_avx.c - libosmocore

blob: 5ac3c163ced76c93d1afc47de24d9fd3a2fe8b79 [file] [log] [blame]

Neels Hofmeyr	17518fe	2017-06-20 04:35:06 +0200	[diff] [blame]	1	/*! \file conv_acc_sse_avx.c
Vadim Yanitskiy	46e533c	2017-06-19 18:21:02 +0700	[diff] [blame]	2	* Accelerated Viterbi decoder implementation
Harald Welte	b93f60f	2017-11-17 11:41:34 +0100	[diff] [blame]	3	* for architectures with both SSSE3 and AVX2 support. */
Neels Hofmeyr	17518fe	2017-06-20 04:35:06 +0200	[diff] [blame]	4	/*
Vadim Yanitskiy	0d49f47	2017-05-28 18:20:02 +0700	[diff] [blame]	5	* Copyright (C) 2013, 2014 Thomas Tsou <tom@tsou.cc>
				6	*
				7	* All Rights Reserved
				8	*
Harald Welte	e08da97	2017-11-13 01:00:26 +0900	[diff] [blame]	9	* SPDX-License-Identifier: GPL-2.0+
				10	*
Vadim Yanitskiy	0d49f47	2017-05-28 18:20:02 +0700	[diff] [blame]	11	* This program is free software; you can redistribute it and/or modify
				12	* it under the terms of the GNU General Public License as published by
				13	* the Free Software Foundation; either version 2 of the License, or
				14	* (at your option) any later version.
				15	*
				16	* This program is distributed in the hope that it will be useful,
				17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				19	* GNU General Public License for more details.
				20	*
				21	* You should have received a copy of the GNU General Public License along
				22	* with this program; if not, write to the Free Software Foundation, Inc.,
				23	* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
				24	*/
				25
				26	#include <stdint.h>
				27	#include "config.h"
				28
				29	#include <emmintrin.h>
				30	#include <tmmintrin.h>
				31	#include <xmmintrin.h>
				32	#include <immintrin.h>
				33
				34	#if defined(HAVE_SSE4_1)
				35	#include <smmintrin.h>
				36	#endif
				37
				38	#define SSE_ALIGN 16
				39
				40
				41	/* Broadcast 16-bit integer
				42	* Repeat the low 16-bit integer to all elements of the 128-bit SSE
				43	* register. Only AVX2 has a dedicated broadcast instruction; use repeat
				44	* unpacks for SSE only architectures. This is a destructive operation and
				45	* the source register is overwritten.
				46	*
				47	* Input:
				48	* M0 - Low 16-bit element is read
				49	*
				50	* Output:
				51	* M0 - Contains broadcasted values
				52	*/
				53	#define SSE_BROADCAST(M0) \
				54	{ \
				55	M0 = _mm_broadcastw_epi16(M0); \
				56	}
				57
				58	/**
				59	* Include common SSE implementation
				60	*/
Vadim Yanitskiy	e4fe71c	2017-06-19 17:59:48 +0700	[diff] [blame]	61	#include <conv_acc_sse_impl.h>
Vadim Yanitskiy	0d49f47	2017-05-28 18:20:02 +0700	[diff] [blame]	62
				63	/* Aligned Memory Allocator
				64	* SSE requires 16-byte memory alignment. We store relevant trellis values
				65	* (accumulated sums, outputs, and path decisions) as 16 bit signed integers
				66	* so the allocated memory is casted as such.
				67	*/
				68	__attribute__ ((visibility("hidden")))
				69	int16_t *osmo_conv_sse_avx_vdec_malloc(size_t n)
				70	{
				71	return (int16_t ) _mm_malloc(sizeof(int16_t) n, SSE_ALIGN);
				72	}
				73
				74	__attribute__ ((visibility("hidden")))
				75	void osmo_conv_sse_avx_vdec_free(int16_t *ptr)
				76	{
				77	_mm_free(ptr);
				78	}
				79
				80	__attribute__ ((visibility("hidden")))
				81	void osmo_conv_sse_avx_metrics_k5_n2(const int8_t *val,
				82	const int16_t out, int16_t sums, int16_t *paths, int norm)
				83	{
				84	const int16_t _val[4] = { val[0], val[1], val[0], val[1] };
				85
				86	_sse_metrics_k5_n2(_val, out, sums, paths, norm);
				87	}
				88
				89	__attribute__ ((visibility("hidden")))
				90	void osmo_conv_sse_avx_metrics_k5_n3(const int8_t *val,
				91	const int16_t out, int16_t sums, int16_t *paths, int norm)
				92	{
				93	const int16_t _val[4] = { val[0], val[1], val[2], 0 };
				94
				95	_sse_metrics_k5_n4(_val, out, sums, paths, norm);
				96	}
				97
				98	__attribute__ ((visibility("hidden")))
				99	void osmo_conv_sse_avx_metrics_k5_n4(const int8_t *val,
				100	const int16_t out, int16_t sums, int16_t *paths, int norm)
				101	{
				102	const int16_t _val[4] = { val[0], val[1], val[2], val[3] };
				103
				104	_sse_metrics_k5_n4(_val, out, sums, paths, norm);
				105	}
				106
				107	__attribute__ ((visibility("hidden")))
				108	void osmo_conv_sse_avx_metrics_k7_n2(const int8_t *val,
				109	const int16_t out, int16_t sums, int16_t *paths, int norm)
				110	{
				111	const int16_t _val[4] = { val[0], val[1], val[0], val[1] };
				112
				113	_sse_metrics_k7_n2(_val, out, sums, paths, norm);
				114	}
				115
				116	__attribute__ ((visibility("hidden")))
				117	void osmo_conv_sse_avx_metrics_k7_n3(const int8_t *val,
				118	const int16_t out, int16_t sums, int16_t *paths, int norm)
				119	{
				120	const int16_t _val[4] = { val[0], val[1], val[2], 0 };
				121
				122	_sse_metrics_k7_n4(_val, out, sums, paths, norm);
				123	}
				124
				125	__attribute__ ((visibility("hidden")))
				126	void osmo_conv_sse_avx_metrics_k7_n4(const int8_t *val,
				127	const int16_t out, int16_t sums, int16_t *paths, int norm)
				128	{
				129	const int16_t _val[4] = { val[0], val[1], val[2], val[3] };
				130
				131	_sse_metrics_k7_n4(_val, out, sums, paths, norm);
				132	}