blob: 63d8722acc00e963874507173371c5c697cc8446 [file] [log] [blame]
Neels Hofmeyr17518fe2017-06-20 04:35:06 +02001/*! \file conv_acc_sse.c
Vadim Yanitskiy46e533c2017-06-19 18:21:02 +07002 * Accelerated Viterbi decoder implementation
Harald Welteb93f60f2017-11-17 11:41:34 +01003 * for architectures with only SSSE3 available. */
Neels Hofmeyr17518fe2017-06-20 04:35:06 +02004/*
Tom Tsou34e228a2017-04-29 00:16:43 +07005 * Copyright (C) 2013, 2014 Thomas Tsou <tom@tsou.cc>
6 *
7 * All Rights Reserved
8 *
Harald Weltee08da972017-11-13 01:00:26 +09009 * SPDX-License-Identifier: GPL-2.0+
10 *
Tom Tsou34e228a2017-04-29 00:16:43 +070011 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License along
22 * with this program; if not, write to the Free Software Foundation, Inc.,
23 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 */
25
26#include <stdint.h>
Vadim Yanitskiy0d49f472017-05-28 18:20:02 +070027#include "config.h"
28
Tom Tsou34e228a2017-04-29 00:16:43 +070029#include <emmintrin.h>
30#include <tmmintrin.h>
31#include <xmmintrin.h>
32
Vadim Yanitskiy0d49f472017-05-28 18:20:02 +070033#if defined(HAVE_SSE4_1)
34#include <smmintrin.h>
Tom Tsou34e228a2017-04-29 00:16:43 +070035#endif
36
37#define SSE_ALIGN 16
38
Tom Tsou34e228a2017-04-29 00:16:43 +070039/* Broadcast 16-bit integer
40 * Repeat the low 16-bit integer to all elements of the 128-bit SSE
41 * register. Only AVX2 has a dedicated broadcast instruction; use repeat
42 * unpacks for SSE only architectures. This is a destructive operation and
43 * the source register is overwritten.
44 *
45 * Input:
46 * M0 - Low 16-bit element is read
47 *
48 * Output:
49 * M0 - Contains broadcasted values
50 */
Tom Tsou34e228a2017-04-29 00:16:43 +070051#define SSE_BROADCAST(M0) \
52{ \
53 M0 = _mm_unpacklo_epi16(M0, M0); \
54 M0 = _mm_unpacklo_epi32(M0, M0); \
55 M0 = _mm_unpacklo_epi64(M0, M0); \
56}
Tom Tsou34e228a2017-04-29 00:16:43 +070057
Vadim Yanitskiy0d49f472017-05-28 18:20:02 +070058/**
59 * Include common SSE implementation
Tom Tsou34e228a2017-04-29 00:16:43 +070060 */
Vadim Yanitskiye4fe71c2017-06-19 17:59:48 +070061#include <conv_acc_sse_impl.h>
Tom Tsou34e228a2017-04-29 00:16:43 +070062
63/* Aligned Memory Allocator
64 * SSE requires 16-byte memory alignment. We store relevant trellis values
65 * (accumulated sums, outputs, and path decisions) as 16 bit signed integers
66 * so the allocated memory is casted as such.
67 */
68__attribute__ ((visibility("hidden")))
Vadim Yanitskiy0d49f472017-05-28 18:20:02 +070069int16_t *osmo_conv_sse_vdec_malloc(size_t n)
Tom Tsou34e228a2017-04-29 00:16:43 +070070{
71 return (int16_t *) _mm_malloc(sizeof(int16_t) * n, SSE_ALIGN);
72}
73
74__attribute__ ((visibility("hidden")))
Vadim Yanitskiy0d49f472017-05-28 18:20:02 +070075void osmo_conv_sse_vdec_free(int16_t *ptr)
Tom Tsou34e228a2017-04-29 00:16:43 +070076{
77 _mm_free(ptr);
78}
79
80__attribute__ ((visibility("hidden")))
Vadim Yanitskiy0d49f472017-05-28 18:20:02 +070081void osmo_conv_sse_metrics_k5_n2(const int8_t *val, const int16_t *out,
Tom Tsou34e228a2017-04-29 00:16:43 +070082 int16_t *sums, int16_t *paths, int norm)
83{
84 const int16_t _val[4] = { val[0], val[1], val[0], val[1] };
85
86 _sse_metrics_k5_n2(_val, out, sums, paths, norm);
87}
88
89__attribute__ ((visibility("hidden")))
Vadim Yanitskiy0d49f472017-05-28 18:20:02 +070090void osmo_conv_sse_metrics_k5_n3(const int8_t *val, const int16_t *out,
Tom Tsou34e228a2017-04-29 00:16:43 +070091 int16_t *sums, int16_t *paths, int norm)
92{
93 const int16_t _val[4] = { val[0], val[1], val[2], 0 };
94
95 _sse_metrics_k5_n4(_val, out, sums, paths, norm);
96}
97
98__attribute__ ((visibility("hidden")))
Vadim Yanitskiy0d49f472017-05-28 18:20:02 +070099void osmo_conv_sse_metrics_k5_n4(const int8_t *val, const int16_t *out,
Tom Tsou34e228a2017-04-29 00:16:43 +0700100 int16_t *sums, int16_t *paths, int norm)
101{
102 const int16_t _val[4] = { val[0], val[1], val[2], val[3] };
103
104 _sse_metrics_k5_n4(_val, out, sums, paths, norm);
105}
106
107__attribute__ ((visibility("hidden")))
Vadim Yanitskiy0d49f472017-05-28 18:20:02 +0700108void osmo_conv_sse_metrics_k7_n2(const int8_t *val, const int16_t *out,
Tom Tsou34e228a2017-04-29 00:16:43 +0700109 int16_t *sums, int16_t *paths, int norm)
110{
111 const int16_t _val[4] = { val[0], val[1], val[0], val[1] };
112
113 _sse_metrics_k7_n2(_val, out, sums, paths, norm);
114}
115
116__attribute__ ((visibility("hidden")))
Vadim Yanitskiy0d49f472017-05-28 18:20:02 +0700117void osmo_conv_sse_metrics_k7_n3(const int8_t *val, const int16_t *out,
Tom Tsou34e228a2017-04-29 00:16:43 +0700118 int16_t *sums, int16_t *paths, int norm)
119{
120 const int16_t _val[4] = { val[0], val[1], val[2], 0 };
121
122 _sse_metrics_k7_n4(_val, out, sums, paths, norm);
123}
124
125__attribute__ ((visibility("hidden")))
Vadim Yanitskiy0d49f472017-05-28 18:20:02 +0700126void osmo_conv_sse_metrics_k7_n4(const int8_t *val, const int16_t *out,
Tom Tsou34e228a2017-04-29 00:16:43 +0700127 int16_t *sums, int16_t *paths, int norm)
128{
129 const int16_t _val[4] = { val[0], val[1], val[2], val[3] };
130
131 _sse_metrics_k7_n4(_val, out, sums, paths, norm);
132}