Neels Hofmeyr | 17518fe | 2017-06-20 04:35:06 +0200 | [diff] [blame] | 1 | /*! \file conv_acc_sse_avx.c |
Vadim Yanitskiy | 46e533c | 2017-06-19 18:21:02 +0700 | [diff] [blame] | 2 | * Accelerated Viterbi decoder implementation |
Harald Welte | b93f60f | 2017-11-17 11:41:34 +0100 | [diff] [blame] | 3 | * for architectures with both SSSE3 and AVX2 support. */ |
Neels Hofmeyr | 17518fe | 2017-06-20 04:35:06 +0200 | [diff] [blame] | 4 | /* |
Vadim Yanitskiy | 0d49f47 | 2017-05-28 18:20:02 +0700 | [diff] [blame] | 5 | * Copyright (C) 2013, 2014 Thomas Tsou <tom@tsou.cc> |
| 6 | * |
| 7 | * All Rights Reserved |
| 8 | * |
Harald Welte | e08da97 | 2017-11-13 01:00:26 +0900 | [diff] [blame] | 9 | * SPDX-License-Identifier: GPL-2.0+ |
| 10 | * |
Vadim Yanitskiy | 0d49f47 | 2017-05-28 18:20:02 +0700 | [diff] [blame] | 11 | * This program is free software; you can redistribute it and/or modify |
| 12 | * it under the terms of the GNU General Public License as published by |
| 13 | * the Free Software Foundation; either version 2 of the License, or |
| 14 | * (at your option) any later version. |
| 15 | * |
| 16 | * This program is distributed in the hope that it will be useful, |
| 17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 19 | * GNU General Public License for more details. |
| 20 | * |
| 21 | * You should have received a copy of the GNU General Public License along |
| 22 | * with this program; if not, write to the Free Software Foundation, Inc., |
| 23 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| 24 | */ |
| 25 | |
| 26 | #include <stdint.h> |
| 27 | #include "config.h" |
| 28 | |
| 29 | #include <emmintrin.h> |
| 30 | #include <tmmintrin.h> |
| 31 | #include <xmmintrin.h> |
| 32 | #include <immintrin.h> |
| 33 | |
| 34 | #if defined(HAVE_SSE4_1) |
| 35 | #include <smmintrin.h> |
| 36 | #endif |
| 37 | |
| 38 | #define SSE_ALIGN 16 |
| 39 | |
| 40 | |
| 41 | /* Broadcast 16-bit integer |
| 42 | * Repeat the low 16-bit integer to all elements of the 128-bit SSE |
| 43 | * register. Only AVX2 has a dedicated broadcast instruction; use repeat |
| 44 | * unpacks for SSE only architectures. This is a destructive operation and |
| 45 | * the source register is overwritten. |
| 46 | * |
| 47 | * Input: |
| 48 | * M0 - Low 16-bit element is read |
| 49 | * |
| 50 | * Output: |
| 51 | * M0 - Contains broadcasted values |
| 52 | */ |
| 53 | #define SSE_BROADCAST(M0) \ |
| 54 | { \ |
| 55 | M0 = _mm_broadcastw_epi16(M0); \ |
| 56 | } |
| 57 | |
| 58 | /** |
| 59 | * Include common SSE implementation |
| 60 | */ |
Vadim Yanitskiy | e4fe71c | 2017-06-19 17:59:48 +0700 | [diff] [blame] | 61 | #include <conv_acc_sse_impl.h> |
Vadim Yanitskiy | 0d49f47 | 2017-05-28 18:20:02 +0700 | [diff] [blame] | 62 | |
| 63 | /* Aligned Memory Allocator |
| 64 | * SSE requires 16-byte memory alignment. We store relevant trellis values |
| 65 | * (accumulated sums, outputs, and path decisions) as 16 bit signed integers |
| 66 | * so the allocated memory is casted as such. |
| 67 | */ |
| 68 | __attribute__ ((visibility("hidden"))) |
| 69 | int16_t *osmo_conv_sse_avx_vdec_malloc(size_t n) |
| 70 | { |
| 71 | return (int16_t *) _mm_malloc(sizeof(int16_t) * n, SSE_ALIGN); |
| 72 | } |
| 73 | |
| 74 | __attribute__ ((visibility("hidden"))) |
| 75 | void osmo_conv_sse_avx_vdec_free(int16_t *ptr) |
| 76 | { |
| 77 | _mm_free(ptr); |
| 78 | } |
| 79 | |
| 80 | __attribute__ ((visibility("hidden"))) |
| 81 | void osmo_conv_sse_avx_metrics_k5_n2(const int8_t *val, |
| 82 | const int16_t *out, int16_t *sums, int16_t *paths, int norm) |
| 83 | { |
| 84 | const int16_t _val[4] = { val[0], val[1], val[0], val[1] }; |
| 85 | |
| 86 | _sse_metrics_k5_n2(_val, out, sums, paths, norm); |
| 87 | } |
| 88 | |
| 89 | __attribute__ ((visibility("hidden"))) |
| 90 | void osmo_conv_sse_avx_metrics_k5_n3(const int8_t *val, |
| 91 | const int16_t *out, int16_t *sums, int16_t *paths, int norm) |
| 92 | { |
| 93 | const int16_t _val[4] = { val[0], val[1], val[2], 0 }; |
| 94 | |
| 95 | _sse_metrics_k5_n4(_val, out, sums, paths, norm); |
| 96 | } |
| 97 | |
| 98 | __attribute__ ((visibility("hidden"))) |
| 99 | void osmo_conv_sse_avx_metrics_k5_n4(const int8_t *val, |
| 100 | const int16_t *out, int16_t *sums, int16_t *paths, int norm) |
| 101 | { |
| 102 | const int16_t _val[4] = { val[0], val[1], val[2], val[3] }; |
| 103 | |
| 104 | _sse_metrics_k5_n4(_val, out, sums, paths, norm); |
| 105 | } |
| 106 | |
| 107 | __attribute__ ((visibility("hidden"))) |
| 108 | void osmo_conv_sse_avx_metrics_k7_n2(const int8_t *val, |
| 109 | const int16_t *out, int16_t *sums, int16_t *paths, int norm) |
| 110 | { |
| 111 | const int16_t _val[4] = { val[0], val[1], val[0], val[1] }; |
| 112 | |
| 113 | _sse_metrics_k7_n2(_val, out, sums, paths, norm); |
| 114 | } |
| 115 | |
| 116 | __attribute__ ((visibility("hidden"))) |
| 117 | void osmo_conv_sse_avx_metrics_k7_n3(const int8_t *val, |
| 118 | const int16_t *out, int16_t *sums, int16_t *paths, int norm) |
| 119 | { |
| 120 | const int16_t _val[4] = { val[0], val[1], val[2], 0 }; |
| 121 | |
| 122 | _sse_metrics_k7_n4(_val, out, sums, paths, norm); |
| 123 | } |
| 124 | |
| 125 | __attribute__ ((visibility("hidden"))) |
| 126 | void osmo_conv_sse_avx_metrics_k7_n4(const int8_t *val, |
| 127 | const int16_t *out, int16_t *sums, int16_t *paths, int norm) |
| 128 | { |
| 129 | const int16_t _val[4] = { val[0], val[1], val[2], val[3] }; |
| 130 | |
| 131 | _sse_metrics_k7_n4(_val, out, sums, paths, norm); |
| 132 | } |