Neels Hofmeyr | 17518fe | 2017-06-20 04:35:06 +0200 | [diff] [blame] | 1 | /*! \file conv_acc_sse.c |
Vadim Yanitskiy | 46e533c | 2017-06-19 18:21:02 +0700 | [diff] [blame] | 2 | * Accelerated Viterbi decoder implementation |
Neels Hofmeyr | 17518fe | 2017-06-20 04:35:06 +0200 | [diff] [blame] | 3 | * for architectures with only SSE3 available. */ |
| 4 | /* |
Tom Tsou | 34e228a | 2017-04-29 00:16:43 +0700 | [diff] [blame] | 5 | * Copyright (C) 2013, 2014 Thomas Tsou <tom@tsou.cc> |
| 6 | * |
| 7 | * All Rights Reserved |
| 8 | * |
| 9 | * This program is free software; you can redistribute it and/or modify |
| 10 | * it under the terms of the GNU General Public License as published by |
| 11 | * the Free Software Foundation; either version 2 of the License, or |
| 12 | * (at your option) any later version. |
| 13 | * |
| 14 | * This program is distributed in the hope that it will be useful, |
| 15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 17 | * GNU General Public License for more details. |
| 18 | * |
| 19 | * You should have received a copy of the GNU General Public License along |
| 20 | * with this program; if not, write to the Free Software Foundation, Inc., |
| 21 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| 22 | */ |
| 23 | |
| 24 | #include <stdint.h> |
Vadim Yanitskiy | 0d49f47 | 2017-05-28 18:20:02 +0700 | [diff] [blame] | 25 | #include "config.h" |
| 26 | |
Tom Tsou | 34e228a | 2017-04-29 00:16:43 +0700 | [diff] [blame] | 27 | #include <emmintrin.h> |
| 28 | #include <tmmintrin.h> |
| 29 | #include <xmmintrin.h> |
| 30 | |
Vadim Yanitskiy | 0d49f47 | 2017-05-28 18:20:02 +0700 | [diff] [blame] | 31 | #if defined(HAVE_SSE4_1) |
| 32 | #include <smmintrin.h> |
Tom Tsou | 34e228a | 2017-04-29 00:16:43 +0700 | [diff] [blame] | 33 | #endif |
| 34 | |
| 35 | #define SSE_ALIGN 16 |
| 36 | |
Tom Tsou | 34e228a | 2017-04-29 00:16:43 +0700 | [diff] [blame] | 37 | /* Broadcast 16-bit integer |
| 38 | * Repeat the low 16-bit integer to all elements of the 128-bit SSE |
| 39 | * register. Only AVX2 has a dedicated broadcast instruction; use repeat |
| 40 | * unpacks for SSE only architectures. This is a destructive operation and |
| 41 | * the source register is overwritten. |
| 42 | * |
| 43 | * Input: |
| 44 | * M0 - Low 16-bit element is read |
| 45 | * |
| 46 | * Output: |
| 47 | * M0 - Contains broadcasted values |
| 48 | */ |
Tom Tsou | 34e228a | 2017-04-29 00:16:43 +0700 | [diff] [blame] | 49 | #define SSE_BROADCAST(M0) \ |
| 50 | { \ |
| 51 | M0 = _mm_unpacklo_epi16(M0, M0); \ |
| 52 | M0 = _mm_unpacklo_epi32(M0, M0); \ |
| 53 | M0 = _mm_unpacklo_epi64(M0, M0); \ |
| 54 | } |
Tom Tsou | 34e228a | 2017-04-29 00:16:43 +0700 | [diff] [blame] | 55 | |
Vadim Yanitskiy | 0d49f47 | 2017-05-28 18:20:02 +0700 | [diff] [blame] | 56 | /** |
| 57 | * Include common SSE implementation |
Tom Tsou | 34e228a | 2017-04-29 00:16:43 +0700 | [diff] [blame] | 58 | */ |
Vadim Yanitskiy | e4fe71c | 2017-06-19 17:59:48 +0700 | [diff] [blame] | 59 | #include <conv_acc_sse_impl.h> |
Tom Tsou | 34e228a | 2017-04-29 00:16:43 +0700 | [diff] [blame] | 60 | |
| 61 | /* Aligned Memory Allocator |
| 62 | * SSE requires 16-byte memory alignment. We store relevant trellis values |
| 63 | * (accumulated sums, outputs, and path decisions) as 16 bit signed integers |
| 64 | * so the allocated memory is casted as such. |
| 65 | */ |
| 66 | __attribute__ ((visibility("hidden"))) |
Vadim Yanitskiy | 0d49f47 | 2017-05-28 18:20:02 +0700 | [diff] [blame] | 67 | int16_t *osmo_conv_sse_vdec_malloc(size_t n) |
Tom Tsou | 34e228a | 2017-04-29 00:16:43 +0700 | [diff] [blame] | 68 | { |
| 69 | return (int16_t *) _mm_malloc(sizeof(int16_t) * n, SSE_ALIGN); |
| 70 | } |
| 71 | |
| 72 | __attribute__ ((visibility("hidden"))) |
Vadim Yanitskiy | 0d49f47 | 2017-05-28 18:20:02 +0700 | [diff] [blame] | 73 | void osmo_conv_sse_vdec_free(int16_t *ptr) |
Tom Tsou | 34e228a | 2017-04-29 00:16:43 +0700 | [diff] [blame] | 74 | { |
| 75 | _mm_free(ptr); |
| 76 | } |
| 77 | |
| 78 | __attribute__ ((visibility("hidden"))) |
Vadim Yanitskiy | 0d49f47 | 2017-05-28 18:20:02 +0700 | [diff] [blame] | 79 | void osmo_conv_sse_metrics_k5_n2(const int8_t *val, const int16_t *out, |
Tom Tsou | 34e228a | 2017-04-29 00:16:43 +0700 | [diff] [blame] | 80 | int16_t *sums, int16_t *paths, int norm) |
| 81 | { |
| 82 | const int16_t _val[4] = { val[0], val[1], val[0], val[1] }; |
| 83 | |
| 84 | _sse_metrics_k5_n2(_val, out, sums, paths, norm); |
| 85 | } |
| 86 | |
| 87 | __attribute__ ((visibility("hidden"))) |
Vadim Yanitskiy | 0d49f47 | 2017-05-28 18:20:02 +0700 | [diff] [blame] | 88 | void osmo_conv_sse_metrics_k5_n3(const int8_t *val, const int16_t *out, |
Tom Tsou | 34e228a | 2017-04-29 00:16:43 +0700 | [diff] [blame] | 89 | int16_t *sums, int16_t *paths, int norm) |
| 90 | { |
| 91 | const int16_t _val[4] = { val[0], val[1], val[2], 0 }; |
| 92 | |
| 93 | _sse_metrics_k5_n4(_val, out, sums, paths, norm); |
| 94 | } |
| 95 | |
| 96 | __attribute__ ((visibility("hidden"))) |
Vadim Yanitskiy | 0d49f47 | 2017-05-28 18:20:02 +0700 | [diff] [blame] | 97 | void osmo_conv_sse_metrics_k5_n4(const int8_t *val, const int16_t *out, |
Tom Tsou | 34e228a | 2017-04-29 00:16:43 +0700 | [diff] [blame] | 98 | int16_t *sums, int16_t *paths, int norm) |
| 99 | { |
| 100 | const int16_t _val[4] = { val[0], val[1], val[2], val[3] }; |
| 101 | |
| 102 | _sse_metrics_k5_n4(_val, out, sums, paths, norm); |
| 103 | } |
| 104 | |
| 105 | __attribute__ ((visibility("hidden"))) |
Vadim Yanitskiy | 0d49f47 | 2017-05-28 18:20:02 +0700 | [diff] [blame] | 106 | void osmo_conv_sse_metrics_k7_n2(const int8_t *val, const int16_t *out, |
Tom Tsou | 34e228a | 2017-04-29 00:16:43 +0700 | [diff] [blame] | 107 | int16_t *sums, int16_t *paths, int norm) |
| 108 | { |
| 109 | const int16_t _val[4] = { val[0], val[1], val[0], val[1] }; |
| 110 | |
| 111 | _sse_metrics_k7_n2(_val, out, sums, paths, norm); |
| 112 | } |
| 113 | |
| 114 | __attribute__ ((visibility("hidden"))) |
Vadim Yanitskiy | 0d49f47 | 2017-05-28 18:20:02 +0700 | [diff] [blame] | 115 | void osmo_conv_sse_metrics_k7_n3(const int8_t *val, const int16_t *out, |
Tom Tsou | 34e228a | 2017-04-29 00:16:43 +0700 | [diff] [blame] | 116 | int16_t *sums, int16_t *paths, int norm) |
| 117 | { |
| 118 | const int16_t _val[4] = { val[0], val[1], val[2], 0 }; |
| 119 | |
| 120 | _sse_metrics_k7_n4(_val, out, sums, paths, norm); |
| 121 | } |
| 122 | |
| 123 | __attribute__ ((visibility("hidden"))) |
Vadim Yanitskiy | 0d49f47 | 2017-05-28 18:20:02 +0700 | [diff] [blame] | 124 | void osmo_conv_sse_metrics_k7_n4(const int8_t *val, const int16_t *out, |
Tom Tsou | 34e228a | 2017-04-29 00:16:43 +0700 | [diff] [blame] | 125 | int16_t *sums, int16_t *paths, int norm) |
| 126 | { |
| 127 | const int16_t _val[4] = { val[0], val[1], val[2], val[3] }; |
| 128 | |
| 129 | _sse_metrics_k7_n4(_val, out, sums, paths, norm); |
| 130 | } |