blob: 4471127e80f9da85e6168f8dee0890c55ba40bed [file] [log] [blame]
Eric3afc1d12020-07-23 02:16:46 +02001/*! \file conv_acc_neon_impl.h
2 * Accelerated Viterbi decoder implementation:
3 * straight port of SSE to NEON based on Tom Tsous work */
4/*
5 * (C) 2020 by sysmocom - s.f.m.c. GmbH
6 * Author: Eric Wild
7 *
8 * All Rights Reserved
9 *
10 * SPDX-License-Identifier: GPL-2.0+
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License along
23 * with this program; if not, write to the Free Software Foundation, Inc.,
24 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 */
26
27/* Some distributions (notably Alpine Linux) for some strange reason
28 * don't have this #define */
29#ifndef __always_inline
30#define __always_inline inline __attribute__((always_inline))
31#endif
32
33#define NEON_BUTTERFLY(M0,M1,M2,M3,M4) \
34{ \
35 M3 = vqaddq_s16(M0, M2); \
36 M4 = vqsubq_s16(M1, M2); \
37 M0 = vqsubq_s16(M0, M2); \
38 M1 = vqaddq_s16(M1, M2); \
39 M2 = vmaxq_s16(M3, M4); \
40 M3 = vreinterpretq_s16_u16(vcgtq_s16(M3, M4)); \
41 M4 = vmaxq_s16(M0, M1); \
42 M1 = vreinterpretq_s16_u16(vcgtq_s16(M0, M1)); \
43}
44
45#define NEON_DEINTERLEAVE_K5(M0,M1,M2,M3) \
46{ \
47 int16x8x2_t tmp; \
48 tmp = vuzpq_s16(M0, M1); \
49 M2 = tmp.val[0]; \
50 M3 = tmp.val[1]; \
51}
52
53#define NEON_DEINTERLEAVE_K7(M0,M1,M2,M3,M4,M5,M6,M7,M8,M9,M10,M11,M12,M13,M14,M15) \
54{ \
55 int16x8x2_t tmp; \
56 tmp = vuzpq_s16(M0, M1); \
57 M8 = tmp.val[0]; M9 = tmp.val[1]; \
58 tmp = vuzpq_s16(M2, M3); \
59 M10 = tmp.val[0]; M11 = tmp.val[1]; \
60 tmp = vuzpq_s16(M4, M5); \
61 M12 = tmp.val[0]; M13 = tmp.val[1]; \
62 tmp = vuzpq_s16(M6, M7); \
63 M14 = tmp.val[0]; M15 = tmp.val[1]; \
64}
65
66#define NEON_BRANCH_METRIC_N2(M0,M1,M2,M3,M4,M6,M7) \
67{ \
68 M0 = vmulq_s16(M4, M0); \
69 M1 = vmulq_s16(M4, M1); \
70 M2 = vmulq_s16(M4, M2); \
71 M3 = vmulq_s16(M4, M3); \
72 M6 = vcombine_s16(vpadd_s16(vget_low_s16(M0), vget_high_s16(M0)), vpadd_s16(vget_low_s16(M1), vget_high_s16(M1))); \
73 M7 = vcombine_s16(vpadd_s16(vget_low_s16(M2), vget_high_s16(M2)), vpadd_s16(vget_low_s16(M3), vget_high_s16(M3))); \
74}
75
76#define NEON_BRANCH_METRIC_N4(M0,M1,M2,M3,M4,M5) \
77{ \
78 M0 = vmulq_s16(M4, M0); \
79 M1 = vmulq_s16(M4, M1); \
80 M2 = vmulq_s16(M4, M2); \
81 M3 = vmulq_s16(M4, M3); \
82 int16x4_t t1 = vpadd_s16(vpadd_s16(vget_low_s16(M0), vget_high_s16(M0)), vpadd_s16(vget_low_s16(M1), vget_high_s16(M1))); \
83 int16x4_t t2 = vpadd_s16(vpadd_s16(vget_low_s16(M2), vget_high_s16(M2)), vpadd_s16(vget_low_s16(M3), vget_high_s16(M3))); \
84 M5 = vcombine_s16(t1, t2); \
85}
86
87#define NEON_NORMALIZE_K5(M0,M1,M2,M3) \
88{ \
89 M2 = vminq_s16(M0, M1); \
90 int16x4_t t = vpmin_s16(vget_low_s16(M2), vget_high_s16(M2)); \
91 t = vpmin_s16(t, t); \
92 t = vpmin_s16(t, t); \
93 M2 = vdupq_lane_s16(t, 0); \
94 M0 = vqsubq_s16(M0, M2); \
95 M1 = vqsubq_s16(M1, M2); \
96}
97
98#define NEON_NORMALIZE_K7(M0,M1,M2,M3,M4,M5,M6,M7,M8,M9,M10,M11) \
99{ \
100 M8 = vminq_s16(M0, M1); \
101 M9 = vminq_s16(M2, M3); \
102 M10 = vminq_s16(M4, M5); \
103 M11 = vminq_s16(M6, M7); \
104 M8 = vminq_s16(M8, M9); \
105 M10 = vminq_s16(M10, M11); \
106 M8 = vminq_s16(M8, M10); \
107 int16x4_t t = vpmin_s16(vget_low_s16(M8), vget_high_s16(M8)); \
108 t = vpmin_s16(t, t); \
109 t = vpmin_s16(t, t); \
110 M8 = vdupq_lane_s16(t, 0); \
111 M0 = vqsubq_s16(M0, M8); \
112 M1 = vqsubq_s16(M1, M8); \
113 M2 = vqsubq_s16(M2, M8); \
114 M3 = vqsubq_s16(M3, M8); \
115 M4 = vqsubq_s16(M4, M8); \
116 M5 = vqsubq_s16(M5, M8); \
117 M6 = vqsubq_s16(M6, M8); \
118 M7 = vqsubq_s16(M7, M8); \
119}
120
121__always_inline void _neon_metrics_k5_n2(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths,
122 int norm)
123{
124 int16_t *__restrict out = __builtin_assume_aligned(outa, 8);
125 int16_t *__restrict sums = __builtin_assume_aligned(sumsa, 8);
126 int16x8_t m0, m1, m2, m3, m4, m5, m6;
127 int16x4_t input;
128
129 /* (BMU) Load and expand 8-bit input out to 16-bits */
130 input = vld1_s16(val);
131 m2 = vcombine_s16(input, input);
132
133 /* (BMU) Load and compute branch metrics */
134 m0 = vld1q_s16(&out[0]);
135 m1 = vld1q_s16(&out[8]);
136
137 m0 = vmulq_s16(m2, m0);
138 m1 = vmulq_s16(m2, m1);
139 m2 = vcombine_s16(vpadd_s16(vget_low_s16(m0), vget_high_s16(m0)),
140 vpadd_s16(vget_low_s16(m1), vget_high_s16(m1)));
141
142 /* (PMU) Load accumulated path matrics */
143 m0 = vld1q_s16(&sums[0]);
144 m1 = vld1q_s16(&sums[8]);
145
146 NEON_DEINTERLEAVE_K5(m0, m1, m3, m4)
147
148 /* (PMU) Butterflies: 0-7 */
149 NEON_BUTTERFLY(m3, m4, m2, m5, m6)
150
151 if (norm)
152 NEON_NORMALIZE_K5(m2, m6, m0, m1)
153
154 vst1q_s16(&sums[0], m2);
155 vst1q_s16(&sums[8], m6);
156 vst1q_s16(&paths[0], m5);
157 vst1q_s16(&paths[8], m4);
158}
159
160__always_inline void _neon_metrics_k5_n4(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths,
161 int norm)
162{
163 int16_t *__restrict out = __builtin_assume_aligned(outa, 8);
164 int16_t *__restrict sums = __builtin_assume_aligned(sumsa, 8);
165 int16x8_t m0, m1, m2, m3, m4, m5, m6;
166 int16x4_t input;
167
168 /* (BMU) Load and expand 8-bit input out to 16-bits */
169 input = vld1_s16(val);
170 m4 = vcombine_s16(input, input);
171
172 /* (BMU) Load and compute branch metrics */
173 m0 = vld1q_s16(&out[0]);
174 m1 = vld1q_s16(&out[8]);
175 m2 = vld1q_s16(&out[16]);
176 m3 = vld1q_s16(&out[24]);
177
178 NEON_BRANCH_METRIC_N4(m0, m1, m2, m3, m4, m2)
179
180 /* (PMU) Load accumulated path matrics */
181 m0 = vld1q_s16(&sums[0]);
182 m1 = vld1q_s16(&sums[8]);
183
184 NEON_DEINTERLEAVE_K5(m0, m1, m3, m4)
185
186 /* (PMU) Butterflies: 0-7 */
187 NEON_BUTTERFLY(m3, m4, m2, m5, m6)
188
189 if (norm)
190 NEON_NORMALIZE_K5(m2, m6, m0, m1)
191
192 vst1q_s16(&sums[0], m2);
193 vst1q_s16(&sums[8], m6);
194 vst1q_s16(&paths[0], m5);
195 vst1q_s16(&paths[8], m4);
196}
197
198__always_inline static void _neon_metrics_k7_n2(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths,
199 int norm)
200{
201 int16_t *__restrict out = __builtin_assume_aligned(outa, 8);
202 int16_t *__restrict sums = __builtin_assume_aligned(sumsa, 8);
203 int16x8_t m0, m1, m2, m3, m4, m5, m6, m7;
204 int16x8_t m8, m9, m10, m11, m12, m13, m14, m15;
205 int16x4_t input;
206
207 /* (PMU) Load accumulated path matrics */
208 m0 = vld1q_s16(&sums[0]);
209 m1 = vld1q_s16(&sums[8]);
210 m2 = vld1q_s16(&sums[16]);
211 m3 = vld1q_s16(&sums[24]);
212 m4 = vld1q_s16(&sums[32]);
213 m5 = vld1q_s16(&sums[40]);
214 m6 = vld1q_s16(&sums[48]);
215 m7 = vld1q_s16(&sums[56]);
216
217 /* (PMU) Deinterleave into even and odd packed registers */
218 NEON_DEINTERLEAVE_K7(m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15)
219
220 /* (BMU) Load and expand 8-bit input out to 16-bits */
221 input = vld1_s16(val);
222 m7 = vcombine_s16(input, input);
223
224 /* (BMU) Load and compute branch metrics */
225 m0 = vld1q_s16(&out[0]);
226 m1 = vld1q_s16(&out[8]);
227 m2 = vld1q_s16(&out[16]);
228 m3 = vld1q_s16(&out[24]);
229
230 NEON_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m4, m5)
231
232 m0 = vld1q_s16(&out[32]);
233 m1 = vld1q_s16(&out[40]);
234 m2 = vld1q_s16(&out[48]);
235 m3 = vld1q_s16(&out[56]);
236
237 NEON_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m6, m7)
238
239 /* (PMU) Butterflies: 0-15 */
240 NEON_BUTTERFLY(m8, m9, m4, m0, m1)
241 NEON_BUTTERFLY(m10, m11, m5, m2, m3)
242
243 vst1q_s16(&paths[0], m0);
244 vst1q_s16(&paths[8], m2);
245 vst1q_s16(&paths[32], m9);
246 vst1q_s16(&paths[40], m11);
247
248 /* (PMU) Butterflies: 17-31 */
249 NEON_BUTTERFLY(m12, m13, m6, m0, m2)
250 NEON_BUTTERFLY(m14, m15, m7, m9, m11)
251
252 vst1q_s16(&paths[16], m0);
253 vst1q_s16(&paths[24], m9);
254 vst1q_s16(&paths[48], m13);
255 vst1q_s16(&paths[56], m15);
256
257 if (norm)
258 NEON_NORMALIZE_K7(m4, m1, m5, m3, m6, m2, m7, m11, m0, m8, m9, m10)
259
260 vst1q_s16(&sums[0], m4);
261 vst1q_s16(&sums[8], m5);
262 vst1q_s16(&sums[16], m6);
263 vst1q_s16(&sums[24], m7);
264 vst1q_s16(&sums[32], m1);
265 vst1q_s16(&sums[40], m3);
266 vst1q_s16(&sums[48], m2);
267 vst1q_s16(&sums[56], m11);
268}
269
270__always_inline static void _neon_metrics_k7_n4(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths,
271 int norm)
272{
273 int16_t *__restrict out = __builtin_assume_aligned(outa, 8);
274 int16_t *__restrict sums = __builtin_assume_aligned(sumsa, 8);
275 int16x8_t m0, m1, m2, m3, m4, m5, m6, m7;
276 int16x8_t m8, m9, m10, m11, m12, m13, m14, m15;
277 int16x4_t input;
278
279 /* (PMU) Load accumulated path matrics */
280 m0 = vld1q_s16(&sums[0]);
281 m1 = vld1q_s16(&sums[8]);
282 m2 = vld1q_s16(&sums[16]);
283 m3 = vld1q_s16(&sums[24]);
284 m4 = vld1q_s16(&sums[32]);
285 m5 = vld1q_s16(&sums[40]);
286 m6 = vld1q_s16(&sums[48]);
287 m7 = vld1q_s16(&sums[56]);
288
289 /* (PMU) Deinterleave into even and odd packed registers */
290 NEON_DEINTERLEAVE_K7(m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15)
291
292 /* (BMU) Load and expand 8-bit input out to 16-bits */
293 input = vld1_s16(val);
294 m7 = vcombine_s16(input, input);
295
296 /* (BMU) Load and compute branch metrics */
297 m0 = vld1q_s16(&out[0]);
298 m1 = vld1q_s16(&out[8]);
299 m2 = vld1q_s16(&out[16]);
300 m3 = vld1q_s16(&out[24]);
301
302 NEON_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m4)
303
304 m0 = vld1q_s16(&out[32]);
305 m1 = vld1q_s16(&out[40]);
306 m2 = vld1q_s16(&out[48]);
307 m3 = vld1q_s16(&out[56]);
308
309 NEON_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m5)
310
311 m0 = vld1q_s16(&out[64]);
312 m1 = vld1q_s16(&out[72]);
313 m2 = vld1q_s16(&out[80]);
314 m3 = vld1q_s16(&out[88]);
315
316 NEON_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m6)
317
318 m0 = vld1q_s16(&out[96]);
319 m1 = vld1q_s16(&out[104]);
320 m2 = vld1q_s16(&out[112]);
321 m3 = vld1q_s16(&out[120]);
322
323 NEON_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m7)
324
325 /* (PMU) Butterflies: 0-15 */
326 NEON_BUTTERFLY(m8, m9, m4, m0, m1)
327 NEON_BUTTERFLY(m10, m11, m5, m2, m3)
328
329 vst1q_s16(&paths[0], m0);
330 vst1q_s16(&paths[8], m2);
331 vst1q_s16(&paths[32], m9);
332 vst1q_s16(&paths[40], m11);
333
334 /* (PMU) Butterflies: 17-31 */
335 NEON_BUTTERFLY(m12, m13, m6, m0, m2)
336 NEON_BUTTERFLY(m14, m15, m7, m9, m11)
337
338 vst1q_s16(&paths[16], m0);
339 vst1q_s16(&paths[24], m9);
340 vst1q_s16(&paths[48], m13);
341 vst1q_s16(&paths[56], m15);
342
343 if (norm)
344 NEON_NORMALIZE_K7(m4, m1, m5, m3, m6, m2, m7, m11, m0, m8, m9, m10)
345
346 vst1q_s16(&sums[0], m4);
347 vst1q_s16(&sums[8], m5);
348 vst1q_s16(&sums[16], m6);
349 vst1q_s16(&sums[24], m7);
350 vst1q_s16(&sums[32], m1);
351 vst1q_s16(&sums[40], m3);
352 vst1q_s16(&sums[48], m2);
353 vst1q_s16(&sums[56], m11);
354}