libomsocoding: NEON viterbi acceleration
configure flag required to enable this: --enable-neon
Although autodetection according to __ARM_NEON would work because this
is only defined if the fpu is neon neon-fp16 neon-vfpv3 neon-vfpv4
neon-fp-armv8 crypto-neon-fp-armv8 doing that would lead to a unknown
performance impact, so it needs to be enabled manually.
Speedup is about ~1.3-1.5 on a unspecified single core Cortex A9. This
requires handling a special case for RACH with len 14 which is far too
short for neon and would actually incur a performance penalty of 25%.
Related: OS#4585
Change-Id: I58ff2cb4ce3514f43390ff0a2121f81e6a4983b5
diff --git a/src/conv_acc.c b/src/conv_acc.c
index c16e436..0f6f7ca 100644
--- a/src/conv_acc.c
+++ b/src/conv_acc.c
@@ -85,6 +85,11 @@
void osmo_conv_sse_avx_vdec_free(int16_t *ptr);
#endif
+#ifdef HAVE_NEON
+int16_t *osmo_conv_neon_vdec_malloc(size_t n);
+void osmo_conv_neon_vdec_free(int16_t *ptr);
+#endif
+
/* Forward Metric Units */
void osmo_conv_gen_metrics_k5_n2(const int8_t *seq, const int16_t *out,
int16_t *sums, int16_t *paths, int norm);
@@ -129,6 +134,21 @@
int16_t *sums, int16_t *paths, int norm);
#endif
+#if defined(HAVE_NEON)
+void osmo_conv_neon_metrics_k5_n2(const int8_t *seq, const int16_t *out,
+ int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_neon_metrics_k5_n3(const int8_t *seq, const int16_t *out,
+ int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_neon_metrics_k5_n4(const int8_t *seq, const int16_t *out,
+ int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_neon_metrics_k7_n2(const int8_t *seq, const int16_t *out,
+ int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_neon_metrics_k7_n3(const int8_t *seq, const int16_t *out,
+ int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_neon_metrics_k7_n4(const int8_t *seq, const int16_t *out,
+ int16_t *sums, int16_t *paths, int norm);
+#endif
+
/* Trellis State
* state - Internal lshift register value
* prev - Register values of previous 0 and 1 states
@@ -528,6 +548,12 @@
if (dec->k == 5) {
switch (dec->n) {
case 2:
+/* rach len 14 is too short for neon */
+#ifdef HAVE_NEON
+ if (code->len < 100)
+ dec->metric_func = osmo_conv_gen_metrics_k5_n2;
+ else
+#endif
dec->metric_func = osmo_conv_metrics_k5_n2;
break;
case 3:
@@ -681,6 +707,8 @@
} else {
INIT_POINTERS(gen);
}
+#elif defined(HAVE_NEON)
+ INIT_POINTERS(neon);
#else
INIT_POINTERS(gen);
#endif