libomsocoding: NEON viterbi acceleration

configure flag required to enable this: --enable-neon

Although autodetection according to __ARM_NEON would work because this
is only defined if the fpu is neon neon-fp16 neon-vfpv3 neon-vfpv4
neon-fp-armv8 crypto-neon-fp-armv8 doing that would lead to a unknown
performance impact, so it needs to be enabled manually.

Speedup is about ~1.3-1.5 on a unspecified single core Cortex A9. This
requires handling a special case for RACH with len 14 which is far too
short for neon and would actually incur a performance penalty of 25%.

Related: OS#4585
Change-Id: I58ff2cb4ce3514f43390ff0a2121f81e6a4983b5
diff --git a/src/conv_acc.c b/src/conv_acc.c
index c16e436..0f6f7ca 100644
--- a/src/conv_acc.c
+++ b/src/conv_acc.c
@@ -85,6 +85,11 @@
 void osmo_conv_sse_avx_vdec_free(int16_t *ptr);
 #endif
 
+#ifdef HAVE_NEON
+int16_t *osmo_conv_neon_vdec_malloc(size_t n);
+void osmo_conv_neon_vdec_free(int16_t *ptr);
+#endif
+
 /* Forward Metric Units */
 void osmo_conv_gen_metrics_k5_n2(const int8_t *seq, const int16_t *out,
 	int16_t *sums, int16_t *paths, int norm);
@@ -129,6 +134,21 @@
 	int16_t *sums, int16_t *paths, int norm);
 #endif
 
+#if defined(HAVE_NEON)
+void osmo_conv_neon_metrics_k5_n2(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_neon_metrics_k5_n3(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_neon_metrics_k5_n4(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_neon_metrics_k7_n2(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_neon_metrics_k7_n3(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_neon_metrics_k7_n4(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+#endif
+
 /* Trellis State
  * state - Internal lshift register value
  * prev  - Register values of previous 0 and 1 states
@@ -528,6 +548,12 @@
 	if (dec->k == 5) {
 		switch (dec->n) {
 		case 2:
+/* rach len 14 is too short for neon */
+#ifdef HAVE_NEON
+			if (code->len < 100)
+				dec->metric_func = osmo_conv_gen_metrics_k5_n2;
+			else
+#endif
 			dec->metric_func = osmo_conv_metrics_k5_n2;
 			break;
 		case 3:
@@ -681,6 +707,8 @@
 	} else {
 		INIT_POINTERS(gen);
 	}
+#elif defined(HAVE_NEON)
+	INIT_POINTERS(neon);
 #else
 	INIT_POINTERS(gen);
 #endif