core/conv: add x86 SSE support for Viterbi decoder Fast convolutional decoding is provided through x86 intrinsic based SSE operations. SSE3, found on virtually all modern x86 processors, is the minimal requirement. SSE4.1 and AVX2 are used if available. Also, the original code was extended with runtime SIMD detection, so only supported extensions will be used by target CPU. It makes the library more partable, what is very important for binary packages distribution. Runtime SIMD detection is currently implemented through the __builtin_cpu_supports call. Change-Id: I1da6d71ed0564f1d684f3a836e998d09de5f0351

commit: 34e228a9bcf3ac37287bb5e684ace46818740f3b [log] [tgz]
author: Tom Tsou <tom.tsou@ettus.com> Sat Apr 29 00:16:43 2017 +0700
committer: Harald Welte <laforge@gnumonks.org> Wed May 24 22:04:53 2017 +0000
tree: ebb7bcd4dd9c494106096384327db0122a4fde01
parent: b6c8dda5e34df6b74183ad24cf66c98601065e56 [diff] [blame]
diff --git a/src/viterbi.c b/src/viterbi.c
index 21c6a57..2097a02 100644
--- a/src/viterbi.c
+++ b/src/viterbi.c

@@ -24,12 +24,35 @@
 #include <string.h>
 #include <errno.h>
 
-#include <osmocom/core/conv.h>
 #include "config.h"
 
+#include <osmocom/core/conv.h>
+
 #define BIT2NRZ(REG,N)	(((REG >> N) & 0x01) * 2 - 1) * -1
 #define NUM_STATES(K)	(K == 7 ? 64 : 16)
 
+static int init_complete = 0;
+
+__attribute__ ((visibility("hidden"))) int avx2_supported = 0;
+__attribute__ ((visibility("hidden"))) int sse3_supported = 0;
+__attribute__ ((visibility("hidden"))) int sse41_supported = 0;
+
+/**
+ * This pointers will be initialized by the osmo_conv_init()
+ * depending on supported SIMD extensions.
+ */
+static int16_t *(*vdec_malloc)(size_t n);
+static void (*vdec_free)(int16_t *ptr);
+
+/* Forward malloc wrappers */
+int16_t *osmo_conv_vdec_malloc(size_t n);
+void osmo_conv_vdec_free(int16_t *ptr);
+
+#ifdef HAVE_SSE3
+int16_t *osmo_conv_vdec_malloc_sse3(size_t n);
+void osmo_conv_vdec_free_sse3(int16_t *ptr);
+#endif
+
 /* Forward Metric Units */
 void osmo_conv_gen_metrics_k5_n2(const int8_t *seq, const int16_t *out,
 	int16_t *sums, int16_t *paths, int norm);
@@ -44,6 +67,21 @@
 void osmo_conv_gen_metrics_k7_n4(const int8_t *seq, const int16_t *out,
 	int16_t *sums, int16_t *paths, int norm);
 
+#ifdef HAVE_SSE3
+void osmo_conv_gen_metrics_k5_n2_sse(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k5_n3_sse(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k5_n4_sse(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n2_sse(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n3_sse(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+void osmo_conv_gen_metrics_k7_n4_sse(const int8_t *seq, const int16_t *out,
+	int16_t *sums, int16_t *paths, int norm);
+#endif
+
 /* Trellis State
  * state - Internal lshift register value
  * prev  - Register values of previous 0 and 1 states
@@ -90,12 +128,6 @@
 		int16_t *, int16_t *, int);
 };
 
-/* Non-aligned Memory Allocator */
-static int16_t *vdec_malloc(size_t n)
-{
-	return (int16_t *) malloc(sizeof(int16_t) * n);
-}
-
 /* Accessor calls */
 static inline int conv_code_recursive(const struct osmo_conv_code *code)
 {
@@ -294,9 +326,9 @@
 	if (!trellis)
 		return;
 
+	vdec_free(trellis->outputs);
+	vdec_free(trellis->sums);
 	free(trellis->vals);
-	free(trellis->outputs);
-	free(trellis->sums);
 	free(trellis);
 }
 
@@ -430,7 +462,7 @@
 	if (!dec)
 		return;
 
-	free(dec->paths[0]);
+	vdec_free(dec->paths[0]);
 	free(dec->paths);
 	free_trellis(dec->trellis);
 	free(dec);
@@ -456,13 +488,31 @@
 	if (dec->k == 5) {
 		switch (dec->n) {
 		case 2:
+		#ifdef HAVE_SSE3
+			dec->metric_func = !sse3_supported ?
+				osmo_conv_gen_metrics_k5_n2 :
+				osmo_conv_gen_metrics_k5_n2_sse;
+		#else
 			dec->metric_func = osmo_conv_gen_metrics_k5_n2;
+		#endif
 			break;
 		case 3:
+		#ifdef HAVE_SSE3
+			dec->metric_func = !sse3_supported ?
+				osmo_conv_gen_metrics_k5_n3 :
+				osmo_conv_gen_metrics_k5_n3_sse;
+		#else
 			dec->metric_func = osmo_conv_gen_metrics_k5_n3;
+		#endif
 			break;
 		case 4:
+		#ifdef HAVE_SSE3
+			dec->metric_func = !sse3_supported ?
+				osmo_conv_gen_metrics_k5_n4 :
+				osmo_conv_gen_metrics_k5_n4_sse;
+		#else
 			dec->metric_func = osmo_conv_gen_metrics_k5_n4;
+		#endif
 			break;
 		default:
 			goto fail;
@@ -470,13 +520,31 @@
 	} else if (dec->k == 7) {
 		switch (dec->n) {
 		case 2:
+		#ifdef HAVE_SSE3
+			dec->metric_func = !sse3_supported ?
+				osmo_conv_gen_metrics_k7_n2 :
+				osmo_conv_gen_metrics_k7_n2_sse;
+		#else
 			dec->metric_func = osmo_conv_gen_metrics_k7_n2;
+		#endif
 			break;
 		case 3:
+		#ifdef HAVE_SSE3
+			dec->metric_func = !sse3_supported ?
+				osmo_conv_gen_metrics_k7_n3 :
+				osmo_conv_gen_metrics_k7_n3_sse;
+		#else
 			dec->metric_func = osmo_conv_gen_metrics_k7_n3;
+		#endif
 			break;
 		case 4:
+		#ifdef HAVE_SSE3
+			dec->metric_func = !sse3_supported ?
+				osmo_conv_gen_metrics_k7_n4 :
+				osmo_conv_gen_metrics_k7_n4_sse;
+		#else
 			dec->metric_func = osmo_conv_gen_metrics_k7_n4;
+		#endif
 			break;
 		default:
 			goto fail;
@@ -569,6 +637,36 @@
 	return traceback(dec, out, term, len);
 }
 
+static void osmo_conv_init(void)
+{
+	init_complete = 1;
+
+#ifdef HAVE___BUILTIN_CPU_SUPPORTS
+	/* Detect CPU capabilities */
+	#ifdef HAVE_AVX2
+		avx2_supported = __builtin_cpu_supports("avx2");
+	#endif
+
+	#ifdef HAVE_SSE3
+		sse3_supported = __builtin_cpu_supports("sse3");
+	#endif
+
+	#ifdef HAVE_SSE4_1
+		sse41_supported = __builtin_cpu_supports("sse4.1");
+	#endif
+#endif
+
+#ifdef HAVE_SSE3
+	vdec_malloc = !sse3_supported ?
+		&osmo_conv_vdec_malloc : &osmo_conv_vdec_malloc_sse3;
+	vdec_free = !sse3_supported ?
+		&osmo_conv_vdec_free : &osmo_conv_vdec_free_sse3;
+#else
+	vdec_malloc = &osmo_conv_vdec_malloc;
+	vdec_free = &osmo_conv_vdec_free;
+#endif
+}
+
 /* All-in-one Viterbi decoding  */
 int osmo_conv_decode_acc(const struct osmo_conv_code *code,
 	const sbit_t *input, ubit_t *output)
@@ -576,6 +674,9 @@
 	int rc;
 	struct vdecoder *vdec;
 
+	if (!init_complete)
+		osmo_conv_init();
+
 	if ((code->N < 2) || (code->N > 4) || (code->len < 1) ||
 		((code->K != 5) && (code->K != 7)))
 		return -EINVAL;
commit	34e228a9bcf3ac37287bb5e684ace46818740f3b	[log] [tgz]
author	Tom Tsou <tom.tsou@ettus.com>	Sat Apr 29 00:16:43 2017 +0700
committer	Harald Welte <laforge@gnumonks.org>	Wed May 24 22:04:53 2017 +0000
tree	ebb7bcd4dd9c494106096384327db0122a4fde01
parent	b6c8dda5e34df6b74183ad24cf66c98601065e56 [diff] [blame]