core/conv: do not mix up AVX and SSE code According to GCC's wiki: If you specify command-line switches such as -msse, the compiler could use the extended instruction sets even if the built-ins are not used explicitly in the program. For this reason, applications that perform run-time CPU detection must compile separate files for each supported architecture, using the appropriate flags. In particular, the file containing the CPU detection code should be compiled without these options. So, this change introduces a separate Viterbi implementation, which is almost the same as previous one, but is being compiled with -mavx2. This implementation will be only used by CPUs with both SSE and AVX support: SSE3 and AVX2: viterbi_sse_avx.c SSE3 only: viterbi_sse.c Generic: viterbi_generic.c Change-Id: I042cc76258df7e4c6c90a73af3d0a6e75999b2b0

commit: 0d49f47aeb5a38273c4a356dc4780e3819747d60 [log] [tgz]
author: Vadim Yanitskiy <axilirator@gmail.com> Sun May 28 18:20:02 2017 +0700
committer: Max <msuraev@sysmocom.de> Mon May 29 14:07:25 2017 +0000
tree: 70cb2852a4044ef0fcb799e1187411e225045102
parent: b51d51b29c647584d95cc62dbe88f869ba5bf5d5 [diff] [blame]
diff --git a/src/viterbi_sse_avx.c b/src/viterbi_sse_avx.c
new file mode 100644
index 0000000..b4c45a6
--- /dev/null
+++ b/src/viterbi_sse_avx.c

@@ -0,0 +1,129 @@
+/*
+ * Intel SSE + AVX Viterbi decoder
+ *
+ * Copyright (C) 2013, 2014 Thomas Tsou <tom@tsou.cc>
+ *
+ * All Rights Reserved
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <stdint.h>
+#include "config.h"
+
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#include <xmmintrin.h>
+#include <immintrin.h>
+
+#if defined(HAVE_SSE4_1)
+#include <smmintrin.h>
+#endif
+
+#define SSE_ALIGN 16
+
+
+/* Broadcast 16-bit integer
+ * Repeat the low 16-bit integer to all elements of the 128-bit SSE
+ * register. Only AVX2 has a dedicated broadcast instruction; use repeat
+ * unpacks for SSE only architectures. This is a destructive operation and
+ * the source register is overwritten.
+ *
+ * Input:
+ * M0 - Low 16-bit element is read
+ *
+ * Output:
+ * M0 - Contains broadcasted values
+ */
+#define SSE_BROADCAST(M0) \
+{ \
+	M0 = _mm_broadcastw_epi16(M0); \
+}
+
+/**
+ * Include common SSE implementation
+ */
+#include <viterbi_sse_common.h>
+
+/* Aligned Memory Allocator
+ * SSE requires 16-byte memory alignment. We store relevant trellis values
+ * (accumulated sums, outputs, and path decisions) as 16 bit signed integers
+ * so the allocated memory is casted as such.
+ */
+__attribute__ ((visibility("hidden")))
+int16_t *osmo_conv_sse_avx_vdec_malloc(size_t n)
+{
+	return (int16_t *) _mm_malloc(sizeof(int16_t) * n, SSE_ALIGN);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_sse_avx_vdec_free(int16_t *ptr)
+{
+	_mm_free(ptr);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_sse_avx_metrics_k5_n2(const int8_t *val,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm)
+{
+	const int16_t _val[4] = { val[0], val[1], val[0], val[1] };
+
+	_sse_metrics_k5_n2(_val, out, sums, paths, norm);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_sse_avx_metrics_k5_n3(const int8_t *val,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm)
+{
+	const int16_t _val[4] = { val[0], val[1], val[2], 0 };
+
+	_sse_metrics_k5_n4(_val, out, sums, paths, norm);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_sse_avx_metrics_k5_n4(const int8_t *val,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm)
+{
+	const int16_t _val[4] = { val[0], val[1], val[2], val[3] };
+
+	_sse_metrics_k5_n4(_val, out, sums, paths, norm);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_sse_avx_metrics_k7_n2(const int8_t *val,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm)
+{
+	const int16_t _val[4] = { val[0], val[1], val[0], val[1] };
+
+	_sse_metrics_k7_n2(_val, out, sums, paths, norm);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_sse_avx_metrics_k7_n3(const int8_t *val,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm)
+{
+	const int16_t _val[4] = { val[0], val[1], val[2], 0 };
+
+	_sse_metrics_k7_n4(_val, out, sums, paths, norm);
+}
+
+__attribute__ ((visibility("hidden")))
+void osmo_conv_sse_avx_metrics_k7_n4(const int8_t *val,
+	const int16_t *out, int16_t *sums, int16_t *paths, int norm)
+{
+	const int16_t _val[4] = { val[0], val[1], val[2], val[3] };
+
+	_sse_metrics_k7_n4(_val, out, sums, paths, norm);
+}
commit	0d49f47aeb5a38273c4a356dc4780e3819747d60	[log] [tgz]
author	Vadim Yanitskiy <axilirator@gmail.com>	Sun May 28 18:20:02 2017 +0700
committer	Max <msuraev@sysmocom.de>	Mon May 29 14:07:25 2017 +0000
tree	70cb2852a4044ef0fcb799e1187411e225045102
parent	b51d51b29c647584d95cc62dbe88f869ba5bf5d5 [diff] [blame]