Transceiver52M: Add NEON complex-complex multiply

Complex-complex block multiples are used for phase rotation of
bursts. Optimization targeted from perf profiling.

Signed-off-by: Thomas Tsou <tom@tsou.cc>
diff --git a/Transceiver52M/Makefile.am b/Transceiver52M/Makefile.am
index 981447f..735265f 100644
--- a/Transceiver52M/Makefile.am
+++ b/Transceiver52M/Makefile.am
@@ -77,7 +77,8 @@
 	Resampler.h \
 	common/convolve.h \
 	common/convert.h \
-	common/scale.h
+	common/scale.h \
+	common/mult.h
 
 transceiver_SOURCES = runTransceiver.cpp
 transceiver_LDADD = \
diff --git a/Transceiver52M/arm/Makefile.am b/Transceiver52M/arm/Makefile.am
index 6d34daa..0b959be 100644
--- a/Transceiver52M/arm/Makefile.am
+++ b/Transceiver52M/arm/Makefile.am
@@ -17,5 +17,7 @@
 	convolve.c \
 	convolve_neon.S \
 	scale.c \
-	scale_neon.S
+	scale_neon.S \
+	mult.c \
+	mult_neon.S
 endif
diff --git a/Transceiver52M/arm/mult.c b/Transceiver52M/arm/mult.c
new file mode 100644
index 0000000..245be50
--- /dev/null
+++ b/Transceiver52M/arm/mult.c
@@ -0,0 +1,56 @@
+/*
+ * NEON scaling
+ * Copyright (C) 2012,2013 Thomas Tsou <tom@tsou.cc>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <malloc.h>
+#include <string.h>
+#include <mult.h>
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+void neon_cmplx_mul_4n(float *, float *, float *, int);
+
+static void cmplx_mul_ps(float *out, float *a, float *b, int len)
+{
+	float ai, aq, bi, bq;
+
+	for (int i = 0; i < len; i++) {
+		ai = a[2 * i + 0];
+		aq = a[2 * i + 1];
+
+		bi = b[2 * i + 0];
+		bq = b[2 * i + 1];
+
+		out[2 * i + 0] = ai * bi - aq * bq;
+		out[2 * i + 1] = ai * bq + aq * bi;
+	}
+}
+
+void mul_complex(float *out, float *a, float *b, int len)
+{
+#ifdef HAVE_NEON
+	if (len % 4)
+		cmplx_mul_ps(out, a, b, len);
+	else
+		neon_cmplx_mul_4n(out, a, b, len >> 2);
+#else
+	cmplx_mul_ps(out, a, b, len);
+#endif
+}
diff --git a/Transceiver52M/arm/mult_neon.S b/Transceiver52M/arm/mult_neon.S
new file mode 100644
index 0000000..162846e
--- /dev/null
+++ b/Transceiver52M/arm/mult_neon.S
@@ -0,0 +1,42 @@
+/*
+ * NEON complex multiplication 
+ * Copyright (C) 2012,2013 Thomas Tsou <tom@tsou.cc>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+	.syntax unified
+	.text
+	.align 2
+	.global neon_cmplx_mul_4n
+	.type neon_cmplx_mul_4n, %function
+neon_cmplx_mul_4n:
+	vpush       {q4-q7}
+.loop_mul:
+	vld2.32     {q0-q1}, [r1]!
+	vld2.32     {q2-q3}, [r2]!
+	vmul.f32         q4, q0, q2
+	vmul.f32         q5, q1, q3
+	vmul.f32         q6, q0, q3
+	vmul.f32         q7, q2, q1
+	vsub.f32         q8, q4, q5
+	vadd.f32         q9, q6, q7
+	vst2.32     {q8-q9}, [r0]!
+	subs             r3, #1
+	bne       .loop_mul
+	vpop       {q4-q7}
+	bx               lr
+	.size neon_cmplx_mul_4n, .-neon_cmplx_mul_4n
+	.section .note.GNU-stack,"",%progbits
diff --git a/Transceiver52M/common/mult.h b/Transceiver52M/common/mult.h
new file mode 100644
index 0000000..4d96efb
--- /dev/null
+++ b/Transceiver52M/common/mult.h
@@ -0,0 +1,6 @@
+#ifndef _MULT_H_
+#define _MULT_H_
+
+void mul_complex(float *out, float *a, float *b, int len);
+
+#endif /* _MULT_H_ */
diff --git a/Transceiver52M/sigProcLib.cpp b/Transceiver52M/sigProcLib.cpp
index 5a1ab77..ab421b6 100644
--- a/Transceiver52M/sigProcLib.cpp
+++ b/Transceiver52M/sigProcLib.cpp
@@ -32,6 +32,7 @@
 extern "C" {
 #include "convolve.h"
 #include "scale.h"
+#include "mult.h"
 }
 
 using namespace GSM;
@@ -287,6 +288,26 @@
 
 static void GMSKRotate(signalVector &x, int sps)
 {
+#if HAVE_NEON
+  size_t len;
+  signalVector *a, *b, *out;
+
+  a = &x;
+  out = &x;
+  len = out->size();
+
+  if (len == 157)
+    len--;
+
+  if (sps == 1)
+    b = GMSKRotation1;
+  else
+    b = GMSKRotationN;
+
+  mul_complex((float *) out->begin(),
+              (float *) a->begin(),
+              (float *) b->begin(), len);
+#else
   signalVector::iterator rotPtr, xPtr = x.begin();
 
   if (sps == 1)
@@ -306,6 +327,7 @@
       xPtr++;
     }
   }
+#endif
 }
 
 static void GMSKReverseRotate(signalVector &x, int sps)