Transceiver52M: Add SSE floating point / integer conversion

Convertions are performed in multiples of 4 or 8. All loads are
considered unaligned.

Signed-off-by: Thomas Tsou <tom@tsou.cc>
diff --git a/Transceiver52M/Makefile.am b/Transceiver52M/Makefile.am
index c2f8ece..d002b04 100644
--- a/Transceiver52M/Makefile.am
+++ b/Transceiver52M/Makefile.am
@@ -53,7 +53,8 @@
 	sigProcLib.cpp \
 	Transceiver.cpp \
 	DummyLoad.cpp \
-	convolve.c
+	convolve.c \
+	convert.c
 
 libtransceiver_la_SOURCES = \
 	$(COMMON_SOURCES) \
@@ -78,7 +79,8 @@
 	rcvLPF_651.h \
 	sendLPF_961.h \
 	Resampler.h \
-	convolve.h
+	convolve.h \
+	convert.h
 
 USRPping_SOURCES = USRPping.cpp
 USRPping_LDADD = \
diff --git a/Transceiver52M/convert.c b/Transceiver52M/convert.c
new file mode 100644
index 0000000..dc5e748
--- /dev/null
+++ b/Transceiver52M/convert.c
@@ -0,0 +1,199 @@
+/*
+ * SSE type conversions
+ * Copyright (C) 2013 Thomas Tsou <tom@tsou.cc>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <malloc.h>
+#include <string.h>
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#ifdef HAVE_SSE3
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+#ifdef HAVE_SSE4_1
+#include <smmintrin.h>
+
+/* 16*N 16-bit signed integer converted to single precision floats */
+static void _sse_convert_si16_ps_16n(float *restrict out,
+				     short *restrict in,
+				     int len)
+{
+	__m128i m0, m1, m2, m3, m4, m5;
+	__m128 m6, m7, m8, m9;
+
+	for (int i = 0; i < len / 16; i++) {
+		/* Load (unaligned) packed floats */
+		m0 = _mm_loadu_si128((__m128i *) &in[16 * i + 0]);
+		m1 = _mm_loadu_si128((__m128i *) &in[16 * i + 8]);
+
+		/* Unpack */
+		m2 = _mm_cvtepi16_epi32(m0);
+		m4 = _mm_cvtepi16_epi32(m1);
+		m0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2));
+		m1 = _mm_shuffle_epi32(m1, _MM_SHUFFLE(1, 0, 3, 2));
+		m3 = _mm_cvtepi16_epi32(m0);
+		m5 = _mm_cvtepi16_epi32(m1);
+
+		/* Convert */
+		m6 = _mm_cvtepi32_ps(m2);
+		m7 = _mm_cvtepi32_ps(m3);
+		m8 = _mm_cvtepi32_ps(m4);
+		m9 = _mm_cvtepi32_ps(m5);
+
+		/* Store */
+		_mm_storeu_ps(&out[16 * i + 0], m6);
+		_mm_storeu_ps(&out[16 * i + 4], m7);
+		_mm_storeu_ps(&out[16 * i + 8], m8);
+		_mm_storeu_ps(&out[16 * i + 12], m9);
+	}
+}
+
+/* 16*N 16-bit signed integer conversion with remainder */
+static void _sse_convert_si16_ps(float *restrict out,
+				 short *restrict in,
+				 int len)
+{
+	int start = len / 16 * 16;
+
+	_sse_convert_si16_ps_16n(out, in, len);
+
+	for (int i = 0; i < len % 16; i++)
+		out[start + i] = in[start + i];
+}
+#endif /* HAVE_SSE4_1 */
+
+/* 8*N single precision floats scaled and converted to 16-bit signed integer */
+static void _sse_convert_scale_ps_si16_8n(short *restrict out,
+					  float *restrict in,
+					  float scale, int len)
+{
+	__m128 m0, m1, m2;
+	__m128i m4, m5;
+
+	for (int i = 0; i < len / 8; i++) {
+		/* Load (unaligned) packed floats */
+		m0 = _mm_loadu_ps(&in[8 * i + 0]);
+		m1 = _mm_loadu_ps(&in[8 * i + 4]);
+		m2 = _mm_load1_ps(&scale);
+
+		/* Scale */
+		m0 = _mm_mul_ps(m0, m2);
+		m1 = _mm_mul_ps(m1, m2);
+
+		/* Convert */
+		m4 = _mm_cvtps_epi32(m0);
+		m5 = _mm_cvtps_epi32(m1);
+
+		/* Pack and store */
+		m5 = _mm_packs_epi32(m4, m5);
+		_mm_storeu_si128((__m128i *) &out[8 * i], m5);
+	}
+}
+
+/* 8*N single precision floats scaled and converted with remainder */
+static void _sse_convert_scale_ps_si16(short *restrict out,
+				       float *restrict in,
+				       float scale, int len)
+{
+	int start = len / 8 * 8;
+
+	_sse_convert_scale_ps_si16_8n(out, in, scale, len);
+
+	for (int i = 0; i < len % 8; i++)
+		out[start + i] = in[start + i] * scale;
+}
+
+/* 16*N single precision floats scaled and converted to 16-bit signed integer */
+static void _sse_convert_scale_ps_si16_16n(short *restrict out,
+					   float *restrict in,
+					   float scale, int len)
+{
+	__m128 m0, m1, m2, m3, m4;
+	__m128i m5, m6, m7, m8;
+
+	for (int i = 0; i < len / 16; i++) {
+		/* Load (unaligned) packed floats */
+		m0 = _mm_loadu_ps(&in[16 * i + 0]);
+		m1 = _mm_loadu_ps(&in[16 * i + 4]);
+		m2 = _mm_loadu_ps(&in[16 * i + 8]);
+		m3 = _mm_loadu_ps(&in[16 * i + 12]);
+		m4 = _mm_load1_ps(&scale);
+
+		/* Scale */
+		m0 = _mm_mul_ps(m0, m4);
+		m1 = _mm_mul_ps(m1, m4);
+		m2 = _mm_mul_ps(m2, m4);
+		m3 = _mm_mul_ps(m3, m4);
+
+		/* Convert */
+		m5 = _mm_cvtps_epi32(m0);
+		m6 = _mm_cvtps_epi32(m1);
+		m7 = _mm_cvtps_epi32(m2);
+		m8 = _mm_cvtps_epi32(m3);
+
+		/* Pack and store */
+		m5 = _mm_packs_epi32(m5, m6);
+		m7 = _mm_packs_epi32(m7, m8);
+		_mm_storeu_si128((__m128i *) &out[16 * i + 0], m5);
+		_mm_storeu_si128((__m128i *) &out[16 * i + 8], m7);
+	}
+}
+#else /* HAVE_SSE3 */
+static void convert_scale_ps_si16(short *out, float *in, float scale, int len)
+{
+	for (int i = 0; i < len; i++)
+		out[i] = in[i] * scale;
+}
+#endif
+
+#ifndef HAVE_SSE_4_1
+static void convert_si16_ps(float *out, short *in, int len)
+{
+	for (int i = 0; i < len; i++)
+		out[i] = in[i];
+}
+#endif
+
+void convert_float_short(short *out, float *in, float scale, int len)
+{
+#ifdef HAVE_SSE3
+	if (!(len % 16))
+		_sse_convert_scale_ps_si16_16n(out, in, scale, len);
+	else if (!(len % 8))
+		_sse_convert_scale_ps_si16_8n(out, in, scale, len);
+	else
+		_sse_convert_scale_ps_si16(out, in, scale, len);
+#else
+	convert_scale_ps_si16(out, in, scale, len);
+#endif
+}
+
+void convert_short_float(float *out, short *in, int len)
+{
+#ifdef HAVE_SSE4_1
+	if (!(len % 16))
+		_sse_convert_si16_ps_16n(out, in, len);
+	else
+		_sse_convert_si16_ps(out, in, len);
+#else
+	convert_si16_ps(out, in, len);
+#endif
+}
diff --git a/Transceiver52M/convert.h b/Transceiver52M/convert.h
new file mode 100644
index 0000000..5b557bf
--- /dev/null
+++ b/Transceiver52M/convert.h
@@ -0,0 +1,7 @@
+#ifndef _CONVERT_H_
+#define _CONVERT_H_
+
+void convert_float_short(short *out, float *in, float scale, int len);
+void convert_short_float(float *out, short *in, int len);
+
+#endif /* _CONVERT_H_ */
diff --git a/Transceiver52M/radioInterface.cpp b/Transceiver52M/radioInterface.cpp
index 1e25b5d..f39a470 100644
--- a/Transceiver52M/radioInterface.cpp
+++ b/Transceiver52M/radioInterface.cpp
@@ -25,30 +25,12 @@
 #include "radioInterface.h"
 #include <Logger.h>
 
+extern "C" {
+#include "convert.h"
+}
+
 bool started = false;
 
-/* Device side buffers */
-static short rx_buf[OUTCHUNK * 2 * 2];
-static short tx_buf[INCHUNK * 2 * 2];
-
-/* Complex float to short conversion */
-static void floatToShort(short *out, float *in, int num)
-{
-  for (int i = 0; i < num; i++) {
-    out[2 * i + 0] = (short) in[2 * i + 0];
-    out[2 * i + 1] = (short) in[2 * i + 1];
-  }
-}
-
-/* Complex short to float conversion */
-static void shortToFloat(float *out, short *in, int num)
-{
-  for (int i = 0; i < num; i++) {
-    out[2 * i + 0] = (float) in[2 * i + 0];
-    out[2 * i + 1] = (float) in[2 * i + 1];
-  }
-}
-
 RadioInterface::RadioInterface(RadioDevice *wRadio,
 			       int wReceiveOffset,
 			       int wSPS,
@@ -96,6 +78,7 @@
   convertSendBuffer = NULL;
 }
 
+
 double RadioInterface::fullScaleInputValue(void) {
   return mRadio->fullScaleInputValue();
 }
@@ -120,22 +103,14 @@
 
 int RadioInterface::radioifyVector(signalVector &wVector,
 				   float *retVector,
-				   float scale,
 				   bool zero)
 {
-  int i;
-  signalVector::iterator itr = wVector.begin();
-
   if (zero) {
     memset(retVector, 0, wVector.size() * 2 * sizeof(float));
     return wVector.size();
   }
 
-  for (i = 0; i < wVector.size(); i++) {
-    retVector[2 * i + 0] = itr->real() * scale;
-    retVector[2 * i + 1] = itr->imag() * scale;
-    itr++;
-  }
+  memcpy(retVector, wVector.begin(), wVector.size() * 2 * sizeof(float));
 
   return wVector.size();
 }
@@ -143,10 +118,14 @@
 int RadioInterface::unRadioifyVector(float *floatVector,
 				     signalVector& newVector)
 {
-  int i;
   signalVector::iterator itr = newVector.begin();
 
-  for (i = 0; i < newVector.size(); i++) {
+  if (newVector.size() > recvCursor) {
+    LOG(ALERT) << "Insufficient number of samples in receive buffer";
+    return -1;
+  }
+
+  for (int i = 0; i < newVector.size(); i++) {
     *itr++ = Complex<float>(floatVector[2 * i + 0],
 			    floatVector[2 * i + 1]);
   }
@@ -205,8 +184,7 @@
     return;
 
   radioifyVector(radioBurst,
-                 (float *) (sendBuffer->begin() + sendCursor),
-                 powerScaling, zeroBurst);
+                 (float *) (sendBuffer->begin() + sendCursor), zeroBurst);
 
   sendCursor += radioBurst.size();
 
@@ -293,36 +271,49 @@
 void RadioInterface::pullBuffer()
 {
   bool local_underrun;
+  int num_recv;
 
-  /* Read samples. Fail if we don't get what we want. */
-  int num_rd = mRadio->readSamples(rx_buf, OUTCHUNK, &overrun,
-                                   readTimestamp, &local_underrun);
+  /* Outer buffer access size is fixed */ 
+  num_recv = mRadio->readSamples(convertRecvBuffer,
+                                 OUTCHUNK,
+                                 &overrun,
+                                 readTimestamp,
+                                 &local_underrun);
+  if (num_recv != OUTCHUNK) {
+          LOG(ALERT) << "Receive error " << num_recv;
+          return;
+  }
 
-  LOG(DEBUG) << "Rx read " << num_rd << " samples from device";
-  assert(num_rd == OUTCHUNK);
+  convert_short_float((float *) (recvBuffer->begin() + recvCursor),
+                      convertRecvBuffer, 2 * OUTCHUNK);
 
   underrun |= local_underrun;
-  readTimestamp += (TIMESTAMP) num_rd;
+  readTimestamp += num_recv;
 
-  shortToFloat((float *) recvBuffer->begin() + recvCursor, rx_buf, num_rd);
-  recvCursor += num_rd;
+  recvCursor += num_recv;
 }
 
 /* Send timestamped chunk to the device with arbitrary size */ 
 void RadioInterface::pushBuffer()
 {
+  int num_sent;
+
   if (sendCursor < INCHUNK)
     return;
 
-  floatToShort(tx_buf, (float *) sendBuffer->begin(), sendCursor);
+  convert_float_short(convertSendBuffer,
+                      (float *) sendBuffer->begin(),
+                      powerScaling, 2 * sendCursor);
 
-  /* Write samples. Fail if we don't get what we want. */
-  int num_smpls = mRadio->writeSamples(tx_buf,
-                                       sendCursor,
-                                       &underrun,
-                                       writeTimestamp);
-  assert(num_smpls == sendCursor);
+  /* Send the all samples in the send buffer */ 
+  num_sent = mRadio->writeSamples(convertSendBuffer,
+                                  sendCursor,
+                                  &underrun,
+                                  writeTimestamp);
+  if (num_sent != sendCursor) {
+          LOG(ALERT) << "Transmit error " << num_sent;
+  }
 
-  writeTimestamp += (TIMESTAMP) num_smpls;
+  writeTimestamp += num_sent;
   sendCursor = 0;
 }
diff --git a/Transceiver52M/radioInterface.h b/Transceiver52M/radioInterface.h
index 690d572..a590c32 100644
--- a/Transceiver52M/radioInterface.h
+++ b/Transceiver52M/radioInterface.h
@@ -70,7 +70,6 @@
   /** format samples to USRP */ 
   int radioifyVector(signalVector &wVector,
                      float *floatVector,
-                     float scale,
                      bool zero);
 
   /** format samples from USRP */
diff --git a/Transceiver52M/radioInterfaceResamp.cpp b/Transceiver52M/radioInterfaceResamp.cpp
index d3dc82e..2b59203 100644
--- a/Transceiver52M/radioInterfaceResamp.cpp
+++ b/Transceiver52M/radioInterfaceResamp.cpp
@@ -49,20 +49,6 @@
 short *convertRecvBuffer = NULL;
 short *convertSendBuffer = NULL;
 
-/* Complex float to short conversion */
-static void floatToShort(short *out, float *in, int num)
-{
-  for (int i = 0; i < num; i++)
-    out[i] = (short) in[i];
-}
-
-/* Complex short to float conversion */
-static void shortToFloat(float *out, short *in, int num)
-{
-  for (int i = 0; i < num; i++)
-    out[i] = (float) in[i];
-}
-
 RadioInterfaceResamp::RadioInterfaceResamp(RadioDevice *wRadio,
 					   int wReceiveOffset,
 					   int wSPS,
@@ -166,8 +152,8 @@
 		return;
 	}
 
-	shortToFloat((float *) outerRecvBuffer->begin(),
-		     convertRecvBuffer, 2 * outer_len);
+	convert_short_float((float *) outerRecvBuffer->begin(),
+			    convertRecvBuffer, 2 * outer_len);
 
 	underrun |= local_underrun;
 	readTimestamp += (TIMESTAMP) num_recv;
@@ -206,9 +192,9 @@
 		LOG(ALERT) << "Sample rate downsampling error";
 	}
 
-	floatToShort(convertSendBuffer,
-		     (float *) outerSendBuffer->begin(),
-		     2 * outer_len);
+	convert_float_short(convertSendBuffer,
+			    (float *) outerSendBuffer->begin(),
+			    powerScaling, 2 * outer_len);
 
 	num_sent = mRadio->writeSamples(convertSendBuffer,
 					outer_len,