WIP: add laforge-mmx.diff to illustrate MMX related changes I tried
diff --git a/libgsmhr/laforge-mmx.diff b/libgsmhr/laforge-mmx.diff
new file mode 100644
index 0000000..a846c25
--- /dev/null
+++ b/libgsmhr/laforge-mmx.diff
@@ -0,0 +1,167 @@
+Only in refsrc: .downloaded
+Only in refsrc: .sp_frm.c.swp
+diff -u refsrc.orig/sp_frm.c refsrc/sp_frm.c
+--- refsrc.orig/sp_frm.c	2015-12-27 19:22:13.966296058 +0100
++++ refsrc/sp_frm.c	2014-05-13 22:43:56.786205819 +0200
+@@ -60,6 +60,13 @@
+  *
+  **************************************************************************/
+ 
++#include <stdio.h>
++
++#include <stdint.h>
++//#define HAVE_MMX
++extern int32_t mmx_mac_unsat(int16_t *_x, int16_t *_y);
++extern int32_t mmx_mac_sat(int16_t *_x, int16_t *_y);
++
+ /*_________________________________________________________________________
+  |                                                                         |
+  |                            Include Files                                |
+@@ -384,6 +391,7 @@
+         /* get a vector */
+         /*--------------*/
+ 
++	      // 16, 32 or 64 iteraitons
+         getNextVec(pswRc);
+ 
+         /* clear the limiter flag */
+@@ -432,6 +440,8 @@
+         for (iCnt = 0; iCnt < quantList.iNum; iCnt++)
+         {
+ 
++	      // 4 * 16, 32 or 64 iteraitons
++
+           /* get a vector */
+           /*--------------*/
+ 
+@@ -931,6 +941,7 @@
+ 
+     for (i = 0; i <= bound; i++)
+     {
++	    // 3-4 iterations
+       L_sum = L_mac(L_ROUND, pswVOld[i], pswQntRc[j]);
+       L_sum = L_mac(L_sum, pswVOld[-i], pswQntRc[j]);
+       L_sum = L_mac(L_sum, pswPOld[i], pswQntRcSqd[j]);
+@@ -949,6 +960,7 @@
+ 
+     for (i = -bound; i < 0; i++)
+     {
++	    // 3-4 iterations
+       L_sum = L_msu(L_ROUND, pswVOld[i + 1], SW_MIN);
+       L_sum = L_mac(L_sum, pswQntRcSqd[j], pswVOld[-i - 1]);
+       L_sum = L_mac(L_sum, pswQntRc[j], pswPOld[-i - 1]);
+@@ -958,6 +970,7 @@
+ 
+     for (i = 0; i <= bound; i++)
+     {
++	    // 3-4 iterations
+       L_sum = L_msu(L_ROUND, pswVOld[i + 1], SW_MIN);
+       L_sum = L_mac(L_sum, pswQntRcSqd[j], pswVOld[-i - 1]);
+       L_sum = L_mac(L_sum, pswQntRc[j], pswPOld[i + 1]);
+@@ -4536,6 +4549,11 @@
+ 
+   pswScaledWSpeech = pswScaledWSpeechBuffer + LSMAX;
+ 
++#if 0
++  printf("G_FRAME_LEN=%d\n", G_FRAME_LEN);
++  printf("LSMIN=%d\n", LSMIN);
++  printf("LSMAX=%d\n", LSMAX);
++#endif
+ /*_________________________________________________________________________
+  |                                                                         |
+  |                            Executable Code                              |
+@@ -4633,11 +4651,20 @@
+   /*---------------------------------------------------------------------*/
+ 
+   L_G = 0;
++#ifndef HAVE_MMX
++  // 40 iterations (MMX: 5)
+   for (i = -LSMAX; i < -LSMAX + S_LEN; i++)
+     L_G = L_mac(L_G, pswScaledWSpeech[i], pswScaledWSpeech[i]);
++#else
++  for (i = -LSMAX; i < -LSMAX + S_LEN; i+=8)
++    L_G += mmx_mac_unsat(&pswScaledWSpeech[i], &pswScaledWSpeech[i]);
++#endif
+ 
+   pswGFrame[G_FRAME_LEN - 1] = extract_h(L_G);
+ 
++//#ifndef HAVE_MMX
++#if 1
++  // 248 iterations (MMX: 31)
+   for (i = -LSMAX; i < G_FRAME_LEN - LSMAX - 1; i++)
+   {
+ 
+@@ -4646,6 +4673,13 @@
+                 pswScaledWSpeech[i + S_LEN]);
+     pswGFrame[G_FRAME_LEN - LSMAX - 2 - i] = extract_h(L_G);
+   }
++#else
++  for (i = -LSMAX; i < G_FRAME_LEN - LSMAX - 1; i+= 8) {
++	  L_G -= mmx_mac_sat(&pswScaledWSpeech[i], &pswScaledWSpeech[i]);
++	  L_G += mmx_mac_sat(&pswScaledWSpeech[i + S_LEN],
++			     &pswScaledWSpeech[i + S_LEN]);
++  } 
++#endif
+ 
+   ppswGSfrm[0] = pswGFrame + 3 * S_LEN;
+   ppswGSfrm[1] = pswGFrame + 2 * S_LEN;
+@@ -4661,8 +4695,14 @@
+   pswSfrmEng[2] = pswGFrame[G_FRAME_LEN - 1 - LSMAX - 2 * S_LEN];
+ 
+   L_WSfrmEng = 0;
++#ifndef HAVE_MMX
++  // 40 iterations (MMX: 5)
+   for (i = F_LEN - S_LEN; i < F_LEN; i++)
+     L_WSfrmEng = L_mac(L_WSfrmEng, pswScaledWSpeech[i], pswScaledWSpeech[i]);
++#else
++  for (i = F_LEN - S_LEN; i < F_LEN; i+= 8)
++    L_WSfrmEng += mmx_mac_unsat(&pswScaledWSpeech[i], &pswScaledWSpeech[i]);
++#endif
+ 
+   pswSfrmEng[3] = extract_h(L_WSfrmEng);
+ 
+@@ -4671,19 +4711,26 @@
+   /* as in the G buffer.)                                       */
+   /*------------------------------------------------------------*/
+ 
++  // 4 iterations
+   for (i = 0; i < N_SUB; i++)
+   {
+ 
++    // 127 iterations
+     for (j = LSMIN; j <= LSMAX; j++)
+     {
+-
+       L_C = 0;
++#ifndef HAVE_MMX
++      // 4*127*40 iterations (MMX: 4*127*5)
+       for (k = 0; k < S_LEN; k++)
+       {
+-
+         L_C = L_mac(L_C, pswScaledWSpeech[i * S_LEN + k],
+                     pswScaledWSpeech[i * S_LEN - j + k]);
+       }
++#else
++      for (k = 0; k < S_LEN; k+= 8)
++	      L_C += mmx_mac_unsat(&pswScaledWSpeech[i*S_LEN + k],
++			      	   &pswScaledWSpeech[i*S_LEN - j + k]);
++#endif
+ 
+       pswCFrame[i * CG_TERMS + j - LSMIN] = extract_h(L_C);
+     }
+@@ -4750,6 +4797,7 @@
+ 
+   L_Voicing = 0;
+   for (i = 0; i < N_SUB; i++)
++	  // 4 Iterations
+     L_Voicing = L_mac(L_Voicing, pswSfrmEng[i], UV_SCALE0);
+ 
+   L_Voicing = L_add(L_Voicing, L_deposit_h(swBestPG));
+@@ -5085,6 +5133,7 @@
+       siLowestSoFar = 2;
+       for (i = 0; i < N_SUB; i++)
+       {
++	      // 4 iterations
+ 
+         /* Check this subframe against highest voicing threshold */
+         /*-------------------------------------------------------*/