mme/snow-3g.c - osmo-ttcn3-hacks - Gitiles

 /*------------------------------------------------------------------------
 * SNOW_3G.c
 *------------------------------------------------------------------------*/

 #include "snow-3g.h"

 /* LFSR */

 static u32 LFSR_S0 = 0x00;
 static u32 LFSR_S1 = 0x00;
 static u32 LFSR_S2 = 0x00;
 static u32 LFSR_S3 = 0x00;
 static u32 LFSR_S4 = 0x00;
 static u32 LFSR_S5 = 0x00;
 static u32 LFSR_S6 = 0x00;
 static u32 LFSR_S7 = 0x00;
 static u32 LFSR_S8 = 0x00;
 static u32 LFSR_S9 = 0x00;
 static u32 LFSR_S10 = 0x00;
 static u32 LFSR_S11 = 0x00;
 static u32 LFSR_S12 = 0x00;
 static u32 LFSR_S13 = 0x00;
 static u32 LFSR_S14 = 0x00;
 static u32 LFSR_S15 = 0x00;

 /* FSM */

 static u32 FSM_R1 = 0x00;
 static u32 FSM_R2 = 0x00;
 static u32 FSM_R3 = 0x00;

 /* Rijndael S-box SR */

 static const u8 SR[256] = {
 0x63,0x7C,0x77,0x7B,0xF2,0x6B,0x6F,0xC5,0x30,0x01,0x67,0x2B,0xFE,0xD7,0xAB,0x76,
 0xCA,0x82,0xC9,0x7D,0xFA,0x59,0x47,0xF0,0xAD,0xD4,0xA2,0xAF,0x9C,0xA4,0x72,0xC0,
 0xB7,0xFD,0x93,0x26,0x36,0x3F,0xF7,0xCC,0x34,0xA5,0xE5,0xF1,0x71,0xD8,0x31,0x15,
 0x04,0xC7,0x23,0xC3,0x18,0x96,0x05,0x9A,0x07,0x12,0x80,0xE2,0xEB,0x27,0xB2,0x75,
 0x09,0x83,0x2C,0x1A,0x1B,0x6E,0x5A,0xA0,0x52,0x3B,0xD6,0xB3,0x29,0xE3,0x2F,0x84,
 0x53,0xD1,0x00,0xED,0x20,0xFC,0xB1,0x5B,0x6A,0xCB,0xBE,0x39,0x4A,0x4C,0x58,0xCF,
 0xD0,0xEF,0xAA,0xFB,0x43,0x4D,0x33,0x85,0x45,0xF9,0x02,0x7F,0x50,0x3C,0x9F,0xA8,
 0x51,0xA3,0x40,0x8F,0x92,0x9D,0x38,0xF5,0xBC,0xB6,0xDA,0x21,0x10,0xFF,0xF3,0xD2,
 0xCD,0x0C,0x13,0xEC,0x5F,0x97,0x44,0x17,0xC4,0xA7,0x7E,0x3D,0x64,0x5D,0x19,0x73,
 0x60,0x81,0x4F,0xDC,0x22,0x2A,0x90,0x88,0x46,0xEE,0xB8,0x14,0xDE,0x5E,0x0B,0xDB,
 0xE0,0x32,0x3A,0x0A,0x49,0x06,0x24,0x5C,0xC2,0xD3,0xAC,0x62,0x91,0x95,0xE4,0x79,
 0xE7,0xC8,0x37,0x6D,0x8D,0xD5,0x4E,0xA9,0x6C,0x56,0xF4,0xEA,0x65,0x7A,0xAE,0x08,
 0xBA,0x78,0x25,0x2E,0x1C,0xA6,0xB4,0xC6,0xE8,0xDD,0x74,0x1F,0x4B,0xBD,0x8B,0x8A,
 0x70,0x3E,0xB5,0x66,0x48,0x03,0xF6,0x0E,0x61,0x35,0x57,0xB9,0x86,0xC1,0x1D,0x9E,
 0xE1,0xF8,0x98,0x11,0x69,0xD9,0x8E,0x94,0x9B,0x1E,0x87,0xE9,0xCE,0x55,0x28,0xDF,
 0x8C,0xA1,0x89,0x0D,0xBF,0xE6,0x42,0x68,0x41,0x99,0x2D,0x0F,0xB0,0x54,0xBB,0x16
 };

 /* S-box SQ */

 static const u8 SQ[256] = {
 0x25,0x24,0x73,0x67,0xD7,0xAE,0x5C,0x30,0xA4,0xEE,0x6E,0xCB,0x7D,0xB5,0x82,0xDB,
 0xE4,0x8E,0x48,0x49,0x4F,0x5D,0x6A,0x78,0x70,0x88,0xE8,0x5F,0x5E,0x84,0x65,0xE2,
 0xD8,0xE9,0xCC,0xED,0x40,0x2F,0x11,0x28,0x57,0xD2,0xAC,0xE3,0x4A,0x15,0x1B,0xB9,
 0xB2,0x80,0x85,0xA6,0x2E,0x02,0x47,0x29,0x07,0x4B,0x0E,0xC1,0x51,0xAA,0x89,0xD4,
 0xCA,0x01,0x46,0xB3,0xEF,0xDD,0x44,0x7B,0xC2,0x7F,0xBE,0xC3,0x9F,0x20,0x4C,0x64,
 0x83,0xA2,0x68,0x42,0x13,0xB4,0x41,0xCD,0xBA,0xC6,0xBB,0x6D,0x4D,0x71,0x21,0xF4,
 0x8D,0xB0,0xE5,0x93,0xFE,0x8F,0xE6,0xCF,0x43,0x45,0x31,0x22,0x37,0x36,0x96,0xFA,
 0xBC,0x0F,0x08,0x52,0x1D,0x55,0x1A,0xC5,0x4E,0x23,0x69,0x7A,0x92,0xFF,0x5B,0x5A,
 0xEB,0x9A,0x1C,0xA9,0xD1,0x7E,0x0D,0xFC,0x50,0x8A,0xB6,0x62,0xF5,0x0A,0xF8,0xDC,
 0x03,0x3C,0x0C,0x39,0xF1,0xB8,0xF3,0x3D,0xF2,0xD5,0x97,0x66,0x81,0x32,0xA0,0x00,
 0x06,0xCE,0xF6,0xEA,0xB7,0x17,0xF7,0x8C,0x79,0xD6,0xA7,0xBF,0x8B,0x3F,0x1F,0x53,
 0x63,0x75,0x35,0x2C,0x60,0xFD,0x27,0xD3,0x94,0xA5,0x7C,0xA1,0x05,0x58,0x2D,0xBD,
 0xD9,0xC7,0xAF,0x6B,0x54,0x0B,0xE0,0x38,0x04,0xC8,0x9D,0xE7,0x14,0xB1,0x87,0x9C,
 0xDF,0x6F,0xF9,0xDA,0x2A,0xC4,0x59,0x16,0x74,0x91,0xAB,0x26,0x61,0x76,0x34,0x2B,
 0xAD,0x99,0xFB,0x72,0xEC,0x33,0x12,0xDE,0x98,0x3B,0xC0,0x9B,0x3E,0x18,0x10,0x3A,
 0x56,0xE1,0x77,0xC9,0x1E,0x9E,0x95,0xA3,0x90,0x19,0xA8,0x6C,0x09,0xD0,0xF0,0x86
 };

 /* MULx.
 * Input V: an 8-bit input.
 * Input c: an 8-bit input.
 * Output : an 8-bit output.
 * See section 3.1.1 for details.
 */

 static u8 MULx(u8 V, u8 c)
 {
 	if ( V & 0x80 )
 		return ( (V << 1) ^ c);
 	else
 		return ( V << 1);
 }

 /* MULxPOW.
 * Input V: an 8-bit input.
 * Input i: a positive integer.
 * Input c: an 8-bit input.
 * Output : an 8-bit output.
 * See section 3.1.2 for details.
 */

 static u8 MULxPOW(u8 V, u8 i, u8 c)
 {
 	if ( i == 0)
 		return V;
 	else
 		return MULx( MULxPOW( V, i-1, c ), c);
 }

 /* The function MUL alpha.
 * Input c: 8-bit input.
 * Output : 32-bit output.
 * See section 3.4.2 for details.
 */

 static u32 MULalpha(u8 c)
 {
 	return ( ( ((u32)MULxPOW(c, 23, 0xa9)) << 24 ) |
 		( ((u32)MULxPOW(c, 245, 0xa9)) << 16 ) |
 		( ((u32)MULxPOW(c, 48, 0xa9)) << 8 ) |
 		( ((u32)MULxPOW(c, 239, 0xa9)) ) ) ;
 }

 /* The function DIV alpha.
 * Input c: 8-bit input.
 * Output : 32-bit output.
 * See section 3.4.3 for details.
 */

 static u32 DIValpha(u8 c)
 {
 	return ( ( ((u32)MULxPOW(c, 16, 0xa9)) << 24 ) |
 		( ((u32)MULxPOW(c, 39, 0xa9)) << 16 ) |
 		( ((u32)MULxPOW(c, 6, 0xa9)) << 8 ) |
 		( ((u32)MULxPOW(c, 64, 0xa9)) ) ) ;
 }

 /* The 32x32-bit S-Box S1
 * Input: a 32-bit input.
 * Output: a 32-bit output of S1 box.
 * See section 3.3.1.
 */

 static u32 S1(u32 w)
 {
 	u8 r0=0, r1=0, r2=0, r3=0;
 	u8 srw0 = SR[ (u8)((w >> 24) & 0xff) ];
 	u8 srw1 = SR[ (u8)((w >> 16) & 0xff) ];
 	u8 srw2 = SR[ (u8)((w >> 8) & 0xff) ];
 	u8 srw3 = SR[ (u8)((w) & 0xff) ];
 	r0 = ( ( MULx( srw0 , 0x1b) ) ^
 		( srw1 ) ^
 		( srw2 ) ^
 		( (MULx( srw3, 0x1b)) ^ srw3 )
 	);
 	r1 = ( ( ( MULx( srw0 , 0x1b) ) ^ srw0 ) ^
 		( MULx(srw1, 0x1b) ) ^
 		( srw2 ) ^
 		( srw3 )
 	);
 	r2 = ( ( srw0 ) ^
 		( ( MULx( srw1 , 0x1b) ) ^ srw1 ) ^
 		( MULx(srw2, 0x1b) ) ^
 		( srw3 )
 	);
 	r3 = ( ( srw0 ) ^
 		( srw1 ) ^
 		( ( MULx( srw2 , 0x1b) ) ^ srw2 ) ^
 		( MULx( srw3, 0x1b) )
 	);

 	return ( ( ((u32)r0) << 24 ) | ( ((u32)r1) << 16 ) | ( ((u32)r2) << 8 ) |
 		( ((u32)r3) ) );
 }

 /* The 32x32-bit S-Box S2
 * Input: a 32-bit input.
 * Output: a 32-bit output of S2 box.
 * See section 3.3.2.
 */

 static u32 S2(u32 w)
 {
 	u8 r0=0, r1=0, r2=0, r3=0;
 	u8 sqw0 = SQ[ (u8)((w >> 24) & 0xff) ];
 	u8 sqw1 = SQ[ (u8)((w >> 16) & 0xff) ];
 	u8 sqw2 = SQ[ (u8)((w >> 8) & 0xff) ];
 	u8 sqw3 = SQ[ (u8)((w) & 0xff) ];
 	r0 = ( ( MULx( sqw0 , 0x69) ) ^
 		( sqw1 ) ^
 		( sqw2 ) ^
 		( (MULx( sqw3, 0x69)) ^ sqw3 )
 	);
 	r1 = ( ( ( MULx( sqw0 , 0x69) ) ^ sqw0 ) ^
 		( MULx(sqw1, 0x69) ) ^
 		( sqw2 ) ^
 		( sqw3 )
 	);
 	r2 = ( ( sqw0 ) ^
 		( ( MULx( sqw1 , 0x69) ) ^ sqw1 ) ^
 		( MULx(sqw2, 0x69) ) ^
 		( sqw3 )
 	);
 	r3 = ( ( sqw0 ) ^
 		( sqw1 ) ^
 		( ( MULx( sqw2 , 0x69) ) ^ sqw2 ) ^
 		( MULx( sqw3, 0x69) )
 	);
 	return ( ( ((u32)r0) << 24 ) | ( ((u32)r1) << 16 ) | ( ((u32)r2) << 8 ) |
 		( ((u32)r3) ) );
 }

 /* Clocking LFSR in initialization mode.
 * LFSR Registers S0 to S15 are updated as the LFSR receives a single clock.
 * Input F: a 32-bit word comes from output of FSM.
 * See section 3.4.4.
 */

 static void ClockLFSRInitializationMode(u32 F)
 {
 	u32 v = ( ( (LFSR_S0 << 8) & 0xffffff00 ) ^
 		( MULalpha( (u8)((LFSR_S0>>24) & 0xff) ) ) ^
 		( LFSR_S2 ) ^
 		( (LFSR_S11 >> 8) & 0x00ffffff ) ^
 		( DIValpha( (u8)( ( LFSR_S11) & 0xff ) ) ) ^
 		( F )
 	);
 	LFSR_S0 = LFSR_S1;
 	LFSR_S1 = LFSR_S2;
 	LFSR_S2 = LFSR_S3;
 	LFSR_S3 = LFSR_S4;
 	LFSR_S4 = LFSR_S5;
 	LFSR_S5 = LFSR_S6;
 	LFSR_S6 = LFSR_S7;
 	LFSR_S7 = LFSR_S8;
 	LFSR_S8 = LFSR_S9;
 	LFSR_S9 = LFSR_S10;
 	LFSR_S10 = LFSR_S11;
 	LFSR_S11 = LFSR_S12;
 	LFSR_S12 = LFSR_S13;
 	LFSR_S13 = LFSR_S14;
 	LFSR_S14 = LFSR_S15;
 	LFSR_S15 = v;
 }

 /* Clocking LFSR in keystream mode.
 * LFSR Registers S0 to S15 are updated as the LFSR receives a single clock.
 * See section 3.4.5.
 */

 static void ClockLFSRKeyStreamMode()
 {
 	u32 v = ( ( (LFSR_S0 << 8) & 0xffffff00 ) ^
 		( MULalpha( (u8)((LFSR_S0>>24) & 0xff) ) ) ^
 		( LFSR_S2 ) ^
 		( (LFSR_S11 >> 8) & 0x00ffffff ) ^
 		( DIValpha( (u8)( ( LFSR_S11) & 0xff ) ) )
 	);
 	LFSR_S0 = LFSR_S1;
 	LFSR_S1 = LFSR_S2;
 	LFSR_S2 = LFSR_S3;
 	LFSR_S3 = LFSR_S4;
 	LFSR_S4 = LFSR_S5;
 	LFSR_S5 = LFSR_S6;
 	LFSR_S6 = LFSR_S7;
 	LFSR_S7 = LFSR_S8;
 	LFSR_S8 = LFSR_S9;
 	LFSR_S9 = LFSR_S10;
 	LFSR_S10 = LFSR_S11;
 	LFSR_S11 = LFSR_S12;
 	LFSR_S12 = LFSR_S13;
 	LFSR_S13 = LFSR_S14;
 	LFSR_S14 = LFSR_S15;
 	LFSR_S15 = v;
 }

 /* Clocking FSM.
 * Produces a 32-bit word F.
 * Updates FSM registers R1, R2, R3.
 * See Section 3.4.6.
 */

 static u32 ClockFSM()
 {
 	u32 F = ( ( LFSR_S15 + FSM_R1 ) & 0xffffffff ) ^ FSM_R2 ;
 	u32 r = ( FSM_R2 + ( FSM_R3 ^ LFSR_S5 ) ) & 0xffffffff ;
 	FSM_R3 = S2(FSM_R2);
 	FSM_R2 = S1(FSM_R1);
 	FSM_R1 = r;
 	return F;
 }

 /* Initialization.
 * Input k[4]: Four 32-bit words making up 128-bit key.
 * Input IV[4]: Four 32-bit words making 128-bit initialization variable.
 * Output: All the LFSRs and FSM are initialized for key generation.
 * See Section 4.1.
 */

 void snow_3g_initialize(u32 k[4], u32 IV[4])
 {
 	u8 i=0;
 	u32 F = 0x0;
 	LFSR_S15 = k[3] ^ IV[0];
 	LFSR_S14 = k[2];
 	LFSR_S13 = k[1];
 	LFSR_S12 = k[0] ^ IV[1];
 	LFSR_S11 = k[3] ^ 0xffffffff;
 	LFSR_S10 = k[2] ^ 0xffffffff ^ IV[2];
 	LFSR_S9 = k[1] ^ 0xffffffff ^ IV[3];
 	LFSR_S8 = k[0] ^ 0xffffffff;
 	LFSR_S7 = k[3];
 	LFSR_S6 = k[2];
 	LFSR_S5 = k[1];
 	LFSR_S4 = k[0];
 	LFSR_S3 = k[3] ^ 0xffffffff;
 	LFSR_S2 = k[2] ^ 0xffffffff;
 	LFSR_S1 = k[1] ^ 0xffffffff;
 	LFSR_S0 = k[0] ^ 0xffffffff;
 	FSM_R1 = 0x0;
 	FSM_R2 = 0x0;
 	FSM_R3 = 0x0;
 	for(i=0;i<32;i++)
 	{
 		F = ClockFSM();
 		ClockLFSRInitializationMode(F);
 	}
 }

 /* Generation of Keystream.
 * input n: number of 32-bit words of keystream.
 * input z: space for the generated keystream, assumes
 * memory is allocated already.
 * output: generated keystream which is filled in z
 * See section 4.2.
 */

 void snow_3g_generate_key_stream(u32 n, u32 *ks)
 {
 	u32 t = 0;
 	u32 F = 0x0;
 	ClockFSM(); /* Clock FSM once. Discard the output. */
 	ClockLFSRKeyStreamMode(); /* Clock LFSR in keystream mode once. */
 	for ( t=0; t<n; t++)
 	{
 		F = ClockFSM(); /* STEP 1 */
 		ks[t] = F ^ LFSR_S0; /* STEP 2 */
 		/* Note that ks[t] corresponds to z_{t+1} in section 4.2
 		*/
 		ClockLFSRKeyStreamMode(); /* STEP 3 */
 	}
 }

 /*-----------------------------------------------------------------------
 * end of SNOW_3G.c
 *-----------------------------------------------------------------------*/

 /*---------------------------------------------------------
 * f8.c
 *---------------------------------------------------------*/

 /*
 #include "f8.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 */

 /* f8.
 * Input key: 128 bit Confidentiality Key.
 * Input count:32-bit Count, Frame dependent input.
 * Input bearer: 5-bit Bearer identity (in the LSB side).
 * Input dir:1 bit, direction of transmission.
 * Input data: length number of bits, input bit stream.
 * Input length: 32 bit Length, i.e., the number of bits to be encrypted or
 * decrypted.
 * Output data: Output bit stream. Assumes data is suitably memory
 * allocated.
 * Encrypts/decrypts blocks of data between 1 and 2^32 bits in length as
 * defined in Section 3.
 */

 void snow_3g_f8(u8 *key, u32 count, u32 bearer, u32 dir, u8 *data, u32 length)
 {
 	u32 K[4],IV[4];
 	int n = ( length + 31 ) / 32;
 	int i=0;
 	int lastbits = (8-(length%8)) % 8;
 	u32 KS[n];

 	/*Initialisation*/
 	/* Load the confidentiality key for SNOW 3G initialization as in section
 	3.4. */
 	for (i=0; i<4; i++)
 		K[3-i] = (key[4*i] << 24) ^ (key[4*i+1] << 16)
 			   ^ (key[4*i+2] << 8) ^ (key[4*i+3]);

 	/* Prepare the initialization vector (IV) for SNOW 3G initialization as in
 	section 3.4. */
 	IV[3] = count;
 	IV[2] = (bearer << 27) | ((dir & 0x1) << 26);
 	IV[1] = IV[3];
 	IV[0] = IV[2];

 	/* Run SNOW 3G algorithm to generate sequence of key stream bits KS*/
 	snow_3g_initialize(K,IV);
 	snow_3g_generate_key_stream(n,(u32*)KS);

 	/* Exclusive-OR the input data with keystream to generate the output bit
 	stream */
 	for (i=0; i<n; i++)
 	{
 		data[4*i+0] ^= (u8) (KS[i] >> 24) & 0xff;
 		data[4*i+1] ^= (u8) (KS[i] >> 16) & 0xff;
 		data[4*i+2] ^= (u8) (KS[i] >> 8) & 0xff;
 		data[4*i+3] ^= (u8) (KS[i] ) & 0xff;
 	}

 	/* zero last bits of data in case its length is not byte-aligned
 	   this is an addition to the C reference code, which did not handle it */
 	if (lastbits)
 		data[length/8] &= 256 - (1<<lastbits);
 }
 /* End of f8.c */

 /*---------------------------------------------------------
  *					f9.c
  *---------------------------------------------------------*/

 /* MUL64x.
  * Input V: a 64-bit input.
  * Input c: a 64-bit input.
  * Output : a 64-bit output.
  * A 64-bit memory is allocated which is to be freed by the calling
  * function.
  * See section 4.3.2 for details.
  */
 static u64 MUL64x(u64 V, u64 c)
 {
 	if ( V & 0x8000000000000000 )
 		return (V << 1) ^ c;
 	else
 		return V << 1;
 }

 /* MUL64xPOW.
  * Input V: a 64-bit input.
  * Input i: a positive integer.
  * Input c: a 64-bit input.
  * Output : a 64-bit output.
  * A 64-bit memory is allocated which is to be freed by the calling function.
  * See section 4.3.3 for details.
  */
 static u64 MUL64xPOW(u64 V, u8 i, u64 c)
 {
 	if ( i == 0)
 		return V;
 	else
 		return MUL64x( MUL64xPOW(V,i-1,c) , c);
 }

 /* MUL64.
  * Input V: a 64-bit input.
  * Input P: a 64-bit input.
  * Input c: a 64-bit input.
  * Output : a 64-bit output.
  * A 64-bit memory is allocated which is to be freed by the calling
  * function.
  * See section 4.3.4 for details.
  */
 static u64 MUL64(u64 V, u64 P, u64 c)
 {
 	u64 result = 0;
 	int i = 0;

 	for ( i=0; i<64; i++)
 	{
 		if( ( P>>i ) & 0x1 )
 			result ^= MUL64xPOW(V,i,c);
 	}
 	return result;
 }

 /* mask8bit.
  * Input n: an integer in 1-7.
  * Output : an 8 bit mask.
  * Prepares an 8 bit mask with required number of 1 bits on the MSB side.
  */
 static u8 mask8bit(int n)
 {
 	return 0xFF ^ ((1<<(8-n)) - 1);
 }

 /* f9.
  * Input key: 128 bit Integrity Key.
  * Input count:32-bit Count, Frame dependent input.
  * Input fresh: 32-bit Random number.
  * Input dir:1 bit, direction of transmission (in the LSB).
  * Input data: length number of bits, input bit stream.
  * Input length: 64 bit Length, i.e., the number of bits to be MAC'd.
  * Output  : 32 bit block used as MAC
  * Generates 32-bit MAC using UIA2 algorithm as defined in Section 4.
  */
 void snow_3g_f9(u8* key, u32 count, u32 fresh, u32 dir, u8 *data, u64 length,
         u8 *out)
 {
 	u32 K[4],IV[4], z[5];
 	u32 i=0, D;
 	u64 EVAL;
 	u64 V;
 	u64 P;
 	u64 Q;
 	u64 c;

 	u64 M_D_2;
 	int rem_bits = 0;

 	/* Load the Integrity Key for SNOW3G initialization as in section 4.4. */
 	for (i=0; i<4; i++)
     {
 		K[3-i] = (key[4*i] << 24) ^ (key[4*i+1] << 16) ^
 				 (key[4*i+2] << 8) ^ (key[4*i+3]);
     }

 	/* Prepare the Initialization Vector (IV) for SNOW3G initialization as
 	   in section 4.4. */
 	IV[3] = count;
 	IV[2] = fresh;
 	IV[1] = count ^ ( dir << 31 ) ;
 	IV[0] = fresh ^ (dir << 15);

 	z[0] = z[1] = z[2] = z[3] = z[4] = 0;

 	/* Run SNOW 3G to produce 5 keystream words z_1, z_2, z_3, z_4 and z_5. */
 	snow_3g_initialize(K, IV);
 	snow_3g_generate_key_stream(5, z);

 	P = (u64)z[0] << 32 | (u64)z[1];
 	Q = (u64)z[2] << 32 | (u64)z[3];

 	/* Calculation */
 	if ((length % 64) == 0)
 		D = (length>>6) + 1;
 	else
 		D = (length>>6) + 2;
 	EVAL = 0;
 	c = 0x1b;

 	/* for 0 <= i <= D-3 */
 	for (i=0; i<D-2; i++)
 	{
 		V = EVAL ^ ( (u64)data[8*i  ]<<56 | (u64)data[8*i+1]<<48 |
 				     (u64)data[8*i+2]<<40 | (u64)data[8*i+3]<<32 |
                      (u64)data[8*i+4]<<24 | (u64)data[8*i+5]<<16 |
 				     (u64)data[8*i+6]<< 8 | (u64)data[8*i+7] )   ;
 		EVAL = MUL64(V,P,c);
 	}

 	/* for D-2 */
 	rem_bits = length % 64;
 	if (rem_bits == 0)
 		rem_bits = 64;

 	M_D_2 = 0;
 	i = 0;
 	while (rem_bits > 7)
 	{
 		M_D_2 |= (u64)data[8*(D-2)+i] << (8*(7-i));
 		rem_bits -= 8;
 		i++;
 	}
 	if (rem_bits > 0)
 		M_D_2 |= (u64)(data[8*(D-2)+i] & mask8bit(rem_bits)) << (8*(7-i));

 	V = EVAL ^ M_D_2;
 	EVAL = MUL64(V,P,c);

 	/* for D-1 */
 	EVAL ^= length;

 	/* Multiply by Q */
 	EVAL = MUL64(EVAL,Q,c);

 	/* XOR with z_5: this is a modification to the reference C code,
 	   which forgot to XOR z[5] */
 	for (i=0; i<4; i++)
 		/*
 		MAC_I[i] = (mac32 >> (8*(3-i))) & 0xff;
 		*/
 		out[i] = ((EVAL >> (56-(i*8))) ^ (z[4] >> (24-(i*8)))) & 0xff;
 }

 /* End of f9.c */
 /*------------------------------------------------------------------------*/
	/*------------------------------------------------------------------------
	* SNOW_3G.c
	------------------------------------------------------------------------/

	#include "snow-3g.h"

	/* LFSR */

	static u32 LFSR_S0 = 0x00;
	static u32 LFSR_S1 = 0x00;
	static u32 LFSR_S2 = 0x00;
	static u32 LFSR_S3 = 0x00;
	static u32 LFSR_S4 = 0x00;
	static u32 LFSR_S5 = 0x00;
	static u32 LFSR_S6 = 0x00;
	static u32 LFSR_S7 = 0x00;
	static u32 LFSR_S8 = 0x00;
	static u32 LFSR_S9 = 0x00;
	static u32 LFSR_S10 = 0x00;
	static u32 LFSR_S11 = 0x00;
	static u32 LFSR_S12 = 0x00;
	static u32 LFSR_S13 = 0x00;
	static u32 LFSR_S14 = 0x00;
	static u32 LFSR_S15 = 0x00;

	/* FSM */

	static u32 FSM_R1 = 0x00;
	static u32 FSM_R2 = 0x00;
	static u32 FSM_R3 = 0x00;

	/* Rijndael S-box SR */

	static const u8 SR[256] = {
	0x63,0x7C,0x77,0x7B,0xF2,0x6B,0x6F,0xC5,0x30,0x01,0x67,0x2B,0xFE,0xD7,0xAB,0x76,
	0xCA,0x82,0xC9,0x7D,0xFA,0x59,0x47,0xF0,0xAD,0xD4,0xA2,0xAF,0x9C,0xA4,0x72,0xC0,
	0xB7,0xFD,0x93,0x26,0x36,0x3F,0xF7,0xCC,0x34,0xA5,0xE5,0xF1,0x71,0xD8,0x31,0x15,
	0x04,0xC7,0x23,0xC3,0x18,0x96,0x05,0x9A,0x07,0x12,0x80,0xE2,0xEB,0x27,0xB2,0x75,
	0x09,0x83,0x2C,0x1A,0x1B,0x6E,0x5A,0xA0,0x52,0x3B,0xD6,0xB3,0x29,0xE3,0x2F,0x84,
	0x53,0xD1,0x00,0xED,0x20,0xFC,0xB1,0x5B,0x6A,0xCB,0xBE,0x39,0x4A,0x4C,0x58,0xCF,
	0xD0,0xEF,0xAA,0xFB,0x43,0x4D,0x33,0x85,0x45,0xF9,0x02,0x7F,0x50,0x3C,0x9F,0xA8,
	0x51,0xA3,0x40,0x8F,0x92,0x9D,0x38,0xF5,0xBC,0xB6,0xDA,0x21,0x10,0xFF,0xF3,0xD2,
	0xCD,0x0C,0x13,0xEC,0x5F,0x97,0x44,0x17,0xC4,0xA7,0x7E,0x3D,0x64,0x5D,0x19,0x73,
	0x60,0x81,0x4F,0xDC,0x22,0x2A,0x90,0x88,0x46,0xEE,0xB8,0x14,0xDE,0x5E,0x0B,0xDB,
	0xE0,0x32,0x3A,0x0A,0x49,0x06,0x24,0x5C,0xC2,0xD3,0xAC,0x62,0x91,0x95,0xE4,0x79,
	0xE7,0xC8,0x37,0x6D,0x8D,0xD5,0x4E,0xA9,0x6C,0x56,0xF4,0xEA,0x65,0x7A,0xAE,0x08,
	0xBA,0x78,0x25,0x2E,0x1C,0xA6,0xB4,0xC6,0xE8,0xDD,0x74,0x1F,0x4B,0xBD,0x8B,0x8A,
	0x70,0x3E,0xB5,0x66,0x48,0x03,0xF6,0x0E,0x61,0x35,0x57,0xB9,0x86,0xC1,0x1D,0x9E,
	0xE1,0xF8,0x98,0x11,0x69,0xD9,0x8E,0x94,0x9B,0x1E,0x87,0xE9,0xCE,0x55,0x28,0xDF,
	0x8C,0xA1,0x89,0x0D,0xBF,0xE6,0x42,0x68,0x41,0x99,0x2D,0x0F,0xB0,0x54,0xBB,0x16
	};

	/* S-box SQ */

	static const u8 SQ[256] = {
	0x25,0x24,0x73,0x67,0xD7,0xAE,0x5C,0x30,0xA4,0xEE,0x6E,0xCB,0x7D,0xB5,0x82,0xDB,
	0xE4,0x8E,0x48,0x49,0x4F,0x5D,0x6A,0x78,0x70,0x88,0xE8,0x5F,0x5E,0x84,0x65,0xE2,
	0xD8,0xE9,0xCC,0xED,0x40,0x2F,0x11,0x28,0x57,0xD2,0xAC,0xE3,0x4A,0x15,0x1B,0xB9,
	0xB2,0x80,0x85,0xA6,0x2E,0x02,0x47,0x29,0x07,0x4B,0x0E,0xC1,0x51,0xAA,0x89,0xD4,
	0xCA,0x01,0x46,0xB3,0xEF,0xDD,0x44,0x7B,0xC2,0x7F,0xBE,0xC3,0x9F,0x20,0x4C,0x64,
	0x83,0xA2,0x68,0x42,0x13,0xB4,0x41,0xCD,0xBA,0xC6,0xBB,0x6D,0x4D,0x71,0x21,0xF4,
	0x8D,0xB0,0xE5,0x93,0xFE,0x8F,0xE6,0xCF,0x43,0x45,0x31,0x22,0x37,0x36,0x96,0xFA,
	0xBC,0x0F,0x08,0x52,0x1D,0x55,0x1A,0xC5,0x4E,0x23,0x69,0x7A,0x92,0xFF,0x5B,0x5A,
	0xEB,0x9A,0x1C,0xA9,0xD1,0x7E,0x0D,0xFC,0x50,0x8A,0xB6,0x62,0xF5,0x0A,0xF8,0xDC,
	0x03,0x3C,0x0C,0x39,0xF1,0xB8,0xF3,0x3D,0xF2,0xD5,0x97,0x66,0x81,0x32,0xA0,0x00,
	0x06,0xCE,0xF6,0xEA,0xB7,0x17,0xF7,0x8C,0x79,0xD6,0xA7,0xBF,0x8B,0x3F,0x1F,0x53,
	0x63,0x75,0x35,0x2C,0x60,0xFD,0x27,0xD3,0x94,0xA5,0x7C,0xA1,0x05,0x58,0x2D,0xBD,
	0xD9,0xC7,0xAF,0x6B,0x54,0x0B,0xE0,0x38,0x04,0xC8,0x9D,0xE7,0x14,0xB1,0x87,0x9C,
	0xDF,0x6F,0xF9,0xDA,0x2A,0xC4,0x59,0x16,0x74,0x91,0xAB,0x26,0x61,0x76,0x34,0x2B,
	0xAD,0x99,0xFB,0x72,0xEC,0x33,0x12,0xDE,0x98,0x3B,0xC0,0x9B,0x3E,0x18,0x10,0x3A,
	0x56,0xE1,0x77,0xC9,0x1E,0x9E,0x95,0xA3,0x90,0x19,0xA8,0x6C,0x09,0xD0,0xF0,0x86
	};

	/* MULx.
	* Input V: an 8-bit input.
	* Input c: an 8-bit input.
	* Output : an 8-bit output.
	* See section 3.1.1 for details.
	*/

	static u8 MULx(u8 V, u8 c)
	{
	if ( V & 0x80 )
	return ( (V << 1) ^ c);
	else
	return ( V << 1);
	}

	/* MULxPOW.
	* Input V: an 8-bit input.
	* Input i: a positive integer.
	* Input c: an 8-bit input.
	* Output : an 8-bit output.
	* See section 3.1.2 for details.
	*/

	static u8 MULxPOW(u8 V, u8 i, u8 c)
	{
	if ( i == 0)
	return V;
	else
	return MULx( MULxPOW( V, i-1, c ), c);
	}

	/* The function MUL alpha.
	* Input c: 8-bit input.
	* Output : 32-bit output.
	* See section 3.4.2 for details.
	*/

	static u32 MULalpha(u8 c)
	{
	return ( ( ((u32)MULxPOW(c, 23, 0xa9)) << 24 ) \|
	( ((u32)MULxPOW(c, 245, 0xa9)) << 16 ) \|
	( ((u32)MULxPOW(c, 48, 0xa9)) << 8 ) \|
	( ((u32)MULxPOW(c, 239, 0xa9)) ) ) ;
	}

	/* The function DIV alpha.
	* Input c: 8-bit input.
	* Output : 32-bit output.
	* See section 3.4.3 for details.
	*/

	static u32 DIValpha(u8 c)
	{
	return ( ( ((u32)MULxPOW(c, 16, 0xa9)) << 24 ) \|
	( ((u32)MULxPOW(c, 39, 0xa9)) << 16 ) \|
	( ((u32)MULxPOW(c, 6, 0xa9)) << 8 ) \|
	( ((u32)MULxPOW(c, 64, 0xa9)) ) ) ;
	}

	/* The 32x32-bit S-Box S1
	* Input: a 32-bit input.
	* Output: a 32-bit output of S1 box.
	* See section 3.3.1.
	*/

	static u32 S1(u32 w)
	{
	u8 r0=0, r1=0, r2=0, r3=0;
	u8 srw0 = SR[ (u8)((w >> 24) & 0xff) ];
	u8 srw1 = SR[ (u8)((w >> 16) & 0xff) ];
	u8 srw2 = SR[ (u8)((w >> 8) & 0xff) ];
	u8 srw3 = SR[ (u8)((w) & 0xff) ];
	r0 = ( ( MULx( srw0 , 0x1b) ) ^
	( srw1 ) ^
	( srw2 ) ^
	( (MULx( srw3, 0x1b)) ^ srw3 )
	);
	r1 = ( ( ( MULx( srw0 , 0x1b) ) ^ srw0 ) ^
	( MULx(srw1, 0x1b) ) ^
	( srw2 ) ^
	( srw3 )
	);
	r2 = ( ( srw0 ) ^
	( ( MULx( srw1 , 0x1b) ) ^ srw1 ) ^
	( MULx(srw2, 0x1b) ) ^
	( srw3 )
	);
	r3 = ( ( srw0 ) ^
	( srw1 ) ^
	( ( MULx( srw2 , 0x1b) ) ^ srw2 ) ^
	( MULx( srw3, 0x1b) )
	);

	return ( ( ((u32)r0) << 24 ) \| ( ((u32)r1) << 16 ) \| ( ((u32)r2) << 8 ) \|
	( ((u32)r3) ) );
	}

	/* The 32x32-bit S-Box S2
	* Input: a 32-bit input.
	* Output: a 32-bit output of S2 box.
	* See section 3.3.2.
	*/

	static u32 S2(u32 w)
	{
	u8 r0=0, r1=0, r2=0, r3=0;
	u8 sqw0 = SQ[ (u8)((w >> 24) & 0xff) ];
	u8 sqw1 = SQ[ (u8)((w >> 16) & 0xff) ];
	u8 sqw2 = SQ[ (u8)((w >> 8) & 0xff) ];
	u8 sqw3 = SQ[ (u8)((w) & 0xff) ];
	r0 = ( ( MULx( sqw0 , 0x69) ) ^
	( sqw1 ) ^
	( sqw2 ) ^
	( (MULx( sqw3, 0x69)) ^ sqw3 )
	);
	r1 = ( ( ( MULx( sqw0 , 0x69) ) ^ sqw0 ) ^
	( MULx(sqw1, 0x69) ) ^
	( sqw2 ) ^
	( sqw3 )
	);
	r2 = ( ( sqw0 ) ^
	( ( MULx( sqw1 , 0x69) ) ^ sqw1 ) ^
	( MULx(sqw2, 0x69) ) ^
	( sqw3 )
	);
	r3 = ( ( sqw0 ) ^
	( sqw1 ) ^
	( ( MULx( sqw2 , 0x69) ) ^ sqw2 ) ^
	( MULx( sqw3, 0x69) )
	);
	return ( ( ((u32)r0) << 24 ) \| ( ((u32)r1) << 16 ) \| ( ((u32)r2) << 8 ) \|
	( ((u32)r3) ) );
	}

	/* Clocking LFSR in initialization mode.
	* LFSR Registers S0 to S15 are updated as the LFSR receives a single clock.
	* Input F: a 32-bit word comes from output of FSM.
	* See section 3.4.4.
	*/

	static void ClockLFSRInitializationMode(u32 F)
	{
	u32 v = ( ( (LFSR_S0 << 8) & 0xffffff00 ) ^
	( MULalpha( (u8)((LFSR_S0>>24) & 0xff) ) ) ^
	( LFSR_S2 ) ^
	( (LFSR_S11 >> 8) & 0x00ffffff ) ^
	( DIValpha( (u8)( ( LFSR_S11) & 0xff ) ) ) ^
	( F )
	);
	LFSR_S0 = LFSR_S1;
	LFSR_S1 = LFSR_S2;
	LFSR_S2 = LFSR_S3;
	LFSR_S3 = LFSR_S4;
	LFSR_S4 = LFSR_S5;
	LFSR_S5 = LFSR_S6;
	LFSR_S6 = LFSR_S7;
	LFSR_S7 = LFSR_S8;
	LFSR_S8 = LFSR_S9;
	LFSR_S9 = LFSR_S10;
	LFSR_S10 = LFSR_S11;
	LFSR_S11 = LFSR_S12;
	LFSR_S12 = LFSR_S13;
	LFSR_S13 = LFSR_S14;
	LFSR_S14 = LFSR_S15;
	LFSR_S15 = v;
	}

	/* Clocking LFSR in keystream mode.
	* LFSR Registers S0 to S15 are updated as the LFSR receives a single clock.
	* See section 3.4.5.
	*/

	static void ClockLFSRKeyStreamMode()
	{
	u32 v = ( ( (LFSR_S0 << 8) & 0xffffff00 ) ^
	( MULalpha( (u8)((LFSR_S0>>24) & 0xff) ) ) ^
	( LFSR_S2 ) ^
	( (LFSR_S11 >> 8) & 0x00ffffff ) ^
	( DIValpha( (u8)( ( LFSR_S11) & 0xff ) ) )
	);
	LFSR_S0 = LFSR_S1;
	LFSR_S1 = LFSR_S2;
	LFSR_S2 = LFSR_S3;
	LFSR_S3 = LFSR_S4;
	LFSR_S4 = LFSR_S5;
	LFSR_S5 = LFSR_S6;
	LFSR_S6 = LFSR_S7;
	LFSR_S7 = LFSR_S8;
	LFSR_S8 = LFSR_S9;
	LFSR_S9 = LFSR_S10;
	LFSR_S10 = LFSR_S11;
	LFSR_S11 = LFSR_S12;
	LFSR_S12 = LFSR_S13;
	LFSR_S13 = LFSR_S14;
	LFSR_S14 = LFSR_S15;
	LFSR_S15 = v;
	}

	/* Clocking FSM.
	* Produces a 32-bit word F.
	* Updates FSM registers R1, R2, R3.
	* See Section 3.4.6.
	*/

	static u32 ClockFSM()
	{
	u32 F = ( ( LFSR_S15 + FSM_R1 ) & 0xffffffff ) ^ FSM_R2 ;
	u32 r = ( FSM_R2 + ( FSM_R3 ^ LFSR_S5 ) ) & 0xffffffff ;
	FSM_R3 = S2(FSM_R2);
	FSM_R2 = S1(FSM_R1);
	FSM_R1 = r;
	return F;
	}

	/* Initialization.
	* Input k[4]: Four 32-bit words making up 128-bit key.
	* Input IV[4]: Four 32-bit words making 128-bit initialization variable.
	* Output: All the LFSRs and FSM are initialized for key generation.
	* See Section 4.1.
	*/

	void snow_3g_initialize(u32 k[4], u32 IV[4])
	{
	u8 i=0;
	u32 F = 0x0;
	LFSR_S15 = k[3] ^ IV[0];
	LFSR_S14 = k[2];
	LFSR_S13 = k[1];
	LFSR_S12 = k[0] ^ IV[1];
	LFSR_S11 = k[3] ^ 0xffffffff;
	LFSR_S10 = k[2] ^ 0xffffffff ^ IV[2];
	LFSR_S9 = k[1] ^ 0xffffffff ^ IV[3];
	LFSR_S8 = k[0] ^ 0xffffffff;
	LFSR_S7 = k[3];
	LFSR_S6 = k[2];
	LFSR_S5 = k[1];
	LFSR_S4 = k[0];
	LFSR_S3 = k[3] ^ 0xffffffff;
	LFSR_S2 = k[2] ^ 0xffffffff;
	LFSR_S1 = k[1] ^ 0xffffffff;
	LFSR_S0 = k[0] ^ 0xffffffff;
	FSM_R1 = 0x0;
	FSM_R2 = 0x0;
	FSM_R3 = 0x0;
	for(i=0;i<32;i++)
	{
	F = ClockFSM();
	ClockLFSRInitializationMode(F);
	}
	}

	/* Generation of Keystream.
	* input n: number of 32-bit words of keystream.
	* input z: space for the generated keystream, assumes
	* memory is allocated already.
	* output: generated keystream which is filled in z
	* See section 4.2.
	*/

	void snow_3g_generate_key_stream(u32 n, u32 *ks)
	{
	u32 t = 0;
	u32 F = 0x0;
	ClockFSM(); /* Clock FSM once. Discard the output. */
	ClockLFSRKeyStreamMode(); /* Clock LFSR in keystream mode once. */
	for ( t=0; t<n; t++)
	{
	F = ClockFSM(); /* STEP 1 */
	ks[t] = F ^ LFSR_S0; /* STEP 2 */
	/* Note that ks[t] corresponds to z_{t+1} in section 4.2
	*/
	ClockLFSRKeyStreamMode(); /* STEP 3 */
	}
	}

	/*-----------------------------------------------------------------------
	* end of SNOW_3G.c
	-----------------------------------------------------------------------/

	/*---------------------------------------------------------
	* f8.c
	---------------------------------------------------------/

	/*
	#include "f8.h"
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	*/

	/* f8.
	* Input key: 128 bit Confidentiality Key.
	* Input count:32-bit Count, Frame dependent input.
	* Input bearer: 5-bit Bearer identity (in the LSB side).
	* Input dir:1 bit, direction of transmission.
	* Input data: length number of bits, input bit stream.
	* Input length: 32 bit Length, i.e., the number of bits to be encrypted or
	* decrypted.
	* Output data: Output bit stream. Assumes data is suitably memory
	* allocated.
	* Encrypts/decrypts blocks of data between 1 and 2^32 bits in length as
	* defined in Section 3.
	*/

	void snow_3g_f8(u8 key, u32 count, u32 bearer, u32 dir, u8 data, u32 length)
	{
	u32 K[4],IV[4];
	int n = ( length + 31 ) / 32;
	int i=0;
	int lastbits = (8-(length%8)) % 8;
	u32 KS[n];

	/Initialisation/
	/* Load the confidentiality key for SNOW 3G initialization as in section
	3.4. */
	for (i=0; i<4; i++)
	K[3-i] = (key[4i] << 24) ^ (key[4i+1] << 16)
	^ (key[4i+2] << 8) ^ (key[4i+3]);

	/* Prepare the initialization vector (IV) for SNOW 3G initialization as in
	section 3.4. */
	IV[3] = count;
	IV[2] = (bearer << 27) \| ((dir & 0x1) << 26);
	IV[1] = IV[3];
	IV[0] = IV[2];

	/* Run SNOW 3G algorithm to generate sequence of key stream bits KS*/
	snow_3g_initialize(K,IV);
	snow_3g_generate_key_stream(n,(u32*)KS);

	/* Exclusive-OR the input data with keystream to generate the output bit
	stream */
	for (i=0; i<n; i++)
	{
	data[4*i+0] ^= (u8) (KS[i] >> 24) & 0xff;
	data[4*i+1] ^= (u8) (KS[i] >> 16) & 0xff;
	data[4*i+2] ^= (u8) (KS[i] >> 8) & 0xff;
	data[4*i+3] ^= (u8) (KS[i] ) & 0xff;
	}

	/* zero last bits of data in case its length is not byte-aligned
	this is an addition to the C reference code, which did not handle it */
	if (lastbits)
	data[length/8] &= 256 - (1<<lastbits);
	}
	/* End of f8.c */

	/*---------------------------------------------------------
	* f9.c
	---------------------------------------------------------/

	/* MUL64x.
	* Input V: a 64-bit input.
	* Input c: a 64-bit input.
	* Output : a 64-bit output.
	* A 64-bit memory is allocated which is to be freed by the calling
	* function.
	* See section 4.3.2 for details.
	*/
	static u64 MUL64x(u64 V, u64 c)
	{
	if ( V & 0x8000000000000000 )
	return (V << 1) ^ c;
	else
	return V << 1;
	}

	/* MUL64xPOW.
	* Input V: a 64-bit input.
	* Input i: a positive integer.
	* Input c: a 64-bit input.
	* Output : a 64-bit output.
	* A 64-bit memory is allocated which is to be freed by the calling function.
	* See section 4.3.3 for details.
	*/
	static u64 MUL64xPOW(u64 V, u8 i, u64 c)
	{
	if ( i == 0)
	return V;
	else
	return MUL64x( MUL64xPOW(V,i-1,c) , c);
	}

	/* MUL64.
	* Input V: a 64-bit input.
	* Input P: a 64-bit input.
	* Input c: a 64-bit input.
	* Output : a 64-bit output.
	* A 64-bit memory is allocated which is to be freed by the calling
	* function.
	* See section 4.3.4 for details.
	*/
	static u64 MUL64(u64 V, u64 P, u64 c)
	{
	u64 result = 0;
	int i = 0;

	for ( i=0; i<64; i++)
	{
	if( ( P>>i ) & 0x1 )
	result ^= MUL64xPOW(V,i,c);
	}
	return result;
	}

	/* mask8bit.
	* Input n: an integer in 1-7.
	* Output : an 8 bit mask.
	* Prepares an 8 bit mask with required number of 1 bits on the MSB side.
	*/
	static u8 mask8bit(int n)
	{
	return 0xFF ^ ((1<<(8-n)) - 1);
	}

	/* f9.
	* Input key: 128 bit Integrity Key.
	* Input count:32-bit Count, Frame dependent input.
	* Input fresh: 32-bit Random number.
	* Input dir:1 bit, direction of transmission (in the LSB).
	* Input data: length number of bits, input bit stream.
	* Input length: 64 bit Length, i.e., the number of bits to be MAC'd.
	* Output : 32 bit block used as MAC
	* Generates 32-bit MAC using UIA2 algorithm as defined in Section 4.
	*/
	void snow_3g_f9(u8* key, u32 count, u32 fresh, u32 dir, u8 *data, u64 length,
	u8 *out)
	{
	u32 K[4],IV[4], z[5];
	u32 i=0, D;
	u64 EVAL;
	u64 V;
	u64 P;
	u64 Q;
	u64 c;

	u64 M_D_2;
	int rem_bits = 0;

	/* Load the Integrity Key for SNOW3G initialization as in section 4.4. */
	for (i=0; i<4; i++)
	{
	K[3-i] = (key[4i] << 24) ^ (key[4i+1] << 16) ^
	(key[4i+2] << 8) ^ (key[4i+3]);
	}

	/* Prepare the Initialization Vector (IV) for SNOW3G initialization as
	in section 4.4. */
	IV[3] = count;
	IV[2] = fresh;
	IV[1] = count ^ ( dir << 31 ) ;
	IV[0] = fresh ^ (dir << 15);

	z[0] = z[1] = z[2] = z[3] = z[4] = 0;

	/* Run SNOW 3G to produce 5 keystream words z_1, z_2, z_3, z_4 and z_5. */
	snow_3g_initialize(K, IV);
	snow_3g_generate_key_stream(5, z);

	P = (u64)z[0] << 32 \| (u64)z[1];
	Q = (u64)z[2] << 32 \| (u64)z[3];

	/* Calculation */
	if ((length % 64) == 0)
	D = (length>>6) + 1;
	else
	D = (length>>6) + 2;
	EVAL = 0;
	c = 0x1b;

	/* for 0 <= i <= D-3 */
	for (i=0; i<D-2; i++)
	{
	V = EVAL ^ ( (u64)data[8i ]<<56 \| (u64)data[8i+1]<<48 \|
	(u64)data[8i+2]<<40 \| (u64)data[8i+3]<<32 \|
	(u64)data[8i+4]<<24 \| (u64)data[8i+5]<<16 \|
	(u64)data[8i+6]<< 8 \| (u64)data[8i+7] ) ;
	EVAL = MUL64(V,P,c);
	}

	/* for D-2 */
	rem_bits = length % 64;
	if (rem_bits == 0)
	rem_bits = 64;

	M_D_2 = 0;
	i = 0;
	while (rem_bits > 7)
	{
	M_D_2 \|= (u64)data[8(D-2)+i] << (8(7-i));
	rem_bits -= 8;
	i++;
	}
	if (rem_bits > 0)
	M_D_2 \|= (u64)(data[8(D-2)+i] & mask8bit(rem_bits)) << (8(7-i));

	V = EVAL ^ M_D_2;
	EVAL = MUL64(V,P,c);

	/* for D-1 */
	EVAL ^= length;

	/* Multiply by Q */
	EVAL = MUL64(EVAL,Q,c);

	/* XOR with z_5: this is a modification to the reference C code,
	which forgot to XOR z[5] */
	for (i=0; i<4; i++)
	/*
	MAC_I[i] = (mac32 >> (8*(3-i))) & 0xff;
	*/
	out[i] = ((EVAL >> (56-(i8))) ^ (z[4] >> (24-(i8)))) & 0xff;
	}

	/* End of f9.c */
	/------------------------------------------------------------------------/