diff -u linux-2.6.8/arch/i386/crypto/aes.c linux-2.6.8.orig/arch/i386/crypto/aes.c
--- linux-2.6.8/arch/i386/crypto/aes.c	2004-09-24 09:42:34.642803416 +0200
+++ linux-2.6.8.orig/arch/i386/crypto/aes.c	2004-08-14 07:36:59.000000000 +0200
@@ -2,92 +2,507 @@
  * 
  * Glue Code for optimized 586 assembler version of AES
  *
- * Copyright (c) 2001, Dr Brian Gladman <brg@gladman.uk.net>, Worcester, UK.
+ * Copyright (c) 2002, Dr Brian Gladman <>, Worcester, UK.
+ * All rights reserved.
+ *
+ * LICENSE TERMS
+ *
+ * The free distribution and use of this software in both source and binary
+ * form is allowed (with or without changes) provided that:
+ *
+ *   1. distributions of this source code include the above copyright
+ *      notice, this list of conditions and the following disclaimer;
+ *
+ *   2. distributions in binary form include the above copyright
+ *      notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other associated materials;
+ *
+ *   3. the copyright holder's name is not used to endorse products
+ *      built using this software without specific written permission.
+ *
+ * ALTERNATIVELY, provided that this notice is retained in full, this product
+ * may be distributed under the terms of the GNU General Public License (GPL),
+ * in which case the provisions of the GPL apply INSTEAD OF those given above.
+ *
+ * DISCLAIMER
+ *
+ * This software is provided 'as is' with no explicit or implied warranties
+ * in respect of its properties, including, but not limited to, correctness
+ * and/or fitness for purpose.
+ *
  * Copyright (c) 2003, Adam J. Richter <adam@yggdrasil.com> (conversion to
  * 2.5 API).
  * Copyright (c) 2003, 2004 Fruhwirth Clemens <clemens@endorphin.org>
-*/
-
+ * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
+ *
+ */
+#include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/types.h>
 #include <linux/crypto.h>
 #include <linux/linkage.h>
 
+asmlinkage void aes_enc_blk(const u8 *src, u8 *dst, void *ctx);
+asmlinkage void aes_dec_blk(const u8 *src, u8 *dst, void *ctx);
+
 #define AES_MIN_KEY_SIZE	16
 #define AES_MAX_KEY_SIZE	32
 #define AES_BLOCK_SIZE		16
-#define AES_KS_LENGTH   4 * AES_BLOCK_SIZE
-#define AES_RC_LENGTH   (9 * AES_BLOCK_SIZE) / 8 - 8
+#define AES_KS_LENGTH		4 * AES_BLOCK_SIZE
+#define RC_LENGTH		29
+
+struct aes_ctx {
+	u32 ekey[AES_KS_LENGTH];
+	u32 rounds;
+	u32 dkey[AES_KS_LENGTH];
+};
+
+#define WPOLY 0x011b
+#define u32_in(x) le32_to_cpu(*(const u32 *)(x))
+#define bytes2word(b0, b1, b2, b3)  \
+	(((u32)(b3) << 24) | ((u32)(b2) << 16) | ((u32)(b1) << 8) | (b0))
+
+/* define the finite field multiplies required for Rijndael */
+#define f2(x) ((x) ? pow[log[x] + 0x19] : 0)
+#define f3(x) ((x) ? pow[log[x] + 0x01] : 0)
+#define f9(x) ((x) ? pow[log[x] + 0xc7] : 0)
+#define fb(x) ((x) ? pow[log[x] + 0x68] : 0)
+#define fd(x) ((x) ? pow[log[x] + 0xee] : 0)
+#define fe(x) ((x) ? pow[log[x] + 0xdf] : 0)
+#define fi(x) ((x) ?   pow[255 - log[x]]: 0)
 
-typedef struct
+static inline u32 upr(u32 x, int n)
 {
-    u_int32_t	 aes_Nkey;	// the number of words in the key input block
-    u_int32_t	 aes_Nrnd;	// the number of cipher rounds
-    u_int32_t	 aes_e_key[AES_KS_LENGTH];   // the encryption key schedule
-    u_int32_t	 aes_d_key[AES_KS_LENGTH];   // the decryption key schedule
-    u_int32_t	 aes_Ncol;	// the number of columns in the cipher state
-} aes_context;
+	return (x << 8 * n) | (x >> (32 - 8 * n));
+}
 
-/*
- * The Cipher Interface
- */
- 
-asmlinkage void aes_set_key(void *, const unsigned char [], const int, const int);
+static inline u8 bval(u32 x, int n)
+{
+	return x >> 8 * n;
+}
 
+/* The forward and inverse affine transformations used in the S-box */
+#define fwd_affine(x) \
+	(w = (u32)x, w ^= (w<<1)^(w<<2)^(w<<3)^(w<<4), 0x63^(u8)(w^(w>>8)))
+
+#define inv_affine(x) \
+	(w = (u32)x, w = (w<<1)^(w<<3)^(w<<6), 0x05^(u8)(w^(w>>8)))
+
+static u32 rcon_tab[RC_LENGTH];
+
+u32 ft_tab[4][256];
+u32 fl_tab[4][256];
+u32 ls_tab[4][256];
+u32 im_tab[4][256];
+u32 il_tab[4][256];
+u32 it_tab[4][256];
 
+void gen_tabs(void)
+{
+	u32 i, w;
+	u8 pow[512], log[256];
+
+	/*
+	 * log and power tables for GF(2^8) finite field with
+	 * WPOLY as modular polynomial - the simplest primitive
+	 * root is 0x03, used here to generate the tables.
+	 */
+	i = 0; w = 1; 
+	
+	do {
+		pow[i] = (u8)w;
+		pow[i + 255] = (u8)w;
+		log[w] = (u8)i++;
+		w ^=  (w << 1) ^ (w & 0x80 ? WPOLY : 0);
+	} while (w != 1);
+	
+	for(i = 0, w = 1; i < RC_LENGTH; ++i) {
+		rcon_tab[i] = bytes2word(w, 0, 0, 0);
+		w = f2(w);
+	}
 
-/* Actually:
- * extern void aes_encrypt(const aes_context *, unsigned char [], const unsigned char []);
- * extern void aes_decrypt(const aes_context *, unsigned char [], const unsigned char []);
-*/
- 
-asmlinkage void aes_encrypt(void*, unsigned char [], const unsigned char []);
-asmlinkage void aes_decrypt(void*, unsigned char [], const unsigned char []);
+	for(i = 0; i < 256; ++i) {
+		u8 b;
+		
+		b = fwd_affine(fi((u8)i));
+		w = bytes2word(f2(b), b, b, f3(b));
+
+		/* tables for a normal encryption round */
+		ft_tab[0][i] = w;
+		ft_tab[1][i] = upr(w, 1);
+		ft_tab[2][i] = upr(w, 2);
+		ft_tab[3][i] = upr(w, 3);
+		w = bytes2word(b, 0, 0, 0);
+		
+		/*
+		 * tables for last encryption round
+		 * (may also be used in the key schedule)
+		 */
+		fl_tab[0][i] = w;
+		fl_tab[1][i] = upr(w, 1);
+		fl_tab[2][i] = upr(w, 2);
+		fl_tab[3][i] = upr(w, 3);
+		
+		/*
+		 * table for key schedule if fl_tab above is
+		 * not of the required form
+		 */
+		ls_tab[0][i] = w;
+		ls_tab[1][i] = upr(w, 1);
+		ls_tab[2][i] = upr(w, 2);
+		ls_tab[3][i] = upr(w, 3);
+		
+		b = fi(inv_affine((u8)i));
+		w = bytes2word(fe(b), f9(b), fd(b), fb(b));
+
+		/* tables for the inverse mix column operation  */
+		im_tab[0][b] = w;
+		im_tab[1][b] = upr(w, 1);
+		im_tab[2][b] = upr(w, 2);
+		im_tab[3][b] = upr(w, 3);
+
+		/* tables for a normal decryption round */
+		it_tab[0][i] = w;
+		it_tab[1][i] = upr(w,1);
+		it_tab[2][i] = upr(w,2);
+		it_tab[3][i] = upr(w,3);
+
+		w = bytes2word(b, 0, 0, 0);
+		
+		/* tables for last decryption round */
+		il_tab[0][i] = w;
+		il_tab[1][i] = upr(w,1);
+		il_tab[2][i] = upr(w,2);
+		il_tab[3][i] = upr(w,3);
+    }
+}
 
-static int aes_set_key_glue(void *cx, const u8 *key,unsigned int key_length, u32 *flags)
+#define four_tables(x,tab,vf,rf,c)		\
+(	tab[0][bval(vf(x,0,c),rf(0,c))]	^	\
+	tab[1][bval(vf(x,1,c),rf(1,c))] ^	\
+	tab[2][bval(vf(x,2,c),rf(2,c))] ^	\
+	tab[3][bval(vf(x,3,c),rf(3,c))]		\
+)
+
+#define vf1(x,r,c)  (x)
+#define rf1(r,c)    (r)
+#define rf2(r,c)    ((r-c)&3)
+
+#define inv_mcol(x) four_tables(x,im_tab,vf1,rf1,0)
+#define ls_box(x,c) four_tables(x,fl_tab,vf1,rf2,c)
+
+#define ff(x) inv_mcol(x)
+
+#define ke4(k,i)							\
+{									\
+	k[4*(i)+4] = ss[0] ^= ls_box(ss[3],3) ^ rcon_tab[i];		\
+	k[4*(i)+5] = ss[1] ^= ss[0];					\
+	k[4*(i)+6] = ss[2] ^= ss[1];					\
+	k[4*(i)+7] = ss[3] ^= ss[2];					\
+}
+
+#define kel4(k,i)							\
+{									\
+	k[4*(i)+4] = ss[0] ^= ls_box(ss[3],3) ^ rcon_tab[i];		\
+	k[4*(i)+5] = ss[1] ^= ss[0];					\
+	k[4*(i)+6] = ss[2] ^= ss[1]; k[4*(i)+7] = ss[3] ^= ss[2];	\
+}
+
+#define ke6(k,i)							\
+{									\
+	k[6*(i)+ 6] = ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i];		\
+	k[6*(i)+ 7] = ss[1] ^= ss[0];					\
+	k[6*(i)+ 8] = ss[2] ^= ss[1];					\
+	k[6*(i)+ 9] = ss[3] ^= ss[2];					\
+	k[6*(i)+10] = ss[4] ^= ss[3];					\
+	k[6*(i)+11] = ss[5] ^= ss[4];					\
+}
+
+#define kel6(k,i)							\
+{									\
+	k[6*(i)+ 6] = ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i];		\
+	k[6*(i)+ 7] = ss[1] ^= ss[0];					\
+	k[6*(i)+ 8] = ss[2] ^= ss[1];					\
+	k[6*(i)+ 9] = ss[3] ^= ss[2];					\
+}
+
+#define ke8(k,i)							\
+{									\
+	k[8*(i)+ 8] = ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i];		\
+	k[8*(i)+ 9] = ss[1] ^= ss[0];					\
+	k[8*(i)+10] = ss[2] ^= ss[1];					\
+	k[8*(i)+11] = ss[3] ^= ss[2];					\
+	k[8*(i)+12] = ss[4] ^= ls_box(ss[3],0);				\
+	k[8*(i)+13] = ss[5] ^= ss[4];					\
+	k[8*(i)+14] = ss[6] ^= ss[5];					\
+	k[8*(i)+15] = ss[7] ^= ss[6];					\
+}
+
+#define kel8(k,i)							\
+{									\
+	k[8*(i)+ 8] = ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i];		\
+	k[8*(i)+ 9] = ss[1] ^= ss[0];					\
+	k[8*(i)+10] = ss[2] ^= ss[1];					\
+	k[8*(i)+11] = ss[3] ^= ss[2];					\
+}
+
+#define kdf4(k,i)							\
+{									\
+	ss[0] = ss[0] ^ ss[2] ^ ss[1] ^ ss[3];				\
+	ss[1] = ss[1] ^ ss[3];						\
+	ss[2] = ss[2] ^ ss[3];						\
+	ss[3] = ss[3];							\
+	ss[4] = ls_box(ss[(i+3) % 4], 3) ^ rcon_tab[i];			\
+	ss[i % 4] ^= ss[4];						\
+	ss[4] ^= k[4*(i)];						\
+	k[4*(i)+4] = ff(ss[4]);						\
+	ss[4] ^= k[4*(i)+1];						\
+	k[4*(i)+5] = ff(ss[4]);						\
+	ss[4] ^= k[4*(i)+2];						\
+	k[4*(i)+6] = ff(ss[4]);						\
+	ss[4] ^= k[4*(i)+3];						\
+	k[4*(i)+7] = ff(ss[4]);						\
+}
+
+#define kd4(k,i)							\
+{									\
+	ss[4] = ls_box(ss[(i+3) % 4], 3) ^ rcon_tab[i];			\
+	ss[i % 4] ^= ss[4];						\
+	ss[4] = ff(ss[4]);						\
+	k[4*(i)+4] = ss[4] ^= k[4*(i)];					\
+	k[4*(i)+5] = ss[4] ^= k[4*(i)+1];				\
+	k[4*(i)+6] = ss[4] ^= k[4*(i)+2];				\
+	k[4*(i)+7] = ss[4] ^= k[4*(i)+3];				\
+}
+
+#define kdl4(k,i)							\
+{									\
+	ss[4] = ls_box(ss[(i+3) % 4], 3) ^ rcon_tab[i];			\
+	ss[i % 4] ^= ss[4];						\
+	k[4*(i)+4] = (ss[0] ^= ss[1]) ^ ss[2] ^ ss[3];			\
+	k[4*(i)+5] = ss[1] ^ ss[3];					\
+	k[4*(i)+6] = ss[0];						\
+	k[4*(i)+7] = ss[1];						\
+}
+
+#define kdf6(k,i)							\
+{									\
+	ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i];				\
+	k[6*(i)+ 6] = ff(ss[0]);					\
+	ss[1] ^= ss[0];							\
+	k[6*(i)+ 7] = ff(ss[1]);					\
+	ss[2] ^= ss[1];							\
+	k[6*(i)+ 8] = ff(ss[2]);					\
+	ss[3] ^= ss[2];							\
+	k[6*(i)+ 9] = ff(ss[3]);					\
+	ss[4] ^= ss[3];							\
+	k[6*(i)+10] = ff(ss[4]);					\
+	ss[5] ^= ss[4];							\
+	k[6*(i)+11] = ff(ss[5]);					\
+}
+
+#define kd6(k,i)							\
+{									\
+	ss[6] = ls_box(ss[5],3) ^ rcon_tab[i];				\
+	ss[0] ^= ss[6]; ss[6] = ff(ss[6]);				\
+	k[6*(i)+ 6] = ss[6] ^= k[6*(i)];				\
+	ss[1] ^= ss[0];							\
+	k[6*(i)+ 7] = ss[6] ^= k[6*(i)+ 1];				\
+	ss[2] ^= ss[1];							\
+	k[6*(i)+ 8] = ss[6] ^= k[6*(i)+ 2];				\
+	ss[3] ^= ss[2];							\
+	k[6*(i)+ 9] = ss[6] ^= k[6*(i)+ 3];				\
+	ss[4] ^= ss[3];							\
+	k[6*(i)+10] = ss[6] ^= k[6*(i)+ 4];				\
+	ss[5] ^= ss[4];							\
+	k[6*(i)+11] = ss[6] ^= k[6*(i)+ 5];				\
+}
+
+#define kdl6(k,i)							\
+{									\
+	ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i];				\
+	k[6*(i)+ 6] = ss[0];						\
+	ss[1] ^= ss[0];							\
+	k[6*(i)+ 7] = ss[1];						\
+	ss[2] ^= ss[1];							\
+	k[6*(i)+ 8] = ss[2];						\
+	ss[3] ^= ss[2];							\
+	k[6*(i)+ 9] = ss[3];						\
+}
+
+#define kdf8(k,i)							\
+{									\
+	ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i];				\
+	k[8*(i)+ 8] = ff(ss[0]);					\
+	ss[1] ^= ss[0];							\
+	k[8*(i)+ 9] = ff(ss[1]);					\
+	ss[2] ^= ss[1];							\
+	k[8*(i)+10] = ff(ss[2]);					\
+	ss[3] ^= ss[2];							\
+	k[8*(i)+11] = ff(ss[3]);					\
+	ss[4] ^= ls_box(ss[3],0);					\
+	k[8*(i)+12] = ff(ss[4]);					\
+	ss[5] ^= ss[4];							\
+	k[8*(i)+13] = ff(ss[5]);					\
+	ss[6] ^= ss[5];							\
+	k[8*(i)+14] = ff(ss[6]);					\
+	ss[7] ^= ss[6];							\
+	k[8*(i)+15] = ff(ss[7]);					\
+}
+
+#define kd8(k,i)							\
+{									\
+	u32 __g = ls_box(ss[7],3) ^ rcon_tab[i];			\
+	ss[0] ^= __g;							\
+	__g = ff(__g);							\
+	k[8*(i)+ 8] = __g ^= k[8*(i)];					\
+	ss[1] ^= ss[0];							\
+	k[8*(i)+ 9] = __g ^= k[8*(i)+ 1];				\
+	ss[2] ^= ss[1];							\
+	k[8*(i)+10] = __g ^= k[8*(i)+ 2];				\
+	ss[3] ^= ss[2];							\
+	k[8*(i)+11] = __g ^= k[8*(i)+ 3];				\
+	__g = ls_box(ss[3],0);						\
+	ss[4] ^= __g;							\
+	__g = ff(__g);							\
+	k[8*(i)+12] = __g ^= k[8*(i)+ 4];				\
+	ss[5] ^= ss[4];							\
+	k[8*(i)+13] = __g ^= k[8*(i)+ 5];				\
+	ss[6] ^= ss[5];							\
+	k[8*(i)+14] = __g ^= k[8*(i)+ 6];				\
+	ss[7] ^= ss[6];							\
+	k[8*(i)+15] = __g ^= k[8*(i)+ 7];				\
+}
+
+#define kdl8(k,i)							\
+{									\
+	ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i];				\
+	k[8*(i)+ 8] = ss[0];						\
+	ss[1] ^= ss[0];							\
+	k[8*(i)+ 9] = ss[1];						\
+	ss[2] ^= ss[1];							\
+	k[8*(i)+10] = ss[2];						\
+	ss[3] ^= ss[2];							\
+	k[8*(i)+11] = ss[3];						\
+}
+
+static int
+aes_set_key(void *ctx_arg, const u8 *in_key, unsigned int key_len, u32 *flags)
 {
-	if(key_length != 16 && key_length != 24 && key_length != 32)
-	{
- 		*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+	int i;
+	u32 ss[8];
+	struct aes_ctx *ctx = ctx_arg;
+
+	/* encryption schedule */
+	
+	ctx->ekey[0] = ss[0] = u32_in(in_key);
+	ctx->ekey[1] = ss[1] = u32_in(in_key + 4);
+	ctx->ekey[2] = ss[2] = u32_in(in_key + 8);
+	ctx->ekey[3] = ss[3] = u32_in(in_key + 12);
+
+	switch(key_len) {
+	case 16:
+		for (i = 0; i < 9; i++)
+			ke4(ctx->ekey, i);
+		kel4(ctx->ekey, 9);
+		ctx->rounds = 10;
+		break;
+		
+	case 24:
+		ctx->ekey[4] = ss[4] = u32_in(in_key + 16);
+		ctx->ekey[5] = ss[5] = u32_in(in_key + 20);
+		for (i = 0; i < 7; i++)
+			ke6(ctx->ekey, i);
+		kel6(ctx->ekey, 7); 
+		ctx->rounds = 12;
+		break;
+
+	case 32:
+		ctx->ekey[4] = ss[4] = u32_in(in_key + 16);
+		ctx->ekey[5] = ss[5] = u32_in(in_key + 20);
+		ctx->ekey[6] = ss[6] = u32_in(in_key + 24);
+		ctx->ekey[7] = ss[7] = u32_in(in_key + 28);
+		for (i = 0; i < 6; i++)
+			ke8(ctx->ekey, i);
+		kel8(ctx->ekey, 6);
+		ctx->rounds = 14;
+		break;
+
+	default:
+		*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
 		return -EINVAL;
 	}
-	aes_set_key(cx, key,key_length,0);
+	
+	/* decryption schedule */
+	
+	ctx->dkey[0] = ss[0] = u32_in(in_key);
+	ctx->dkey[1] = ss[1] = u32_in(in_key + 4);
+	ctx->dkey[2] = ss[2] = u32_in(in_key + 8);
+	ctx->dkey[3] = ss[3] = u32_in(in_key + 12);
+
+	switch (key_len) {
+	case 16:
+		kdf4(ctx->dkey, 0);
+		for (i = 1; i < 9; i++)
+			kd4(ctx->dkey, i);
+		kdl4(ctx->dkey, 9);
+		break;
+		
+	case 24:
+		ctx->dkey[4] = ff(ss[4] = u32_in(in_key + 16));
+		ctx->dkey[5] = ff(ss[5] = u32_in(in_key + 20));
+		kdf6(ctx->dkey, 0);
+		for (i = 1; i < 7; i++)
+			kd6(ctx->dkey, i);
+		kdl6(ctx->dkey, 7);
+		break;
+
+	case 32:
+		ctx->dkey[4] = ff(ss[4] = u32_in(in_key + 16));
+		ctx->dkey[5] = ff(ss[5] = u32_in(in_key + 20));
+		ctx->dkey[6] = ff(ss[6] = u32_in(in_key + 24));
+		ctx->dkey[7] = ff(ss[7] = u32_in(in_key + 28));
+		kdf8(ctx->dkey, 0);
+		for (i = 1; i < 6; i++)
+			kd8(ctx->dkey, i);
+		kdl8(ctx->dkey, 6);
+		break;
+	}
 	return 0;
 }
 
-#ifdef CONFIG_REGPARM
-static void aes_encrypt_glue(void* a, unsigned char b[], const unsigned char c[]) {
-	aes_encrypt(a,b,c);
-}
-static void aes_decrypt_glue(void* a, unsigned char b[], const unsigned char c[]) {
-	aes_decrypt(a,b,c);
-}
-#else
-#define aes_encrypt_glue aes_encrypt
-#define aes_decrypt_glue aes_decrypt
-#endif /* CONFIG_REGPARM */
+static inline void aes_encrypt(void *ctx, u8 *dst, const u8 *src)
+{
+	aes_enc_blk(src, dst, ctx);
+}
+static inline void aes_decrypt(void *ctx, u8 *dst, const u8 *src)
+{
+	aes_dec_blk(src, dst, ctx);
+}
+
 
 static struct crypto_alg aes_alg = {
 	.cra_name		=	"aes",
 	.cra_flags		=	CRYPTO_ALG_TYPE_CIPHER,
 	.cra_blocksize		=	AES_BLOCK_SIZE,
-	.cra_ctxsize		=	sizeof(aes_context),
+	.cra_ctxsize		=	sizeof(struct aes_ctx),
 	.cra_module		=	THIS_MODULE,
 	.cra_list		=	LIST_HEAD_INIT(aes_alg.cra_list),
 	.cra_u			=	{
 		.cipher = {
 			.cia_min_keysize	=	AES_MIN_KEY_SIZE,
 			.cia_max_keysize	=	AES_MAX_KEY_SIZE,
-			.cia_setkey	   	= 	aes_set_key_glue,
-			.cia_encrypt	 	=	aes_encrypt_glue,
-			.cia_decrypt	  	=	aes_decrypt_glue
+			.cia_setkey	   	= 	aes_set_key,
+			.cia_encrypt	 	=	aes_encrypt,
+			.cia_decrypt	  	=	aes_decrypt
 		}
 	}
 };
 
 static int __init aes_init(void)
 {
+	gen_tabs();
 	return crypto_register_alg(&aes_alg);
 }
 
@@ -101,5 +516,5 @@
 
 MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, i586 asm optimized");
 MODULE_LICENSE("Dual BSD/GPL");
-MODULE_AUTHOR("Fruhwirth Clemens");
+MODULE_AUTHOR("Fruhwirth Clemens, James Morris, Brian Gladman, Adam Richter");
 MODULE_ALIAS("aes");
Only in linux-2.6.8/arch/i386/crypto/: aes-i586-asm.o
Only in linux-2.6.8/arch/i386/crypto/: .aes-i586-asm.o.cmd
diff -u linux-2.6.8/arch/i386/crypto/aes-i586-asm.S linux-2.6.8.orig/arch/i386/crypto/aes-i586-asm.S
--- linux-2.6.8/arch/i386/crypto/aes-i586-asm.S	2004-09-24 09:42:34.641803568 +0200
+++ linux-2.6.8.orig/arch/i386/crypto/aes-i586-asm.S	2004-08-14 07:37:15.000000000 +0200
@@ -1,918 +1,341 @@
-//
-// Copyright (c) 2001, Dr Brian Gladman <brg@gladman.uk.net>, Worcester, UK.
+// -------------------------------------------------------------------------
+// Copyright (c) 2001, Dr Brian Gladman <                 >, Worcester, UK.
 // All rights reserved.
 //
-// TERMS
+// LICENSE TERMS
 //
-//  Redistribution and use in source and binary forms, with or without
-//  modification, are permitted subject to the following conditions:
+// The free distribution and use of this software in both source and binary 
+// form is allowed (with or without changes) provided that:
 //
-//  1. Redistributions of source code must retain the above copyright
-//     notice, this list of conditions and the following disclaimer.
+//   1. distributions of this source code include the above copyright 
+//      notice, this list of conditions and the following disclaimer//
 //
-//  2. Redistributions in binary form must reproduce the above copyright
-//     notice, this list of conditions and the following disclaimer in the
-//     documentation and/or other materials provided with the distribution.
+//   2. distributions in binary form include the above copyright
+//      notice, this list of conditions and the following disclaimer
+//      in the documentation and/or other associated materials//
 //
-//  3. The copyright holder's name must not be used to endorse or promote
-//     any products derived from this software without his specific prior
-//     written permission.
+//   3. the copyright holder's name is not used to endorse products 
+//      built using this software without specific written permission.
 //
-//  This software is provided 'as is' with no express or implied warranties
-//  of correctness or fitness for purpose.
-
-// Modified by Jari Ruusu,  December 24 2001
-//  - Converted syntax to GNU CPP/assembler syntax
-//  - C programming interface converted back to "old" API
-//  - Minor portability cleanups and speed optimizations
-
-// Modified by Jari Ruusu,  April 11 2002
-//  - Added above copyright and terms to resulting object code so that
-//    binary distributions can avoid legal trouble
-
-// Modified by Clemens Fruhwirth,  Feb 04 2003
-//  - Switched in/out to fit CryptoAPI calls.
-
-// An AES (Rijndael) implementation for the Pentium. This version only
-// implements the standard AES block length (128 bits, 16 bytes). This code
-// does not preserve the eax, ecx or edx registers or the artihmetic status
-// flags. However, the ebx, esi, edi, and ebp registers are preserved across
-// calls.
-
-// void aes_set_key(aes_context *cx, const unsigned char key[], const int key_len, const int f)
-// void aes_encrypt(const aes_context *cx, unsigned char out_blk[], const unsigned char in_blk[])
-// void aes_decrypt(const aes_context *cx, unsigned char out_blk[], const unsigned char in_blk[])
-
-# define ALIGN32BYTES 32
-
-	.file	"aes-i586.S"
-	.globl	aes_set_key
-	.globl	aes_encrypt
-	.globl	aes_decrypt
-
-	.text
-copyright:
-	.ascii "    \000"
-	.ascii "Copyright (c) 2001, Dr Brian Gladman <brg@gladman.uk.net>, Worcester, UK.\000"
-	.ascii "All rights reserved.\000"
-	.ascii "    \000"
-	.ascii "TERMS\000"
-	.ascii "    \000"
-	.ascii " Redistribution and use in source and binary forms, with or without\000"
-	.ascii " modification, are permitted subject to the following conditions:\000"
-	.ascii "    \000"
-	.ascii " 1. Redistributions of source code must retain the above copyright\000"
-	.ascii "    notice, this list of conditions and the following disclaimer.\000"
-	.ascii "    \000"
-	.ascii " 2. Redistributions in binary form must reproduce the above copyright\000"
-	.ascii "    notice, this list of conditions and the following disclaimer in the\000"
-	.ascii "    documentation and/or other materials provided with the distribution.\000"
-	.ascii "    \000"
-	.ascii " 3. The copyright holder's name must not be used to endorse or promote\000"
-	.ascii "    any products derived from this software without his specific prior\000"
-	.ascii "    written permission.\000"
-	.ascii "    \000"
-	.ascii " This software is provided 'as is' with no express or implied warranties\000"
-	.ascii " of correctness or fitness for purpose.\000"
-	.ascii "    \000"
+//
+// ALTERNATIVELY, provided that this notice is retained in full, this product
+// may be distributed under the terms of the GNU General Public License (GPL),
+// in which case the provisions of the GPL apply INSTEAD OF those given above.
+//
+// Copyright (c) 2004 Linus Torvalds <torvalds@osdl.org>
+// Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
 
-#define tlen	1024	// length of each of 4 'xor' arrays (256 32-bit words)
+// DISCLAIMER
+//
+// This software is provided 'as is' with no explicit or implied warranties
+// in respect of its properties including, but not limited to, correctness 
+// and fitness for purpose.
+// -------------------------------------------------------------------------
+// Issue Date: 29/07/2002
+
+.file "aes-i586-asm.S"
+.text
+
+// aes_rval aes_enc_blk(const unsigned char in_blk[], unsigned char out_blk[], const aes_ctx cx[1])//
+// aes_rval aes_dec_blk(const unsigned char in_blk[], unsigned char out_blk[], const aes_ctx cx[1])//
+	
+#define tlen 1024   // length of each of 4 'xor' arrays (256 32-bit words)
 
 // offsets to parameters with one register pushed onto stack
 
-#define ctx	8	// AES context structure
-#define out_blk	12	// output byte array address parameter
-#define in_blk	16	// input byte array address parameter
+#define in_blk    8  // input byte array address parameter
+#define out_blk  12  // output byte array address parameter
+#define ctx      16  // AES context structure
 
 // offsets in context structure
 
-#define nkey	0	// key length, size 4
-#define nrnd	4	// number of rounds, size 4
-#define ekey	8	// encryption key schedule base address, size 256
-#define dkey	264	// decryption key schedule base address, size 256
+#define ekey     0   // encryption key schedule base address
+#define nrnd   256   // number of rounds
+#define dkey   260   // decryption key schedule base address
+
+// register mapping for encrypt and decrypt subroutines
+
+#define r0  eax
+#define r1  ebx
+#define r2  ecx
+#define r3  edx
+#define r4  esi
+#define r5  edi
+#define r6  ebp
+
+#define eaxl  al
+#define eaxh  ah
+#define ebxl  bl
+#define ebxh  bh
+#define ecxl  cl
+#define ecxh  ch
+#define edxl  dl
+#define edxh  dh
+
+#define _h(reg) reg##h
+#define h(reg) _h(reg)
+
+#define _l(reg) reg##l
+#define l(reg) _l(reg)
+
+// This macro takes a 32-bit word representing a column and uses
+// each of its four bytes to index into four tables of 256 32-bit
+// words to obtain values that are then xored into the appropriate
+// output registers r0, r1, r4 or r5.  
+
+// Parameters:
+//   %1  out_state[0]
+//   %2  out_state[1]
+//   %3  out_state[2]
+//   %4  out_state[3]
+//   %5  table base address
+//   %6  input register for the round (destroyed)
+//   %7  scratch register for the round
+
+#define do_col(a1, a2, a3, a4, a5, a6, a7)	\
+	movzx   %l(a6),%a7;			\
+	xor     a5(,%a7,4),%a1;			\
+	movzx   %h(a6),%a7;			\
+	shr     $16,%a6;			\
+	xor     a5+tlen(,%a7,4),%a2;		\
+	movzx   %l(a6),%a7;			\
+	movzx   %h(a6),%a6;			\
+	xor     a5+2*tlen(,%a7,4),%a3;		\
+	xor     a5+3*tlen(,%a6,4),%a4;
+
+// initialise output registers from the key schedule
+
+#define do_fcol(a1, a2, a3, a4, a5, a6, a7, a8)	\
+	mov     0 a8,%a1;			\
+	movzx   %l(a6),%a7;			\
+	mov     12 a8,%a2;			\
+	xor     a5(,%a7,4),%a1;			\
+	mov     4 a8,%a4;			\
+	movzx   %h(a6),%a7;			\
+	shr     $16,%a6;			\
+	xor     a5+tlen(,%a7,4),%a2;		\
+	movzx   %l(a6),%a7;			\
+	movzx   %h(a6),%a6;			\
+	xor     a5+3*tlen(,%a6,4),%a4;		\
+	mov     %a3,%a6;			\
+	mov     8 a8,%a3;			\
+	xor     a5+2*tlen(,%a7,4),%a3;
+
+// initialise output registers from the key schedule
+
+#define do_icol(a1, a2, a3, a4, a5, a6, a7, a8)	\
+	mov     0 a8,%a1;			\
+	movzx   %l(a6),%a7;			\
+	mov     4 a8,%a2;			\
+	xor     a5(,%a7,4),%a1;			\
+	mov     12 a8,%a4;			\
+	movzx   %h(a6),%a7;			\
+	shr     $16,%a6;			\
+	xor     a5+tlen(,%a7,4),%a2;		\
+	movzx   %l(a6),%a7;			\
+	movzx   %h(a6),%a6;			\
+	xor     a5+3*tlen(,%a6,4),%a4;		\
+	mov     %a3,%a6;			\
+	mov     8 a8,%a3;			\
+	xor     a5+2*tlen(,%a7,4),%a3;
+
+
+// original Gladman had conditional saves to MMX regs.
+#define save(a1, a2)		\
+	mov     %a2,4*a1(%esp)
 
-// This macro performs a forward encryption cycle. It is entered with
-// the first previous round column values in %eax, %ebx, %esi and %edi and
-// exits with the final values in the same registers.
+#define restore(a1, a2)		\
+	mov     4*a2(%esp),%a1
 
-#define fwd_rnd(p1,p2)			 \
-	mov	%ebx,(%esp)		;\
-	movzbl	%al,%edx		;\
-	mov	%eax,%ecx		;\
-	mov	p2(%ebp),%eax		;\
-	mov	%edi,4(%esp)		;\
-	mov	p2+12(%ebp),%edi	;\
-	xor	p1(,%edx,4),%eax	;\
-	movzbl	%ch,%edx		;\
-	shr	$16,%ecx		;\
-	mov	p2+4(%ebp),%ebx		;\
-	xor	p1+tlen(,%edx,4),%edi	;\
-	movzbl	%cl,%edx		;\
-	movzbl	%ch,%ecx		;\
-	xor	p1+3*tlen(,%ecx,4),%ebx	;\
-	mov	%esi,%ecx		;\
-	mov	p1+2*tlen(,%edx,4),%esi	;\
-	movzbl	%cl,%edx		;\
-	xor	p1(,%edx,4),%esi	;\
-	movzbl	%ch,%edx		;\
-	shr	$16,%ecx		;\
-	xor	p1+tlen(,%edx,4),%ebx	;\
-	movzbl	%cl,%edx		;\
-	movzbl	%ch,%ecx		;\
-	xor	p1+2*tlen(,%edx,4),%eax	;\
-	mov	(%esp),%edx		;\
-	xor	p1+3*tlen(,%ecx,4),%edi ;\
-	movzbl	%dl,%ecx		;\
-	xor	p2+8(%ebp),%esi		;\
-	xor	p1(,%ecx,4),%ebx	;\
-	movzbl	%dh,%ecx		;\
-	shr	$16,%edx		;\
-	xor	p1+tlen(,%ecx,4),%eax	;\
-	movzbl	%dl,%ecx		;\
-	movzbl	%dh,%edx		;\
-	xor	p1+2*tlen(,%ecx,4),%edi	;\
-	mov	4(%esp),%ecx		;\
-	xor	p1+3*tlen(,%edx,4),%esi ;\
-	movzbl	%cl,%edx		;\
-	xor	p1(,%edx,4),%edi	;\
-	movzbl	%ch,%edx		;\
-	shr	$16,%ecx		;\
-	xor	p1+tlen(,%edx,4),%esi	;\
-	movzbl	%cl,%edx		;\
-	movzbl	%ch,%ecx		;\
-	xor	p1+2*tlen(,%edx,4),%ebx	;\
-	xor	p1+3*tlen(,%ecx,4),%eax
+// This macro performs a forward encryption cycle. It is entered with
+// the first previous round column values in r0, r1, r4 and r5 and
+// exits with the final values in the same registers, using the MMX
+// registers mm0-mm1 or the stack for temporary storage
+
+// mov current column values into the MMX registers
+#define fwd_rnd(arg, table)					\
+	/* mov current column values into the MMX registers */	\
+	mov     %r0,%r2;					\
+	save   (0,r1);						\
+	save   (1,r5);						\
+								\
+	/* compute new column values */				\
+	do_fcol(r0,r5,r4,r1,table, r2,r3, arg);			\
+	do_col (r4,r1,r0,r5,table, r2,r3);			\
+	restore(r2,0);						\
+	do_col (r1,r0,r5,r4,table, r2,r3);			\
+	restore(r2,1);						\
+	do_col (r5,r4,r1,r0,table, r2,r3);
 
 // This macro performs an inverse encryption cycle. It is entered with
-// the first previous round column values in %eax, %ebx, %esi and %edi and
-// exits with the final values in the same registers.
-
-#define inv_rnd(p1,p2)			 \
-	movzbl	%al,%edx		;\
-	mov	%ebx,(%esp)		;\
-	mov	%eax,%ecx		;\
-	mov	p2(%ebp),%eax		;\
-	mov	%edi,4(%esp)		;\
-	mov	p2+4(%ebp),%ebx		;\
-	xor	p1(,%edx,4),%eax	;\
-	movzbl	%ch,%edx		;\
-	shr	$16,%ecx		;\
-	mov	p2+12(%ebp),%edi	;\
-	xor	p1+tlen(,%edx,4),%ebx	;\
-	movzbl	%cl,%edx		;\
-	movzbl	%ch,%ecx		;\
-	xor	p1+3*tlen(,%ecx,4),%edi	;\
-	mov	%esi,%ecx		;\
-	mov	p1+2*tlen(,%edx,4),%esi	;\
-	movzbl	%cl,%edx		;\
-	xor	p1(,%edx,4),%esi	;\
-	movzbl	%ch,%edx		;\
-	shr	$16,%ecx		;\
-	xor	p1+tlen(,%edx,4),%edi	;\
-	movzbl	%cl,%edx		;\
-	movzbl	%ch,%ecx		;\
-	xor	p1+2*tlen(,%edx,4),%eax	;\
-	mov	(%esp),%edx		;\
-	xor	p1+3*tlen(,%ecx,4),%ebx ;\
-	movzbl	%dl,%ecx		;\
-	xor	p2+8(%ebp),%esi		;\
-	xor	p1(,%ecx,4),%ebx	;\
-	movzbl	%dh,%ecx		;\
-	shr	$16,%edx		;\
-	xor	p1+tlen(,%ecx,4),%esi	;\
-	movzbl	%dl,%ecx		;\
-	movzbl	%dh,%edx		;\
-	xor	p1+2*tlen(,%ecx,4),%edi	;\
-	mov	4(%esp),%ecx		;\
-	xor	p1+3*tlen(,%edx,4),%eax ;\
-	movzbl	%cl,%edx		;\
-	xor	p1(,%edx,4),%edi	;\
-	movzbl	%ch,%edx		;\
-	shr	$16,%ecx		;\
-	xor	p1+tlen(,%edx,4),%eax	;\
-	movzbl	%cl,%edx		;\
-	movzbl	%ch,%ecx		;\
-	xor	p1+2*tlen(,%edx,4),%ebx	;\
-	xor	p1+3*tlen(,%ecx,4),%esi
+// the first previous round column values in r0, r1, r4 and r5 and
+// exits with the final values in the same registers, using the MMX
+// registers mm0-mm1 or the stack for temporary storage
+
+#define inv_rnd(arg, table)					\
+	/* mov current column values into the MMX registers */	\
+	mov     %r0,%r2;					\
+	save    (0,r1);						\
+	save    (1,r5);						\
+								\
+	/* compute new column values */				\
+	do_icol(r0,r1,r4,r5, table, r2,r3, arg);		\
+	do_col (r4,r5,r0,r1, table, r2,r3);			\
+	restore(r2,0);						\
+	do_col (r1,r4,r5,r0, table, r2,r3);			\
+	restore(r2,1);						\
+	do_col (r5,r0,r1,r4, table, r2,r3);
 
 // AES (Rijndael) Encryption Subroutine
 
-	.text
-	.align	ALIGN32BYTES
-aes_encrypt:
-	push	%ebp
-	mov	ctx(%esp),%ebp		// pointer to context
-	mov	in_blk(%esp),%ecx
-	push	%ebx
-	push	%esi
-	push	%edi
-	mov	nrnd(%ebp),%edx		// number of rounds
-	lea	ekey+16(%ebp),%ebp	// key pointer
-
-// input four columns and xor in first round key
-
-	mov	(%ecx),%eax
-	mov	4(%ecx),%ebx
-	mov	8(%ecx),%esi
-	mov	12(%ecx),%edi
-	xor	-16(%ebp),%eax
-	xor	-12(%ebp),%ebx
-	xor	-8(%ebp),%esi
-	xor	-4(%ebp),%edi
-
-	sub	$8,%esp			// space for register saves on stack
-
-	sub	$10,%edx
-	je	aes_15
-	add	$32,%ebp
-	sub	$2,%edx
-	je	aes_13
-	add	$32,%ebp
-
-	fwd_rnd(aes_ft_tab,-64)		// 14 rounds for 256-bit key
-	fwd_rnd(aes_ft_tab,-48)
-aes_13:	fwd_rnd(aes_ft_tab,-32)		// 12 rounds for 192-bit key
-	fwd_rnd(aes_ft_tab,-16)
-aes_15:	fwd_rnd(aes_ft_tab,0)		// 10 rounds for 128-bit key
-	fwd_rnd(aes_ft_tab,16)
-	fwd_rnd(aes_ft_tab,32)
-	fwd_rnd(aes_ft_tab,48)
-	fwd_rnd(aes_ft_tab,64)
-	fwd_rnd(aes_ft_tab,80)
-	fwd_rnd(aes_ft_tab,96)
-	fwd_rnd(aes_ft_tab,112)
-	fwd_rnd(aes_ft_tab,128)
-	fwd_rnd(aes_fl_tab,144)		// last round uses a different table
-
-// move final values to the output array.
-
-	mov	out_blk+20(%esp),%ebp
-	add	$8,%esp
-	mov	%eax,(%ebp)
-	mov	%ebx,4(%ebp)
-	mov	%esi,8(%ebp)
-	mov	%edi,12(%ebp)
-	pop	%edi
-	pop	%esi
-	pop	%ebx
-	pop	%ebp
-	ret
+.global  aes_enc_blk
 
+.extern  ft_tab
+.extern  fl_tab
 
-// AES (Rijndael) Decryption Subroutine
+.align 4
 
-	.align	ALIGN32BYTES
-aes_decrypt:
-	push	%ebp
-	mov	ctx(%esp),%ebp		// pointer to context
-	mov	in_blk(%esp),%ecx
-	push	%ebx
-	push	%esi
-	push	%edi
-	mov	nrnd(%ebp),%edx		// number of rounds
-	lea	dkey+16(%ebp),%ebp	// key pointer
+aes_enc_blk:
+	push    %ebp
+	mov     ctx(%esp),%ebp      // pointer to context
+	xor     %eax,%eax
+
+// CAUTION: the order and the values used in these assigns 
+// rely on the register mappings
+
+1:	push    %ebx
+	mov     in_blk+4(%esp),%r2
+	push    %esi
+	mov     nrnd(%ebp),%r3   // number of rounds
+	push    %edi
+	lea     ekey(%ebp),%r6   // key pointer
 
 // input four columns and xor in first round key
 
-	mov	(%ecx),%eax
-	mov	4(%ecx),%ebx
-	mov	8(%ecx),%esi
-	mov	12(%ecx),%edi
-	xor	-16(%ebp),%eax
-	xor	-12(%ebp),%ebx
-	xor	-8(%ebp),%esi
-	xor	-4(%ebp),%edi
-
-	sub	$8,%esp			// space for register saves on stack
-
-	sub	$10,%edx
-	je	aes_25
-	add	$32,%ebp
-	sub	$2,%edx
-	je	aes_23
-	add	$32,%ebp
-
-	inv_rnd(aes_it_tab,-64)		// 14 rounds for 256-bit key
-	inv_rnd(aes_it_tab,-48)
-aes_23:	inv_rnd(aes_it_tab,-32)		// 12 rounds for 192-bit key
-	inv_rnd(aes_it_tab,-16)
-aes_25:	inv_rnd(aes_it_tab,0)		// 10 rounds for 128-bit key
-	inv_rnd(aes_it_tab,16)
-	inv_rnd(aes_it_tab,32)
-	inv_rnd(aes_it_tab,48)
-	inv_rnd(aes_it_tab,64)
-	inv_rnd(aes_it_tab,80)
-	inv_rnd(aes_it_tab,96)
-	inv_rnd(aes_it_tab,112)
-	inv_rnd(aes_it_tab,128)
-	inv_rnd(aes_il_tab,144)		// last round uses a different table
-
-// move final values to the output array.
-
-	mov	out_blk+20(%esp),%ebp
-	add	$8,%esp
-	mov	%eax,(%ebp)
-	mov	%ebx,4(%ebp)
-	mov	%esi,8(%ebp)
-	mov	%edi,12(%ebp)
-	pop	%edi
-	pop	%esi
-	pop	%ebx
-	pop	%ebp
+	mov     (%r2),%r0
+	mov     4(%r2),%r1
+	mov     8(%r2),%r4
+	mov     12(%r2),%r5
+	xor     (%r6),%r0
+	xor     4(%r6),%r1
+	xor     8(%r6),%r4
+	xor     12(%r6),%r5
+
+	sub     $8,%esp           // space for register saves on stack
+	add     $16,%r6           // increment to next round key   
+	sub     $10,%r3          
+	je      4f              // 10 rounds for 128-bit key
+	add     $32,%r6
+	sub     $2,%r3
+	je      3f              // 12 rounds for 128-bit key
+	add     $32,%r6
+
+2:	fwd_rnd( -64(%r6) ,ft_tab)	// 14 rounds for 128-bit key
+	fwd_rnd( -48(%r6) ,ft_tab)
+3:	fwd_rnd( -32(%r6) ,ft_tab)	// 12 rounds for 128-bit key
+	fwd_rnd( -16(%r6) ,ft_tab)
+4:	fwd_rnd(    (%r6) ,ft_tab)	// 10 rounds for 128-bit key
+	fwd_rnd( +16(%r6) ,ft_tab)
+	fwd_rnd( +32(%r6) ,ft_tab)
+	fwd_rnd( +48(%r6) ,ft_tab)
+	fwd_rnd( +64(%r6) ,ft_tab)
+	fwd_rnd( +80(%r6) ,ft_tab)
+	fwd_rnd( +96(%r6) ,ft_tab)
+	fwd_rnd(+112(%r6) ,ft_tab)
+	fwd_rnd(+128(%r6) ,ft_tab)
+	fwd_rnd(+144(%r6) ,fl_tab)	// last round uses a different table
+
+// move final values to the output array.  CAUTION: the 
+// order of these assigns rely on the register mappings
+
+	add     $8,%esp
+	mov     out_blk+12(%esp),%r6
+	mov     %r5,12(%r6)
+	pop     %edi
+	mov     %r4,8(%r6)
+	pop     %esi
+	mov     %r1,4(%r6)
+	pop     %ebx
+	mov     %r0,(%r6)
+	pop     %ebp
+	mov     $1,%eax
 	ret
 
-// AES (Rijndael) Key Schedule Subroutine
-
-// input/output parameters
+// AES (Rijndael) Decryption Subroutine
 
-#define aes_cx	12	// AES context
-#define in_key	16	// key input array address
-#define key_ln	20	// key length, bytes (16,24,32) or bits (128,192,256)
-#define ed_flg	24	// 0=create both encr/decr keys, 1=create encr key only
-
-// offsets for locals
-
-#define cnt	-4
-#define kpf	-8
-#define slen	8
-
-// This macro performs a column mixing operation on an input 32-bit
-// word to give a 32-bit result. It uses each of the 4 bytes in the
-// the input column to index 4 different tables of 256 32-bit words
-// that are xored together to form the output value.
-
-#define mix_col(p1)			 \
-	movzbl	%bl,%ecx		;\
-	mov	p1(,%ecx,4),%eax	;\
-	movzbl	%bh,%ecx		;\
-	ror	$16,%ebx		;\
-	xor	p1+tlen(,%ecx,4),%eax	;\
-	movzbl	%bl,%ecx		;\
-	xor	p1+2*tlen(,%ecx,4),%eax	;\
-	movzbl	%bh,%ecx		;\
-	xor	p1+3*tlen(,%ecx,4),%eax
-
-// Key Schedule Macros
-
-#define ksc4(p1)			 \
-	rol	$24,%ebx		;\
-	mix_col(aes_fl_tab)		;\
-	ror	$8,%ebx			;\
-	xor	4*p1+aes_rcon_tab,%eax	;\
-	xor	%eax,%esi		;\
-	xor	%esi,%ebp		;\
-	mov	%esi,16*p1(%edi)	;\
-	mov	%ebp,16*p1+4(%edi)	;\
-	xor	%ebp,%edx		;\
-	xor	%edx,%ebx		;\
-	mov	%edx,16*p1+8(%edi)	;\
-	mov	%ebx,16*p1+12(%edi)
-
-#define ksc6(p1)			 \
-	rol	$24,%ebx		;\
-	mix_col(aes_fl_tab)		;\
-	ror	$8,%ebx			;\
-	xor	4*p1+aes_rcon_tab,%eax	;\
-	xor	24*p1-24(%edi),%eax	;\
-	mov	%eax,24*p1(%edi)	;\
-	xor	24*p1-20(%edi),%eax	;\
-	mov	%eax,24*p1+4(%edi)	;\
-	xor	%eax,%esi		;\
-	xor	%esi,%ebp		;\
-	mov	%esi,24*p1+8(%edi)	;\
-	mov	%ebp,24*p1+12(%edi)	;\
-	xor	%ebp,%edx		;\
-	xor	%edx,%ebx		;\
-	mov	%edx,24*p1+16(%edi)	;\
-	mov	%ebx,24*p1+20(%edi)
-
-#define ksc8(p1)			 \
-	rol	$24,%ebx		;\
-	mix_col(aes_fl_tab)		;\
-	ror	$8,%ebx			;\
-	xor	4*p1+aes_rcon_tab,%eax	;\
-	xor	32*p1-32(%edi),%eax	;\
-	mov	%eax,32*p1(%edi)	;\
-	xor	32*p1-28(%edi),%eax	;\
-	mov	%eax,32*p1+4(%edi)	;\
-	xor	32*p1-24(%edi),%eax	;\
-	mov	%eax,32*p1+8(%edi)	;\
-	xor	32*p1-20(%edi),%eax	;\
-	mov	%eax,32*p1+12(%edi)	;\
-	push	%ebx			;\
-	mov	%eax,%ebx		;\
-	mix_col(aes_fl_tab)		;\
-	pop	%ebx			;\
-	xor	%eax,%esi		;\
-	xor	%esi,%ebp		;\
-	mov	%esi,32*p1+16(%edi)	;\
-	mov	%ebp,32*p1+20(%edi)	;\
-	xor	%ebp,%edx		;\
-	xor	%edx,%ebx		;\
-	mov	%edx,32*p1+24(%edi)	;\
-	mov	%ebx,32*p1+28(%edi)
-
-	.align	ALIGN32BYTES
-aes_set_key:
-	pushfl
-	push	%ebp
-	mov	%esp,%ebp
-	sub	$slen,%esp
-	push	%ebx
-	push	%esi
-	push	%edi
-
-	mov	aes_cx(%ebp),%edx	// edx -> AES context
-
-	mov	key_ln(%ebp),%ecx	// key length
-	cmpl	$128,%ecx
-	jb	aes_30
-	shr	$3,%ecx
-aes_30:	cmpl	$32,%ecx
-	je	aes_32
-	cmpl	$24,%ecx
-	je	aes_32
-	mov	$16,%ecx
-aes_32:	shr	$2,%ecx
-	mov	%ecx,nkey(%edx)
-
-	lea	6(%ecx),%eax		// 10/12/14 for 4/6/8 32-bit key length
-	mov	%eax,nrnd(%edx)
-
-	mov	in_key(%ebp),%esi	// key input array
-	lea	ekey(%edx),%edi		// key position in AES context
-	cld
-	push	%ebp
-	mov	%ecx,%eax		// save key length in eax
-	rep ;	movsl			// words in the key schedule
-	mov	-4(%esi),%ebx		// put some values in registers
-	mov	-8(%esi),%edx		// to allow faster code
-	mov	-12(%esi),%ebp
-	mov	-16(%esi),%esi
-
-	cmpl	$4,%eax			// jump on key size
-	je	aes_36
-	cmpl	$6,%eax
-	je	aes_35
-
-	ksc8(0)
-	ksc8(1)
-	ksc8(2)
-	ksc8(3)
-	ksc8(4)
-	ksc8(5)
-	ksc8(6)
-	jmp	aes_37
-aes_35:	ksc6(0)
-	ksc6(1)
-	ksc6(2)
-	ksc6(3)
-	ksc6(4)
-	ksc6(5)
-	ksc6(6)
-	ksc6(7)
-	jmp	aes_37
-aes_36:	ksc4(0)
-	ksc4(1)
-	ksc4(2)
-	ksc4(3)
-	ksc4(4)
-	ksc4(5)
-	ksc4(6)
-	ksc4(7)
-	ksc4(8)
-	ksc4(9)
-aes_37:	pop	%ebp
-	mov	aes_cx(%ebp),%edx	// edx -> AES context
-	cmpl	$0,ed_flg(%ebp)
-	jne	aes_39
-
-// compile decryption key schedule from encryption schedule - reverse
-// order and do mix_column operation on round keys except first and last
-
-	mov	nrnd(%edx),%eax		// kt = cx->d_key + nc * cx->Nrnd
-	shl	$2,%eax
-	lea	dkey(%edx,%eax,4),%edi
-	lea	ekey(%edx),%esi		// kf = cx->e_key
-
-	movsl				// copy first round key (unmodified)
-	movsl
-	movsl
-	movsl
-	sub	$32,%edi
-	movl	$1,cnt(%ebp)
-aes_38:					// do mix column on each column of
-	lodsl				// each round key
-	mov	%eax,%ebx
-	mix_col(aes_im_tab)
-	stosl
-	lodsl
-	mov	%eax,%ebx
-	mix_col(aes_im_tab)
-	stosl
-	lodsl
-	mov	%eax,%ebx
-	mix_col(aes_im_tab)
-	stosl
-	lodsl
-	mov	%eax,%ebx
-	mix_col(aes_im_tab)
-	stosl
-	sub	$32,%edi
-
-	incl	cnt(%ebp)
-	mov	cnt(%ebp),%eax
-	cmp	nrnd(%edx),%eax
-	jb	aes_38
-
-	movsl				// copy last round key (unmodified)
-	movsl
-	movsl
-	movsl
-aes_39:	pop	%edi
-	pop	%esi
-	pop	%ebx
-	mov	%ebp,%esp
-	pop	%ebp
-	popfl
-	ret
+.global  aes_dec_blk
 
+.extern  it_tab
+.extern  il_tab
 
-// finite field multiplies by {02}, {04} and {08}
+.align 4
 
-#define f2(x)	((x<<1)^(((x>>7)&1)*0x11b))
-#define f4(x)	((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
-#define f8(x)	((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
-
-// finite field multiplies required in table generation
-
-#define f3(x)	(f2(x) ^ x)
-#define f9(x)	(f8(x) ^ x)
-#define fb(x)	(f8(x) ^ f2(x) ^ x)
-#define fd(x)	(f8(x) ^ f4(x) ^ x)
-#define fe(x)	(f8(x) ^ f4(x) ^ f2(x))
-
-// These defines generate the forward table entries
-
-#define u0(x)	((f3(x) << 24) | (x << 16) | (x << 8) | f2(x))
-#define u1(x)	((x << 24) | (x << 16) | (f2(x) << 8) | f3(x))
-#define u2(x)	((x << 24) | (f2(x) << 16) | (f3(x) << 8) | x)
-#define u3(x)	((f2(x) << 24) | (f3(x) << 16) | (x << 8) | x)
-
-// These defines generate the inverse table entries
-
-#define v0(x)	((fb(x) << 24) | (fd(x) << 16) | (f9(x) << 8) | fe(x))
-#define v1(x)	((fd(x) << 24) | (f9(x) << 16) | (fe(x) << 8) | fb(x))
-#define v2(x)	((f9(x) << 24) | (fe(x) << 16) | (fb(x) << 8) | fd(x))
-#define v3(x)	((fe(x) << 24) | (fb(x) << 16) | (fd(x) << 8) | f9(x))
-
-// These defines generate entries for the last round tables
-
-#define w0(x)	(x)
-#define w1(x)	(x <<  8)
-#define w2(x)	(x << 16)
-#define w3(x)	(x << 24)
-
-// macro to generate inverse mix column tables (needed for the key schedule)
-
-#define im_data0(p1) \
-	.long	p1(0x00),p1(0x01),p1(0x02),p1(0x03),p1(0x04),p1(0x05),p1(0x06),p1(0x07) ;\
-	.long	p1(0x08),p1(0x09),p1(0x0a),p1(0x0b),p1(0x0c),p1(0x0d),p1(0x0e),p1(0x0f) ;\
-	.long	p1(0x10),p1(0x11),p1(0x12),p1(0x13),p1(0x14),p1(0x15),p1(0x16),p1(0x17) ;\
-	.long	p1(0x18),p1(0x19),p1(0x1a),p1(0x1b),p1(0x1c),p1(0x1d),p1(0x1e),p1(0x1f)
-#define im_data1(p1) \
-	.long	p1(0x20),p1(0x21),p1(0x22),p1(0x23),p1(0x24),p1(0x25),p1(0x26),p1(0x27) ;\
-	.long	p1(0x28),p1(0x29),p1(0x2a),p1(0x2b),p1(0x2c),p1(0x2d),p1(0x2e),p1(0x2f) ;\
-	.long	p1(0x30),p1(0x31),p1(0x32),p1(0x33),p1(0x34),p1(0x35),p1(0x36),p1(0x37) ;\
-	.long	p1(0x38),p1(0x39),p1(0x3a),p1(0x3b),p1(0x3c),p1(0x3d),p1(0x3e),p1(0x3f)
-#define im_data2(p1) \
-	.long	p1(0x40),p1(0x41),p1(0x42),p1(0x43),p1(0x44),p1(0x45),p1(0x46),p1(0x47) ;\
-	.long	p1(0x48),p1(0x49),p1(0x4a),p1(0x4b),p1(0x4c),p1(0x4d),p1(0x4e),p1(0x4f) ;\
-	.long	p1(0x50),p1(0x51),p1(0x52),p1(0x53),p1(0x54),p1(0x55),p1(0x56),p1(0x57) ;\
-	.long	p1(0x58),p1(0x59),p1(0x5a),p1(0x5b),p1(0x5c),p1(0x5d),p1(0x5e),p1(0x5f)
-#define im_data3(p1) \
-	.long	p1(0x60),p1(0x61),p1(0x62),p1(0x63),p1(0x64),p1(0x65),p1(0x66),p1(0x67) ;\
-	.long	p1(0x68),p1(0x69),p1(0x6a),p1(0x6b),p1(0x6c),p1(0x6d),p1(0x6e),p1(0x6f) ;\
-	.long	p1(0x70),p1(0x71),p1(0x72),p1(0x73),p1(0x74),p1(0x75),p1(0x76),p1(0x77) ;\
-	.long	p1(0x78),p1(0x79),p1(0x7a),p1(0x7b),p1(0x7c),p1(0x7d),p1(0x7e),p1(0x7f)
-#define im_data4(p1) \
-	.long	p1(0x80),p1(0x81),p1(0x82),p1(0x83),p1(0x84),p1(0x85),p1(0x86),p1(0x87) ;\
-	.long	p1(0x88),p1(0x89),p1(0x8a),p1(0x8b),p1(0x8c),p1(0x8d),p1(0x8e),p1(0x8f) ;\
-	.long	p1(0x90),p1(0x91),p1(0x92),p1(0x93),p1(0x94),p1(0x95),p1(0x96),p1(0x97) ;\
-	.long	p1(0x98),p1(0x99),p1(0x9a),p1(0x9b),p1(0x9c),p1(0x9d),p1(0x9e),p1(0x9f)
-#define im_data5(p1) \
-	.long	p1(0xa0),p1(0xa1),p1(0xa2),p1(0xa3),p1(0xa4),p1(0xa5),p1(0xa6),p1(0xa7) ;\
-	.long	p1(0xa8),p1(0xa9),p1(0xaa),p1(0xab),p1(0xac),p1(0xad),p1(0xae),p1(0xaf) ;\
-	.long	p1(0xb0),p1(0xb1),p1(0xb2),p1(0xb3),p1(0xb4),p1(0xb5),p1(0xb6),p1(0xb7) ;\
-	.long	p1(0xb8),p1(0xb9),p1(0xba),p1(0xbb),p1(0xbc),p1(0xbd),p1(0xbe),p1(0xbf)
-#define im_data6(p1) \
-	.long	p1(0xc0),p1(0xc1),p1(0xc2),p1(0xc3),p1(0xc4),p1(0xc5),p1(0xc6),p1(0xc7) ;\
-	.long	p1(0xc8),p1(0xc9),p1(0xca),p1(0xcb),p1(0xcc),p1(0xcd),p1(0xce),p1(0xcf) ;\
-	.long	p1(0xd0),p1(0xd1),p1(0xd2),p1(0xd3),p1(0xd4),p1(0xd5),p1(0xd6),p1(0xd7) ;\
-	.long	p1(0xd8),p1(0xd9),p1(0xda),p1(0xdb),p1(0xdc),p1(0xdd),p1(0xde),p1(0xdf)
-#define im_data7(p1) \
-	.long	p1(0xe0),p1(0xe1),p1(0xe2),p1(0xe3),p1(0xe4),p1(0xe5),p1(0xe6),p1(0xe7) ;\
-	.long	p1(0xe8),p1(0xe9),p1(0xea),p1(0xeb),p1(0xec),p1(0xed),p1(0xee),p1(0xef) ;\
-	.long	p1(0xf0),p1(0xf1),p1(0xf2),p1(0xf3),p1(0xf4),p1(0xf5),p1(0xf6),p1(0xf7) ;\
-	.long	p1(0xf8),p1(0xf9),p1(0xfa),p1(0xfb),p1(0xfc),p1(0xfd),p1(0xfe),p1(0xff)
-
-// S-box data - 256 entries
-
-#define sb_data0(p1) \
-	.long	p1(0x63),p1(0x7c),p1(0x77),p1(0x7b),p1(0xf2),p1(0x6b),p1(0x6f),p1(0xc5) ;\
-	.long	p1(0x30),p1(0x01),p1(0x67),p1(0x2b),p1(0xfe),p1(0xd7),p1(0xab),p1(0x76) ;\
-	.long	p1(0xca),p1(0x82),p1(0xc9),p1(0x7d),p1(0xfa),p1(0x59),p1(0x47),p1(0xf0) ;\
-	.long	p1(0xad),p1(0xd4),p1(0xa2),p1(0xaf),p1(0x9c),p1(0xa4),p1(0x72),p1(0xc0)
-#define sb_data1(p1) \
-	.long	p1(0xb7),p1(0xfd),p1(0x93),p1(0x26),p1(0x36),p1(0x3f),p1(0xf7),p1(0xcc) ;\
-	.long	p1(0x34),p1(0xa5),p1(0xe5),p1(0xf1),p1(0x71),p1(0xd8),p1(0x31),p1(0x15) ;\
-	.long	p1(0x04),p1(0xc7),p1(0x23),p1(0xc3),p1(0x18),p1(0x96),p1(0x05),p1(0x9a) ;\
-	.long	p1(0x07),p1(0x12),p1(0x80),p1(0xe2),p1(0xeb),p1(0x27),p1(0xb2),p1(0x75)
-#define sb_data2(p1) \
-	.long	p1(0x09),p1(0x83),p1(0x2c),p1(0x1a),p1(0x1b),p1(0x6e),p1(0x5a),p1(0xa0) ;\
-	.long	p1(0x52),p1(0x3b),p1(0xd6),p1(0xb3),p1(0x29),p1(0xe3),p1(0x2f),p1(0x84) ;\
-	.long	p1(0x53),p1(0xd1),p1(0x00),p1(0xed),p1(0x20),p1(0xfc),p1(0xb1),p1(0x5b) ;\
-	.long	p1(0x6a),p1(0xcb),p1(0xbe),p1(0x39),p1(0x4a),p1(0x4c),p1(0x58),p1(0xcf)
-#define sb_data3(p1) \
-	.long	p1(0xd0),p1(0xef),p1(0xaa),p1(0xfb),p1(0x43),p1(0x4d),p1(0x33),p1(0x85) ;\
-	.long	p1(0x45),p1(0xf9),p1(0x02),p1(0x7f),p1(0x50),p1(0x3c),p1(0x9f),p1(0xa8) ;\
-	.long	p1(0x51),p1(0xa3),p1(0x40),p1(0x8f),p1(0x92),p1(0x9d),p1(0x38),p1(0xf5) ;\
-	.long	p1(0xbc),p1(0xb6),p1(0xda),p1(0x21),p1(0x10),p1(0xff),p1(0xf3),p1(0xd2)
-#define sb_data4(p1) \
-	.long	p1(0xcd),p1(0x0c),p1(0x13),p1(0xec),p1(0x5f),p1(0x97),p1(0x44),p1(0x17) ;\
-	.long	p1(0xc4),p1(0xa7),p1(0x7e),p1(0x3d),p1(0x64),p1(0x5d),p1(0x19),p1(0x73) ;\
-	.long	p1(0x60),p1(0x81),p1(0x4f),p1(0xdc),p1(0x22),p1(0x2a),p1(0x90),p1(0x88) ;\
-	.long	p1(0x46),p1(0xee),p1(0xb8),p1(0x14),p1(0xde),p1(0x5e),p1(0x0b),p1(0xdb)
-#define sb_data5(p1) \
-	.long	p1(0xe0),p1(0x32),p1(0x3a),p1(0x0a),p1(0x49),p1(0x06),p1(0x24),p1(0x5c) ;\
-	.long	p1(0xc2),p1(0xd3),p1(0xac),p1(0x62),p1(0x91),p1(0x95),p1(0xe4),p1(0x79) ;\
-	.long	p1(0xe7),p1(0xc8),p1(0x37),p1(0x6d),p1(0x8d),p1(0xd5),p1(0x4e),p1(0xa9) ;\
-	.long	p1(0x6c),p1(0x56),p1(0xf4),p1(0xea),p1(0x65),p1(0x7a),p1(0xae),p1(0x08)
-#define sb_data6(p1) \
-	.long	p1(0xba),p1(0x78),p1(0x25),p1(0x2e),p1(0x1c),p1(0xa6),p1(0xb4),p1(0xc6) ;\
-	.long	p1(0xe8),p1(0xdd),p1(0x74),p1(0x1f),p1(0x4b),p1(0xbd),p1(0x8b),p1(0x8a) ;\
-	.long	p1(0x70),p1(0x3e),p1(0xb5),p1(0x66),p1(0x48),p1(0x03),p1(0xf6),p1(0x0e) ;\
-	.long	p1(0x61),p1(0x35),p1(0x57),p1(0xb9),p1(0x86),p1(0xc1),p1(0x1d),p1(0x9e)
-#define sb_data7(p1) \
-	.long	p1(0xe1),p1(0xf8),p1(0x98),p1(0x11),p1(0x69),p1(0xd9),p1(0x8e),p1(0x94) ;\
-	.long	p1(0x9b),p1(0x1e),p1(0x87),p1(0xe9),p1(0xce),p1(0x55),p1(0x28),p1(0xdf) ;\
-	.long	p1(0x8c),p1(0xa1),p1(0x89),p1(0x0d),p1(0xbf),p1(0xe6),p1(0x42),p1(0x68) ;\
-	.long	p1(0x41),p1(0x99),p1(0x2d),p1(0x0f),p1(0xb0),p1(0x54),p1(0xbb),p1(0x16)
-
-// Inverse S-box data - 256 entries
-
-#define ib_data0(p1) \
-	.long	p1(0x52),p1(0x09),p1(0x6a),p1(0xd5),p1(0x30),p1(0x36),p1(0xa5),p1(0x38) ;\
-	.long	p1(0xbf),p1(0x40),p1(0xa3),p1(0x9e),p1(0x81),p1(0xf3),p1(0xd7),p1(0xfb) ;\
-	.long	p1(0x7c),p1(0xe3),p1(0x39),p1(0x82),p1(0x9b),p1(0x2f),p1(0xff),p1(0x87) ;\
-	.long	p1(0x34),p1(0x8e),p1(0x43),p1(0x44),p1(0xc4),p1(0xde),p1(0xe9),p1(0xcb)
-#define ib_data1(p1) \
-	.long	p1(0x54),p1(0x7b),p1(0x94),p1(0x32),p1(0xa6),p1(0xc2),p1(0x23),p1(0x3d) ;\
-	.long	p1(0xee),p1(0x4c),p1(0x95),p1(0x0b),p1(0x42),p1(0xfa),p1(0xc3),p1(0x4e) ;\
-	.long	p1(0x08),p1(0x2e),p1(0xa1),p1(0x66),p1(0x28),p1(0xd9),p1(0x24),p1(0xb2) ;\
-	.long	p1(0x76),p1(0x5b),p1(0xa2),p1(0x49),p1(0x6d),p1(0x8b),p1(0xd1),p1(0x25)
-#define ib_data2(p1) \
-	.long	p1(0x72),p1(0xf8),p1(0xf6),p1(0x64),p1(0x86),p1(0x68),p1(0x98),p1(0x16) ;\
-	.long	p1(0xd4),p1(0xa4),p1(0x5c),p1(0xcc),p1(0x5d),p1(0x65),p1(0xb6),p1(0x92) ;\
-	.long	p1(0x6c),p1(0x70),p1(0x48),p1(0x50),p1(0xfd),p1(0xed),p1(0xb9),p1(0xda) ;\
-	.long	p1(0x5e),p1(0x15),p1(0x46),p1(0x57),p1(0xa7),p1(0x8d),p1(0x9d),p1(0x84)
-#define ib_data3(p1) \
-	.long	p1(0x90),p1(0xd8),p1(0xab),p1(0x00),p1(0x8c),p1(0xbc),p1(0xd3),p1(0x0a) ;\
-	.long	p1(0xf7),p1(0xe4),p1(0x58),p1(0x05),p1(0xb8),p1(0xb3),p1(0x45),p1(0x06) ;\
-	.long	p1(0xd0),p1(0x2c),p1(0x1e),p1(0x8f),p1(0xca),p1(0x3f),p1(0x0f),p1(0x02) ;\
-	.long	p1(0xc1),p1(0xaf),p1(0xbd),p1(0x03),p1(0x01),p1(0x13),p1(0x8a),p1(0x6b)
-#define ib_data4(p1) \
-	.long	p1(0x3a),p1(0x91),p1(0x11),p1(0x41),p1(0x4f),p1(0x67),p1(0xdc),p1(0xea) ;\
-	.long	p1(0x97),p1(0xf2),p1(0xcf),p1(0xce),p1(0xf0),p1(0xb4),p1(0xe6),p1(0x73) ;\
-	.long	p1(0x96),p1(0xac),p1(0x74),p1(0x22),p1(0xe7),p1(0xad),p1(0x35),p1(0x85) ;\
-	.long	p1(0xe2),p1(0xf9),p1(0x37),p1(0xe8),p1(0x1c),p1(0x75),p1(0xdf),p1(0x6e)
-#define ib_data5(p1) \
-	.long	p1(0x47),p1(0xf1),p1(0x1a),p1(0x71),p1(0x1d),p1(0x29),p1(0xc5),p1(0x89) ;\
-	.long	p1(0x6f),p1(0xb7),p1(0x62),p1(0x0e),p1(0xaa),p1(0x18),p1(0xbe),p1(0x1b) ;\
-	.long	p1(0xfc),p1(0x56),p1(0x3e),p1(0x4b),p1(0xc6),p1(0xd2),p1(0x79),p1(0x20) ;\
-	.long	p1(0x9a),p1(0xdb),p1(0xc0),p1(0xfe),p1(0x78),p1(0xcd),p1(0x5a),p1(0xf4)
-#define ib_data6(p1) \
-	.long	p1(0x1f),p1(0xdd),p1(0xa8),p1(0x33),p1(0x88),p1(0x07),p1(0xc7),p1(0x31) ;\
-	.long	p1(0xb1),p1(0x12),p1(0x10),p1(0x59),p1(0x27),p1(0x80),p1(0xec),p1(0x5f) ;\
-	.long	p1(0x60),p1(0x51),p1(0x7f),p1(0xa9),p1(0x19),p1(0xb5),p1(0x4a),p1(0x0d) ;\
-	.long	p1(0x2d),p1(0xe5),p1(0x7a),p1(0x9f),p1(0x93),p1(0xc9),p1(0x9c),p1(0xef)
-#define ib_data7(p1) \
-	.long	p1(0xa0),p1(0xe0),p1(0x3b),p1(0x4d),p1(0xae),p1(0x2a),p1(0xf5),p1(0xb0) ;\
-	.long	p1(0xc8),p1(0xeb),p1(0xbb),p1(0x3c),p1(0x83),p1(0x53),p1(0x99),p1(0x61) ;\
-	.long	p1(0x17),p1(0x2b),p1(0x04),p1(0x7e),p1(0xba),p1(0x77),p1(0xd6),p1(0x26) ;\
-	.long	p1(0xe1),p1(0x69),p1(0x14),p1(0x63),p1(0x55),p1(0x21),p1(0x0c),p1(0x7d)
+aes_dec_blk:
+	push    %ebp
+	mov     ctx(%esp),%ebp       // pointer to context
+	xor     %eax,%eax
+
+// CAUTION: the order and the values used in these assigns 
+// rely on the register mappings
+
+1:	push    %ebx
+	mov     in_blk+4(%esp),%r2
+	push    %esi
+	mov     nrnd(%ebp),%r3   // number of rounds
+	push    %edi
+	lea     dkey(%ebp),%r6   // key pointer
+	mov     %r3,%r0
+	shl     $4,%r0
+	add     %r0,%r6
+	
+// input four columns and xor in first round key
 
-// The rcon_table (needed for the key schedule)
-//
-// Here is original Dr Brian Gladman's source code:
-//	_rcon_tab:
-//	%assign x   1
-//	%rep 29
-//	    dd  x
-//	%assign x f2(x)
-//	%endrep
-//
-// Here is precomputed output (it's more portable this way):
+	mov     (%r2),%r0
+	mov     4(%r2),%r1
+	mov     8(%r2),%r4
+	mov     12(%r2),%r5
+	xor     (%r6),%r0
+	xor     4(%r6),%r1
+	xor     8(%r6),%r4
+	xor     12(%r6),%r5
+
+	sub     $8,%esp           // space for register saves on stack
+	sub     $16,%r6           // increment to next round key   
+	sub     $10,%r3          
+	je      4f              // 10 rounds for 128-bit key
+	sub     $32,%r6
+	sub     $2,%r3
+	je      3f              // 12 rounds for 128-bit key
+	sub     $32,%r6
+
+2:	inv_rnd( +64(%r6), it_tab)	// 14 rounds for 128-bit key 
+	inv_rnd( +48(%r6), it_tab)
+3:	inv_rnd( +32(%r6), it_tab)	// 12 rounds for 128-bit key
+	inv_rnd( +16(%r6), it_tab)
+4:	inv_rnd(    (%r6), it_tab)	// 10 rounds for 128-bit key
+	inv_rnd( -16(%r6), it_tab)
+	inv_rnd( -32(%r6), it_tab)
+	inv_rnd( -48(%r6), it_tab)
+	inv_rnd( -64(%r6), it_tab)
+	inv_rnd( -80(%r6), it_tab)
+	inv_rnd( -96(%r6), it_tab)
+	inv_rnd(-112(%r6), it_tab)
+	inv_rnd(-128(%r6), it_tab)
+	inv_rnd(-144(%r6), il_tab)	// last round uses a different table
+
+// move final values to the output array.  CAUTION: the 
+// order of these assigns rely on the register mappings
+
+	add     $8,%esp
+	mov     out_blk+12(%esp),%r6
+	mov     %r5,12(%r6)
+	pop     %edi
+	mov     %r4,8(%r6)
+	pop     %esi
+	mov     %r1,4(%r6)
+	pop     %ebx
+	mov     %r0,(%r6)
+	pop     %ebp
+	mov     $1,%eax
+	ret
 
-	.align	ALIGN32BYTES
-aes_rcon_tab:
-	.long	0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80
-	.long	0x1b,0x36,0x6c,0xd8,0xab,0x4d,0x9a,0x2f
-	.long	0x5e,0xbc,0x63,0xc6,0x97,0x35,0x6a,0xd4
-	.long	0xb3,0x7d,0xfa,0xef,0xc5
-
-// The forward xor tables
-
-	.align	ALIGN32BYTES
-aes_ft_tab:
-	sb_data0(u0)
-	sb_data1(u0)
-	sb_data2(u0)
-	sb_data3(u0)
-	sb_data4(u0)
-	sb_data5(u0)
-	sb_data6(u0)
-	sb_data7(u0)
-
-	sb_data0(u1)
-	sb_data1(u1)
-	sb_data2(u1)
-	sb_data3(u1)
-	sb_data4(u1)
-	sb_data5(u1)
-	sb_data6(u1)
-	sb_data7(u1)
-
-	sb_data0(u2)
-	sb_data1(u2)
-	sb_data2(u2)
-	sb_data3(u2)
-	sb_data4(u2)
-	sb_data5(u2)
-	sb_data6(u2)
-	sb_data7(u2)
-
-	sb_data0(u3)
-	sb_data1(u3)
-	sb_data2(u3)
-	sb_data3(u3)
-	sb_data4(u3)
-	sb_data5(u3)
-	sb_data6(u3)
-	sb_data7(u3)
-
-	.align	ALIGN32BYTES
-aes_fl_tab:
-	sb_data0(w0)
-	sb_data1(w0)
-	sb_data2(w0)
-	sb_data3(w0)
-	sb_data4(w0)
-	sb_data5(w0)
-	sb_data6(w0)
-	sb_data7(w0)
-
-	sb_data0(w1)
-	sb_data1(w1)
-	sb_data2(w1)
-	sb_data3(w1)
-	sb_data4(w1)
-	sb_data5(w1)
-	sb_data6(w1)
-	sb_data7(w1)
-
-	sb_data0(w2)
-	sb_data1(w2)
-	sb_data2(w2)
-	sb_data3(w2)
-	sb_data4(w2)
-	sb_data5(w2)
-	sb_data6(w2)
-	sb_data7(w2)
-
-	sb_data0(w3)
-	sb_data1(w3)
-	sb_data2(w3)
-	sb_data3(w3)
-	sb_data4(w3)
-	sb_data5(w3)
-	sb_data6(w3)
-	sb_data7(w3)
-
-// The inverse xor tables
-
-	.align	ALIGN32BYTES
-aes_it_tab:
-	ib_data0(v0)
-	ib_data1(v0)
-	ib_data2(v0)
-	ib_data3(v0)
-	ib_data4(v0)
-	ib_data5(v0)
-	ib_data6(v0)
-	ib_data7(v0)
-
-	ib_data0(v1)
-	ib_data1(v1)
-	ib_data2(v1)
-	ib_data3(v1)
-	ib_data4(v1)
-	ib_data5(v1)
-	ib_data6(v1)
-	ib_data7(v1)
-
-	ib_data0(v2)
-	ib_data1(v2)
-	ib_data2(v2)
-	ib_data3(v2)
-	ib_data4(v2)
-	ib_data5(v2)
-	ib_data6(v2)
-	ib_data7(v2)
-
-	ib_data0(v3)
-	ib_data1(v3)
-	ib_data2(v3)
-	ib_data3(v3)
-	ib_data4(v3)
-	ib_data5(v3)
-	ib_data6(v3)
-	ib_data7(v3)
-
-	.align	ALIGN32BYTES
-aes_il_tab:
-	ib_data0(w0)
-	ib_data1(w0)
-	ib_data2(w0)
-	ib_data3(w0)
-	ib_data4(w0)
-	ib_data5(w0)
-	ib_data6(w0)
-	ib_data7(w0)
-
-	ib_data0(w1)
-	ib_data1(w1)
-	ib_data2(w1)
-	ib_data3(w1)
-	ib_data4(w1)
-	ib_data5(w1)
-	ib_data6(w1)
-	ib_data7(w1)
-
-	ib_data0(w2)
-	ib_data1(w2)
-	ib_data2(w2)
-	ib_data3(w2)
-	ib_data4(w2)
-	ib_data5(w2)
-	ib_data6(w2)
-	ib_data7(w2)
-
-	ib_data0(w3)
-	ib_data1(w3)
-	ib_data2(w3)
-	ib_data3(w3)
-	ib_data4(w3)
-	ib_data5(w3)
-	ib_data6(w3)
-	ib_data7(w3)
-
-// The inverse mix column tables
-
-	.align	ALIGN32BYTES
-aes_im_tab:
-	im_data0(v0)
-	im_data1(v0)
-	im_data2(v0)
-	im_data3(v0)
-	im_data4(v0)
-	im_data5(v0)
-	im_data6(v0)
-	im_data7(v0)
-
-	im_data0(v1)
-	im_data1(v1)
-	im_data2(v1)
-	im_data3(v1)
-	im_data4(v1)
-	im_data5(v1)
-	im_data6(v1)
-	im_data7(v1)
-
-	im_data0(v2)
-	im_data1(v2)
-	im_data2(v2)
-	im_data3(v2)
-	im_data4(v2)
-	im_data5(v2)
-	im_data6(v2)
-	im_data7(v2)
-
-	im_data0(v3)
-	im_data1(v3)
-	im_data2(v3)
-	im_data3(v3)
-	im_data4(v3)
-	im_data5(v3)
-	im_data6(v3)
-	im_data7(v3)