diff -u linux-2.6.8/arch/i386/crypto/aes.c linux-2.6.8.orig/arch/i386/crypto/aes.c --- linux-2.6.8/arch/i386/crypto/aes.c 2004-09-24 09:42:34.642803416 +0200 +++ linux-2.6.8.orig/arch/i386/crypto/aes.c 2004-08-14 07:36:59.000000000 +0200 @@ -2,92 +2,507 @@ * * Glue Code for optimized 586 assembler version of AES * - * Copyright (c) 2001, Dr Brian Gladman , Worcester, UK. + * Copyright (c) 2002, Dr Brian Gladman <>, Worcester, UK. + * All rights reserved. + * + * LICENSE TERMS + * + * The free distribution and use of this software in both source and binary + * form is allowed (with or without changes) provided that: + * + * 1. distributions of this source code include the above copyright + * notice, this list of conditions and the following disclaimer; + * + * 2. distributions in binary form include the above copyright + * notice, this list of conditions and the following disclaimer + * in the documentation and/or other associated materials; + * + * 3. the copyright holder's name is not used to endorse products + * built using this software without specific written permission. + * + * ALTERNATIVELY, provided that this notice is retained in full, this product + * may be distributed under the terms of the GNU General Public License (GPL), + * in which case the provisions of the GPL apply INSTEAD OF those given above. + * + * DISCLAIMER + * + * This software is provided 'as is' with no explicit or implied warranties + * in respect of its properties, including, but not limited to, correctness + * and/or fitness for purpose. + * * Copyright (c) 2003, Adam J. Richter (conversion to * 2.5 API). * Copyright (c) 2003, 2004 Fruhwirth Clemens -*/ - + * Copyright (c) 2004 Red Hat, Inc., James Morris + * + */ +#include #include #include #include #include #include +asmlinkage void aes_enc_blk(const u8 *src, u8 *dst, void *ctx); +asmlinkage void aes_dec_blk(const u8 *src, u8 *dst, void *ctx); + #define AES_MIN_KEY_SIZE 16 #define AES_MAX_KEY_SIZE 32 #define AES_BLOCK_SIZE 16 -#define AES_KS_LENGTH 4 * AES_BLOCK_SIZE -#define AES_RC_LENGTH (9 * AES_BLOCK_SIZE) / 8 - 8 +#define AES_KS_LENGTH 4 * AES_BLOCK_SIZE +#define RC_LENGTH 29 + +struct aes_ctx { + u32 ekey[AES_KS_LENGTH]; + u32 rounds; + u32 dkey[AES_KS_LENGTH]; +}; + +#define WPOLY 0x011b +#define u32_in(x) le32_to_cpu(*(const u32 *)(x)) +#define bytes2word(b0, b1, b2, b3) \ + (((u32)(b3) << 24) | ((u32)(b2) << 16) | ((u32)(b1) << 8) | (b0)) + +/* define the finite field multiplies required for Rijndael */ +#define f2(x) ((x) ? pow[log[x] + 0x19] : 0) +#define f3(x) ((x) ? pow[log[x] + 0x01] : 0) +#define f9(x) ((x) ? pow[log[x] + 0xc7] : 0) +#define fb(x) ((x) ? pow[log[x] + 0x68] : 0) +#define fd(x) ((x) ? pow[log[x] + 0xee] : 0) +#define fe(x) ((x) ? pow[log[x] + 0xdf] : 0) +#define fi(x) ((x) ? pow[255 - log[x]]: 0) -typedef struct +static inline u32 upr(u32 x, int n) { - u_int32_t aes_Nkey; // the number of words in the key input block - u_int32_t aes_Nrnd; // the number of cipher rounds - u_int32_t aes_e_key[AES_KS_LENGTH]; // the encryption key schedule - u_int32_t aes_d_key[AES_KS_LENGTH]; // the decryption key schedule - u_int32_t aes_Ncol; // the number of columns in the cipher state -} aes_context; + return (x << 8 * n) | (x >> (32 - 8 * n)); +} -/* - * The Cipher Interface - */ - -asmlinkage void aes_set_key(void *, const unsigned char [], const int, const int); +static inline u8 bval(u32 x, int n) +{ + return x >> 8 * n; +} +/* The forward and inverse affine transformations used in the S-box */ +#define fwd_affine(x) \ + (w = (u32)x, w ^= (w<<1)^(w<<2)^(w<<3)^(w<<4), 0x63^(u8)(w^(w>>8))) + +#define inv_affine(x) \ + (w = (u32)x, w = (w<<1)^(w<<3)^(w<<6), 0x05^(u8)(w^(w>>8))) + +static u32 rcon_tab[RC_LENGTH]; + +u32 ft_tab[4][256]; +u32 fl_tab[4][256]; +u32 ls_tab[4][256]; +u32 im_tab[4][256]; +u32 il_tab[4][256]; +u32 it_tab[4][256]; +void gen_tabs(void) +{ + u32 i, w; + u8 pow[512], log[256]; + + /* + * log and power tables for GF(2^8) finite field with + * WPOLY as modular polynomial - the simplest primitive + * root is 0x03, used here to generate the tables. + */ + i = 0; w = 1; + + do { + pow[i] = (u8)w; + pow[i + 255] = (u8)w; + log[w] = (u8)i++; + w ^= (w << 1) ^ (w & 0x80 ? WPOLY : 0); + } while (w != 1); + + for(i = 0, w = 1; i < RC_LENGTH; ++i) { + rcon_tab[i] = bytes2word(w, 0, 0, 0); + w = f2(w); + } -/* Actually: - * extern void aes_encrypt(const aes_context *, unsigned char [], const unsigned char []); - * extern void aes_decrypt(const aes_context *, unsigned char [], const unsigned char []); -*/ - -asmlinkage void aes_encrypt(void*, unsigned char [], const unsigned char []); -asmlinkage void aes_decrypt(void*, unsigned char [], const unsigned char []); + for(i = 0; i < 256; ++i) { + u8 b; + + b = fwd_affine(fi((u8)i)); + w = bytes2word(f2(b), b, b, f3(b)); + + /* tables for a normal encryption round */ + ft_tab[0][i] = w; + ft_tab[1][i] = upr(w, 1); + ft_tab[2][i] = upr(w, 2); + ft_tab[3][i] = upr(w, 3); + w = bytes2word(b, 0, 0, 0); + + /* + * tables for last encryption round + * (may also be used in the key schedule) + */ + fl_tab[0][i] = w; + fl_tab[1][i] = upr(w, 1); + fl_tab[2][i] = upr(w, 2); + fl_tab[3][i] = upr(w, 3); + + /* + * table for key schedule if fl_tab above is + * not of the required form + */ + ls_tab[0][i] = w; + ls_tab[1][i] = upr(w, 1); + ls_tab[2][i] = upr(w, 2); + ls_tab[3][i] = upr(w, 3); + + b = fi(inv_affine((u8)i)); + w = bytes2word(fe(b), f9(b), fd(b), fb(b)); + + /* tables for the inverse mix column operation */ + im_tab[0][b] = w; + im_tab[1][b] = upr(w, 1); + im_tab[2][b] = upr(w, 2); + im_tab[3][b] = upr(w, 3); + + /* tables for a normal decryption round */ + it_tab[0][i] = w; + it_tab[1][i] = upr(w,1); + it_tab[2][i] = upr(w,2); + it_tab[3][i] = upr(w,3); + + w = bytes2word(b, 0, 0, 0); + + /* tables for last decryption round */ + il_tab[0][i] = w; + il_tab[1][i] = upr(w,1); + il_tab[2][i] = upr(w,2); + il_tab[3][i] = upr(w,3); + } +} -static int aes_set_key_glue(void *cx, const u8 *key,unsigned int key_length, u32 *flags) +#define four_tables(x,tab,vf,rf,c) \ +( tab[0][bval(vf(x,0,c),rf(0,c))] ^ \ + tab[1][bval(vf(x,1,c),rf(1,c))] ^ \ + tab[2][bval(vf(x,2,c),rf(2,c))] ^ \ + tab[3][bval(vf(x,3,c),rf(3,c))] \ +) + +#define vf1(x,r,c) (x) +#define rf1(r,c) (r) +#define rf2(r,c) ((r-c)&3) + +#define inv_mcol(x) four_tables(x,im_tab,vf1,rf1,0) +#define ls_box(x,c) four_tables(x,fl_tab,vf1,rf2,c) + +#define ff(x) inv_mcol(x) + +#define ke4(k,i) \ +{ \ + k[4*(i)+4] = ss[0] ^= ls_box(ss[3],3) ^ rcon_tab[i]; \ + k[4*(i)+5] = ss[1] ^= ss[0]; \ + k[4*(i)+6] = ss[2] ^= ss[1]; \ + k[4*(i)+7] = ss[3] ^= ss[2]; \ +} + +#define kel4(k,i) \ +{ \ + k[4*(i)+4] = ss[0] ^= ls_box(ss[3],3) ^ rcon_tab[i]; \ + k[4*(i)+5] = ss[1] ^= ss[0]; \ + k[4*(i)+6] = ss[2] ^= ss[1]; k[4*(i)+7] = ss[3] ^= ss[2]; \ +} + +#define ke6(k,i) \ +{ \ + k[6*(i)+ 6] = ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i]; \ + k[6*(i)+ 7] = ss[1] ^= ss[0]; \ + k[6*(i)+ 8] = ss[2] ^= ss[1]; \ + k[6*(i)+ 9] = ss[3] ^= ss[2]; \ + k[6*(i)+10] = ss[4] ^= ss[3]; \ + k[6*(i)+11] = ss[5] ^= ss[4]; \ +} + +#define kel6(k,i) \ +{ \ + k[6*(i)+ 6] = ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i]; \ + k[6*(i)+ 7] = ss[1] ^= ss[0]; \ + k[6*(i)+ 8] = ss[2] ^= ss[1]; \ + k[6*(i)+ 9] = ss[3] ^= ss[2]; \ +} + +#define ke8(k,i) \ +{ \ + k[8*(i)+ 8] = ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i]; \ + k[8*(i)+ 9] = ss[1] ^= ss[0]; \ + k[8*(i)+10] = ss[2] ^= ss[1]; \ + k[8*(i)+11] = ss[3] ^= ss[2]; \ + k[8*(i)+12] = ss[4] ^= ls_box(ss[3],0); \ + k[8*(i)+13] = ss[5] ^= ss[4]; \ + k[8*(i)+14] = ss[6] ^= ss[5]; \ + k[8*(i)+15] = ss[7] ^= ss[6]; \ +} + +#define kel8(k,i) \ +{ \ + k[8*(i)+ 8] = ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i]; \ + k[8*(i)+ 9] = ss[1] ^= ss[0]; \ + k[8*(i)+10] = ss[2] ^= ss[1]; \ + k[8*(i)+11] = ss[3] ^= ss[2]; \ +} + +#define kdf4(k,i) \ +{ \ + ss[0] = ss[0] ^ ss[2] ^ ss[1] ^ ss[3]; \ + ss[1] = ss[1] ^ ss[3]; \ + ss[2] = ss[2] ^ ss[3]; \ + ss[3] = ss[3]; \ + ss[4] = ls_box(ss[(i+3) % 4], 3) ^ rcon_tab[i]; \ + ss[i % 4] ^= ss[4]; \ + ss[4] ^= k[4*(i)]; \ + k[4*(i)+4] = ff(ss[4]); \ + ss[4] ^= k[4*(i)+1]; \ + k[4*(i)+5] = ff(ss[4]); \ + ss[4] ^= k[4*(i)+2]; \ + k[4*(i)+6] = ff(ss[4]); \ + ss[4] ^= k[4*(i)+3]; \ + k[4*(i)+7] = ff(ss[4]); \ +} + +#define kd4(k,i) \ +{ \ + ss[4] = ls_box(ss[(i+3) % 4], 3) ^ rcon_tab[i]; \ + ss[i % 4] ^= ss[4]; \ + ss[4] = ff(ss[4]); \ + k[4*(i)+4] = ss[4] ^= k[4*(i)]; \ + k[4*(i)+5] = ss[4] ^= k[4*(i)+1]; \ + k[4*(i)+6] = ss[4] ^= k[4*(i)+2]; \ + k[4*(i)+7] = ss[4] ^= k[4*(i)+3]; \ +} + +#define kdl4(k,i) \ +{ \ + ss[4] = ls_box(ss[(i+3) % 4], 3) ^ rcon_tab[i]; \ + ss[i % 4] ^= ss[4]; \ + k[4*(i)+4] = (ss[0] ^= ss[1]) ^ ss[2] ^ ss[3]; \ + k[4*(i)+5] = ss[1] ^ ss[3]; \ + k[4*(i)+6] = ss[0]; \ + k[4*(i)+7] = ss[1]; \ +} + +#define kdf6(k,i) \ +{ \ + ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i]; \ + k[6*(i)+ 6] = ff(ss[0]); \ + ss[1] ^= ss[0]; \ + k[6*(i)+ 7] = ff(ss[1]); \ + ss[2] ^= ss[1]; \ + k[6*(i)+ 8] = ff(ss[2]); \ + ss[3] ^= ss[2]; \ + k[6*(i)+ 9] = ff(ss[3]); \ + ss[4] ^= ss[3]; \ + k[6*(i)+10] = ff(ss[4]); \ + ss[5] ^= ss[4]; \ + k[6*(i)+11] = ff(ss[5]); \ +} + +#define kd6(k,i) \ +{ \ + ss[6] = ls_box(ss[5],3) ^ rcon_tab[i]; \ + ss[0] ^= ss[6]; ss[6] = ff(ss[6]); \ + k[6*(i)+ 6] = ss[6] ^= k[6*(i)]; \ + ss[1] ^= ss[0]; \ + k[6*(i)+ 7] = ss[6] ^= k[6*(i)+ 1]; \ + ss[2] ^= ss[1]; \ + k[6*(i)+ 8] = ss[6] ^= k[6*(i)+ 2]; \ + ss[3] ^= ss[2]; \ + k[6*(i)+ 9] = ss[6] ^= k[6*(i)+ 3]; \ + ss[4] ^= ss[3]; \ + k[6*(i)+10] = ss[6] ^= k[6*(i)+ 4]; \ + ss[5] ^= ss[4]; \ + k[6*(i)+11] = ss[6] ^= k[6*(i)+ 5]; \ +} + +#define kdl6(k,i) \ +{ \ + ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i]; \ + k[6*(i)+ 6] = ss[0]; \ + ss[1] ^= ss[0]; \ + k[6*(i)+ 7] = ss[1]; \ + ss[2] ^= ss[1]; \ + k[6*(i)+ 8] = ss[2]; \ + ss[3] ^= ss[2]; \ + k[6*(i)+ 9] = ss[3]; \ +} + +#define kdf8(k,i) \ +{ \ + ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i]; \ + k[8*(i)+ 8] = ff(ss[0]); \ + ss[1] ^= ss[0]; \ + k[8*(i)+ 9] = ff(ss[1]); \ + ss[2] ^= ss[1]; \ + k[8*(i)+10] = ff(ss[2]); \ + ss[3] ^= ss[2]; \ + k[8*(i)+11] = ff(ss[3]); \ + ss[4] ^= ls_box(ss[3],0); \ + k[8*(i)+12] = ff(ss[4]); \ + ss[5] ^= ss[4]; \ + k[8*(i)+13] = ff(ss[5]); \ + ss[6] ^= ss[5]; \ + k[8*(i)+14] = ff(ss[6]); \ + ss[7] ^= ss[6]; \ + k[8*(i)+15] = ff(ss[7]); \ +} + +#define kd8(k,i) \ +{ \ + u32 __g = ls_box(ss[7],3) ^ rcon_tab[i]; \ + ss[0] ^= __g; \ + __g = ff(__g); \ + k[8*(i)+ 8] = __g ^= k[8*(i)]; \ + ss[1] ^= ss[0]; \ + k[8*(i)+ 9] = __g ^= k[8*(i)+ 1]; \ + ss[2] ^= ss[1]; \ + k[8*(i)+10] = __g ^= k[8*(i)+ 2]; \ + ss[3] ^= ss[2]; \ + k[8*(i)+11] = __g ^= k[8*(i)+ 3]; \ + __g = ls_box(ss[3],0); \ + ss[4] ^= __g; \ + __g = ff(__g); \ + k[8*(i)+12] = __g ^= k[8*(i)+ 4]; \ + ss[5] ^= ss[4]; \ + k[8*(i)+13] = __g ^= k[8*(i)+ 5]; \ + ss[6] ^= ss[5]; \ + k[8*(i)+14] = __g ^= k[8*(i)+ 6]; \ + ss[7] ^= ss[6]; \ + k[8*(i)+15] = __g ^= k[8*(i)+ 7]; \ +} + +#define kdl8(k,i) \ +{ \ + ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i]; \ + k[8*(i)+ 8] = ss[0]; \ + ss[1] ^= ss[0]; \ + k[8*(i)+ 9] = ss[1]; \ + ss[2] ^= ss[1]; \ + k[8*(i)+10] = ss[2]; \ + ss[3] ^= ss[2]; \ + k[8*(i)+11] = ss[3]; \ +} + +static int +aes_set_key(void *ctx_arg, const u8 *in_key, unsigned int key_len, u32 *flags) { - if(key_length != 16 && key_length != 24 && key_length != 32) - { - *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + int i; + u32 ss[8]; + struct aes_ctx *ctx = ctx_arg; + + /* encryption schedule */ + + ctx->ekey[0] = ss[0] = u32_in(in_key); + ctx->ekey[1] = ss[1] = u32_in(in_key + 4); + ctx->ekey[2] = ss[2] = u32_in(in_key + 8); + ctx->ekey[3] = ss[3] = u32_in(in_key + 12); + + switch(key_len) { + case 16: + for (i = 0; i < 9; i++) + ke4(ctx->ekey, i); + kel4(ctx->ekey, 9); + ctx->rounds = 10; + break; + + case 24: + ctx->ekey[4] = ss[4] = u32_in(in_key + 16); + ctx->ekey[5] = ss[5] = u32_in(in_key + 20); + for (i = 0; i < 7; i++) + ke6(ctx->ekey, i); + kel6(ctx->ekey, 7); + ctx->rounds = 12; + break; + + case 32: + ctx->ekey[4] = ss[4] = u32_in(in_key + 16); + ctx->ekey[5] = ss[5] = u32_in(in_key + 20); + ctx->ekey[6] = ss[6] = u32_in(in_key + 24); + ctx->ekey[7] = ss[7] = u32_in(in_key + 28); + for (i = 0; i < 6; i++) + ke8(ctx->ekey, i); + kel8(ctx->ekey, 6); + ctx->rounds = 14; + break; + + default: + *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; return -EINVAL; } - aes_set_key(cx, key,key_length,0); + + /* decryption schedule */ + + ctx->dkey[0] = ss[0] = u32_in(in_key); + ctx->dkey[1] = ss[1] = u32_in(in_key + 4); + ctx->dkey[2] = ss[2] = u32_in(in_key + 8); + ctx->dkey[3] = ss[3] = u32_in(in_key + 12); + + switch (key_len) { + case 16: + kdf4(ctx->dkey, 0); + for (i = 1; i < 9; i++) + kd4(ctx->dkey, i); + kdl4(ctx->dkey, 9); + break; + + case 24: + ctx->dkey[4] = ff(ss[4] = u32_in(in_key + 16)); + ctx->dkey[5] = ff(ss[5] = u32_in(in_key + 20)); + kdf6(ctx->dkey, 0); + for (i = 1; i < 7; i++) + kd6(ctx->dkey, i); + kdl6(ctx->dkey, 7); + break; + + case 32: + ctx->dkey[4] = ff(ss[4] = u32_in(in_key + 16)); + ctx->dkey[5] = ff(ss[5] = u32_in(in_key + 20)); + ctx->dkey[6] = ff(ss[6] = u32_in(in_key + 24)); + ctx->dkey[7] = ff(ss[7] = u32_in(in_key + 28)); + kdf8(ctx->dkey, 0); + for (i = 1; i < 6; i++) + kd8(ctx->dkey, i); + kdl8(ctx->dkey, 6); + break; + } return 0; } -#ifdef CONFIG_REGPARM -static void aes_encrypt_glue(void* a, unsigned char b[], const unsigned char c[]) { - aes_encrypt(a,b,c); -} -static void aes_decrypt_glue(void* a, unsigned char b[], const unsigned char c[]) { - aes_decrypt(a,b,c); -} -#else -#define aes_encrypt_glue aes_encrypt -#define aes_decrypt_glue aes_decrypt -#endif /* CONFIG_REGPARM */ +static inline void aes_encrypt(void *ctx, u8 *dst, const u8 *src) +{ + aes_enc_blk(src, dst, ctx); +} +static inline void aes_decrypt(void *ctx, u8 *dst, const u8 *src) +{ + aes_dec_blk(src, dst, ctx); +} + static struct crypto_alg aes_alg = { .cra_name = "aes", .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(aes_context), + .cra_ctxsize = sizeof(struct aes_ctx), .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT(aes_alg.cra_list), .cra_u = { .cipher = { .cia_min_keysize = AES_MIN_KEY_SIZE, .cia_max_keysize = AES_MAX_KEY_SIZE, - .cia_setkey = aes_set_key_glue, - .cia_encrypt = aes_encrypt_glue, - .cia_decrypt = aes_decrypt_glue + .cia_setkey = aes_set_key, + .cia_encrypt = aes_encrypt, + .cia_decrypt = aes_decrypt } } }; static int __init aes_init(void) { + gen_tabs(); return crypto_register_alg(&aes_alg); } @@ -101,5 +516,5 @@ MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, i586 asm optimized"); MODULE_LICENSE("Dual BSD/GPL"); -MODULE_AUTHOR("Fruhwirth Clemens"); +MODULE_AUTHOR("Fruhwirth Clemens, James Morris, Brian Gladman, Adam Richter"); MODULE_ALIAS("aes"); Only in linux-2.6.8/arch/i386/crypto/: aes-i586-asm.o Only in linux-2.6.8/arch/i386/crypto/: .aes-i586-asm.o.cmd diff -u linux-2.6.8/arch/i386/crypto/aes-i586-asm.S linux-2.6.8.orig/arch/i386/crypto/aes-i586-asm.S --- linux-2.6.8/arch/i386/crypto/aes-i586-asm.S 2004-09-24 09:42:34.641803568 +0200 +++ linux-2.6.8.orig/arch/i386/crypto/aes-i586-asm.S 2004-08-14 07:37:15.000000000 +0200 @@ -1,918 +1,341 @@ -// -// Copyright (c) 2001, Dr Brian Gladman , Worcester, UK. +// ------------------------------------------------------------------------- +// Copyright (c) 2001, Dr Brian Gladman < >, Worcester, UK. // All rights reserved. // -// TERMS +// LICENSE TERMS // -// Redistribution and use in source and binary forms, with or without -// modification, are permitted subject to the following conditions: +// The free distribution and use of this software in both source and binary +// form is allowed (with or without changes) provided that: // -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. +// 1. distributions of this source code include the above copyright +// notice, this list of conditions and the following disclaimer// // -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. +// 2. distributions in binary form include the above copyright +// notice, this list of conditions and the following disclaimer +// in the documentation and/or other associated materials// // -// 3. The copyright holder's name must not be used to endorse or promote -// any products derived from this software without his specific prior -// written permission. +// 3. the copyright holder's name is not used to endorse products +// built using this software without specific written permission. // -// This software is provided 'as is' with no express or implied warranties -// of correctness or fitness for purpose. - -// Modified by Jari Ruusu, December 24 2001 -// - Converted syntax to GNU CPP/assembler syntax -// - C programming interface converted back to "old" API -// - Minor portability cleanups and speed optimizations - -// Modified by Jari Ruusu, April 11 2002 -// - Added above copyright and terms to resulting object code so that -// binary distributions can avoid legal trouble - -// Modified by Clemens Fruhwirth, Feb 04 2003 -// - Switched in/out to fit CryptoAPI calls. - -// An AES (Rijndael) implementation for the Pentium. This version only -// implements the standard AES block length (128 bits, 16 bytes). This code -// does not preserve the eax, ecx or edx registers or the artihmetic status -// flags. However, the ebx, esi, edi, and ebp registers are preserved across -// calls. - -// void aes_set_key(aes_context *cx, const unsigned char key[], const int key_len, const int f) -// void aes_encrypt(const aes_context *cx, unsigned char out_blk[], const unsigned char in_blk[]) -// void aes_decrypt(const aes_context *cx, unsigned char out_blk[], const unsigned char in_blk[]) - -# define ALIGN32BYTES 32 - - .file "aes-i586.S" - .globl aes_set_key - .globl aes_encrypt - .globl aes_decrypt - - .text -copyright: - .ascii " \000" - .ascii "Copyright (c) 2001, Dr Brian Gladman , Worcester, UK.\000" - .ascii "All rights reserved.\000" - .ascii " \000" - .ascii "TERMS\000" - .ascii " \000" - .ascii " Redistribution and use in source and binary forms, with or without\000" - .ascii " modification, are permitted subject to the following conditions:\000" - .ascii " \000" - .ascii " 1. Redistributions of source code must retain the above copyright\000" - .ascii " notice, this list of conditions and the following disclaimer.\000" - .ascii " \000" - .ascii " 2. Redistributions in binary form must reproduce the above copyright\000" - .ascii " notice, this list of conditions and the following disclaimer in the\000" - .ascii " documentation and/or other materials provided with the distribution.\000" - .ascii " \000" - .ascii " 3. The copyright holder's name must not be used to endorse or promote\000" - .ascii " any products derived from this software without his specific prior\000" - .ascii " written permission.\000" - .ascii " \000" - .ascii " This software is provided 'as is' with no express or implied warranties\000" - .ascii " of correctness or fitness for purpose.\000" - .ascii " \000" +// +// ALTERNATIVELY, provided that this notice is retained in full, this product +// may be distributed under the terms of the GNU General Public License (GPL), +// in which case the provisions of the GPL apply INSTEAD OF those given above. +// +// Copyright (c) 2004 Linus Torvalds +// Copyright (c) 2004 Red Hat, Inc., James Morris -#define tlen 1024 // length of each of 4 'xor' arrays (256 32-bit words) +// DISCLAIMER +// +// This software is provided 'as is' with no explicit or implied warranties +// in respect of its properties including, but not limited to, correctness +// and fitness for purpose. +// ------------------------------------------------------------------------- +// Issue Date: 29/07/2002 + +.file "aes-i586-asm.S" +.text + +// aes_rval aes_enc_blk(const unsigned char in_blk[], unsigned char out_blk[], const aes_ctx cx[1])// +// aes_rval aes_dec_blk(const unsigned char in_blk[], unsigned char out_blk[], const aes_ctx cx[1])// + +#define tlen 1024 // length of each of 4 'xor' arrays (256 32-bit words) // offsets to parameters with one register pushed onto stack -#define ctx 8 // AES context structure -#define out_blk 12 // output byte array address parameter -#define in_blk 16 // input byte array address parameter +#define in_blk 8 // input byte array address parameter +#define out_blk 12 // output byte array address parameter +#define ctx 16 // AES context structure // offsets in context structure -#define nkey 0 // key length, size 4 -#define nrnd 4 // number of rounds, size 4 -#define ekey 8 // encryption key schedule base address, size 256 -#define dkey 264 // decryption key schedule base address, size 256 +#define ekey 0 // encryption key schedule base address +#define nrnd 256 // number of rounds +#define dkey 260 // decryption key schedule base address + +// register mapping for encrypt and decrypt subroutines + +#define r0 eax +#define r1 ebx +#define r2 ecx +#define r3 edx +#define r4 esi +#define r5 edi +#define r6 ebp + +#define eaxl al +#define eaxh ah +#define ebxl bl +#define ebxh bh +#define ecxl cl +#define ecxh ch +#define edxl dl +#define edxh dh + +#define _h(reg) reg##h +#define h(reg) _h(reg) + +#define _l(reg) reg##l +#define l(reg) _l(reg) + +// This macro takes a 32-bit word representing a column and uses +// each of its four bytes to index into four tables of 256 32-bit +// words to obtain values that are then xored into the appropriate +// output registers r0, r1, r4 or r5. + +// Parameters: +// %1 out_state[0] +// %2 out_state[1] +// %3 out_state[2] +// %4 out_state[3] +// %5 table base address +// %6 input register for the round (destroyed) +// %7 scratch register for the round + +#define do_col(a1, a2, a3, a4, a5, a6, a7) \ + movzx %l(a6),%a7; \ + xor a5(,%a7,4),%a1; \ + movzx %h(a6),%a7; \ + shr $16,%a6; \ + xor a5+tlen(,%a7,4),%a2; \ + movzx %l(a6),%a7; \ + movzx %h(a6),%a6; \ + xor a5+2*tlen(,%a7,4),%a3; \ + xor a5+3*tlen(,%a6,4),%a4; + +// initialise output registers from the key schedule + +#define do_fcol(a1, a2, a3, a4, a5, a6, a7, a8) \ + mov 0 a8,%a1; \ + movzx %l(a6),%a7; \ + mov 12 a8,%a2; \ + xor a5(,%a7,4),%a1; \ + mov 4 a8,%a4; \ + movzx %h(a6),%a7; \ + shr $16,%a6; \ + xor a5+tlen(,%a7,4),%a2; \ + movzx %l(a6),%a7; \ + movzx %h(a6),%a6; \ + xor a5+3*tlen(,%a6,4),%a4; \ + mov %a3,%a6; \ + mov 8 a8,%a3; \ + xor a5+2*tlen(,%a7,4),%a3; + +// initialise output registers from the key schedule + +#define do_icol(a1, a2, a3, a4, a5, a6, a7, a8) \ + mov 0 a8,%a1; \ + movzx %l(a6),%a7; \ + mov 4 a8,%a2; \ + xor a5(,%a7,4),%a1; \ + mov 12 a8,%a4; \ + movzx %h(a6),%a7; \ + shr $16,%a6; \ + xor a5+tlen(,%a7,4),%a2; \ + movzx %l(a6),%a7; \ + movzx %h(a6),%a6; \ + xor a5+3*tlen(,%a6,4),%a4; \ + mov %a3,%a6; \ + mov 8 a8,%a3; \ + xor a5+2*tlen(,%a7,4),%a3; + + +// original Gladman had conditional saves to MMX regs. +#define save(a1, a2) \ + mov %a2,4*a1(%esp) -// This macro performs a forward encryption cycle. It is entered with -// the first previous round column values in %eax, %ebx, %esi and %edi and -// exits with the final values in the same registers. +#define restore(a1, a2) \ + mov 4*a2(%esp),%a1 -#define fwd_rnd(p1,p2) \ - mov %ebx,(%esp) ;\ - movzbl %al,%edx ;\ - mov %eax,%ecx ;\ - mov p2(%ebp),%eax ;\ - mov %edi,4(%esp) ;\ - mov p2+12(%ebp),%edi ;\ - xor p1(,%edx,4),%eax ;\ - movzbl %ch,%edx ;\ - shr $16,%ecx ;\ - mov p2+4(%ebp),%ebx ;\ - xor p1+tlen(,%edx,4),%edi ;\ - movzbl %cl,%edx ;\ - movzbl %ch,%ecx ;\ - xor p1+3*tlen(,%ecx,4),%ebx ;\ - mov %esi,%ecx ;\ - mov p1+2*tlen(,%edx,4),%esi ;\ - movzbl %cl,%edx ;\ - xor p1(,%edx,4),%esi ;\ - movzbl %ch,%edx ;\ - shr $16,%ecx ;\ - xor p1+tlen(,%edx,4),%ebx ;\ - movzbl %cl,%edx ;\ - movzbl %ch,%ecx ;\ - xor p1+2*tlen(,%edx,4),%eax ;\ - mov (%esp),%edx ;\ - xor p1+3*tlen(,%ecx,4),%edi ;\ - movzbl %dl,%ecx ;\ - xor p2+8(%ebp),%esi ;\ - xor p1(,%ecx,4),%ebx ;\ - movzbl %dh,%ecx ;\ - shr $16,%edx ;\ - xor p1+tlen(,%ecx,4),%eax ;\ - movzbl %dl,%ecx ;\ - movzbl %dh,%edx ;\ - xor p1+2*tlen(,%ecx,4),%edi ;\ - mov 4(%esp),%ecx ;\ - xor p1+3*tlen(,%edx,4),%esi ;\ - movzbl %cl,%edx ;\ - xor p1(,%edx,4),%edi ;\ - movzbl %ch,%edx ;\ - shr $16,%ecx ;\ - xor p1+tlen(,%edx,4),%esi ;\ - movzbl %cl,%edx ;\ - movzbl %ch,%ecx ;\ - xor p1+2*tlen(,%edx,4),%ebx ;\ - xor p1+3*tlen(,%ecx,4),%eax +// This macro performs a forward encryption cycle. It is entered with +// the first previous round column values in r0, r1, r4 and r5 and +// exits with the final values in the same registers, using the MMX +// registers mm0-mm1 or the stack for temporary storage + +// mov current column values into the MMX registers +#define fwd_rnd(arg, table) \ + /* mov current column values into the MMX registers */ \ + mov %r0,%r2; \ + save (0,r1); \ + save (1,r5); \ + \ + /* compute new column values */ \ + do_fcol(r0,r5,r4,r1,table, r2,r3, arg); \ + do_col (r4,r1,r0,r5,table, r2,r3); \ + restore(r2,0); \ + do_col (r1,r0,r5,r4,table, r2,r3); \ + restore(r2,1); \ + do_col (r5,r4,r1,r0,table, r2,r3); // This macro performs an inverse encryption cycle. It is entered with -// the first previous round column values in %eax, %ebx, %esi and %edi and -// exits with the final values in the same registers. - -#define inv_rnd(p1,p2) \ - movzbl %al,%edx ;\ - mov %ebx,(%esp) ;\ - mov %eax,%ecx ;\ - mov p2(%ebp),%eax ;\ - mov %edi,4(%esp) ;\ - mov p2+4(%ebp),%ebx ;\ - xor p1(,%edx,4),%eax ;\ - movzbl %ch,%edx ;\ - shr $16,%ecx ;\ - mov p2+12(%ebp),%edi ;\ - xor p1+tlen(,%edx,4),%ebx ;\ - movzbl %cl,%edx ;\ - movzbl %ch,%ecx ;\ - xor p1+3*tlen(,%ecx,4),%edi ;\ - mov %esi,%ecx ;\ - mov p1+2*tlen(,%edx,4),%esi ;\ - movzbl %cl,%edx ;\ - xor p1(,%edx,4),%esi ;\ - movzbl %ch,%edx ;\ - shr $16,%ecx ;\ - xor p1+tlen(,%edx,4),%edi ;\ - movzbl %cl,%edx ;\ - movzbl %ch,%ecx ;\ - xor p1+2*tlen(,%edx,4),%eax ;\ - mov (%esp),%edx ;\ - xor p1+3*tlen(,%ecx,4),%ebx ;\ - movzbl %dl,%ecx ;\ - xor p2+8(%ebp),%esi ;\ - xor p1(,%ecx,4),%ebx ;\ - movzbl %dh,%ecx ;\ - shr $16,%edx ;\ - xor p1+tlen(,%ecx,4),%esi ;\ - movzbl %dl,%ecx ;\ - movzbl %dh,%edx ;\ - xor p1+2*tlen(,%ecx,4),%edi ;\ - mov 4(%esp),%ecx ;\ - xor p1+3*tlen(,%edx,4),%eax ;\ - movzbl %cl,%edx ;\ - xor p1(,%edx,4),%edi ;\ - movzbl %ch,%edx ;\ - shr $16,%ecx ;\ - xor p1+tlen(,%edx,4),%eax ;\ - movzbl %cl,%edx ;\ - movzbl %ch,%ecx ;\ - xor p1+2*tlen(,%edx,4),%ebx ;\ - xor p1+3*tlen(,%ecx,4),%esi +// the first previous round column values in r0, r1, r4 and r5 and +// exits with the final values in the same registers, using the MMX +// registers mm0-mm1 or the stack for temporary storage + +#define inv_rnd(arg, table) \ + /* mov current column values into the MMX registers */ \ + mov %r0,%r2; \ + save (0,r1); \ + save (1,r5); \ + \ + /* compute new column values */ \ + do_icol(r0,r1,r4,r5, table, r2,r3, arg); \ + do_col (r4,r5,r0,r1, table, r2,r3); \ + restore(r2,0); \ + do_col (r1,r4,r5,r0, table, r2,r3); \ + restore(r2,1); \ + do_col (r5,r0,r1,r4, table, r2,r3); // AES (Rijndael) Encryption Subroutine - .text - .align ALIGN32BYTES -aes_encrypt: - push %ebp - mov ctx(%esp),%ebp // pointer to context - mov in_blk(%esp),%ecx - push %ebx - push %esi - push %edi - mov nrnd(%ebp),%edx // number of rounds - lea ekey+16(%ebp),%ebp // key pointer - -// input four columns and xor in first round key - - mov (%ecx),%eax - mov 4(%ecx),%ebx - mov 8(%ecx),%esi - mov 12(%ecx),%edi - xor -16(%ebp),%eax - xor -12(%ebp),%ebx - xor -8(%ebp),%esi - xor -4(%ebp),%edi - - sub $8,%esp // space for register saves on stack - - sub $10,%edx - je aes_15 - add $32,%ebp - sub $2,%edx - je aes_13 - add $32,%ebp - - fwd_rnd(aes_ft_tab,-64) // 14 rounds for 256-bit key - fwd_rnd(aes_ft_tab,-48) -aes_13: fwd_rnd(aes_ft_tab,-32) // 12 rounds for 192-bit key - fwd_rnd(aes_ft_tab,-16) -aes_15: fwd_rnd(aes_ft_tab,0) // 10 rounds for 128-bit key - fwd_rnd(aes_ft_tab,16) - fwd_rnd(aes_ft_tab,32) - fwd_rnd(aes_ft_tab,48) - fwd_rnd(aes_ft_tab,64) - fwd_rnd(aes_ft_tab,80) - fwd_rnd(aes_ft_tab,96) - fwd_rnd(aes_ft_tab,112) - fwd_rnd(aes_ft_tab,128) - fwd_rnd(aes_fl_tab,144) // last round uses a different table - -// move final values to the output array. - - mov out_blk+20(%esp),%ebp - add $8,%esp - mov %eax,(%ebp) - mov %ebx,4(%ebp) - mov %esi,8(%ebp) - mov %edi,12(%ebp) - pop %edi - pop %esi - pop %ebx - pop %ebp - ret +.global aes_enc_blk +.extern ft_tab +.extern fl_tab -// AES (Rijndael) Decryption Subroutine +.align 4 - .align ALIGN32BYTES -aes_decrypt: - push %ebp - mov ctx(%esp),%ebp // pointer to context - mov in_blk(%esp),%ecx - push %ebx - push %esi - push %edi - mov nrnd(%ebp),%edx // number of rounds - lea dkey+16(%ebp),%ebp // key pointer +aes_enc_blk: + push %ebp + mov ctx(%esp),%ebp // pointer to context + xor %eax,%eax + +// CAUTION: the order and the values used in these assigns +// rely on the register mappings + +1: push %ebx + mov in_blk+4(%esp),%r2 + push %esi + mov nrnd(%ebp),%r3 // number of rounds + push %edi + lea ekey(%ebp),%r6 // key pointer // input four columns and xor in first round key - mov (%ecx),%eax - mov 4(%ecx),%ebx - mov 8(%ecx),%esi - mov 12(%ecx),%edi - xor -16(%ebp),%eax - xor -12(%ebp),%ebx - xor -8(%ebp),%esi - xor -4(%ebp),%edi - - sub $8,%esp // space for register saves on stack - - sub $10,%edx - je aes_25 - add $32,%ebp - sub $2,%edx - je aes_23 - add $32,%ebp - - inv_rnd(aes_it_tab,-64) // 14 rounds for 256-bit key - inv_rnd(aes_it_tab,-48) -aes_23: inv_rnd(aes_it_tab,-32) // 12 rounds for 192-bit key - inv_rnd(aes_it_tab,-16) -aes_25: inv_rnd(aes_it_tab,0) // 10 rounds for 128-bit key - inv_rnd(aes_it_tab,16) - inv_rnd(aes_it_tab,32) - inv_rnd(aes_it_tab,48) - inv_rnd(aes_it_tab,64) - inv_rnd(aes_it_tab,80) - inv_rnd(aes_it_tab,96) - inv_rnd(aes_it_tab,112) - inv_rnd(aes_it_tab,128) - inv_rnd(aes_il_tab,144) // last round uses a different table - -// move final values to the output array. - - mov out_blk+20(%esp),%ebp - add $8,%esp - mov %eax,(%ebp) - mov %ebx,4(%ebp) - mov %esi,8(%ebp) - mov %edi,12(%ebp) - pop %edi - pop %esi - pop %ebx - pop %ebp + mov (%r2),%r0 + mov 4(%r2),%r1 + mov 8(%r2),%r4 + mov 12(%r2),%r5 + xor (%r6),%r0 + xor 4(%r6),%r1 + xor 8(%r6),%r4 + xor 12(%r6),%r5 + + sub $8,%esp // space for register saves on stack + add $16,%r6 // increment to next round key + sub $10,%r3 + je 4f // 10 rounds for 128-bit key + add $32,%r6 + sub $2,%r3 + je 3f // 12 rounds for 128-bit key + add $32,%r6 + +2: fwd_rnd( -64(%r6) ,ft_tab) // 14 rounds for 128-bit key + fwd_rnd( -48(%r6) ,ft_tab) +3: fwd_rnd( -32(%r6) ,ft_tab) // 12 rounds for 128-bit key + fwd_rnd( -16(%r6) ,ft_tab) +4: fwd_rnd( (%r6) ,ft_tab) // 10 rounds for 128-bit key + fwd_rnd( +16(%r6) ,ft_tab) + fwd_rnd( +32(%r6) ,ft_tab) + fwd_rnd( +48(%r6) ,ft_tab) + fwd_rnd( +64(%r6) ,ft_tab) + fwd_rnd( +80(%r6) ,ft_tab) + fwd_rnd( +96(%r6) ,ft_tab) + fwd_rnd(+112(%r6) ,ft_tab) + fwd_rnd(+128(%r6) ,ft_tab) + fwd_rnd(+144(%r6) ,fl_tab) // last round uses a different table + +// move final values to the output array. CAUTION: the +// order of these assigns rely on the register mappings + + add $8,%esp + mov out_blk+12(%esp),%r6 + mov %r5,12(%r6) + pop %edi + mov %r4,8(%r6) + pop %esi + mov %r1,4(%r6) + pop %ebx + mov %r0,(%r6) + pop %ebp + mov $1,%eax ret -// AES (Rijndael) Key Schedule Subroutine - -// input/output parameters +// AES (Rijndael) Decryption Subroutine -#define aes_cx 12 // AES context -#define in_key 16 // key input array address -#define key_ln 20 // key length, bytes (16,24,32) or bits (128,192,256) -#define ed_flg 24 // 0=create both encr/decr keys, 1=create encr key only - -// offsets for locals - -#define cnt -4 -#define kpf -8 -#define slen 8 - -// This macro performs a column mixing operation on an input 32-bit -// word to give a 32-bit result. It uses each of the 4 bytes in the -// the input column to index 4 different tables of 256 32-bit words -// that are xored together to form the output value. - -#define mix_col(p1) \ - movzbl %bl,%ecx ;\ - mov p1(,%ecx,4),%eax ;\ - movzbl %bh,%ecx ;\ - ror $16,%ebx ;\ - xor p1+tlen(,%ecx,4),%eax ;\ - movzbl %bl,%ecx ;\ - xor p1+2*tlen(,%ecx,4),%eax ;\ - movzbl %bh,%ecx ;\ - xor p1+3*tlen(,%ecx,4),%eax - -// Key Schedule Macros - -#define ksc4(p1) \ - rol $24,%ebx ;\ - mix_col(aes_fl_tab) ;\ - ror $8,%ebx ;\ - xor 4*p1+aes_rcon_tab,%eax ;\ - xor %eax,%esi ;\ - xor %esi,%ebp ;\ - mov %esi,16*p1(%edi) ;\ - mov %ebp,16*p1+4(%edi) ;\ - xor %ebp,%edx ;\ - xor %edx,%ebx ;\ - mov %edx,16*p1+8(%edi) ;\ - mov %ebx,16*p1+12(%edi) - -#define ksc6(p1) \ - rol $24,%ebx ;\ - mix_col(aes_fl_tab) ;\ - ror $8,%ebx ;\ - xor 4*p1+aes_rcon_tab,%eax ;\ - xor 24*p1-24(%edi),%eax ;\ - mov %eax,24*p1(%edi) ;\ - xor 24*p1-20(%edi),%eax ;\ - mov %eax,24*p1+4(%edi) ;\ - xor %eax,%esi ;\ - xor %esi,%ebp ;\ - mov %esi,24*p1+8(%edi) ;\ - mov %ebp,24*p1+12(%edi) ;\ - xor %ebp,%edx ;\ - xor %edx,%ebx ;\ - mov %edx,24*p1+16(%edi) ;\ - mov %ebx,24*p1+20(%edi) - -#define ksc8(p1) \ - rol $24,%ebx ;\ - mix_col(aes_fl_tab) ;\ - ror $8,%ebx ;\ - xor 4*p1+aes_rcon_tab,%eax ;\ - xor 32*p1-32(%edi),%eax ;\ - mov %eax,32*p1(%edi) ;\ - xor 32*p1-28(%edi),%eax ;\ - mov %eax,32*p1+4(%edi) ;\ - xor 32*p1-24(%edi),%eax ;\ - mov %eax,32*p1+8(%edi) ;\ - xor 32*p1-20(%edi),%eax ;\ - mov %eax,32*p1+12(%edi) ;\ - push %ebx ;\ - mov %eax,%ebx ;\ - mix_col(aes_fl_tab) ;\ - pop %ebx ;\ - xor %eax,%esi ;\ - xor %esi,%ebp ;\ - mov %esi,32*p1+16(%edi) ;\ - mov %ebp,32*p1+20(%edi) ;\ - xor %ebp,%edx ;\ - xor %edx,%ebx ;\ - mov %edx,32*p1+24(%edi) ;\ - mov %ebx,32*p1+28(%edi) - - .align ALIGN32BYTES -aes_set_key: - pushfl - push %ebp - mov %esp,%ebp - sub $slen,%esp - push %ebx - push %esi - push %edi - - mov aes_cx(%ebp),%edx // edx -> AES context - - mov key_ln(%ebp),%ecx // key length - cmpl $128,%ecx - jb aes_30 - shr $3,%ecx -aes_30: cmpl $32,%ecx - je aes_32 - cmpl $24,%ecx - je aes_32 - mov $16,%ecx -aes_32: shr $2,%ecx - mov %ecx,nkey(%edx) - - lea 6(%ecx),%eax // 10/12/14 for 4/6/8 32-bit key length - mov %eax,nrnd(%edx) - - mov in_key(%ebp),%esi // key input array - lea ekey(%edx),%edi // key position in AES context - cld - push %ebp - mov %ecx,%eax // save key length in eax - rep ; movsl // words in the key schedule - mov -4(%esi),%ebx // put some values in registers - mov -8(%esi),%edx // to allow faster code - mov -12(%esi),%ebp - mov -16(%esi),%esi - - cmpl $4,%eax // jump on key size - je aes_36 - cmpl $6,%eax - je aes_35 - - ksc8(0) - ksc8(1) - ksc8(2) - ksc8(3) - ksc8(4) - ksc8(5) - ksc8(6) - jmp aes_37 -aes_35: ksc6(0) - ksc6(1) - ksc6(2) - ksc6(3) - ksc6(4) - ksc6(5) - ksc6(6) - ksc6(7) - jmp aes_37 -aes_36: ksc4(0) - ksc4(1) - ksc4(2) - ksc4(3) - ksc4(4) - ksc4(5) - ksc4(6) - ksc4(7) - ksc4(8) - ksc4(9) -aes_37: pop %ebp - mov aes_cx(%ebp),%edx // edx -> AES context - cmpl $0,ed_flg(%ebp) - jne aes_39 - -// compile decryption key schedule from encryption schedule - reverse -// order and do mix_column operation on round keys except first and last - - mov nrnd(%edx),%eax // kt = cx->d_key + nc * cx->Nrnd - shl $2,%eax - lea dkey(%edx,%eax,4),%edi - lea ekey(%edx),%esi // kf = cx->e_key - - movsl // copy first round key (unmodified) - movsl - movsl - movsl - sub $32,%edi - movl $1,cnt(%ebp) -aes_38: // do mix column on each column of - lodsl // each round key - mov %eax,%ebx - mix_col(aes_im_tab) - stosl - lodsl - mov %eax,%ebx - mix_col(aes_im_tab) - stosl - lodsl - mov %eax,%ebx - mix_col(aes_im_tab) - stosl - lodsl - mov %eax,%ebx - mix_col(aes_im_tab) - stosl - sub $32,%edi - - incl cnt(%ebp) - mov cnt(%ebp),%eax - cmp nrnd(%edx),%eax - jb aes_38 - - movsl // copy last round key (unmodified) - movsl - movsl - movsl -aes_39: pop %edi - pop %esi - pop %ebx - mov %ebp,%esp - pop %ebp - popfl - ret +.global aes_dec_blk +.extern it_tab +.extern il_tab -// finite field multiplies by {02}, {04} and {08} +.align 4 -#define f2(x) ((x<<1)^(((x>>7)&1)*0x11b)) -#define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b)) -#define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b)) - -// finite field multiplies required in table generation - -#define f3(x) (f2(x) ^ x) -#define f9(x) (f8(x) ^ x) -#define fb(x) (f8(x) ^ f2(x) ^ x) -#define fd(x) (f8(x) ^ f4(x) ^ x) -#define fe(x) (f8(x) ^ f4(x) ^ f2(x)) - -// These defines generate the forward table entries - -#define u0(x) ((f3(x) << 24) | (x << 16) | (x << 8) | f2(x)) -#define u1(x) ((x << 24) | (x << 16) | (f2(x) << 8) | f3(x)) -#define u2(x) ((x << 24) | (f2(x) << 16) | (f3(x) << 8) | x) -#define u3(x) ((f2(x) << 24) | (f3(x) << 16) | (x << 8) | x) - -// These defines generate the inverse table entries - -#define v0(x) ((fb(x) << 24) | (fd(x) << 16) | (f9(x) << 8) | fe(x)) -#define v1(x) ((fd(x) << 24) | (f9(x) << 16) | (fe(x) << 8) | fb(x)) -#define v2(x) ((f9(x) << 24) | (fe(x) << 16) | (fb(x) << 8) | fd(x)) -#define v3(x) ((fe(x) << 24) | (fb(x) << 16) | (fd(x) << 8) | f9(x)) - -// These defines generate entries for the last round tables - -#define w0(x) (x) -#define w1(x) (x << 8) -#define w2(x) (x << 16) -#define w3(x) (x << 24) - -// macro to generate inverse mix column tables (needed for the key schedule) - -#define im_data0(p1) \ - .long p1(0x00),p1(0x01),p1(0x02),p1(0x03),p1(0x04),p1(0x05),p1(0x06),p1(0x07) ;\ - .long p1(0x08),p1(0x09),p1(0x0a),p1(0x0b),p1(0x0c),p1(0x0d),p1(0x0e),p1(0x0f) ;\ - .long p1(0x10),p1(0x11),p1(0x12),p1(0x13),p1(0x14),p1(0x15),p1(0x16),p1(0x17) ;\ - .long p1(0x18),p1(0x19),p1(0x1a),p1(0x1b),p1(0x1c),p1(0x1d),p1(0x1e),p1(0x1f) -#define im_data1(p1) \ - .long p1(0x20),p1(0x21),p1(0x22),p1(0x23),p1(0x24),p1(0x25),p1(0x26),p1(0x27) ;\ - .long p1(0x28),p1(0x29),p1(0x2a),p1(0x2b),p1(0x2c),p1(0x2d),p1(0x2e),p1(0x2f) ;\ - .long p1(0x30),p1(0x31),p1(0x32),p1(0x33),p1(0x34),p1(0x35),p1(0x36),p1(0x37) ;\ - .long p1(0x38),p1(0x39),p1(0x3a),p1(0x3b),p1(0x3c),p1(0x3d),p1(0x3e),p1(0x3f) -#define im_data2(p1) \ - .long p1(0x40),p1(0x41),p1(0x42),p1(0x43),p1(0x44),p1(0x45),p1(0x46),p1(0x47) ;\ - .long p1(0x48),p1(0x49),p1(0x4a),p1(0x4b),p1(0x4c),p1(0x4d),p1(0x4e),p1(0x4f) ;\ - .long p1(0x50),p1(0x51),p1(0x52),p1(0x53),p1(0x54),p1(0x55),p1(0x56),p1(0x57) ;\ - .long p1(0x58),p1(0x59),p1(0x5a),p1(0x5b),p1(0x5c),p1(0x5d),p1(0x5e),p1(0x5f) -#define im_data3(p1) \ - .long p1(0x60),p1(0x61),p1(0x62),p1(0x63),p1(0x64),p1(0x65),p1(0x66),p1(0x67) ;\ - .long p1(0x68),p1(0x69),p1(0x6a),p1(0x6b),p1(0x6c),p1(0x6d),p1(0x6e),p1(0x6f) ;\ - .long p1(0x70),p1(0x71),p1(0x72),p1(0x73),p1(0x74),p1(0x75),p1(0x76),p1(0x77) ;\ - .long p1(0x78),p1(0x79),p1(0x7a),p1(0x7b),p1(0x7c),p1(0x7d),p1(0x7e),p1(0x7f) -#define im_data4(p1) \ - .long p1(0x80),p1(0x81),p1(0x82),p1(0x83),p1(0x84),p1(0x85),p1(0x86),p1(0x87) ;\ - .long p1(0x88),p1(0x89),p1(0x8a),p1(0x8b),p1(0x8c),p1(0x8d),p1(0x8e),p1(0x8f) ;\ - .long p1(0x90),p1(0x91),p1(0x92),p1(0x93),p1(0x94),p1(0x95),p1(0x96),p1(0x97) ;\ - .long p1(0x98),p1(0x99),p1(0x9a),p1(0x9b),p1(0x9c),p1(0x9d),p1(0x9e),p1(0x9f) -#define im_data5(p1) \ - .long p1(0xa0),p1(0xa1),p1(0xa2),p1(0xa3),p1(0xa4),p1(0xa5),p1(0xa6),p1(0xa7) ;\ - .long p1(0xa8),p1(0xa9),p1(0xaa),p1(0xab),p1(0xac),p1(0xad),p1(0xae),p1(0xaf) ;\ - .long p1(0xb0),p1(0xb1),p1(0xb2),p1(0xb3),p1(0xb4),p1(0xb5),p1(0xb6),p1(0xb7) ;\ - .long p1(0xb8),p1(0xb9),p1(0xba),p1(0xbb),p1(0xbc),p1(0xbd),p1(0xbe),p1(0xbf) -#define im_data6(p1) \ - .long p1(0xc0),p1(0xc1),p1(0xc2),p1(0xc3),p1(0xc4),p1(0xc5),p1(0xc6),p1(0xc7) ;\ - .long p1(0xc8),p1(0xc9),p1(0xca),p1(0xcb),p1(0xcc),p1(0xcd),p1(0xce),p1(0xcf) ;\ - .long p1(0xd0),p1(0xd1),p1(0xd2),p1(0xd3),p1(0xd4),p1(0xd5),p1(0xd6),p1(0xd7) ;\ - .long p1(0xd8),p1(0xd9),p1(0xda),p1(0xdb),p1(0xdc),p1(0xdd),p1(0xde),p1(0xdf) -#define im_data7(p1) \ - .long p1(0xe0),p1(0xe1),p1(0xe2),p1(0xe3),p1(0xe4),p1(0xe5),p1(0xe6),p1(0xe7) ;\ - .long p1(0xe8),p1(0xe9),p1(0xea),p1(0xeb),p1(0xec),p1(0xed),p1(0xee),p1(0xef) ;\ - .long p1(0xf0),p1(0xf1),p1(0xf2),p1(0xf3),p1(0xf4),p1(0xf5),p1(0xf6),p1(0xf7) ;\ - .long p1(0xf8),p1(0xf9),p1(0xfa),p1(0xfb),p1(0xfc),p1(0xfd),p1(0xfe),p1(0xff) - -// S-box data - 256 entries - -#define sb_data0(p1) \ - .long p1(0x63),p1(0x7c),p1(0x77),p1(0x7b),p1(0xf2),p1(0x6b),p1(0x6f),p1(0xc5) ;\ - .long p1(0x30),p1(0x01),p1(0x67),p1(0x2b),p1(0xfe),p1(0xd7),p1(0xab),p1(0x76) ;\ - .long p1(0xca),p1(0x82),p1(0xc9),p1(0x7d),p1(0xfa),p1(0x59),p1(0x47),p1(0xf0) ;\ - .long p1(0xad),p1(0xd4),p1(0xa2),p1(0xaf),p1(0x9c),p1(0xa4),p1(0x72),p1(0xc0) -#define sb_data1(p1) \ - .long p1(0xb7),p1(0xfd),p1(0x93),p1(0x26),p1(0x36),p1(0x3f),p1(0xf7),p1(0xcc) ;\ - .long p1(0x34),p1(0xa5),p1(0xe5),p1(0xf1),p1(0x71),p1(0xd8),p1(0x31),p1(0x15) ;\ - .long p1(0x04),p1(0xc7),p1(0x23),p1(0xc3),p1(0x18),p1(0x96),p1(0x05),p1(0x9a) ;\ - .long p1(0x07),p1(0x12),p1(0x80),p1(0xe2),p1(0xeb),p1(0x27),p1(0xb2),p1(0x75) -#define sb_data2(p1) \ - .long p1(0x09),p1(0x83),p1(0x2c),p1(0x1a),p1(0x1b),p1(0x6e),p1(0x5a),p1(0xa0) ;\ - .long p1(0x52),p1(0x3b),p1(0xd6),p1(0xb3),p1(0x29),p1(0xe3),p1(0x2f),p1(0x84) ;\ - .long p1(0x53),p1(0xd1),p1(0x00),p1(0xed),p1(0x20),p1(0xfc),p1(0xb1),p1(0x5b) ;\ - .long p1(0x6a),p1(0xcb),p1(0xbe),p1(0x39),p1(0x4a),p1(0x4c),p1(0x58),p1(0xcf) -#define sb_data3(p1) \ - .long p1(0xd0),p1(0xef),p1(0xaa),p1(0xfb),p1(0x43),p1(0x4d),p1(0x33),p1(0x85) ;\ - .long p1(0x45),p1(0xf9),p1(0x02),p1(0x7f),p1(0x50),p1(0x3c),p1(0x9f),p1(0xa8) ;\ - .long p1(0x51),p1(0xa3),p1(0x40),p1(0x8f),p1(0x92),p1(0x9d),p1(0x38),p1(0xf5) ;\ - .long p1(0xbc),p1(0xb6),p1(0xda),p1(0x21),p1(0x10),p1(0xff),p1(0xf3),p1(0xd2) -#define sb_data4(p1) \ - .long p1(0xcd),p1(0x0c),p1(0x13),p1(0xec),p1(0x5f),p1(0x97),p1(0x44),p1(0x17) ;\ - .long p1(0xc4),p1(0xa7),p1(0x7e),p1(0x3d),p1(0x64),p1(0x5d),p1(0x19),p1(0x73) ;\ - .long p1(0x60),p1(0x81),p1(0x4f),p1(0xdc),p1(0x22),p1(0x2a),p1(0x90),p1(0x88) ;\ - .long p1(0x46),p1(0xee),p1(0xb8),p1(0x14),p1(0xde),p1(0x5e),p1(0x0b),p1(0xdb) -#define sb_data5(p1) \ - .long p1(0xe0),p1(0x32),p1(0x3a),p1(0x0a),p1(0x49),p1(0x06),p1(0x24),p1(0x5c) ;\ - .long p1(0xc2),p1(0xd3),p1(0xac),p1(0x62),p1(0x91),p1(0x95),p1(0xe4),p1(0x79) ;\ - .long p1(0xe7),p1(0xc8),p1(0x37),p1(0x6d),p1(0x8d),p1(0xd5),p1(0x4e),p1(0xa9) ;\ - .long p1(0x6c),p1(0x56),p1(0xf4),p1(0xea),p1(0x65),p1(0x7a),p1(0xae),p1(0x08) -#define sb_data6(p1) \ - .long p1(0xba),p1(0x78),p1(0x25),p1(0x2e),p1(0x1c),p1(0xa6),p1(0xb4),p1(0xc6) ;\ - .long p1(0xe8),p1(0xdd),p1(0x74),p1(0x1f),p1(0x4b),p1(0xbd),p1(0x8b),p1(0x8a) ;\ - .long p1(0x70),p1(0x3e),p1(0xb5),p1(0x66),p1(0x48),p1(0x03),p1(0xf6),p1(0x0e) ;\ - .long p1(0x61),p1(0x35),p1(0x57),p1(0xb9),p1(0x86),p1(0xc1),p1(0x1d),p1(0x9e) -#define sb_data7(p1) \ - .long p1(0xe1),p1(0xf8),p1(0x98),p1(0x11),p1(0x69),p1(0xd9),p1(0x8e),p1(0x94) ;\ - .long p1(0x9b),p1(0x1e),p1(0x87),p1(0xe9),p1(0xce),p1(0x55),p1(0x28),p1(0xdf) ;\ - .long p1(0x8c),p1(0xa1),p1(0x89),p1(0x0d),p1(0xbf),p1(0xe6),p1(0x42),p1(0x68) ;\ - .long p1(0x41),p1(0x99),p1(0x2d),p1(0x0f),p1(0xb0),p1(0x54),p1(0xbb),p1(0x16) - -// Inverse S-box data - 256 entries - -#define ib_data0(p1) \ - .long p1(0x52),p1(0x09),p1(0x6a),p1(0xd5),p1(0x30),p1(0x36),p1(0xa5),p1(0x38) ;\ - .long p1(0xbf),p1(0x40),p1(0xa3),p1(0x9e),p1(0x81),p1(0xf3),p1(0xd7),p1(0xfb) ;\ - .long p1(0x7c),p1(0xe3),p1(0x39),p1(0x82),p1(0x9b),p1(0x2f),p1(0xff),p1(0x87) ;\ - .long p1(0x34),p1(0x8e),p1(0x43),p1(0x44),p1(0xc4),p1(0xde),p1(0xe9),p1(0xcb) -#define ib_data1(p1) \ - .long p1(0x54),p1(0x7b),p1(0x94),p1(0x32),p1(0xa6),p1(0xc2),p1(0x23),p1(0x3d) ;\ - .long p1(0xee),p1(0x4c),p1(0x95),p1(0x0b),p1(0x42),p1(0xfa),p1(0xc3),p1(0x4e) ;\ - .long p1(0x08),p1(0x2e),p1(0xa1),p1(0x66),p1(0x28),p1(0xd9),p1(0x24),p1(0xb2) ;\ - .long p1(0x76),p1(0x5b),p1(0xa2),p1(0x49),p1(0x6d),p1(0x8b),p1(0xd1),p1(0x25) -#define ib_data2(p1) \ - .long p1(0x72),p1(0xf8),p1(0xf6),p1(0x64),p1(0x86),p1(0x68),p1(0x98),p1(0x16) ;\ - .long p1(0xd4),p1(0xa4),p1(0x5c),p1(0xcc),p1(0x5d),p1(0x65),p1(0xb6),p1(0x92) ;\ - .long p1(0x6c),p1(0x70),p1(0x48),p1(0x50),p1(0xfd),p1(0xed),p1(0xb9),p1(0xda) ;\ - .long p1(0x5e),p1(0x15),p1(0x46),p1(0x57),p1(0xa7),p1(0x8d),p1(0x9d),p1(0x84) -#define ib_data3(p1) \ - .long p1(0x90),p1(0xd8),p1(0xab),p1(0x00),p1(0x8c),p1(0xbc),p1(0xd3),p1(0x0a) ;\ - .long p1(0xf7),p1(0xe4),p1(0x58),p1(0x05),p1(0xb8),p1(0xb3),p1(0x45),p1(0x06) ;\ - .long p1(0xd0),p1(0x2c),p1(0x1e),p1(0x8f),p1(0xca),p1(0x3f),p1(0x0f),p1(0x02) ;\ - .long p1(0xc1),p1(0xaf),p1(0xbd),p1(0x03),p1(0x01),p1(0x13),p1(0x8a),p1(0x6b) -#define ib_data4(p1) \ - .long p1(0x3a),p1(0x91),p1(0x11),p1(0x41),p1(0x4f),p1(0x67),p1(0xdc),p1(0xea) ;\ - .long p1(0x97),p1(0xf2),p1(0xcf),p1(0xce),p1(0xf0),p1(0xb4),p1(0xe6),p1(0x73) ;\ - .long p1(0x96),p1(0xac),p1(0x74),p1(0x22),p1(0xe7),p1(0xad),p1(0x35),p1(0x85) ;\ - .long p1(0xe2),p1(0xf9),p1(0x37),p1(0xe8),p1(0x1c),p1(0x75),p1(0xdf),p1(0x6e) -#define ib_data5(p1) \ - .long p1(0x47),p1(0xf1),p1(0x1a),p1(0x71),p1(0x1d),p1(0x29),p1(0xc5),p1(0x89) ;\ - .long p1(0x6f),p1(0xb7),p1(0x62),p1(0x0e),p1(0xaa),p1(0x18),p1(0xbe),p1(0x1b) ;\ - .long p1(0xfc),p1(0x56),p1(0x3e),p1(0x4b),p1(0xc6),p1(0xd2),p1(0x79),p1(0x20) ;\ - .long p1(0x9a),p1(0xdb),p1(0xc0),p1(0xfe),p1(0x78),p1(0xcd),p1(0x5a),p1(0xf4) -#define ib_data6(p1) \ - .long p1(0x1f),p1(0xdd),p1(0xa8),p1(0x33),p1(0x88),p1(0x07),p1(0xc7),p1(0x31) ;\ - .long p1(0xb1),p1(0x12),p1(0x10),p1(0x59),p1(0x27),p1(0x80),p1(0xec),p1(0x5f) ;\ - .long p1(0x60),p1(0x51),p1(0x7f),p1(0xa9),p1(0x19),p1(0xb5),p1(0x4a),p1(0x0d) ;\ - .long p1(0x2d),p1(0xe5),p1(0x7a),p1(0x9f),p1(0x93),p1(0xc9),p1(0x9c),p1(0xef) -#define ib_data7(p1) \ - .long p1(0xa0),p1(0xe0),p1(0x3b),p1(0x4d),p1(0xae),p1(0x2a),p1(0xf5),p1(0xb0) ;\ - .long p1(0xc8),p1(0xeb),p1(0xbb),p1(0x3c),p1(0x83),p1(0x53),p1(0x99),p1(0x61) ;\ - .long p1(0x17),p1(0x2b),p1(0x04),p1(0x7e),p1(0xba),p1(0x77),p1(0xd6),p1(0x26) ;\ - .long p1(0xe1),p1(0x69),p1(0x14),p1(0x63),p1(0x55),p1(0x21),p1(0x0c),p1(0x7d) +aes_dec_blk: + push %ebp + mov ctx(%esp),%ebp // pointer to context + xor %eax,%eax + +// CAUTION: the order and the values used in these assigns +// rely on the register mappings + +1: push %ebx + mov in_blk+4(%esp),%r2 + push %esi + mov nrnd(%ebp),%r3 // number of rounds + push %edi + lea dkey(%ebp),%r6 // key pointer + mov %r3,%r0 + shl $4,%r0 + add %r0,%r6 + +// input four columns and xor in first round key -// The rcon_table (needed for the key schedule) -// -// Here is original Dr Brian Gladman's source code: -// _rcon_tab: -// %assign x 1 -// %rep 29 -// dd x -// %assign x f2(x) -// %endrep -// -// Here is precomputed output (it's more portable this way): + mov (%r2),%r0 + mov 4(%r2),%r1 + mov 8(%r2),%r4 + mov 12(%r2),%r5 + xor (%r6),%r0 + xor 4(%r6),%r1 + xor 8(%r6),%r4 + xor 12(%r6),%r5 + + sub $8,%esp // space for register saves on stack + sub $16,%r6 // increment to next round key + sub $10,%r3 + je 4f // 10 rounds for 128-bit key + sub $32,%r6 + sub $2,%r3 + je 3f // 12 rounds for 128-bit key + sub $32,%r6 + +2: inv_rnd( +64(%r6), it_tab) // 14 rounds for 128-bit key + inv_rnd( +48(%r6), it_tab) +3: inv_rnd( +32(%r6), it_tab) // 12 rounds for 128-bit key + inv_rnd( +16(%r6), it_tab) +4: inv_rnd( (%r6), it_tab) // 10 rounds for 128-bit key + inv_rnd( -16(%r6), it_tab) + inv_rnd( -32(%r6), it_tab) + inv_rnd( -48(%r6), it_tab) + inv_rnd( -64(%r6), it_tab) + inv_rnd( -80(%r6), it_tab) + inv_rnd( -96(%r6), it_tab) + inv_rnd(-112(%r6), it_tab) + inv_rnd(-128(%r6), it_tab) + inv_rnd(-144(%r6), il_tab) // last round uses a different table + +// move final values to the output array. CAUTION: the +// order of these assigns rely on the register mappings + + add $8,%esp + mov out_blk+12(%esp),%r6 + mov %r5,12(%r6) + pop %edi + mov %r4,8(%r6) + pop %esi + mov %r1,4(%r6) + pop %ebx + mov %r0,(%r6) + pop %ebp + mov $1,%eax + ret - .align ALIGN32BYTES -aes_rcon_tab: - .long 0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80 - .long 0x1b,0x36,0x6c,0xd8,0xab,0x4d,0x9a,0x2f - .long 0x5e,0xbc,0x63,0xc6,0x97,0x35,0x6a,0xd4 - .long 0xb3,0x7d,0xfa,0xef,0xc5 - -// The forward xor tables - - .align ALIGN32BYTES -aes_ft_tab: - sb_data0(u0) - sb_data1(u0) - sb_data2(u0) - sb_data3(u0) - sb_data4(u0) - sb_data5(u0) - sb_data6(u0) - sb_data7(u0) - - sb_data0(u1) - sb_data1(u1) - sb_data2(u1) - sb_data3(u1) - sb_data4(u1) - sb_data5(u1) - sb_data6(u1) - sb_data7(u1) - - sb_data0(u2) - sb_data1(u2) - sb_data2(u2) - sb_data3(u2) - sb_data4(u2) - sb_data5(u2) - sb_data6(u2) - sb_data7(u2) - - sb_data0(u3) - sb_data1(u3) - sb_data2(u3) - sb_data3(u3) - sb_data4(u3) - sb_data5(u3) - sb_data6(u3) - sb_data7(u3) - - .align ALIGN32BYTES -aes_fl_tab: - sb_data0(w0) - sb_data1(w0) - sb_data2(w0) - sb_data3(w0) - sb_data4(w0) - sb_data5(w0) - sb_data6(w0) - sb_data7(w0) - - sb_data0(w1) - sb_data1(w1) - sb_data2(w1) - sb_data3(w1) - sb_data4(w1) - sb_data5(w1) - sb_data6(w1) - sb_data7(w1) - - sb_data0(w2) - sb_data1(w2) - sb_data2(w2) - sb_data3(w2) - sb_data4(w2) - sb_data5(w2) - sb_data6(w2) - sb_data7(w2) - - sb_data0(w3) - sb_data1(w3) - sb_data2(w3) - sb_data3(w3) - sb_data4(w3) - sb_data5(w3) - sb_data6(w3) - sb_data7(w3) - -// The inverse xor tables - - .align ALIGN32BYTES -aes_it_tab: - ib_data0(v0) - ib_data1(v0) - ib_data2(v0) - ib_data3(v0) - ib_data4(v0) - ib_data5(v0) - ib_data6(v0) - ib_data7(v0) - - ib_data0(v1) - ib_data1(v1) - ib_data2(v1) - ib_data3(v1) - ib_data4(v1) - ib_data5(v1) - ib_data6(v1) - ib_data7(v1) - - ib_data0(v2) - ib_data1(v2) - ib_data2(v2) - ib_data3(v2) - ib_data4(v2) - ib_data5(v2) - ib_data6(v2) - ib_data7(v2) - - ib_data0(v3) - ib_data1(v3) - ib_data2(v3) - ib_data3(v3) - ib_data4(v3) - ib_data5(v3) - ib_data6(v3) - ib_data7(v3) - - .align ALIGN32BYTES -aes_il_tab: - ib_data0(w0) - ib_data1(w0) - ib_data2(w0) - ib_data3(w0) - ib_data4(w0) - ib_data5(w0) - ib_data6(w0) - ib_data7(w0) - - ib_data0(w1) - ib_data1(w1) - ib_data2(w1) - ib_data3(w1) - ib_data4(w1) - ib_data5(w1) - ib_data6(w1) - ib_data7(w1) - - ib_data0(w2) - ib_data1(w2) - ib_data2(w2) - ib_data3(w2) - ib_data4(w2) - ib_data5(w2) - ib_data6(w2) - ib_data7(w2) - - ib_data0(w3) - ib_data1(w3) - ib_data2(w3) - ib_data3(w3) - ib_data4(w3) - ib_data5(w3) - ib_data6(w3) - ib_data7(w3) - -// The inverse mix column tables - - .align ALIGN32BYTES -aes_im_tab: - im_data0(v0) - im_data1(v0) - im_data2(v0) - im_data3(v0) - im_data4(v0) - im_data5(v0) - im_data6(v0) - im_data7(v0) - - im_data0(v1) - im_data1(v1) - im_data2(v1) - im_data3(v1) - im_data4(v1) - im_data5(v1) - im_data6(v1) - im_data7(v1) - - im_data0(v2) - im_data1(v2) - im_data2(v2) - im_data3(v2) - im_data4(v2) - im_data5(v2) - im_data6(v2) - im_data7(v2) - - im_data0(v3) - im_data1(v3) - im_data2(v3) - im_data3(v3) - im_data4(v3) - im_data5(v3) - im_data6(v3) - im_data7(v3)