/* * Copyright (C) 2015 Michael Brown . * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA * 02110-1301, USA. * * You can also choose to distribute this program under the terms of * the Unmodified Binary Distribution Licence (as given in the file * COPYING.UBDL), provided that you have satisfied its requirements. */ FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL ); /** @file * * AES algorithm * */ #include #include #include #include #include #include #include #include #include #include /** AES strides * * These are the strides (modulo 16) used to walk through the AES * input state bytes in order of byte position after [Inv]ShiftRows. */ enum aes_stride { /** Input stride for ShiftRows * * 0 4 8 c * \ \ \ * 1 5 9 d * \ \ \ * 2 6 a e * \ \ \ * 3 7 b f */ AES_STRIDE_SHIFTROWS = +5, /** Input stride for InvShiftRows * * 0 4 8 c * / / / * 1 5 9 d * / / / * 2 6 a e * / / / * 3 7 b f */ AES_STRIDE_INVSHIFTROWS = -3, }; /** A single AES lookup table entry * * This represents the product (in the Galois field GF(2^8)) of an * eight-byte vector multiplier with a single scalar multiplicand. * * The vector multipliers used for AES will be {1,1,1,3,2,1,1,3} for * MixColumns and {1,9,13,11,14,9,13,11} for InvMixColumns. This * allows for the result of multiplying any single column of the * [Inv]MixColumns matrix by a scalar value to be obtained simply by * extracting the relevant four-byte subset from the lookup table * entry. * * For example, to find the result of multiplying the second column of * the MixColumns matrix by the scalar value 0x80: * * MixColumns column[0]: { 2, 1, 1, 3 } * MixColumns column[1]: { 3, 2, 1, 1 } * MixColumns column[2]: { 1, 3, 2, 1 } * MixColumns column[3]: { 1, 1, 3, 2 } * Vector multiplier: { 1, 1, 1, 3, 2, 1, 1, 3 } * Scalar multiplicand: 0x80 * Lookup table entry: { 0x80, 0x80, 0x80, 0x9b, 0x1b, 0x80, 0x80, 0x9b } * * The second column of the MixColumns matrix is {3,2,1,1}. The * product of this column with the scalar value 0x80 can be obtained * by extracting the relevant four-byte subset of the lookup table * entry: * * MixColumns column[1]: { 3, 2, 1, 1 } * Vector multiplier: { 1, 1, 1, 3, 2, 1, 1, 3 } * Lookup table entry: { 0x80, 0x80, 0x80, 0x9b, 0x1b, 0x80, 0x80, 0x9b } * Product: { 0x9b, 0x1b, 0x80, 0x80 } * * The column lookups require only seven bytes of the eight-byte * entry: the remaining (first) byte is used to hold the scalar * multiplicand itself (i.e. the first byte of the vector multiplier * is always chosen to be 1). */ union aes_table_entry { /** Viewed as an array of bytes */ uint8_t byte[8]; } __attribute__ (( packed )); /** An AES lookup table * * This represents the products (in the Galois field GF(2^8)) of a * constant eight-byte vector multiplier with all possible 256 scalar * multiplicands. * * The entries are indexed by the AES [Inv]SubBytes S-box output * values (denoted S(N)). This allows for the result of multiplying * any single column of the [Inv]MixColumns matrix by S(N) to be * obtained simply by extracting the relevant four-byte subset from * the Nth table entry. For example: * * Input byte (N): 0x3a * SubBytes output S(N): 0x80 * MixColumns column[1]: { 3, 2, 1, 1 } * Vector multiplier: { 1, 1, 1, 3, 2, 1, 1, 3 } * Table entry[0x3a]: { 0x80, 0x80, 0x80, 0x9b, 0x1b, 0x80, 0x80, 0x9b } * Product: { 0x9b, 0x1b, 0x80, 0x80 } * * Since the first byte of the eight-byte vector multiplier is always * chosen to be 1, the value of S(N) may be lookup up by extracting * the first byte of the Nth table entry. */ struct aes_table { /** Table entries, indexed by S(N) */ union aes_table_entry entry[256]; } __attribute__ (( aligned ( 8 ) )); /** AES MixColumns lookup table */ static struct aes_table aes_mixcolumns; /** AES InvMixColumns lookup table */ static struct aes_table aes_invmixcolumns; /** * Multiply [Inv]MixColumns matrix column by scalar multiplicand * * @v entry AES lookup table entry for scalar multiplicand * @v column [Inv]MixColumns matrix column index * @ret product Product of matrix column with scalar multiplicand */ static inline __attribute__ (( always_inline )) uint32_t aes_entry_column ( const union aes_table_entry *entry, unsigned int column ) { const union { uint8_t byte; uint32_t column; } __attribute__ (( may_alias )) *product; /* Locate relevant four-byte subset */ product = container_of ( &entry->byte[ 4 - column ], typeof ( *product ), byte ); /* Extract this four-byte subset */ return product->column; } /** * Multiply [Inv]MixColumns matrix column by S-boxed input byte * * @v table AES lookup table * @v stride AES row shift stride * @v in AES input state * @v offset Output byte offset (after [Inv]ShiftRows) * @ret product Product of matrix column with S(input byte) * * Note that the specified offset is not the offset of the input byte; * it is the offset of the output byte which corresponds to the input * byte. This output byte offset is used to calculate both the input * byte offset and to select the appropriate matric column. * * With a compile-time constant offset, this function will optimise * down to a single "movzbl" (to extract the input byte) and will * generate a single x86 memory reference expression which can then be * used directly within a single "xorl" instruction. */ static inline __attribute__ (( always_inline )) uint32_t aes_column ( const struct aes_table *table, size_t stride, const union aes_matrix *in, size_t offset ) { const union aes_table_entry *entry; unsigned int byte; /* Extract input byte corresponding to this output byte offset * (i.e. perform [Inv]ShiftRows). */ byte = in->byte[ ( stride * offset ) & 0xf ]; /* Locate lookup table entry for this input byte (i.e. perform * [Inv]SubBytes). */ entry = &table->entry[byte]; /* Multiply appropriate matrix column by this input byte * (i.e. perform [Inv]MixColumns). */ return aes_entry_column ( entry, ( offset & 0x3 ) ); } /** * Calculate intermediate round output column * * @v table AES lookup table * @v stride AES row shift stride * @v in AES input state * @v key AES round key * @v column Column index * @ret output Output column value */ static inline __attribute__ (( always_inline )) uint32_t aes_output ( const struct aes_table *table, size_t stride, const union aes_matrix *in, const union aes_matrix *key, unsigned int column ) { size_t offset = ( column * 4 ); /* Perform [Inv]ShiftRows, [Inv]SubBytes, [Inv]MixColumns, and * AddRoundKey for this column. The loop is unrolled to allow * for the required compile-time constant optimisations. */ return ( aes_column ( table, stride, in, ( offset + 0 ) ) ^ aes_column ( table, stride, in, ( offset + 1 ) ) ^ aes_column ( table, stride, in, ( offset + 2 ) ) ^ aes_column ( table, stride, in, ( offset + 3 ) ) ^ key->column[column] ); } /** * Perform a single intermediate round * * @v table AES lookup table * @v stride AES row shift stride * @v in AES input state * @v out AES output state * @v key AES round key */ static inline __attribute__ (( always_inline )) void aes_round ( const struct aes_table *table, size_t stride, const union aes_matrix *in, union aes_matrix *out, const union aes_matrix *key ) { /* Perform [Inv]ShiftRows, [Inv]SubBytes, [Inv]MixColumns, and * AddRoundKey for all columns. The loop is unrolled to allow * for the required compile-time constant optimisations. */ out->column[0] = aes_output ( table, stride, in, key, 0 ); out->column[1] = aes_output ( table, stride, in, key, 1 ); out->column[2] = aes_output ( table, stride, in, key, 2 ); out->column[3] = aes_output ( table, stride, in, key, 3 ); } /** * Perform encryption intermediate rounds * * @v in AES input state * @v out AES output state * @v key Round keys * @v rounds Number of rounds (must be odd) * * This function is deliberately marked as non-inlinable to ensure * maximal availability of registers for GCC's register allocator, * which has a tendency to otherwise spill performance-critical * registers to the stack. */ static __attribute__ (( noinline )) void aes_encrypt_rounds ( union aes_matrix *in, union aes_matrix *out, const union aes_matrix *key, unsigned int rounds ) { union aes_matrix *tmp; /* Perform intermediate rounds */ do { /* Perform one intermediate round */ aes_round ( &aes_mixcolumns, AES_STRIDE_SHIFTROWS, in, out, key++ ); /* Swap input and output states for next round */ tmp = in; in = out; out = tmp; } while ( --rounds ); } /** * Perform decryption intermediate rounds * * @v in AES input state * @v out AES output state * @v key Round keys * @v rounds Number of rounds (must be odd) * * As with aes_encrypt_rounds(), this function is deliberately marked * as non-inlinable. * * This function could potentially use the same binary code as is used * for encryption. To compensate for the difference between ShiftRows * and InvShiftRows, half of the input byte offsets would have to be * modifiable at runtime (half by an offset of +4/-4, half by an * offset of -4/+4 for ShiftRows/InvShiftRows). This can be * accomplished in x86 assembly within the number of available * registers, but GCC's register allocator struggles to do so, * resulting in a significant performance decrease due to registers * being spilled to the stack. We therefore use two separate but very * similar binary functions based on the same C source. */ static __attribute__ (( noinline )) void aes_decrypt_rounds ( union aes_matrix *in, union aes_matrix *out, const union aes_matrix *key, unsigned int rounds ) { union aes_matrix *tmp; /* Perform intermediate rounds */ do { /* Perform one intermediate round */ aes_round ( &aes_invmixcolumns, AES_STRIDE_INVSHIFTROWS, in, out, key++ ); /* Swap input and output states for next round */ tmp = in; in = out; out = tmp; } while ( --rounds ); } /** * Perform standalone AddRoundKey * * @v state AES state * @v key AES round key */ static inline __attribute__ (( always_inline )) void aes_addroundkey ( union aes_matrix *state, const union aes_matrix *key ) { state->column[0] ^= key->column[0]; state->column[1] ^= key->column[1]; state->column[2] ^= key->column[2]; state->column[3] ^= key->column[3]; } /** * Perform final round * * @v table AES lookup table * @v stride AES row shift stride * @v in AES input state * @v out AES output state * @v key AES round key */ static void aes_final ( const struct aes_table *table, size_t stride, const union aes_matrix *in, union aes_matrix *out, const union aes_matrix *key ) { const union aes_table_entry *entry; unsigned int byte; size_t out_offset; size_t in_offset; /* Perform [Inv]ShiftRows and [Inv]SubBytes */ for ( out_offset = 0, in_offset = 0 ; out_offset < 16 ; out_offset++, in_offset = ( ( in_offset + stride ) & 0xf ) ) { /* Extract input byte (i.e. perform [Inv]ShiftRows) */ byte = in->byte[in_offset]; /* Locate lookup table entry for this input byte * (i.e. perform [Inv]SubBytes). */ entry = &table->entry[byte]; /* Store output byte */ out->byte[out_offset] = entry->byte[0]; } /* Perform AddRoundKey */ aes_addroundkey ( out, key ); } /** * Encrypt data * * @v ctx Context * @v src Data to encrypt * @v dst Buffer for encrypted data * @v len Length of data */ static void aes_encrypt ( void *ctx, const void *src, void *dst, size_t len ) { struct aes_context *aes = ctx; union aes_matrix buffer[2]; union aes_matrix *in = &buffer[0]; union aes_matrix *out = &buffer[1]; unsigned int rounds = aes->rounds; /* Sanity check */ assert ( len == sizeof ( *in ) ); /* Initialise input state */ memcpy ( in, src, sizeof ( *in ) ); /* Perform initial round (AddRoundKey) */ aes_addroundkey ( in, &aes->encrypt.key[0] ); /* Perform intermediate rounds (ShiftRows, SubBytes, * MixColumns, AddRoundKey). */ aes_encrypt_rounds ( in, out, &aes->encrypt.key[1], ( rounds - 2 ) ); in = out; /* Perform final round (ShiftRows, SubBytes, AddRoundKey) */ out = dst; aes_final ( &aes_mixcolumns, AES_STRIDE_SHIFTROWS, in, out, &aes->encrypt.key[ rounds - 1 ] ); } /** * Decrypt data * * @v ctx Context * @v src Data to decrypt * @v dst Buffer for decrypted data * @v len Length of data */ static void aes_decrypt ( void *ctx, const void *src, void *dst, size_t len ) { struct aes_context *aes = ctx; union aes_matrix buffer[2]; union aes_matrix *in = &buffer[0]; union aes_matrix *out = &buffer[1]; unsigned int rounds = aes->rounds; /* Sanity check */ assert ( len == sizeof ( *in ) ); /* Initialise input state */ memcpy ( in, src, sizeof ( *in ) ); /* Perform initial round (AddRoundKey) */ aes_addroundkey ( in, &aes->decrypt.key[0] ); /* Perform intermediate rounds (InvShiftRows, InvSubBytes, * InvMixColumns, AddRoundKey). */ aes_decrypt_rounds ( in, out, &aes->decrypt.key[1], ( rounds - 2 ) ); in = out; /* Perform final round (InvShiftRows, InvSubBytes, AddRoundKey) */ out = dst; aes_final ( &aes_invmixcolumns, AES_STRIDE_INVSHIFTROWS, in, out, &aes->decrypt.key[ rounds - 1 ] ); } /** * Multiply a polynomial by (x) modulo (x^8 + x^4 + x^3 + x^2 + 1) in GF(2^8) * * @v poly Polynomial to be multiplied * @ret result Result */ static __attribute__ (( const )) unsigned int aes_double ( unsigned int poly ) { /* Multiply polynomial by (x), placing the resulting x^8 * coefficient in the LSB (i.e. rotate byte left by one). */ poly = rol8 ( poly, 1 ); /* If coefficient of x^8 (in LSB) is non-zero, then reduce by * subtracting (x^8 + x^4 + x^3 + x^2 + 1) in GF(2^8). */ if ( poly & 0x01 ) { poly ^= 0x01; /* Subtract x^8 (currently in LSB) */ poly ^= 0x1b; /* Subtract (x^4 + x^3 + x^2 + 1) */ } return poly; } /** * Fill in MixColumns lookup table entry * * @v entry AES lookup table entry for scalar multiplicand * * The MixColumns lookup table vector multiplier is {1,1,1,3,2,1,1,3}. */ static void aes_mixcolumns_entry ( union aes_table_entry *entry ) { unsigned int scalar_x_1; unsigned int scalar_x; unsigned int scalar; /* Retrieve scalar multiplicand */ scalar = entry->byte[0]; entry->byte[1] = scalar; entry->byte[2] = scalar; entry->byte[5] = scalar; entry->byte[6] = scalar; /* Calculate scalar multiplied by (x) */ scalar_x = aes_double ( scalar ); entry->byte[4] = scalar_x; /* Calculate scalar multiplied by (x + 1) */ scalar_x_1 = ( scalar_x ^ scalar ); entry->byte[3] = scalar_x_1; entry->byte[7] = scalar_x_1; } /** * Fill in InvMixColumns lookup table entry * * @v entry AES lookup table entry for scalar multiplicand * * The InvMixColumns lookup table vector multiplier is {1,9,13,11,14,9,13,11}. */ static void aes_invmixcolumns_entry ( union aes_table_entry *entry ) { unsigned int scalar_x3_x2_x; unsigned int scalar_x3_x2_1; unsigned int scalar_x3_x2; unsigned int scalar_x3_x_1; unsigned int scalar_x3_1; unsigned int scalar_x3; unsigned int scalar_x2; unsigned int scalar_x; unsigned int scalar; /* Retrieve scalar multiplicand */ scalar = entry->byte[0]; /* Calculate scalar multiplied by (x) */ scalar_x = aes_double ( scalar ); /* Calculate scalar multiplied by (x^2) */ scalar_x2 = aes_double ( scalar_x ); /* Calculate scalar multiplied by (x^3) */ scalar_x3 = aes_double ( scalar_x2 ); /* Calculate scalar multiplied by (x^3 + 1) */ scalar_x3_1 = ( scalar_x3 ^ scalar ); entry->byte[1] = scalar_x3_1; entry->byte[5] = scalar_x3_1; /* Calculate scalar multiplied by (x^3 + x + 1) */ scalar_x3_x_1 = ( scalar_x3_1 ^ scalar_x ); entry->byte[3] = scalar_x3_x_1; entry->byte[7] = scalar_x3_x_1; /* Calculate scalar multiplied by (x^3 + x^2) */ scalar_x3_x2 = ( scalar_x3 ^ scalar_x2 ); /* Calculate scalar multiplied by (x^3 + x^2 + 1) */ scalar_x3_x2_1 = ( scalar_x3_x2 ^ scalar ); entry->byte[2] = scalar_x3_x2_1; entry->byte[6] = scalar_x3_x2_1; /* Calculate scalar multiplied by (x^3 + x^2 + x) */ scalar_x3_x2_x = ( scalar_x3_x2 ^ scalar_x ); entry->byte[4] = scalar_x3_x2_x; } /** * Generate AES lookup tables * */ static void aes_generate ( void ) { union aes_table_entry *entry; union aes_table_entry *inventry; unsigned int poly = 0x01; unsigned int invpoly = 0x01; unsigned int transformed; unsigned int i; /* Iterate over non-zero values of GF(2^8) using generator (x + 1) */ do { /* Multiply polynomial by (x + 1) */ poly ^= aes_double ( poly ); /* Divide inverse polynomial by (x + 1). This code * fragment is taken directly from the Wikipedia page * on the Rijndael S-box. An explanation of why it * works would be greatly appreciated. */ invpoly ^= ( invpoly << 1 ); invpoly ^= ( invpoly << 2 ); invpoly ^= ( invpoly << 4 ); if ( invpoly & 0x80 ) invpoly ^= 0x09; invpoly &= 0xff; /* Apply affine transformation */ transformed = ( 0x63 ^ invpoly ^ rol8 ( invpoly, 1 ) ^ rol8 ( invpoly, 2 ) ^ rol8 ( invpoly, 3 ) ^ rol8 ( invpoly, 4 ) ); /* Populate S-box (within MixColumns lookup table) */ aes_mixcolumns.entry[poly].byte[0] = transformed; } while ( poly != 0x01 ); /* Populate zeroth S-box entry (which has no inverse) */ aes_mixcolumns.entry[0].byte[0] = 0x63; /* Fill in MixColumns and InvMixColumns lookup tables */ for ( i = 0 ; i < 256 ; i++ ) { /* Fill in MixColumns lookup table entry */ entry = &aes_mixcolumns.entry[i]; aes_mixcolumns_entry ( entry ); /* Populate inverse S-box (within InvMixColumns lookup table) */ inventry = &aes_invmixcolumns.entry[ entry->byte[0] ]; inventry->byte[0] = i; /* Fill in InvMixColumns lookup table entry */ aes_invmixcolumns_entry ( inventry ); } } /** * Rotate key column * * @v column Key column * @ret column Updated key column */ static inline __attribute__ (( always_inline )) uint32_t aes_key_rotate ( uint32_t column ) { return ( ( __BYTE_ORDER == __LITTLE_ENDIAN ) ? ror32 ( column, 8 ) : rol32 ( column, 8 ) ); } /** * Apply S-box to key column * * @v column Key column * @ret column Updated key column */ static uint32_t aes_key_sbox ( uint32_t column ) { unsigned int i; uint8_t byte; for ( i = 0 ; i < 4 ; i++ ) { byte = ( column & 0xff ); byte = aes_mixcolumns.entry[byte].byte[0]; column = ( ( column & ~0xff ) | byte ); column = rol32 ( column, 8 ); } return column; } /** * Apply schedule round constant to key column * * @v column Key column * @v rcon Round constant * @ret column Updated key column */ static inline __attribute__ (( always_inline )) uint32_t aes_key_rcon ( uint32_t column, unsigned int rcon ) { return ( ( __BYTE_ORDER == __LITTLE_ENDIAN ) ? ( column ^ rcon ) : ( column ^ ( rcon << 24 ) ) ); } /** * Set key * * @v ctx Context * @v key Key * @v keylen Key length * @ret rc Return status code */ static int aes_setkey ( void *ctx, const void *key, size_t keylen ) { struct aes_context *aes = ctx; union aes_matrix *enc; union aes_matrix *dec; union aes_matrix temp; union aes_matrix zero; unsigned int rcon = 0x01; unsigned int rounds; size_t offset = 0; uint32_t *prev; uint32_t *next; uint32_t *end; uint32_t tmp; /* Generate lookup tables, if not already done */ if ( ! aes_mixcolumns.entry[0].byte[0] ) aes_generate(); /* Validate key length and calculate number of intermediate rounds */ switch ( keylen ) { case ( 128 / 8 ) : rounds = 11; break; case ( 192 / 8 ) : rounds = 13; break; case ( 256 / 8 ) : rounds = 15; break; default: DBGC ( aes, "AES %p unsupported key length (%zd bits)\n", aes, ( keylen * 8 ) ); return -EINVAL; } aes->rounds = rounds; enc = aes->encrypt.key; end = enc[rounds].column; /* Copy raw key */ memcpy ( enc, key, keylen ); prev = enc->column; next = ( ( ( void * ) prev ) + keylen ); tmp = next[-1]; /* Construct expanded key */ while ( next < end ) { /* If this is the first column of an expanded key * block, or the middle column of an AES-256 key * block, then apply the S-box. */ if ( ( offset == 0 ) || ( ( offset | keylen ) == 48 ) ) tmp = aes_key_sbox ( tmp ); /* If this is the first column of an expanded key * block then rotate and apply the round constant. */ if ( offset == 0 ) { tmp = aes_key_rotate ( tmp ); tmp = aes_key_rcon ( tmp, rcon ); rcon = aes_double ( rcon ); } /* XOR with previous key column */ tmp ^= *prev; /* Store column */ *next = tmp; /* Move to next column */ offset += sizeof ( *next ); if ( offset == keylen ) offset = 0; next++; prev++; } DBGC2 ( aes, "AES %p expanded %zd-bit key:\n", aes, ( keylen * 8 ) ); DBGC2_HDA ( aes, 0, &aes->encrypt, ( rounds * sizeof ( *enc ) ) ); /* Convert to decryption key */ memset ( &zero, 0, sizeof ( zero ) ); dec = &aes->decrypt.key[ rounds - 1 ]; memcpy ( dec--, enc++, sizeof ( *dec ) ); while ( dec > aes->decrypt.key ) { /* Perform InvMixColumns (by reusing the encryption * final-round code to perform ShiftRows+SubBytes and * reusing the decryption intermediate-round code to * perform InvShiftRows+InvSubBytes+InvMixColumns, all * with a zero encryption key). */ aes_final ( &aes_mixcolumns, AES_STRIDE_SHIFTROWS, enc++, &temp, &zero ); aes_decrypt_rounds ( &temp, dec--, &zero, 1 ); } memcpy ( dec--, enc++, sizeof ( *dec ) ); DBGC2 ( aes, "AES %p inverted %zd-bit key:\n", aes, ( keylen * 8 ) ); DBGC2_HDA ( aes, 0, &aes->decrypt, ( rounds * sizeof ( *dec ) ) ); return 0; } /** * Set initialisation vector * * @v ctx Context * @v iv Initialisation vector */ static void aes_setiv ( void *ctx __unused, const void *iv __unused ) { /* Nothing to do */ } /** Basic AES algorithm */ struct cipher_algorithm aes_algorithm = { .name = "aes", .ctxsize = sizeof ( struct aes_context ), .blocksize = AES_BLOCKSIZE, .setkey = aes_setkey, .setiv = aes_setiv, .encrypt = aes_encrypt, .decrypt = aes_decrypt, }; /* AES in Electronic Codebook mode */ ECB_CIPHER ( aes_ecb, aes_ecb_algorithm, aes_algorithm, struct aes_context, AES_BLOCKSIZE ); /* AES in Cipher Block Chaining mode */ CBC_CIPHER ( aes_cbc, aes_cbc_algorithm, aes_algorithm, struct aes_context, AES_BLOCKSIZE );