mirror of
https://github.com/rvtr/GodMode9i.git
synced 2025-06-19 03:05:43 -04:00
253 lines
6.6 KiB
C
253 lines
6.6 KiB
C
|
|
#include <nds.h>
|
|
#include <malloc.h>
|
|
#include "aes.h"
|
|
|
|
/* AES 128 ECB dug out from mbed TLS 2.5.1
|
|
* https://github.com/ARMmbed/mbedtls/blob/development/include/mbedtls/aes.h
|
|
* https://github.com/ARMmbed/mbedtls/blob/development/library/aes.c
|
|
*
|
|
* C style comments are mbed TLS comments
|
|
* C++ style comments are mine
|
|
*/
|
|
|
|
// make VC happy
|
|
#ifdef _MSC_VER
|
|
#define DTCM_BSS
|
|
#define ITCM_CODE
|
|
#endif
|
|
|
|
// it's interesting they mix unsigned char with uint32_t
|
|
DTCM_BSS static unsigned char FSb[256];
|
|
DTCM_BSS static uint32_t FT0[256];
|
|
DTCM_BSS static uint32_t FT1[256];
|
|
DTCM_BSS static uint32_t FT2[256];
|
|
DTCM_BSS static uint32_t FT3[256];
|
|
|
|
// AES-CTR/CCM only uses encrypt, so R tables are not used
|
|
#define NO_R_TABLES
|
|
#ifndef NO_R_TABLES
|
|
static unsigned char RSb[256];
|
|
static uint32_t RT0[256];
|
|
static uint32_t RT1[256];
|
|
static uint32_t RT2[256];
|
|
static uint32_t RT3[256];
|
|
#endif
|
|
|
|
static uint32_t RCON[256];
|
|
|
|
/*
|
|
* Tables generation code
|
|
*/
|
|
#define ROTL8(x) ( ( x << 8 ) & 0xFFFFFFFF ) | ( x >> 24 )
|
|
#define XTIME(x) ( ( x << 1 ) ^ ( ( x & 0x80 ) ? 0x1B : 0x00 ) )
|
|
#define MUL(x,y) ( ( x && y ) ? pow[(log[x]+log[y]) % 255] : 0 )
|
|
|
|
void aes_gen_tables(void)
|
|
{
|
|
#ifdef NO_R_TABLES
|
|
unsigned char *RSb = memalign(32, 256);
|
|
uint32_t *RT0 = memalign(32, 256 * sizeof(uint32_t));
|
|
uint32_t *RT1 = memalign(32, 256 * sizeof(uint32_t));
|
|
uint32_t *RT2 = memalign(32, 256 * sizeof(uint32_t));
|
|
uint32_t *RT3 = memalign(32, 256 * sizeof(uint32_t));
|
|
#endif
|
|
|
|
int i, x, y, z;
|
|
int pow[256];
|
|
int log[256];
|
|
|
|
/*
|
|
* compute pow and log tables over GF(2^8)
|
|
*/
|
|
for (i = 0, x = 1; i < 256; i++)
|
|
{
|
|
pow[i] = x;
|
|
log[x] = i;
|
|
x = (x ^ XTIME(x)) & 0xFF;
|
|
}
|
|
|
|
/*
|
|
* calculate the round constants
|
|
*/
|
|
for (i = 0, x = 1; i < 10; i++)
|
|
{
|
|
RCON[i] = (uint32_t)x;
|
|
x = XTIME(x) & 0xFF;
|
|
}
|
|
|
|
/*
|
|
* generate the forward and reverse S-boxes
|
|
*/
|
|
FSb[0x00] = 0x63;
|
|
RSb[0x63] = 0x00;
|
|
|
|
for (i = 1; i < 256; i++)
|
|
{
|
|
x = pow[255 - log[i]];
|
|
|
|
y = x; y = ((y << 1) | (y >> 7)) & 0xFF;
|
|
x ^= y; y = ((y << 1) | (y >> 7)) & 0xFF;
|
|
x ^= y; y = ((y << 1) | (y >> 7)) & 0xFF;
|
|
x ^= y; y = ((y << 1) | (y >> 7)) & 0xFF;
|
|
x ^= y ^ 0x63;
|
|
|
|
FSb[i] = (unsigned char)x;
|
|
RSb[x] = (unsigned char)i;
|
|
}
|
|
|
|
/*
|
|
* generate the forward and reverse tables
|
|
*/
|
|
for (i = 0; i < 256; i++)
|
|
{
|
|
x = FSb[i];
|
|
y = XTIME(x) & 0xFF;
|
|
z = (y ^ x) & 0xFF;
|
|
|
|
FT0[i] = ((uint32_t)y) ^
|
|
((uint32_t)x << 8) ^
|
|
((uint32_t)x << 16) ^
|
|
((uint32_t)z << 24);
|
|
|
|
FT1[i] = ROTL8(FT0[i]);
|
|
FT2[i] = ROTL8(FT1[i]);
|
|
FT3[i] = ROTL8(FT2[i]);
|
|
|
|
x = RSb[i];
|
|
|
|
RT0[i] = ((uint32_t)MUL(0x0E, x)) ^
|
|
((uint32_t)MUL(0x09, x) << 8) ^
|
|
((uint32_t)MUL(0x0D, x) << 16) ^
|
|
((uint32_t)MUL(0x0B, x) << 24);
|
|
|
|
RT1[i] = ROTL8(RT0[i]);
|
|
RT2[i] = ROTL8(RT1[i]);
|
|
RT3[i] = ROTL8(RT2[i]);
|
|
}
|
|
#ifdef NO_R_TABLES
|
|
free(RSb);
|
|
free(RT0);
|
|
free(RT1);
|
|
free(RT2);
|
|
free(RT3);
|
|
#endif
|
|
}
|
|
|
|
// did a little counting to understand why original mbedTLS buf is [68]
|
|
// in set key, they generated:
|
|
// 128 bits key: 10 rounds of += 4, plus 4 after, 44
|
|
// 192 bits key: 8 rounds of += 6, plus 6 after, 56
|
|
// 256 bits key: 7 rounds of += 8, plus 8 after, 64
|
|
// and in ecb encrypt, it used:
|
|
// 4 + 4 * 2 * 4 + 4 + 4 "++"s, 44
|
|
// 4 + 4 * 2 * 5 + 4 + 4 "++"s, 52
|
|
// 4 + 4 * 2 * 6 + 4 + 4 "++"s, 60
|
|
// so they generated several bytes more in 192 and 256 modes to simplify the loop
|
|
// "able to hold 32 extra bytes" in their comment makes senses now
|
|
|
|
void aes_set_key_enc_128_be(uint32_t rk[RK_LEN], const unsigned char *key) {
|
|
uint32_t *RK = rk;
|
|
|
|
GET_UINT32_BE(RK[0], key, 12);
|
|
GET_UINT32_BE(RK[1], key, 8);
|
|
GET_UINT32_BE(RK[2], key, 4);
|
|
GET_UINT32_BE(RK[3], key, 0);
|
|
|
|
for (unsigned i = 0; i < 10; ++i, RK += 4) {
|
|
RK[4] = RK[0] ^ RCON[i] ^
|
|
((uint32_t)FSb[(RK[3] >> 8) & 0xFF]) ^
|
|
((uint32_t)FSb[(RK[3] >> 16) & 0xFF] << 8) ^
|
|
((uint32_t)FSb[(RK[3] >> 24) & 0xFF] << 16) ^
|
|
((uint32_t)FSb[(RK[3]) & 0xFF] << 24);
|
|
|
|
RK[5] = RK[1] ^ RK[4];
|
|
RK[6] = RK[2] ^ RK[5];
|
|
RK[7] = RK[3] ^ RK[6];
|
|
}
|
|
}
|
|
|
|
#define AES_FROUND(X0,X1,X2,X3,Y0,Y1,Y2,Y3) \
|
|
{ \
|
|
X0 = *RK++ ^ FT0[ ( Y0 ) & 0xFF ] ^ \
|
|
FT1[ ( Y1 >> 8 ) & 0xFF ] ^ \
|
|
FT2[ ( Y2 >> 16 ) & 0xFF ] ^ \
|
|
FT3[ ( Y3 >> 24 ) & 0xFF ]; \
|
|
\
|
|
X1 = *RK++ ^ FT0[ ( Y1 ) & 0xFF ] ^ \
|
|
FT1[ ( Y2 >> 8 ) & 0xFF ] ^ \
|
|
FT2[ ( Y3 >> 16 ) & 0xFF ] ^ \
|
|
FT3[ ( Y0 >> 24 ) & 0xFF ]; \
|
|
\
|
|
X2 = *RK++ ^ FT0[ ( Y2 ) & 0xFF ] ^ \
|
|
FT1[ ( Y3 >> 8 ) & 0xFF ] ^ \
|
|
FT2[ ( Y0 >> 16 ) & 0xFF ] ^ \
|
|
FT3[ ( Y1 >> 24 ) & 0xFF ]; \
|
|
\
|
|
X3 = *RK++ ^ FT0[ ( Y3 ) & 0xFF ] ^ \
|
|
FT1[ ( Y0 >> 8 ) & 0xFF ] ^ \
|
|
FT2[ ( Y1 >> 16 ) & 0xFF ] ^ \
|
|
FT3[ ( Y2 >> 24 ) & 0xFF ]; \
|
|
}
|
|
|
|
DTCM_BSS uint32_t X0, X1, X2, X3, Y0, Y1, Y2, Y3;
|
|
DTCM_BSS const uint32_t *RK;
|
|
|
|
ITCM_CODE void aes_encrypt_128_be(const uint32_t rk[RK_LEN],
|
|
const unsigned char input[16], unsigned char output[16])
|
|
{
|
|
RK = rk;
|
|
|
|
GET_UINT32_BE(X0, input, 12);
|
|
GET_UINT32_BE(X1, input, 8);
|
|
GET_UINT32_BE(X2, input, 4);
|
|
GET_UINT32_BE(X3, input, 0);
|
|
|
|
X0 ^= *RK++;
|
|
X1 ^= *RK++;
|
|
X2 ^= *RK++;
|
|
X3 ^= *RK++;
|
|
|
|
// loop unrolled
|
|
AES_FROUND(Y0, Y1, Y2, Y3, X0, X1, X2, X3);
|
|
AES_FROUND(X0, X1, X2, X3, Y0, Y1, Y2, Y3);
|
|
AES_FROUND(Y0, Y1, Y2, Y3, X0, X1, X2, X3);
|
|
AES_FROUND(X0, X1, X2, X3, Y0, Y1, Y2, Y3);
|
|
AES_FROUND(Y0, Y1, Y2, Y3, X0, X1, X2, X3);
|
|
AES_FROUND(X0, X1, X2, X3, Y0, Y1, Y2, Y3);
|
|
AES_FROUND(Y0, Y1, Y2, Y3, X0, X1, X2, X3);
|
|
AES_FROUND(X0, X1, X2, X3, Y0, Y1, Y2, Y3);
|
|
AES_FROUND(Y0, Y1, Y2, Y3, X0, X1, X2, X3);
|
|
|
|
X0 = *RK++ ^ \
|
|
((uint32_t)FSb[(Y0) & 0xFF]) ^
|
|
((uint32_t)FSb[(Y1 >> 8) & 0xFF] << 8) ^
|
|
((uint32_t)FSb[(Y2 >> 16) & 0xFF] << 16) ^
|
|
((uint32_t)FSb[(Y3 >> 24) & 0xFF] << 24);
|
|
|
|
X1 = *RK++ ^ \
|
|
((uint32_t)FSb[(Y1) & 0xFF]) ^
|
|
((uint32_t)FSb[(Y2 >> 8) & 0xFF] << 8) ^
|
|
((uint32_t)FSb[(Y3 >> 16) & 0xFF] << 16) ^
|
|
((uint32_t)FSb[(Y0 >> 24) & 0xFF] << 24);
|
|
|
|
X2 = *RK++ ^ \
|
|
((uint32_t)FSb[(Y2) & 0xFF]) ^
|
|
((uint32_t)FSb[(Y3 >> 8) & 0xFF] << 8) ^
|
|
((uint32_t)FSb[(Y0 >> 16) & 0xFF] << 16) ^
|
|
((uint32_t)FSb[(Y1 >> 24) & 0xFF] << 24);
|
|
|
|
// removed a ++ here
|
|
X3 = *RK ^ \
|
|
((uint32_t)FSb[(Y3) & 0xFF]) ^
|
|
((uint32_t)FSb[(Y0 >> 8) & 0xFF] << 8) ^
|
|
((uint32_t)FSb[(Y1 >> 16) & 0xFF] << 16) ^
|
|
((uint32_t)FSb[(Y2 >> 24) & 0xFF] << 24);
|
|
|
|
PUT_UINT32_BE(X0, output, 12);
|
|
PUT_UINT32_BE(X1, output, 8);
|
|
PUT_UINT32_BE(X2, output, 4);
|
|
PUT_UINT32_BE(X3, output, 0);
|
|
}
|
|
|