Implement NAND browsing

Code from twlnf. The NAND write code is stripped for safety reasons.
2025-06-18 19:05:30 -04:00 · 2020-02-11 18:42:04 -07:00 · 2020-02-11 18:42:04 -07:00 · bf53a3c037
commit bf53a3c037
parent 2bd4e57081
30 changed files with 5938 additions and 70 deletions
--- a/README.md
+++ b/README.md
@ -43,5 +43,6 @@ Once everything is downloaded and installed, `git clone` this repository, naviga
 * [RocketRobz](https://github.com/RocketRobz): Creator of GodMode9i.
 * [zacchi4k](https://github.com/zacchi4k): Creator of the GodMode9i logo used in v1.3.1 and onwards.
 * [Edo9300](https://github.com/edo9300): Save reading code from his save manager tool.
+* [JimmyZ](https://github.com/JimmyZ): NAND code from twlnf (with writing code stripped for safety reasons).
 * [devkitPro](https://github.com/devkitPro): devkitARM, libnds, original nds-hb-menu code, and screenshot code.
 * [d0k3](https://github.com/d0k3): Original GM9 app and name for the Nintendo 3DS, which this is inspired by.
--- a/arm7/source/main.c
+++ b/arm7/source/main.c
@ -93,7 +93,11 @@ int main() {
 	irqEnable( IRQ_VBLANK | IRQ_VCOUNT );

 	setPowerButtonCB(powerButtonCB);
-	
+
+	for (int i = 0; i < 8; i++) {
+		*(u8*)(0x2FFFD00+i) = *(u8*)(0x4004D07-i);	// Get ConsoleID
+	}
+
 	fifoSendValue32(FIFO_USER_03, *SCFG_EXT);
 	fifoSendValue32(FIFO_USER_07, *(u16*)(0x4004700));
 	fifoSendValue32(FIFO_USER_06, 1);
@ -103,6 +107,10 @@ int main() {
 		if ( 0 == (REG_KEYINPUT & (KEY_SELECT | KEY_START | KEY_L | KEY_R))) {
 			exitflag = true;
 		}
+		if (*(u32*)(0x2FFFD0C) == 0x454D4D43) {
+			sdmmc_nand_cid((u32*)0x2FFD7BC);	// Get eMMC CID
+			*(u32*)(0x2FFFD0C) = 0;
+		}
 		resyncClock();
 		swiWaitForVBlank();
 	}
--- a/arm9/Makefile
+++ b/arm9/Makefile
@ -19,7 +19,7 @@ include $(DEVKITARM)/ds_rules
 #---------------------------------------------------------------------------------
 TARGET		:=	GodMode9i
 BUILD		:=	build
-SOURCES		:=	source dldi-include ramdrive-include
+SOURCES		:=	source dldi-include ramdrive-include mbedtls
 INCLUDES	:=	include dldi-include ramdrive-include source
 DATA		:=	../data
 GRAPHICS	:=  ../gfx
--- a/arm9/mbedtls/aes.c
+++ b/arm9/mbedtls/aes.c
@ -0,0 +1,252 @@
+
+#include <nds.h>
+#include <malloc.h>
+#include "aes.h"
+
+/* AES 128 ECB dug out from mbed TLS 2.5.1
+ * https://github.com/ARMmbed/mbedtls/blob/development/include/mbedtls/aes.h
+ * https://github.com/ARMmbed/mbedtls/blob/development/library/aes.c
+ *
+ * C style comments are mbed TLS comments
+ * C++ style comments are mine
+ */
+
+// make VC happy
+#ifdef _MSC_VER
+#define DTCM_BSS
+#define ITCM_CODE
+#endif
+
+// it's interesting they mix unsigned char with uint32_t
+DTCM_BSS static unsigned char FSb[256];
+DTCM_BSS static uint32_t FT0[256];
+DTCM_BSS static uint32_t FT1[256];
+DTCM_BSS static uint32_t FT2[256];
+DTCM_BSS static uint32_t FT3[256];
+
+// AES-CTR/CCM only uses encrypt, so R tables are not used
+#define NO_R_TABLES
+#ifndef NO_R_TABLES
+static unsigned char RSb[256];
+static uint32_t RT0[256];
+static uint32_t RT1[256];
+static uint32_t RT2[256];
+static uint32_t RT3[256];
+#endif
+
+static uint32_t RCON[256];
+
+/*
+ * Tables generation code
+ */
+#define ROTL8(x) ( ( x << 8 ) & 0xFFFFFFFF ) | ( x >> 24 )
+#define XTIME(x) ( ( x << 1 ) ^ ( ( x & 0x80 ) ? 0x1B : 0x00 ) )
+#define MUL(x,y) ( ( x && y ) ? pow[(log[x]+log[y]) % 255] : 0 )
+
+void aes_gen_tables(void)
+{
+#ifdef NO_R_TABLES
+	unsigned char *RSb = memalign(32, 256);
+	uint32_t *RT0 = memalign(32, 256 * sizeof(uint32_t));
+	uint32_t *RT1 = memalign(32, 256 * sizeof(uint32_t));
+	uint32_t *RT2 = memalign(32, 256 * sizeof(uint32_t));
+	uint32_t *RT3 = memalign(32, 256 * sizeof(uint32_t));
+#endif
+
+	int i, x, y, z;
+	int pow[256];
+	int log[256];
+
+	/*
+	 * compute pow and log tables over GF(2^8)
+	 */
+	for (i = 0, x = 1; i < 256; i++)
+	{
+		pow[i] = x;
+		log[x] = i;
+		x = (x ^ XTIME(x)) & 0xFF;
+	}
+
+	/*
+	 * calculate the round constants
+	 */
+	for (i = 0, x = 1; i < 10; i++)
+	{
+		RCON[i] = (uint32_t)x;
+		x = XTIME(x) & 0xFF;
+	}
+
+	/*
+	 * generate the forward and reverse S-boxes
+	 */
+	FSb[0x00] = 0x63;
+	RSb[0x63] = 0x00;
+
+	for (i = 1; i < 256; i++)
+	{
+		x = pow[255 - log[i]];
+
+		y = x; y = ((y << 1) | (y >> 7)) & 0xFF;
+		x ^= y; y = ((y << 1) | (y >> 7)) & 0xFF;
+		x ^= y; y = ((y << 1) | (y >> 7)) & 0xFF;
+		x ^= y; y = ((y << 1) | (y >> 7)) & 0xFF;
+		x ^= y ^ 0x63;
+
+		FSb[i] = (unsigned char)x;
+		RSb[x] = (unsigned char)i;
+	}
+
+	/*
+	 * generate the forward and reverse tables
+	 */
+	for (i = 0; i < 256; i++)
+	{
+		x = FSb[i];
+		y = XTIME(x) & 0xFF;
+		z = (y ^ x) & 0xFF;
+
+		FT0[i] = ((uint32_t)y) ^
+			((uint32_t)x << 8) ^
+			((uint32_t)x << 16) ^
+			((uint32_t)z << 24);
+
+		FT1[i] = ROTL8(FT0[i]);
+		FT2[i] = ROTL8(FT1[i]);
+		FT3[i] = ROTL8(FT2[i]);
+
+		x = RSb[i];
+
+		RT0[i] = ((uint32_t)MUL(0x0E, x)) ^
+			((uint32_t)MUL(0x09, x) << 8) ^
+			((uint32_t)MUL(0x0D, x) << 16) ^
+			((uint32_t)MUL(0x0B, x) << 24);
+
+		RT1[i] = ROTL8(RT0[i]);
+		RT2[i] = ROTL8(RT1[i]);
+		RT3[i] = ROTL8(RT2[i]);
+	}
+#ifdef NO_R_TABLES
+	free(RSb);
+	free(RT0);
+	free(RT1);
+	free(RT2);
+	free(RT3);
+#endif
+}
+
+// did a little counting to understand why original mbedTLS buf is [68]
+// in set key, they generated:
+//     128 bits key: 10 rounds of += 4, plus 4 after, 44
+//     192 bits key: 8 rounds of += 6, plus 6 after, 56
+//     256 bits key: 7 rounds of += 8, plus 8 after, 64
+// and in ecb encrypt, it used:
+//     4 + 4 * 2 * 4 + 4 + 4 "++"s, 44
+//     4 + 4 * 2 * 5 + 4 + 4 "++"s, 52
+//     4 + 4 * 2 * 6 + 4 + 4 "++"s, 60
+// so they generated several bytes more in 192 and 256 modes to simplify the loop
+// "able to hold 32 extra bytes" in their comment makes senses now
+
+void aes_set_key_enc_128_be(uint32_t rk[RK_LEN], const unsigned char *key) {
+	uint32_t *RK = rk;
+
+	GET_UINT32_BE(RK[0], key, 12);
+	GET_UINT32_BE(RK[1], key, 8);
+	GET_UINT32_BE(RK[2], key, 4);
+	GET_UINT32_BE(RK[3], key, 0);
+
+	for (unsigned i = 0; i < 10; ++i, RK += 4) {
+		RK[4] = RK[0] ^ RCON[i] ^
+			((uint32_t)FSb[(RK[3] >> 8) & 0xFF]) ^
+			((uint32_t)FSb[(RK[3] >> 16) & 0xFF] << 8) ^
+			((uint32_t)FSb[(RK[3] >> 24) & 0xFF] << 16) ^
+			((uint32_t)FSb[(RK[3]) & 0xFF] << 24);
+
+		RK[5] = RK[1] ^ RK[4];
+		RK[6] = RK[2] ^ RK[5];
+		RK[7] = RK[3] ^ RK[6];
+	}
+}
+
+#define AES_FROUND(X0,X1,X2,X3,Y0,Y1,Y2,Y3)     \
+{                                               \
+    X0 = *RK++ ^ FT0[ ( Y0       ) & 0xFF ] ^   \
+                 FT1[ ( Y1 >>  8 ) & 0xFF ] ^   \
+                 FT2[ ( Y2 >> 16 ) & 0xFF ] ^   \
+                 FT3[ ( Y3 >> 24 ) & 0xFF ];    \
+                                                \
+    X1 = *RK++ ^ FT0[ ( Y1       ) & 0xFF ] ^   \
+                 FT1[ ( Y2 >>  8 ) & 0xFF ] ^   \
+                 FT2[ ( Y3 >> 16 ) & 0xFF ] ^   \
+                 FT3[ ( Y0 >> 24 ) & 0xFF ];    \
+                                                \
+    X2 = *RK++ ^ FT0[ ( Y2       ) & 0xFF ] ^   \
+                 FT1[ ( Y3 >>  8 ) & 0xFF ] ^   \
+                 FT2[ ( Y0 >> 16 ) & 0xFF ] ^   \
+                 FT3[ ( Y1 >> 24 ) & 0xFF ];    \
+                                                \
+    X3 = *RK++ ^ FT0[ ( Y3       ) & 0xFF ] ^   \
+                 FT1[ ( Y0 >>  8 ) & 0xFF ] ^   \
+                 FT2[ ( Y1 >> 16 ) & 0xFF ] ^   \
+                 FT3[ ( Y2 >> 24 ) & 0xFF ];    \
+}
+
+DTCM_BSS uint32_t X0, X1, X2, X3, Y0, Y1, Y2, Y3;
+DTCM_BSS const uint32_t *RK;
+
+ITCM_CODE void aes_encrypt_128_be(const uint32_t rk[RK_LEN],
+	const unsigned char input[16], unsigned char output[16])
+{
+	RK = rk;
+
+	GET_UINT32_BE(X0, input, 12);
+	GET_UINT32_BE(X1, input, 8);
+	GET_UINT32_BE(X2, input, 4);
+	GET_UINT32_BE(X3, input, 0);
+
+	X0 ^= *RK++;
+	X1 ^= *RK++;
+	X2 ^= *RK++;
+	X3 ^= *RK++;
+
+	// loop unrolled
+	AES_FROUND(Y0, Y1, Y2, Y3, X0, X1, X2, X3);
+	AES_FROUND(X0, X1, X2, X3, Y0, Y1, Y2, Y3);
+	AES_FROUND(Y0, Y1, Y2, Y3, X0, X1, X2, X3);
+	AES_FROUND(X0, X1, X2, X3, Y0, Y1, Y2, Y3);
+	AES_FROUND(Y0, Y1, Y2, Y3, X0, X1, X2, X3);
+	AES_FROUND(X0, X1, X2, X3, Y0, Y1, Y2, Y3);
+	AES_FROUND(Y0, Y1, Y2, Y3, X0, X1, X2, X3);
+	AES_FROUND(X0, X1, X2, X3, Y0, Y1, Y2, Y3);
+	AES_FROUND(Y0, Y1, Y2, Y3, X0, X1, X2, X3);
+
+	X0 = *RK++ ^ \
+		((uint32_t)FSb[(Y0) & 0xFF]) ^
+		((uint32_t)FSb[(Y1 >> 8) & 0xFF] << 8) ^
+		((uint32_t)FSb[(Y2 >> 16) & 0xFF] << 16) ^
+		((uint32_t)FSb[(Y3 >> 24) & 0xFF] << 24);
+
+	X1 = *RK++ ^ \
+		((uint32_t)FSb[(Y1) & 0xFF]) ^
+		((uint32_t)FSb[(Y2 >> 8) & 0xFF] << 8) ^
+		((uint32_t)FSb[(Y3 >> 16) & 0xFF] << 16) ^
+		((uint32_t)FSb[(Y0 >> 24) & 0xFF] << 24);
+
+	X2 = *RK++ ^ \
+		((uint32_t)FSb[(Y2) & 0xFF]) ^
+		((uint32_t)FSb[(Y3 >> 8) & 0xFF] << 8) ^
+		((uint32_t)FSb[(Y0 >> 16) & 0xFF] << 16) ^
+		((uint32_t)FSb[(Y1 >> 24) & 0xFF] << 24);
+
+	// removed a ++ here
+	X3 = *RK ^ \
+		((uint32_t)FSb[(Y3) & 0xFF]) ^
+		((uint32_t)FSb[(Y0 >> 8) & 0xFF] << 8) ^
+		((uint32_t)FSb[(Y1 >> 16) & 0xFF] << 16) ^
+		((uint32_t)FSb[(Y2 >> 24) & 0xFF] << 24);
+
+	PUT_UINT32_BE(X0, output, 12);
+	PUT_UINT32_BE(X1, output, 8);
+	PUT_UINT32_BE(X2, output, 4);
+	PUT_UINT32_BE(X3, output, 0);
+}
+
--- a/arm9/mbedtls/aes.h
+++ b/arm9/mbedtls/aes.h
@ -0,0 +1,32 @@
+
+#pragma once
+
+#include <stdint.h>
+
+#define RK_LEN 44 //round key length
+
+// modified to work on reversed byte order input/output
+// it could work by wrapping it between byte reversed I/O, minmize modification to actual AES code
+// this is just my OCD to eliminate some copy
+// original mbedTLS AES GET/PUT_UINT32 macros on little endian I/O regardless of CPU endianness
+// seems like Nintendo used big endian hardware AES with little endian CPU
+// by byte reversing on I/O, this mimics Nintendo behavior on little endian CPU
+// calling it BE is not very accurate, it becomes little endian on big endian CPU
+
+#define GET_UINT32_BE(n, b, i) \
+	((uint8_t*)&(n))[0] = (b)[i + 3]; \
+	((uint8_t*)&(n))[1] = (b)[i + 2]; \
+	((uint8_t*)&(n))[2] = (b)[i + 1]; \
+	((uint8_t*)&(n))[3] = (b)[i + 0]
+#define PUT_UINT32_BE(n, b, i) \
+	(b)[i + 0] = ((uint8_t*)&(n))[3]; \
+	(b)[i + 1] = ((uint8_t*)&(n))[2]; \
+	(b)[i + 2] = ((uint8_t*)&(n))[1]; \
+	(b)[i + 3] = ((uint8_t*)&(n))[0]
+
+void aes_gen_tables(void);
+
+void aes_set_key_enc_128_be(uint32_t rk[RK_LEN], const unsigned char *key);
+
+void aes_encrypt_128_be(const uint32_t rk[RK_LEN], const unsigned char input[16], unsigned char output[16]);
+
--- a/arm9/mbedtls/bignum.c
+++ b/arm9/mbedtls/bignum.c
--- a/arm9/mbedtls/bignum.h
+++ b/arm9/mbedtls/bignum.h
@ -0,0 +1,761 @@
+/**
+ * \file bignum.h
+ *
+ * \brief  Multi-precision integer library
+ *
+ *  Copyright (C) 2006-2015, ARM Limited, All Rights Reserved
+ *  SPDX-License-Identifier: Apache-2.0
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License"); you may
+ *  not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ *  WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  This file is part of mbed TLS (https://tls.mbed.org)
+ */
+#ifndef MBEDTLS_BIGNUM_H
+#define MBEDTLS_BIGNUM_H
+
+#if !defined(MBEDTLS_CONFIG_FILE)
+#include "config.h"
+#else
+#include MBEDTLS_CONFIG_FILE
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(MBEDTLS_FS_IO)
+#include <stdio.h>
+#endif
+
+#define MBEDTLS_ERR_MPI_FILE_IO_ERROR                     -0x0002  /**< An error occurred while reading from or writing to a file. */
+#define MBEDTLS_ERR_MPI_BAD_INPUT_DATA                    -0x0004  /**< Bad input parameters to function. */
+#define MBEDTLS_ERR_MPI_INVALID_CHARACTER                 -0x0006  /**< There is an invalid character in the digit string. */
+#define MBEDTLS_ERR_MPI_BUFFER_TOO_SMALL                  -0x0008  /**< The buffer is too small to write to. */
+#define MBEDTLS_ERR_MPI_NEGATIVE_VALUE                    -0x000A  /**< The input arguments are negative or result in illegal output. */
+#define MBEDTLS_ERR_MPI_DIVISION_BY_ZERO                  -0x000C  /**< The input argument for division is zero, which is not allowed. */
+#define MBEDTLS_ERR_MPI_NOT_ACCEPTABLE                    -0x000E  /**< The input arguments are not acceptable. */
+#define MBEDTLS_ERR_MPI_ALLOC_FAILED                      -0x0010  /**< Memory allocation failed. */
+
+#define MBEDTLS_MPI_CHK(f) do { if( ( ret = f ) != 0 ) goto cleanup; } while( 0 )
+
+/*
+ * Maximum size MPIs are allowed to grow to in number of limbs.
+ */
+#define MBEDTLS_MPI_MAX_LIMBS                             10000
+
+#if !defined(MBEDTLS_MPI_WINDOW_SIZE)
+/*
+ * Maximum window size used for modular exponentiation. Default: 6
+ * Minimum value: 1. Maximum value: 6.
+ *
+ * Result is an array of ( 2 << MBEDTLS_MPI_WINDOW_SIZE ) MPIs used
+ * for the sliding window calculation. (So 64 by default)
+ *
+ * Reduction in size, reduces speed.
+ */
+#define MBEDTLS_MPI_WINDOW_SIZE                           6        /**< Maximum windows size used. */
+#endif /* !MBEDTLS_MPI_WINDOW_SIZE */
+
+#if !defined(MBEDTLS_MPI_MAX_SIZE)
+/*
+ * Maximum size of MPIs allowed in bits and bytes for user-MPIs.
+ * ( Default: 512 bytes => 4096 bits, Maximum tested: 2048 bytes => 16384 bits )
+ *
+ * Note: Calculations can results temporarily in larger MPIs. So the number
+ * of limbs required (MBEDTLS_MPI_MAX_LIMBS) is higher.
+ */
+#define MBEDTLS_MPI_MAX_SIZE                              1024     /**< Maximum number of bytes for usable MPIs. */
+#endif /* !MBEDTLS_MPI_MAX_SIZE */
+
+#define MBEDTLS_MPI_MAX_BITS                              ( 8 * MBEDTLS_MPI_MAX_SIZE )    /**< Maximum number of bits for usable MPIs. */
+
+/*
+ * When reading from files with mbedtls_mpi_read_file() and writing to files with
+ * mbedtls_mpi_write_file() the buffer should have space
+ * for a (short) label, the MPI (in the provided radix), the newline
+ * characters and the '\0'.
+ *
+ * By default we assume at least a 10 char label, a minimum radix of 10
+ * (decimal) and a maximum of 4096 bit numbers (1234 decimal chars).
+ * Autosized at compile time for at least a 10 char label, a minimum radix
+ * of 10 (decimal) for a number of MBEDTLS_MPI_MAX_BITS size.
+ *
+ * This used to be statically sized to 1250 for a maximum of 4096 bit
+ * numbers (1234 decimal chars).
+ *
+ * Calculate using the formula:
+ *  MBEDTLS_MPI_RW_BUFFER_SIZE = ceil(MBEDTLS_MPI_MAX_BITS / ln(10) * ln(2)) +
+ *                                LabelSize + 6
+ */
+#define MBEDTLS_MPI_MAX_BITS_SCALE100          ( 100 * MBEDTLS_MPI_MAX_BITS )
+#define MBEDTLS_LN_2_DIV_LN_10_SCALE100                 332
+#define MBEDTLS_MPI_RW_BUFFER_SIZE             ( ((MBEDTLS_MPI_MAX_BITS_SCALE100 + MBEDTLS_LN_2_DIV_LN_10_SCALE100 - 1) / MBEDTLS_LN_2_DIV_LN_10_SCALE100) + 10 + 6 )
+
+/*
+ * Define the base integer type, architecture-wise.
+ *
+ * 32 or 64-bit integer types can be forced regardless of the underlying
+ * architecture by defining MBEDTLS_HAVE_INT32 or MBEDTLS_HAVE_INT64
+ * respectively and undefining MBEDTLS_HAVE_ASM.
+ *
+ * Double-width integers (e.g. 128-bit in 64-bit architectures) can be
+ * disabled by defining MBEDTLS_NO_UDBL_DIVISION.
+ */
+#if !defined(MBEDTLS_HAVE_INT32)
+    #if defined(_MSC_VER) && defined(_M_AMD64)
+        /* Always choose 64-bit when using MSC */
+        #if !defined(MBEDTLS_HAVE_INT64)
+            #define MBEDTLS_HAVE_INT64
+        #endif /* !MBEDTLS_HAVE_INT64 */
+        typedef  int64_t mbedtls_mpi_sint;
+        typedef uint64_t mbedtls_mpi_uint;
+    #elif defined(__GNUC__) && (                         \
+        defined(__amd64__) || defined(__x86_64__)     || \
+        defined(__ppc64__) || defined(__powerpc64__)  || \
+        defined(__ia64__)  || defined(__alpha__)      || \
+        ( defined(__sparc__) && defined(__arch64__) ) || \
+        defined(__s390x__) || defined(__mips64) )
+        #if !defined(MBEDTLS_HAVE_INT64)
+            #define MBEDTLS_HAVE_INT64
+        #endif /* MBEDTLS_HAVE_INT64 */
+        typedef  int64_t mbedtls_mpi_sint;
+        typedef uint64_t mbedtls_mpi_uint;
+        #if !defined(MBEDTLS_NO_UDBL_DIVISION)
+            /* mbedtls_t_udbl defined as 128-bit unsigned int */
+            typedef unsigned int mbedtls_t_udbl __attribute__((mode(TI)));
+            #define MBEDTLS_HAVE_UDBL
+        #endif /* !MBEDTLS_NO_UDBL_DIVISION */
+    #elif defined(__ARMCC_VERSION) && defined(__aarch64__)
+        /*
+         * __ARMCC_VERSION is defined for both armcc and armclang and
+         * __aarch64__ is only defined by armclang when compiling 64-bit code
+         */
+        #if !defined(MBEDTLS_HAVE_INT64)
+            #define MBEDTLS_HAVE_INT64
+        #endif /* !MBEDTLS_HAVE_INT64 */
+        typedef  int64_t mbedtls_mpi_sint;
+        typedef uint64_t mbedtls_mpi_uint;
+        #if !defined(MBEDTLS_NO_UDBL_DIVISION)
+            /* mbedtls_t_udbl defined as 128-bit unsigned int */
+            typedef __uint128_t mbedtls_t_udbl;
+            #define MBEDTLS_HAVE_UDBL
+        #endif /* !MBEDTLS_NO_UDBL_DIVISION */
+    #elif defined(MBEDTLS_HAVE_INT64)
+        /* Force 64-bit integers with unknown compiler */
+        typedef  int64_t mbedtls_mpi_sint;
+        typedef uint64_t mbedtls_mpi_uint;
+    #endif
+#endif /* !MBEDTLS_HAVE_INT32 */
+
+#if !defined(MBEDTLS_HAVE_INT64)
+    /* Default to 32-bit compilation */
+    #if !defined(MBEDTLS_HAVE_INT32)
+        #define MBEDTLS_HAVE_INT32
+    #endif /* !MBEDTLS_HAVE_INT32 */
+    typedef  int32_t mbedtls_mpi_sint;
+    typedef uint32_t mbedtls_mpi_uint;
+    #if !defined(MBEDTLS_NO_UDBL_DIVISION)
+        typedef uint64_t mbedtls_t_udbl;
+        #define MBEDTLS_HAVE_UDBL
+    #endif /* !MBEDTLS_NO_UDBL_DIVISION */
+#endif /* !MBEDTLS_HAVE_INT64 */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * \brief          MPI structure
+ */
+typedef struct
+{
+    int s;              /*!<  integer sign      */
+    size_t n;           /*!<  total # of limbs  */
+    mbedtls_mpi_uint *p;          /*!<  pointer to limbs  */
+}
+mbedtls_mpi;
+
+/**
+ * \brief           Initialize one MPI (make internal references valid)
+ *                  This just makes it ready to be set or freed,
+ *                  but does not define a value for the MPI.
+ *
+ * \param X         One MPI to initialize.
+ */
+void mbedtls_mpi_init( mbedtls_mpi *X );
+
+/**
+ * \brief          Unallocate one MPI
+ *
+ * \param X        One MPI to unallocate.
+ */
+void mbedtls_mpi_free( mbedtls_mpi *X );
+
+/**
+ * \brief          Enlarge to the specified number of limbs
+ *
+ * \param X        MPI to grow
+ * \param nblimbs  The target number of limbs
+ *
+ * \return         0 if successful,
+ *                 MBEDTLS_ERR_MPI_ALLOC_FAILED if memory allocation failed
+ */
+int mbedtls_mpi_grow( mbedtls_mpi *X, size_t nblimbs );
+
+/**
+ * \brief          Resize down, keeping at least the specified number of limbs
+ *
+ * \param X        MPI to shrink
+ * \param nblimbs  The minimum number of limbs to keep
+ *
+ * \return         0 if successful,
+ *                 MBEDTLS_ERR_MPI_ALLOC_FAILED if memory allocation failed
+ */
+int mbedtls_mpi_shrink( mbedtls_mpi *X, size_t nblimbs );
+
+/**
+ * \brief          Copy the contents of Y into X
+ *
+ * \param X        Destination MPI
+ * \param Y        Source MPI
+ *
+ * \return         0 if successful,
+ *                 MBEDTLS_ERR_MPI_ALLOC_FAILED if memory allocation failed
+ */
+int mbedtls_mpi_copy( mbedtls_mpi *X, const mbedtls_mpi *Y );
+
+/**
+ * \brief          Swap the contents of X and Y
+ *
+ * \param X        First MPI value
+ * \param Y        Second MPI value
+ */
+void mbedtls_mpi_swap( mbedtls_mpi *X, mbedtls_mpi *Y );
+
+/**
+ * \brief          Safe conditional assignement X = Y if assign is 1
+ *
+ * \param X        MPI to conditionally assign to
+ * \param Y        Value to be assigned
+ * \param assign   1: perform the assignment, 0: keep X's original value
+ *
+ * \return         0 if successful,
+ *                 MBEDTLS_ERR_MPI_ALLOC_FAILED if memory allocation failed,
+ *
+ * \note           This function is equivalent to
+ *                      if( assign ) mbedtls_mpi_copy( X, Y );
+ *                 except that it avoids leaking any information about whether
+ *                 the assignment was done or not (the above code may leak
+ *                 information through branch prediction and/or memory access
+ *                 patterns analysis).
+ */
+int mbedtls_mpi_safe_cond_assign( mbedtls_mpi *X, const mbedtls_mpi *Y, unsigned char assign );
+
+/**
+ * \brief          Safe conditional swap X <-> Y if swap is 1
+ *
+ * \param X        First mbedtls_mpi value
+ * \param Y        Second mbedtls_mpi value
+ * \param assign   1: perform the swap, 0: keep X and Y's original values
+ *
+ * \return         0 if successful,
+ *                 MBEDTLS_ERR_MPI_ALLOC_FAILED if memory allocation failed,
+ *
+ * \note           This function is equivalent to
+ *                      if( assign ) mbedtls_mpi_swap( X, Y );
+ *                 except that it avoids leaking any information about whether
+ *                 the assignment was done or not (the above code may leak
+ *                 information through branch prediction and/or memory access
+ *                 patterns analysis).
+ */
+int mbedtls_mpi_safe_cond_swap( mbedtls_mpi *X, mbedtls_mpi *Y, unsigned char assign );
+
+/**
+ * \brief          Set value from integer
+ *
+ * \param X        MPI to set
+ * \param z        Value to use
+ *
+ * \return         0 if successful,
+ *                 MBEDTLS_ERR_MPI_ALLOC_FAILED if memory allocation failed
+ */
+int mbedtls_mpi_lset( mbedtls_mpi *X, mbedtls_mpi_sint z );
+
+/**
+ * \brief          Get a specific bit from X
+ *
+ * \param X        MPI to use
+ * \param pos      Zero-based index of the bit in X
+ *
+ * \return         Either a 0 or a 1
+ */
+int mbedtls_mpi_get_bit( const mbedtls_mpi *X, size_t pos );
+
+/**
+ * \brief          Set a bit of X to a specific value of 0 or 1
+ *
+ * \note           Will grow X if necessary to set a bit to 1 in a not yet
+ *                 existing limb. Will not grow if bit should be set to 0
+ *
+ * \param X        MPI to use
+ * \param pos      Zero-based index of the bit in X
+ * \param val      The value to set the bit to (0 or 1)
+ *
+ * \return         0 if successful,
+ *                 MBEDTLS_ERR_MPI_ALLOC_FAILED if memory allocation failed,
+ *                 MBEDTLS_ERR_MPI_BAD_INPUT_DATA if val is not 0 or 1
+ */
+int mbedtls_mpi_set_bit( mbedtls_mpi *X, size_t pos, unsigned char val );
+
+/**
+ * \brief          Return the number of zero-bits before the least significant
+ *                 '1' bit
+ *
+ * Note: Thus also the zero-based index of the least significant '1' bit
+ *
+ * \param X        MPI to use
+ */
+size_t mbedtls_mpi_lsb( const mbedtls_mpi *X );
+
+/**
+ * \brief          Return the number of bits up to and including the most
+ *                 significant '1' bit'
+ *
+ * Note: Thus also the one-based index of the most significant '1' bit
+ *
+ * \param X        MPI to use
+ */
+size_t mbedtls_mpi_bitlen( const mbedtls_mpi *X );
+
+/**
+ * \brief          Return the total size in bytes
+ *
+ * \param X        MPI to use
+ */
+size_t mbedtls_mpi_size( const mbedtls_mpi *X );
+
+/**
+ * \brief          Import from an ASCII string
+ *
+ * \param X        Destination MPI
+ * \param radix    Input numeric base
+ * \param s        Null-terminated string buffer
+ *
+ * \return         0 if successful, or a MBEDTLS_ERR_MPI_XXX error code
+ */
+int mbedtls_mpi_read_string( mbedtls_mpi *X, int radix, const char *s );
+
+/**
+ * \brief          Export into an ASCII string
+ *
+ * \param X        Source MPI
+ * \param radix    Output numeric base
+ * \param buf      Buffer to write the string to
+ * \param buflen   Length of buf
+ * \param olen     Length of the string written, including final NUL byte
+ *
+ * \return         0 if successful, or a MBEDTLS_ERR_MPI_XXX error code.
+ *                 *olen is always updated to reflect the amount
+ *                 of data that has (or would have) been written.
+ *
+ * \note           Call this function with buflen = 0 to obtain the
+ *                 minimum required buffer size in *olen.
+ */
+int mbedtls_mpi_write_string( const mbedtls_mpi *X, int radix,
+                              char *buf, size_t buflen, size_t *olen );
+
+#if defined(MBEDTLS_FS_IO)
+/**
+ * \brief          Read MPI from a line in an opened file
+ *
+ * \param X        Destination MPI
+ * \param radix    Input numeric base
+ * \param fin      Input file handle
+ *
+ * \return         0 if successful, MBEDTLS_ERR_MPI_BUFFER_TOO_SMALL if
+ *                 the file read buffer is too small or a
+ *                 MBEDTLS_ERR_MPI_XXX error code
+ *
+ * \note           On success, this function advances the file stream
+ *                 to the end of the current line or to EOF.
+ *
+ *                 The function returns 0 on an empty line.
+ *
+ *                 Leading whitespaces are ignored, as is a
+ *                 '0x' prefix for radix 16.
+ *
+ */
+int mbedtls_mpi_read_file( mbedtls_mpi *X, int radix, FILE *fin );
+
+/**
+ * \brief          Write X into an opened file, or stdout if fout is NULL
+ *
+ * \param p        Prefix, can be NULL
+ * \param X        Source MPI
+ * \param radix    Output numeric base
+ * \param fout     Output file handle (can be NULL)
+ *
+ * \return         0 if successful, or a MBEDTLS_ERR_MPI_XXX error code
+ *
+ * \note           Set fout == NULL to print X on the console.
+ */
+int mbedtls_mpi_write_file( const char *p, const mbedtls_mpi *X, int radix, FILE *fout );
+#endif /* MBEDTLS_FS_IO */
+
+/**
+ * \brief          Import X from unsigned binary data, big endian
+ *
+ * \param X        Destination MPI
+ * \param buf      Input buffer
+ * \param buflen   Input buffer size
+ *
+ * \return         0 if successful,
+ *                 MBEDTLS_ERR_MPI_ALLOC_FAILED if memory allocation failed
+ */
+int mbedtls_mpi_read_binary( mbedtls_mpi *X, const unsigned char *buf, size_t buflen );
+
+/**
+ * \brief          Export X into unsigned binary data, big endian.
+ *                 Always fills the whole buffer, which will start with zeros
+ *                 if the number is smaller.
+ *
+ * \param X        Source MPI
+ * \param buf      Output buffer
+ * \param buflen   Output buffer size
+ *
+ * \return         0 if successful,
+ *                 MBEDTLS_ERR_MPI_BUFFER_TOO_SMALL if buf isn't large enough
+ */
+int mbedtls_mpi_write_binary( const mbedtls_mpi *X, unsigned char *buf, size_t buflen );
+
+/**
+ * \brief          Left-shift: X <<= count
+ *
+ * \param X        MPI to shift
+ * \param count    Amount to shift
+ *
+ * \return         0 if successful,
+ *                 MBEDTLS_ERR_MPI_ALLOC_FAILED if memory allocation failed
+ */
+int mbedtls_mpi_shift_l( mbedtls_mpi *X, size_t count );
+
+/**
+ * \brief          Right-shift: X >>= count
+ *
+ * \param X        MPI to shift
+ * \param count    Amount to shift
+ *
+ * \return         0 if successful,
+ *                 MBEDTLS_ERR_MPI_ALLOC_FAILED if memory allocation failed
+ */
+int mbedtls_mpi_shift_r( mbedtls_mpi *X, size_t count );
+
+/**
+ * \brief          Compare unsigned values
+ *
+ * \param X        Left-hand MPI
+ * \param Y        Right-hand MPI
+ *
+ * \return         1 if |X| is greater than |Y|,
+ *                -1 if |X| is lesser  than |Y| or
+ *                 0 if |X| is equal to |Y|
+ */
+int mbedtls_mpi_cmp_abs( const mbedtls_mpi *X, const mbedtls_mpi *Y );
+
+/**
+ * \brief          Compare signed values
+ *
+ * \param X        Left-hand MPI
+ * \param Y        Right-hand MPI
+ *
+ * \return         1 if X is greater than Y,
+ *                -1 if X is lesser  than Y or
+ *                 0 if X is equal to Y
+ */
+int mbedtls_mpi_cmp_mpi( const mbedtls_mpi *X, const mbedtls_mpi *Y );
+
+/**
+ * \brief          Compare signed values
+ *
+ * \param X        Left-hand MPI
+ * \param z        The integer value to compare to
+ *
+ * \return         1 if X is greater than z,
+ *                -1 if X is lesser  than z or
+ *                 0 if X is equal to z
+ */
+int mbedtls_mpi_cmp_int( const mbedtls_mpi *X, mbedtls_mpi_sint z );
+
+/**
+ * \brief          Unsigned addition: X = |A| + |B|
+ *
+ * \param X        Destination MPI
+ * \param A        Left-hand MPI
+ * \param B        Right-hand MPI
+ *
+ * \return         0 if successful,
+ *                 MBEDTLS_ERR_MPI_ALLOC_FAILED if memory allocation failed
+ */
+int mbedtls_mpi_add_abs( mbedtls_mpi *X, const mbedtls_mpi *A, const mbedtls_mpi *B );
+
+/**
+ * \brief          Unsigned subtraction: X = |A| - |B|
+ *
+ * \param X        Destination MPI
+ * \param A        Left-hand MPI
+ * \param B        Right-hand MPI
+ *
+ * \return         0 if successful,
+ *                 MBEDTLS_ERR_MPI_NEGATIVE_VALUE if B is greater than A
+ */
+int mbedtls_mpi_sub_abs( mbedtls_mpi *X, const mbedtls_mpi *A, const mbedtls_mpi *B );
+
+/**
+ * \brief          Signed addition: X = A + B
+ *
+ * \param X        Destination MPI
+ * \param A        Left-hand MPI
+ * \param B        Right-hand MPI
+ *
+ * \return         0 if successful,
+ *                 MBEDTLS_ERR_MPI_ALLOC_FAILED if memory allocation failed
+ */
+int mbedtls_mpi_add_mpi( mbedtls_mpi *X, const mbedtls_mpi *A, const mbedtls_mpi *B );
+
+/**
+ * \brief          Signed subtraction: X = A - B
+ *
+ * \param X        Destination MPI
+ * \param A        Left-hand MPI
+ * \param B        Right-hand MPI
+ *
+ * \return         0 if successful,
+ *                 MBEDTLS_ERR_MPI_ALLOC_FAILED if memory allocation failed
+ */
+int mbedtls_mpi_sub_mpi( mbedtls_mpi *X, const mbedtls_mpi *A, const mbedtls_mpi *B );
+
+/**
+ * \brief          Signed addition: X = A + b
+ *
+ * \param X        Destination MPI
+ * \param A        Left-hand MPI
+ * \param b        The integer value to add
+ *
+ * \return         0 if successful,
+ *                 MBEDTLS_ERR_MPI_ALLOC_FAILED if memory allocation failed
+ */
+int mbedtls_mpi_add_int( mbedtls_mpi *X, const mbedtls_mpi *A, mbedtls_mpi_sint b );
+
+/**
+ * \brief          Signed subtraction: X = A - b
+ *
+ * \param X        Destination MPI
+ * \param A        Left-hand MPI
+ * \param b        The integer value to subtract
+ *
+ * \return         0 if successful,
+ *                 MBEDTLS_ERR_MPI_ALLOC_FAILED if memory allocation failed
+ */
+int mbedtls_mpi_sub_int( mbedtls_mpi *X, const mbedtls_mpi *A, mbedtls_mpi_sint b );
+
+/**
+ * \brief          Baseline multiplication: X = A * B
+ *
+ * \param X        Destination MPI
+ * \param A        Left-hand MPI
+ * \param B        Right-hand MPI
+ *
+ * \return         0 if successful,
+ *                 MBEDTLS_ERR_MPI_ALLOC_FAILED if memory allocation failed
+ */
+int mbedtls_mpi_mul_mpi( mbedtls_mpi *X, const mbedtls_mpi *A, const mbedtls_mpi *B );
+
+/**
+ * \brief          Baseline multiplication: X = A * b
+ *
+ * \param X        Destination MPI
+ * \param A        Left-hand MPI
+ * \param b        The unsigned integer value to multiply with
+ *
+ * \note           b is unsigned
+ *
+ * \return         0 if successful,
+ *                 MBEDTLS_ERR_MPI_ALLOC_FAILED if memory allocation failed
+ */
+int mbedtls_mpi_mul_int( mbedtls_mpi *X, const mbedtls_mpi *A, mbedtls_mpi_uint b );
+
+/**
+ * \brief          Division by mbedtls_mpi: A = Q * B + R
+ *
+ * \param Q        Destination MPI for the quotient
+ * \param R        Destination MPI for the rest value
+ * \param A        Left-hand MPI
+ * \param B        Right-hand MPI
+ *
+ * \return         0 if successful,
+ *                 MBEDTLS_ERR_MPI_ALLOC_FAILED if memory allocation failed,
+ *                 MBEDTLS_ERR_MPI_DIVISION_BY_ZERO if B == 0
+ *
+ * \note           Either Q or R can be NULL.
+ */
+int mbedtls_mpi_div_mpi( mbedtls_mpi *Q, mbedtls_mpi *R, const mbedtls_mpi *A, const mbedtls_mpi *B );
+
+/**
+ * \brief          Division by int: A = Q * b + R
+ *
+ * \param Q        Destination MPI for the quotient
+ * \param R        Destination MPI for the rest value
+ * \param A        Left-hand MPI
+ * \param b        Integer to divide by
+ *
+ * \return         0 if successful,
+ *                 MBEDTLS_ERR_MPI_ALLOC_FAILED if memory allocation failed,
+ *                 MBEDTLS_ERR_MPI_DIVISION_BY_ZERO if b == 0
+ *
+ * \note           Either Q or R can be NULL.
+ */
+int mbedtls_mpi_div_int( mbedtls_mpi *Q, mbedtls_mpi *R, const mbedtls_mpi *A, mbedtls_mpi_sint b );
+
+/**
+ * \brief          Modulo: R = A mod B
+ *
+ * \param R        Destination MPI for the rest value
+ * \param A        Left-hand MPI
+ * \param B        Right-hand MPI
+ *
+ * \return         0 if successful,
+ *                 MBEDTLS_ERR_MPI_ALLOC_FAILED if memory allocation failed,
+ *                 MBEDTLS_ERR_MPI_DIVISION_BY_ZERO if B == 0,
+ *                 MBEDTLS_ERR_MPI_NEGATIVE_VALUE if B < 0
+ */
+int mbedtls_mpi_mod_mpi( mbedtls_mpi *R, const mbedtls_mpi *A, const mbedtls_mpi *B );
+
+/**
+ * \brief          Modulo: r = A mod b
+ *
+ * \param r        Destination mbedtls_mpi_uint
+ * \param A        Left-hand MPI
+ * \param b        Integer to divide by
+ *
+ * \return         0 if successful,
+ *                 MBEDTLS_ERR_MPI_ALLOC_FAILED if memory allocation failed,
+ *                 MBEDTLS_ERR_MPI_DIVISION_BY_ZERO if b == 0,
+ *                 MBEDTLS_ERR_MPI_NEGATIVE_VALUE if b < 0
+ */
+int mbedtls_mpi_mod_int( mbedtls_mpi_uint *r, const mbedtls_mpi *A, mbedtls_mpi_sint b );
+
+/**
+ * \brief          Sliding-window exponentiation: X = A^E mod N
+ *
+ * \param X        Destination MPI
+ * \param A        Left-hand MPI
+ * \param E        Exponent MPI
+ * \param N        Modular MPI
+ * \param _RR      Speed-up MPI used for recalculations
+ *
+ * \return         0 if successful,
+ *                 MBEDTLS_ERR_MPI_ALLOC_FAILED if memory allocation failed,
+ *                 MBEDTLS_ERR_MPI_BAD_INPUT_DATA if N is negative or even or
+ *                 if E is negative
+ *
+ * \note           _RR is used to avoid re-computing R*R mod N across
+ *                 multiple calls, which speeds up things a bit. It can
+ *                 be set to NULL if the extra performance is unneeded.
+ */
+int mbedtls_mpi_exp_mod( mbedtls_mpi *X, const mbedtls_mpi *A, const mbedtls_mpi *E, const mbedtls_mpi *N, mbedtls_mpi *_RR );
+
+/**
+ * \brief          Fill an MPI X with size bytes of random
+ *
+ * \param X        Destination MPI
+ * \param size     Size in bytes
+ * \param f_rng    RNG function
+ * \param p_rng    RNG parameter
+ *
+ * \return         0 if successful,
+ *                 MBEDTLS_ERR_MPI_ALLOC_FAILED if memory allocation failed
+ */
+int mbedtls_mpi_fill_random( mbedtls_mpi *X, size_t size,
+                     int (*f_rng)(void *, unsigned char *, size_t),
+                     void *p_rng );
+
+/**
+ * \brief          Greatest common divisor: G = gcd(A, B)
+ *
+ * \param G        Destination MPI
+ * \param A        Left-hand MPI
+ * \param B        Right-hand MPI
+ *
+ * \return         0 if successful,
+ *                 MBEDTLS_ERR_MPI_ALLOC_FAILED if memory allocation failed
+ */
+int mbedtls_mpi_gcd( mbedtls_mpi *G, const mbedtls_mpi *A, const mbedtls_mpi *B );
+
+/**
+ * \brief          Modular inverse: X = A^-1 mod N
+ *
+ * \param X        Destination MPI
+ * \param A        Left-hand MPI
+ * \param N        Right-hand MPI
+ *
+ * \return         0 if successful,
+ *                 MBEDTLS_ERR_MPI_ALLOC_FAILED if memory allocation failed,
+ *                 MBEDTLS_ERR_MPI_BAD_INPUT_DATA if N is <= 1,
+                   MBEDTLS_ERR_MPI_NOT_ACCEPTABLE if A has no inverse mod N.
+ */
+int mbedtls_mpi_inv_mod( mbedtls_mpi *X, const mbedtls_mpi *A, const mbedtls_mpi *N );
+
+/**
+ * \brief          Miller-Rabin primality test
+ *
+ * \param X        MPI to check
+ * \param f_rng    RNG function
+ * \param p_rng    RNG parameter
+ *
+ * \return         0 if successful (probably prime),
+ *                 MBEDTLS_ERR_MPI_ALLOC_FAILED if memory allocation failed,
+ *                 MBEDTLS_ERR_MPI_NOT_ACCEPTABLE if X is not prime
+ */
+int mbedtls_mpi_is_prime( const mbedtls_mpi *X,
+                  int (*f_rng)(void *, unsigned char *, size_t),
+                  void *p_rng );
+
+/**
+ * \brief          Prime number generation
+ *
+ * \param X        Destination MPI
+ * \param nbits    Required size of X in bits
+ *                 ( 3 <= nbits <= MBEDTLS_MPI_MAX_BITS )
+ * \param dh_flag  If 1, then (X-1)/2 will be prime too
+ * \param f_rng    RNG function
+ * \param p_rng    RNG parameter
+ *
+ * \return         0 if successful (probably prime),
+ *                 MBEDTLS_ERR_MPI_ALLOC_FAILED if memory allocation failed,
+ *                 MBEDTLS_ERR_MPI_BAD_INPUT_DATA if nbits is < 3
+ */
+int mbedtls_mpi_gen_prime( mbedtls_mpi *X, size_t nbits, int dh_flag,
+                   int (*f_rng)(void *, unsigned char *, size_t),
+                   void *p_rng );
+
+/**
+ * \brief          Checkup routine
+ *
+ * \return         0 if successful, or 1 if the test failed
+ */
+int mbedtls_mpi_self_test( int verbose );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* bignum.h */
--- a/arm9/mbedtls/bn_mul.h
+++ b/arm9/mbedtls/bn_mul.h
@ -0,0 +1,887 @@
+/**
+ * \file bn_mul.h
+ *
+ * \brief  Multi-precision integer library
+ *
+ *  Copyright (C) 2006-2015, ARM Limited, All Rights Reserved
+ *  SPDX-License-Identifier: Apache-2.0
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License"); you may
+ *  not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ *  WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  This file is part of mbed TLS (https://tls.mbed.org)
+ */
+/*
+ *      Multiply source vector [s] with b, add result
+ *       to destination vector [d] and set carry c.
+ *
+ *      Currently supports:
+ *
+ *         . IA-32 (386+)         . AMD64 / EM64T
+ *         . IA-32 (SSE2)         . Motorola 68000
+ *         . PowerPC, 32-bit      . MicroBlaze
+ *         . PowerPC, 64-bit      . TriCore
+ *         . SPARC v8             . ARM v3+
+ *         . Alpha                . MIPS32
+ *         . C, longlong          . C, generic
+ */
+#ifndef MBEDTLS_BN_MUL_H
+#define MBEDTLS_BN_MUL_H
+
+#include "bignum.h"
+
+#if defined(MBEDTLS_HAVE_ASM)
+
+#ifndef asm
+#define asm __asm
+#endif
+
+/* armcc5 --gnu defines __GNUC__ but doesn't support GNU's extended asm */
+#if defined(__GNUC__) && \
+    ( !defined(__ARMCC_VERSION) || __ARMCC_VERSION >= 6000000 )
+#if defined(__i386__)
+
+#define MULADDC_INIT                        \
+    asm(                                    \
+        "movl   %%ebx, %0           \n\t"   \
+        "movl   %5, %%esi           \n\t"   \
+        "movl   %6, %%edi           \n\t"   \
+        "movl   %7, %%ecx           \n\t"   \
+        "movl   %8, %%ebx           \n\t"
+
+#define MULADDC_CORE                        \
+        "lodsl                      \n\t"   \
+        "mull   %%ebx               \n\t"   \
+        "addl   %%ecx,   %%eax      \n\t"   \
+        "adcl   $0,      %%edx      \n\t"   \
+        "addl   (%%edi), %%eax      \n\t"   \
+        "adcl   $0,      %%edx      \n\t"   \
+        "movl   %%edx,   %%ecx      \n\t"   \
+        "stosl                      \n\t"
+
+#if defined(MBEDTLS_HAVE_SSE2)
+
+#define MULADDC_HUIT                            \
+        "movd     %%ecx,     %%mm1      \n\t"   \
+        "movd     %%ebx,     %%mm0      \n\t"   \
+        "movd     (%%edi),   %%mm3      \n\t"   \
+        "paddq    %%mm3,     %%mm1      \n\t"   \
+        "movd     (%%esi),   %%mm2      \n\t"   \
+        "pmuludq  %%mm0,     %%mm2      \n\t"   \
+        "movd     4(%%esi),  %%mm4      \n\t"   \
+        "pmuludq  %%mm0,     %%mm4      \n\t"   \
+        "movd     8(%%esi),  %%mm6      \n\t"   \
+        "pmuludq  %%mm0,     %%mm6      \n\t"   \
+        "movd     12(%%esi), %%mm7      \n\t"   \
+        "pmuludq  %%mm0,     %%mm7      \n\t"   \
+        "paddq    %%mm2,     %%mm1      \n\t"   \
+        "movd     4(%%edi),  %%mm3      \n\t"   \
+        "paddq    %%mm4,     %%mm3      \n\t"   \
+        "movd     8(%%edi),  %%mm5      \n\t"   \
+        "paddq    %%mm6,     %%mm5      \n\t"   \
+        "movd     12(%%edi), %%mm4      \n\t"   \
+        "paddq    %%mm4,     %%mm7      \n\t"   \
+        "movd     %%mm1,     (%%edi)    \n\t"   \
+        "movd     16(%%esi), %%mm2      \n\t"   \
+        "pmuludq  %%mm0,     %%mm2      \n\t"   \
+        "psrlq    $32,       %%mm1      \n\t"   \
+        "movd     20(%%esi), %%mm4      \n\t"   \
+        "pmuludq  %%mm0,     %%mm4      \n\t"   \
+        "paddq    %%mm3,     %%mm1      \n\t"   \
+        "movd     24(%%esi), %%mm6      \n\t"   \
+        "pmuludq  %%mm0,     %%mm6      \n\t"   \
+        "movd     %%mm1,     4(%%edi)   \n\t"   \
+        "psrlq    $32,       %%mm1      \n\t"   \
+        "movd     28(%%esi), %%mm3      \n\t"   \
+        "pmuludq  %%mm0,     %%mm3      \n\t"   \
+        "paddq    %%mm5,     %%mm1      \n\t"   \
+        "movd     16(%%edi), %%mm5      \n\t"   \
+        "paddq    %%mm5,     %%mm2      \n\t"   \
+        "movd     %%mm1,     8(%%edi)   \n\t"   \
+        "psrlq    $32,       %%mm1      \n\t"   \
+        "paddq    %%mm7,     %%mm1      \n\t"   \
+        "movd     20(%%edi), %%mm5      \n\t"   \
+        "paddq    %%mm5,     %%mm4      \n\t"   \
+        "movd     %%mm1,     12(%%edi)  \n\t"   \
+        "psrlq    $32,       %%mm1      \n\t"   \
+        "paddq    %%mm2,     %%mm1      \n\t"   \
+        "movd     24(%%edi), %%mm5      \n\t"   \
+        "paddq    %%mm5,     %%mm6      \n\t"   \
+        "movd     %%mm1,     16(%%edi)  \n\t"   \
+        "psrlq    $32,       %%mm1      \n\t"   \
+        "paddq    %%mm4,     %%mm1      \n\t"   \
+        "movd     28(%%edi), %%mm5      \n\t"   \
+        "paddq    %%mm5,     %%mm3      \n\t"   \
+        "movd     %%mm1,     20(%%edi)  \n\t"   \
+        "psrlq    $32,       %%mm1      \n\t"   \
+        "paddq    %%mm6,     %%mm1      \n\t"   \
+        "movd     %%mm1,     24(%%edi)  \n\t"   \
+        "psrlq    $32,       %%mm1      \n\t"   \
+        "paddq    %%mm3,     %%mm1      \n\t"   \
+        "movd     %%mm1,     28(%%edi)  \n\t"   \
+        "addl     $32,       %%edi      \n\t"   \
+        "addl     $32,       %%esi      \n\t"   \
+        "psrlq    $32,       %%mm1      \n\t"   \
+        "movd     %%mm1,     %%ecx      \n\t"
+
+#define MULADDC_STOP                    \
+        "emms                   \n\t"   \
+        "movl   %4, %%ebx       \n\t"   \
+        "movl   %%ecx, %1       \n\t"   \
+        "movl   %%edi, %2       \n\t"   \
+        "movl   %%esi, %3       \n\t"   \
+        : "=m" (t), "=m" (c), "=m" (d), "=m" (s)        \
+        : "m" (t), "m" (s), "m" (d), "m" (c), "m" (b)   \
+        : "eax", "ecx", "edx", "esi", "edi"             \
+    );
+
+#else
+
+#define MULADDC_STOP                    \
+        "movl   %4, %%ebx       \n\t"   \
+        "movl   %%ecx, %1       \n\t"   \
+        "movl   %%edi, %2       \n\t"   \
+        "movl   %%esi, %3       \n\t"   \
+        : "=m" (t), "=m" (c), "=m" (d), "=m" (s)        \
+        : "m" (t), "m" (s), "m" (d), "m" (c), "m" (b)   \
+        : "eax", "ecx", "edx", "esi", "edi"             \
+    );
+#endif /* SSE2 */
+#endif /* i386 */
+
+#if defined(__amd64__) || defined (__x86_64__)
+
+#define MULADDC_INIT                        \
+    asm(                                    \
+        "xorq   %%r8, %%r8          \n\t"
+
+#define MULADDC_CORE                        \
+        "movq   (%%rsi), %%rax      \n\t"   \
+        "mulq   %%rbx               \n\t"   \
+        "addq   $8,      %%rsi      \n\t"   \
+        "addq   %%rcx,   %%rax      \n\t"   \
+        "movq   %%r8,    %%rcx      \n\t"   \
+        "adcq   $0,      %%rdx      \n\t"   \
+        "nop                        \n\t"   \
+        "addq   %%rax,   (%%rdi)    \n\t"   \
+        "adcq   %%rdx,   %%rcx      \n\t"   \
+        "addq   $8,      %%rdi      \n\t"
+
+#define MULADDC_STOP                        \
+        : "+c" (c), "+D" (d), "+S" (s)      \
+        : "b" (b)                           \
+        : "rax", "rdx", "r8"                \
+    );
+
+#endif /* AMD64 */
+
+#if defined(__mc68020__) || defined(__mcpu32__)
+
+#define MULADDC_INIT                    \
+    asm(                                \
+        "movl   %3, %%a2        \n\t"   \
+        "movl   %4, %%a3        \n\t"   \
+        "movl   %5, %%d3        \n\t"   \
+        "movl   %6, %%d2        \n\t"   \
+        "moveq  #0, %%d0        \n\t"
+
+#define MULADDC_CORE                    \
+        "movel  %%a2@+, %%d1    \n\t"   \
+        "mulul  %%d2, %%d4:%%d1 \n\t"   \
+        "addl   %%d3, %%d1      \n\t"   \
+        "addxl  %%d0, %%d4      \n\t"   \
+        "moveq  #0,   %%d3      \n\t"   \
+        "addl   %%d1, %%a3@+    \n\t"   \
+        "addxl  %%d4, %%d3      \n\t"
+
+#define MULADDC_STOP                    \
+        "movl   %%d3, %0        \n\t"   \
+        "movl   %%a3, %1        \n\t"   \
+        "movl   %%a2, %2        \n\t"   \
+        : "=m" (c), "=m" (d), "=m" (s)              \
+        : "m" (s), "m" (d), "m" (c), "m" (b)        \
+        : "d0", "d1", "d2", "d3", "d4", "a2", "a3"  \
+    );
+
+#define MULADDC_HUIT                        \
+        "movel  %%a2@+,  %%d1       \n\t"   \
+        "mulul  %%d2,    %%d4:%%d1  \n\t"   \
+        "addxl  %%d3,    %%d1       \n\t"   \
+        "addxl  %%d0,    %%d4       \n\t"   \
+        "addl   %%d1,    %%a3@+     \n\t"   \
+        "movel  %%a2@+,  %%d1       \n\t"   \
+        "mulul  %%d2,    %%d3:%%d1  \n\t"   \
+        "addxl  %%d4,    %%d1       \n\t"   \
+        "addxl  %%d0,    %%d3       \n\t"   \
+        "addl   %%d1,    %%a3@+     \n\t"   \
+        "movel  %%a2@+,  %%d1       \n\t"   \
+        "mulul  %%d2,    %%d4:%%d1  \n\t"   \
+        "addxl  %%d3,    %%d1       \n\t"   \
+        "addxl  %%d0,    %%d4       \n\t"   \
+        "addl   %%d1,    %%a3@+     \n\t"   \
+        "movel  %%a2@+,  %%d1       \n\t"   \
+        "mulul  %%d2,    %%d3:%%d1  \n\t"   \
+        "addxl  %%d4,    %%d1       \n\t"   \
+        "addxl  %%d0,    %%d3       \n\t"   \
+        "addl   %%d1,    %%a3@+     \n\t"   \
+        "movel  %%a2@+,  %%d1       \n\t"   \
+        "mulul  %%d2,    %%d4:%%d1  \n\t"   \
+        "addxl  %%d3,    %%d1       \n\t"   \
+        "addxl  %%d0,    %%d4       \n\t"   \
+        "addl   %%d1,    %%a3@+     \n\t"   \
+        "movel  %%a2@+,  %%d1       \n\t"   \
+        "mulul  %%d2,    %%d3:%%d1  \n\t"   \
+        "addxl  %%d4,    %%d1       \n\t"   \
+        "addxl  %%d0,    %%d3       \n\t"   \
+        "addl   %%d1,    %%a3@+     \n\t"   \
+        "movel  %%a2@+,  %%d1       \n\t"   \
+        "mulul  %%d2,    %%d4:%%d1  \n\t"   \
+        "addxl  %%d3,    %%d1       \n\t"   \
+        "addxl  %%d0,    %%d4       \n\t"   \
+        "addl   %%d1,    %%a3@+     \n\t"   \
+        "movel  %%a2@+,  %%d1       \n\t"   \
+        "mulul  %%d2,    %%d3:%%d1  \n\t"   \
+        "addxl  %%d4,    %%d1       \n\t"   \
+        "addxl  %%d0,    %%d3       \n\t"   \
+        "addl   %%d1,    %%a3@+     \n\t"   \
+        "addxl  %%d0,    %%d3       \n\t"
+
+#endif /* MC68000 */
+
+#if defined(__powerpc64__) || defined(__ppc64__)
+
+#if defined(__MACH__) && defined(__APPLE__)
+
+#define MULADDC_INIT                        \
+    asm(                                    \
+        "ld     r3, %3              \n\t"   \
+        "ld     r4, %4              \n\t"   \
+        "ld     r5, %5              \n\t"   \
+        "ld     r6, %6              \n\t"   \
+        "addi   r3, r3, -8          \n\t"   \
+        "addi   r4, r4, -8          \n\t"   \
+        "addic  r5, r5,  0          \n\t"
+
+#define MULADDC_CORE                        \
+        "ldu    r7, 8(r3)           \n\t"   \
+        "mulld  r8, r7, r6          \n\t"   \
+        "mulhdu r9, r7, r6          \n\t"   \
+        "adde   r8, r8, r5          \n\t"   \
+        "ld     r7, 8(r4)           \n\t"   \
+        "addze  r5, r9              \n\t"   \
+        "addc   r8, r8, r7          \n\t"   \
+        "stdu   r8, 8(r4)           \n\t"
+
+#define MULADDC_STOP                        \
+        "addze  r5, r5              \n\t"   \
+        "addi   r4, r4, 8           \n\t"   \
+        "addi   r3, r3, 8           \n\t"   \
+        "std    r5, %0              \n\t"   \
+        "std    r4, %1              \n\t"   \
+        "std    r3, %2              \n\t"   \
+        : "=m" (c), "=m" (d), "=m" (s)              \
+        : "m" (s), "m" (d), "m" (c), "m" (b)        \
+        : "r3", "r4", "r5", "r6", "r7", "r8", "r9"  \
+    );
+
+
+#else /* __MACH__ && __APPLE__ */
+
+#define MULADDC_INIT                        \
+    asm(                                    \
+        "ld     %%r3, %3            \n\t"   \
+        "ld     %%r4, %4            \n\t"   \
+        "ld     %%r5, %5            \n\t"   \
+        "ld     %%r6, %6            \n\t"   \
+        "addi   %%r3, %%r3, -8      \n\t"   \
+        "addi   %%r4, %%r4, -8      \n\t"   \
+        "addic  %%r5, %%r5,  0      \n\t"
+
+#define MULADDC_CORE                        \
+        "ldu    %%r7, 8(%%r3)       \n\t"   \
+        "mulld  %%r8, %%r7, %%r6    \n\t"   \
+        "mulhdu %%r9, %%r7, %%r6    \n\t"   \
+        "adde   %%r8, %%r8, %%r5    \n\t"   \
+        "ld     %%r7, 8(%%r4)       \n\t"   \
+        "addze  %%r5, %%r9          \n\t"   \
+        "addc   %%r8, %%r8, %%r7    \n\t"   \
+        "stdu   %%r8, 8(%%r4)       \n\t"
+
+#define MULADDC_STOP                        \
+        "addze  %%r5, %%r5          \n\t"   \
+        "addi   %%r4, %%r4, 8       \n\t"   \
+        "addi   %%r3, %%r3, 8       \n\t"   \
+        "std    %%r5, %0            \n\t"   \
+        "std    %%r4, %1            \n\t"   \
+        "std    %%r3, %2            \n\t"   \
+        : "=m" (c), "=m" (d), "=m" (s)              \
+        : "m" (s), "m" (d), "m" (c), "m" (b)        \
+        : "r3", "r4", "r5", "r6", "r7", "r8", "r9"  \
+    );
+
+#endif /* __MACH__ && __APPLE__ */
+
+#elif defined(__powerpc__) || defined(__ppc__) /* end PPC64/begin PPC32  */
+
+#if defined(__MACH__) && defined(__APPLE__)
+
+#define MULADDC_INIT                    \
+    asm(                                \
+        "lwz    r3, %3          \n\t"   \
+        "lwz    r4, %4          \n\t"   \
+        "lwz    r5, %5          \n\t"   \
+        "lwz    r6, %6          \n\t"   \
+        "addi   r3, r3, -4      \n\t"   \
+        "addi   r4, r4, -4      \n\t"   \
+        "addic  r5, r5,  0      \n\t"
+
+#define MULADDC_CORE                    \
+        "lwzu   r7, 4(r3)       \n\t"   \
+        "mullw  r8, r7, r6      \n\t"   \
+        "mulhwu r9, r7, r6      \n\t"   \
+        "adde   r8, r8, r5      \n\t"   \
+        "lwz    r7, 4(r4)       \n\t"   \
+        "addze  r5, r9          \n\t"   \
+        "addc   r8, r8, r7      \n\t"   \
+        "stwu   r8, 4(r4)       \n\t"
+
+#define MULADDC_STOP                    \
+        "addze  r5, r5          \n\t"   \
+        "addi   r4, r4, 4       \n\t"   \
+        "addi   r3, r3, 4       \n\t"   \
+        "stw    r5, %0          \n\t"   \
+        "stw    r4, %1          \n\t"   \
+        "stw    r3, %2          \n\t"   \
+        : "=m" (c), "=m" (d), "=m" (s)              \
+        : "m" (s), "m" (d), "m" (c), "m" (b)        \
+        : "r3", "r4", "r5", "r6", "r7", "r8", "r9"  \
+    );
+
+#else /* __MACH__ && __APPLE__ */
+
+#define MULADDC_INIT                        \
+    asm(                                    \
+        "lwz    %%r3, %3            \n\t"   \
+        "lwz    %%r4, %4            \n\t"   \
+        "lwz    %%r5, %5            \n\t"   \
+        "lwz    %%r6, %6            \n\t"   \
+        "addi   %%r3, %%r3, -4      \n\t"   \
+        "addi   %%r4, %%r4, -4      \n\t"   \
+        "addic  %%r5, %%r5,  0      \n\t"
+
+#define MULADDC_CORE                        \
+        "lwzu   %%r7, 4(%%r3)       \n\t"   \
+        "mullw  %%r8, %%r7, %%r6    \n\t"   \
+        "mulhwu %%r9, %%r7, %%r6    \n\t"   \
+        "adde   %%r8, %%r8, %%r5    \n\t"   \
+        "lwz    %%r7, 4(%%r4)       \n\t"   \
+        "addze  %%r5, %%r9          \n\t"   \
+        "addc   %%r8, %%r8, %%r7    \n\t"   \
+        "stwu   %%r8, 4(%%r4)       \n\t"
+
+#define MULADDC_STOP                        \
+        "addze  %%r5, %%r5          \n\t"   \
+        "addi   %%r4, %%r4, 4       \n\t"   \
+        "addi   %%r3, %%r3, 4       \n\t"   \
+        "stw    %%r5, %0            \n\t"   \
+        "stw    %%r4, %1            \n\t"   \
+        "stw    %%r3, %2            \n\t"   \
+        : "=m" (c), "=m" (d), "=m" (s)              \
+        : "m" (s), "m" (d), "m" (c), "m" (b)        \
+        : "r3", "r4", "r5", "r6", "r7", "r8", "r9"  \
+    );
+
+#endif /* __MACH__ && __APPLE__ */
+
+#endif /* PPC32 */
+
+/*
+ * The Sparc(64) assembly is reported to be broken.
+ * Disable it for now, until we're able to fix it.
+ */
+#if 0 && defined(__sparc__)
+#if defined(__sparc64__)
+
+#define MULADDC_INIT                                    \
+    asm(                                                \
+                "ldx     %3, %%o0               \n\t"   \
+                "ldx     %4, %%o1               \n\t"   \
+                "ld      %5, %%o2               \n\t"   \
+                "ld      %6, %%o3               \n\t"
+
+#define MULADDC_CORE                                    \
+                "ld      [%%o0], %%o4           \n\t"   \
+                "inc     4, %%o0                \n\t"   \
+                "ld      [%%o1], %%o5           \n\t"   \
+                "umul    %%o3, %%o4, %%o4       \n\t"   \
+                "addcc   %%o4, %%o2, %%o4       \n\t"   \
+                "rd      %%y, %%g1              \n\t"   \
+                "addx    %%g1, 0, %%g1          \n\t"   \
+                "addcc   %%o4, %%o5, %%o4       \n\t"   \
+                "st      %%o4, [%%o1]           \n\t"   \
+                "addx    %%g1, 0, %%o2          \n\t"   \
+                "inc     4, %%o1                \n\t"
+
+        #define MULADDC_STOP                            \
+                "st      %%o2, %0               \n\t"   \
+                "stx     %%o1, %1               \n\t"   \
+                "stx     %%o0, %2               \n\t"   \
+        : "=m" (c), "=m" (d), "=m" (s)          \
+        : "m" (s), "m" (d), "m" (c), "m" (b)    \
+        : "g1", "o0", "o1", "o2", "o3", "o4",   \
+          "o5"                                  \
+        );
+
+#else /* __sparc64__ */
+
+#define MULADDC_INIT                                    \
+    asm(                                                \
+                "ld      %3, %%o0               \n\t"   \
+                "ld      %4, %%o1               \n\t"   \
+                "ld      %5, %%o2               \n\t"   \
+                "ld      %6, %%o3               \n\t"
+
+#define MULADDC_CORE                                    \
+                "ld      [%%o0], %%o4           \n\t"   \
+                "inc     4, %%o0                \n\t"   \
+                "ld      [%%o1], %%o5           \n\t"   \
+                "umul    %%o3, %%o4, %%o4       \n\t"   \
+                "addcc   %%o4, %%o2, %%o4       \n\t"   \
+                "rd      %%y, %%g1              \n\t"   \
+                "addx    %%g1, 0, %%g1          \n\t"   \
+                "addcc   %%o4, %%o5, %%o4       \n\t"   \
+                "st      %%o4, [%%o1]           \n\t"   \
+                "addx    %%g1, 0, %%o2          \n\t"   \
+                "inc     4, %%o1                \n\t"
+
+#define MULADDC_STOP                                    \
+                "st      %%o2, %0               \n\t"   \
+                "st      %%o1, %1               \n\t"   \
+                "st      %%o0, %2               \n\t"   \
+        : "=m" (c), "=m" (d), "=m" (s)          \
+        : "m" (s), "m" (d), "m" (c), "m" (b)    \
+        : "g1", "o0", "o1", "o2", "o3", "o4",   \
+          "o5"                                  \
+        );
+
+#endif /* __sparc64__ */
+#endif /* __sparc__ */
+
+#if defined(__microblaze__) || defined(microblaze)
+
+#define MULADDC_INIT                    \
+    asm(                                \
+        "lwi   r3,   %3         \n\t"   \
+        "lwi   r4,   %4         \n\t"   \
+        "lwi   r5,   %5         \n\t"   \
+        "lwi   r6,   %6         \n\t"   \
+        "andi  r7,   r6, 0xffff \n\t"   \
+        "bsrli r6,   r6, 16     \n\t"
+
+#define MULADDC_CORE                    \
+        "lhui  r8,   r3,   0    \n\t"   \
+        "addi  r3,   r3,   2    \n\t"   \
+        "lhui  r9,   r3,   0    \n\t"   \
+        "addi  r3,   r3,   2    \n\t"   \
+        "mul   r10,  r9,  r6    \n\t"   \
+        "mul   r11,  r8,  r7    \n\t"   \
+        "mul   r12,  r9,  r7    \n\t"   \
+        "mul   r13,  r8,  r6    \n\t"   \
+        "bsrli  r8, r10,  16    \n\t"   \
+        "bsrli  r9, r11,  16    \n\t"   \
+        "add   r13, r13,  r8    \n\t"   \
+        "add   r13, r13,  r9    \n\t"   \
+        "bslli r10, r10,  16    \n\t"   \
+        "bslli r11, r11,  16    \n\t"   \
+        "add   r12, r12, r10    \n\t"   \
+        "addc  r13, r13,  r0    \n\t"   \
+        "add   r12, r12, r11    \n\t"   \
+        "addc  r13, r13,  r0    \n\t"   \
+        "lwi   r10,  r4,   0    \n\t"   \
+        "add   r12, r12, r10    \n\t"   \
+        "addc  r13, r13,  r0    \n\t"   \
+        "add   r12, r12,  r5    \n\t"   \
+        "addc   r5, r13,  r0    \n\t"   \
+        "swi   r12,  r4,   0    \n\t"   \
+        "addi   r4,  r4,   4    \n\t"
+
+#define MULADDC_STOP                    \
+        "swi   r5,   %0         \n\t"   \
+        "swi   r4,   %1         \n\t"   \
+        "swi   r3,   %2         \n\t"   \
+        : "=m" (c), "=m" (d), "=m" (s)              \
+        : "m" (s), "m" (d), "m" (c), "m" (b)        \
+        : "r3", "r4"  "r5", "r6", "r7", "r8",       \
+          "r9", "r10", "r11", "r12", "r13"          \
+    );
+
+#endif /* MicroBlaze */
+
+#if defined(__tricore__)
+
+#define MULADDC_INIT                            \
+    asm(                                        \
+        "ld.a   %%a2, %3                \n\t"   \
+        "ld.a   %%a3, %4                \n\t"   \
+        "ld.w   %%d4, %5                \n\t"   \
+        "ld.w   %%d1, %6                \n\t"   \
+        "xor    %%d5, %%d5              \n\t"
+
+#define MULADDC_CORE                            \
+        "ld.w   %%d0,   [%%a2+]         \n\t"   \
+        "madd.u %%e2, %%e4, %%d0, %%d1  \n\t"   \
+        "ld.w   %%d0,   [%%a3]          \n\t"   \
+        "addx   %%d2,    %%d2,  %%d0    \n\t"   \
+        "addc   %%d3,    %%d3,    0     \n\t"   \
+        "mov    %%d4,    %%d3           \n\t"   \
+        "st.w  [%%a3+],  %%d2           \n\t"
+
+#define MULADDC_STOP                            \
+        "st.w   %0, %%d4                \n\t"   \
+        "st.a   %1, %%a3                \n\t"   \
+        "st.a   %2, %%a2                \n\t"   \
+        : "=m" (c), "=m" (d), "=m" (s)          \
+        : "m" (s), "m" (d), "m" (c), "m" (b)    \
+        : "d0", "d1", "e2", "d4", "a2", "a3"    \
+    );
+
+#endif /* TriCore */
+
+/*
+ * gcc -O0 by default uses r7 for the frame pointer, so it complains about our
+ * use of r7 below, unless -fomit-frame-pointer is passed. Unfortunately,
+ * passing that option is not easy when building with yotta.
+ *
+ * On the other hand, -fomit-frame-pointer is implied by any -Ox options with
+ * x !=0, which we can detect using __OPTIMIZE__ (which is also defined by
+ * clang and armcc5 under the same conditions).
+ *
+ * So, only use the optimized assembly below for optimized build, which avoids
+ * the build error and is pretty reasonable anyway.
+ */
+#if defined(__GNUC__) && !defined(__OPTIMIZE__)
+#define MULADDC_CANNOT_USE_R7
+#endif
+
+#if defined(__arm__) && !defined(MULADDC_CANNOT_USE_R7)
+
+#if defined(__thumb__) && !defined(__thumb2__)
+
+#pragma message "using ARM THUMB MULADDC"
+
+#define MULADDC_INIT                                    \
+    asm(                                                \
+            "ldr    r0, %3                      \n\t"   \
+            "ldr    r1, %4                      \n\t"   \
+            "ldr    r2, %5                      \n\t"   \
+            "ldr    r3, %6                      \n\t"   \
+            "lsr    r7, r3, #16                 \n\t"   \
+            "mov    r9, r7                      \n\t"   \
+            "lsl    r7, r3, #16                 \n\t"   \
+            "lsr    r7, r7, #16                 \n\t"   \
+            "mov    r8, r7                      \n\t"
+
+#define MULADDC_CORE                                    \
+            "ldmia  r0!, {r6}                   \n\t"   \
+            "lsr    r7, r6, #16                 \n\t"   \
+            "lsl    r6, r6, #16                 \n\t"   \
+            "lsr    r6, r6, #16                 \n\t"   \
+            "mov    r4, r8                      \n\t"   \
+            "mul    r4, r6                      \n\t"   \
+            "mov    r3, r9                      \n\t"   \
+            "mul    r6, r3                      \n\t"   \
+            "mov    r5, r9                      \n\t"   \
+            "mul    r5, r7                      \n\t"   \
+            "mov    r3, r8                      \n\t"   \
+            "mul    r7, r3                      \n\t"   \
+            "lsr    r3, r6, #16                 \n\t"   \
+            "add    r5, r5, r3                  \n\t"   \
+            "lsr    r3, r7, #16                 \n\t"   \
+            "add    r5, r5, r3                  \n\t"   \
+            "add    r4, r4, r2                  \n\t"   \
+            "mov    r2, #0                      \n\t"   \
+            "adc    r5, r2                      \n\t"   \
+            "lsl    r3, r6, #16                 \n\t"   \
+            "add    r4, r4, r3                  \n\t"   \
+            "adc    r5, r2                      \n\t"   \
+            "lsl    r3, r7, #16                 \n\t"   \
+            "add    r4, r4, r3                  \n\t"   \
+            "adc    r5, r2                      \n\t"   \
+            "ldr    r3, [r1]                    \n\t"   \
+            "add    r4, r4, r3                  \n\t"   \
+            "adc    r2, r5                      \n\t"   \
+            "stmia  r1!, {r4}                   \n\t"
+
+#define MULADDC_STOP                                    \
+            "str    r2, %0                      \n\t"   \
+            "str    r1, %1                      \n\t"   \
+            "str    r0, %2                      \n\t"   \
+         : "=m" (c),  "=m" (d), "=m" (s)        \
+         : "m" (s), "m" (d), "m" (c), "m" (b)   \
+         : "r0", "r1", "r2", "r3", "r4", "r5",  \
+           "r6", "r7", "r8", "r9", "cc"         \
+         );
+
+#else
+
+#define MULADDC_INIT                                    \
+    asm(                                                \
+            "ldr    r0, %3                      \n\t"   \
+            "ldr    r1, %4                      \n\t"   \
+            "ldr    r2, %5                      \n\t"   \
+            "ldr    r3, %6                      \n\t"
+
+#define MULADDC_CORE                                    \
+            "ldr    r4, [r0], #4                \n\t"   \
+            "mov    r5, #0                      \n\t"   \
+            "ldr    r6, [r1]                    \n\t"   \
+            "umlal  r2, r5, r3, r4              \n\t"   \
+            "adds   r7, r6, r2                  \n\t"   \
+            "adc    r2, r5, #0                  \n\t"   \
+            "str    r7, [r1], #4                \n\t"
+
+#define MULADDC_STOP                                    \
+            "str    r2, %0                      \n\t"   \
+            "str    r1, %1                      \n\t"   \
+            "str    r0, %2                      \n\t"   \
+         : "=m" (c),  "=m" (d), "=m" (s)        \
+         : "m" (s), "m" (d), "m" (c), "m" (b)   \
+         : "r0", "r1", "r2", "r3", "r4", "r5",  \
+           "r6", "r7", "cc"                     \
+         );
+
+#endif /* Thumb */
+
+#endif /* ARMv3 */
+
+#if defined(__alpha__)
+
+#define MULADDC_INIT                    \
+    asm(                                \
+        "ldq    $1, %3          \n\t"   \
+        "ldq    $2, %4          \n\t"   \
+        "ldq    $3, %5          \n\t"   \
+        "ldq    $4, %6          \n\t"
+
+#define MULADDC_CORE                    \
+        "ldq    $6,  0($1)      \n\t"   \
+        "addq   $1,  8, $1      \n\t"   \
+        "mulq   $6, $4, $7      \n\t"   \
+        "umulh  $6, $4, $6      \n\t"   \
+        "addq   $7, $3, $7      \n\t"   \
+        "cmpult $7, $3, $3      \n\t"   \
+        "ldq    $5,  0($2)      \n\t"   \
+        "addq   $7, $5, $7      \n\t"   \
+        "cmpult $7, $5, $5      \n\t"   \
+        "stq    $7,  0($2)      \n\t"   \
+        "addq   $2,  8, $2      \n\t"   \
+        "addq   $6, $3, $3      \n\t"   \
+        "addq   $5, $3, $3      \n\t"
+
+#define MULADDC_STOP                                    \
+        "stq    $3, %0          \n\t"   \
+        "stq    $2, %1          \n\t"   \
+        "stq    $1, %2          \n\t"   \
+        : "=m" (c), "=m" (d), "=m" (s)              \
+        : "m" (s), "m" (d), "m" (c), "m" (b)        \
+        : "$1", "$2", "$3", "$4", "$5", "$6", "$7"  \
+    );
+#endif /* Alpha */
+
+#if defined(__mips__) && !defined(__mips64)
+
+#define MULADDC_INIT                    \
+    asm(                                \
+        "lw     $10, %3         \n\t"   \
+        "lw     $11, %4         \n\t"   \
+        "lw     $12, %5         \n\t"   \
+        "lw     $13, %6         \n\t"
+
+#define MULADDC_CORE                    \
+        "lw     $14, 0($10)     \n\t"   \
+        "multu  $13, $14        \n\t"   \
+        "addi   $10, $10, 4     \n\t"   \
+        "mflo   $14             \n\t"   \
+        "mfhi   $9              \n\t"   \
+        "addu   $14, $12, $14   \n\t"   \
+        "lw     $15, 0($11)     \n\t"   \
+        "sltu   $12, $14, $12   \n\t"   \
+        "addu   $15, $14, $15   \n\t"   \
+        "sltu   $14, $15, $14   \n\t"   \
+        "addu   $12, $12, $9    \n\t"   \
+        "sw     $15, 0($11)     \n\t"   \
+        "addu   $12, $12, $14   \n\t"   \
+        "addi   $11, $11, 4     \n\t"
+
+#define MULADDC_STOP                    \
+        "sw     $12, %0         \n\t"   \
+        "sw     $11, %1         \n\t"   \
+        "sw     $10, %2         \n\t"   \
+        : "=m" (c), "=m" (d), "=m" (s)                      \
+        : "m" (s), "m" (d), "m" (c), "m" (b)                \
+        : "$9", "$10", "$11", "$12", "$13", "$14", "$15"    \
+    );
+
+#endif /* MIPS */
+#endif /* GNUC */
+
+#if (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__)
+
+#define MULADDC_INIT                            \
+    __asm   mov     esi, s                      \
+    __asm   mov     edi, d                      \
+    __asm   mov     ecx, c                      \
+    __asm   mov     ebx, b
+
+#define MULADDC_CORE                            \
+    __asm   lodsd                               \
+    __asm   mul     ebx                         \
+    __asm   add     eax, ecx                    \
+    __asm   adc     edx, 0                      \
+    __asm   add     eax, [edi]                  \
+    __asm   adc     edx, 0                      \
+    __asm   mov     ecx, edx                    \
+    __asm   stosd
+
+#if defined(MBEDTLS_HAVE_SSE2)
+
+#define EMIT __asm _emit
+
+#define MULADDC_HUIT                            \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0xC9             \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0xC3             \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0x1F             \
+    EMIT 0x0F  EMIT 0xD4  EMIT 0xCB             \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0x16             \
+    EMIT 0x0F  EMIT 0xF4  EMIT 0xD0             \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0x66  EMIT 0x04  \
+    EMIT 0x0F  EMIT 0xF4  EMIT 0xE0             \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0x76  EMIT 0x08  \
+    EMIT 0x0F  EMIT 0xF4  EMIT 0xF0             \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0x7E  EMIT 0x0C  \
+    EMIT 0x0F  EMIT 0xF4  EMIT 0xF8             \
+    EMIT 0x0F  EMIT 0xD4  EMIT 0xCA             \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0x5F  EMIT 0x04  \
+    EMIT 0x0F  EMIT 0xD4  EMIT 0xDC             \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0x6F  EMIT 0x08  \
+    EMIT 0x0F  EMIT 0xD4  EMIT 0xEE             \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0x67  EMIT 0x0C  \
+    EMIT 0x0F  EMIT 0xD4  EMIT 0xFC             \
+    EMIT 0x0F  EMIT 0x7E  EMIT 0x0F             \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0x56  EMIT 0x10  \
+    EMIT 0x0F  EMIT 0xF4  EMIT 0xD0             \
+    EMIT 0x0F  EMIT 0x73  EMIT 0xD1  EMIT 0x20  \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0x66  EMIT 0x14  \
+    EMIT 0x0F  EMIT 0xF4  EMIT 0xE0             \
+    EMIT 0x0F  EMIT 0xD4  EMIT 0xCB             \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0x76  EMIT 0x18  \
+    EMIT 0x0F  EMIT 0xF4  EMIT 0xF0             \
+    EMIT 0x0F  EMIT 0x7E  EMIT 0x4F  EMIT 0x04  \
+    EMIT 0x0F  EMIT 0x73  EMIT 0xD1  EMIT 0x20  \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0x5E  EMIT 0x1C  \
+    EMIT 0x0F  EMIT 0xF4  EMIT 0xD8             \
+    EMIT 0x0F  EMIT 0xD4  EMIT 0xCD             \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0x6F  EMIT 0x10  \
+    EMIT 0x0F  EMIT 0xD4  EMIT 0xD5             \
+    EMIT 0x0F  EMIT 0x7E  EMIT 0x4F  EMIT 0x08  \
+    EMIT 0x0F  EMIT 0x73  EMIT 0xD1  EMIT 0x20  \
+    EMIT 0x0F  EMIT 0xD4  EMIT 0xCF             \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0x6F  EMIT 0x14  \
+    EMIT 0x0F  EMIT 0xD4  EMIT 0xE5             \
+    EMIT 0x0F  EMIT 0x7E  EMIT 0x4F  EMIT 0x0C  \
+    EMIT 0x0F  EMIT 0x73  EMIT 0xD1  EMIT 0x20  \
+    EMIT 0x0F  EMIT 0xD4  EMIT 0xCA             \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0x6F  EMIT 0x18  \
+    EMIT 0x0F  EMIT 0xD4  EMIT 0xF5             \
+    EMIT 0x0F  EMIT 0x7E  EMIT 0x4F  EMIT 0x10  \
+    EMIT 0x0F  EMIT 0x73  EMIT 0xD1  EMIT 0x20  \
+    EMIT 0x0F  EMIT 0xD4  EMIT 0xCC             \
+    EMIT 0x0F  EMIT 0x6E  EMIT 0x6F  EMIT 0x1C  \
+    EMIT 0x0F  EMIT 0xD4  EMIT 0xDD             \
+    EMIT 0x0F  EMIT 0x7E  EMIT 0x4F  EMIT 0x14  \
+    EMIT 0x0F  EMIT 0x73  EMIT 0xD1  EMIT 0x20  \
+    EMIT 0x0F  EMIT 0xD4  EMIT 0xCE             \
+    EMIT 0x0F  EMIT 0x7E  EMIT 0x4F  EMIT 0x18  \
+    EMIT 0x0F  EMIT 0x73  EMIT 0xD1  EMIT 0x20  \
+    EMIT 0x0F  EMIT 0xD4  EMIT 0xCB             \
+    EMIT 0x0F  EMIT 0x7E  EMIT 0x4F  EMIT 0x1C  \
+    EMIT 0x83  EMIT 0xC7  EMIT 0x20             \
+    EMIT 0x83  EMIT 0xC6  EMIT 0x20             \
+    EMIT 0x0F  EMIT 0x73  EMIT 0xD1  EMIT 0x20  \
+    EMIT 0x0F  EMIT 0x7E  EMIT 0xC9
+
+#define MULADDC_STOP                            \
+    EMIT 0x0F  EMIT 0x77                        \
+    __asm   mov     c, ecx                      \
+    __asm   mov     d, edi                      \
+    __asm   mov     s, esi                      \
+
+#else
+
+#define MULADDC_STOP                            \
+    __asm   mov     c, ecx                      \
+    __asm   mov     d, edi                      \
+    __asm   mov     s, esi                      \
+
+#endif /* SSE2 */
+#endif /* MSVC */
+
+#endif /* MBEDTLS_HAVE_ASM */
+
+#if !defined(MULADDC_CORE)
+#if defined(MBEDTLS_HAVE_UDBL)
+
+#define MULADDC_INIT                    \
+{                                       \
+    mbedtls_t_udbl r;                           \
+    mbedtls_mpi_uint r0, r1;
+
+#define MULADDC_CORE                    \
+    r   = *(s++) * (mbedtls_t_udbl) b;          \
+    r0  = (mbedtls_mpi_uint) r;                   \
+    r1  = (mbedtls_mpi_uint)( r >> biL );         \
+    r0 += c;  r1 += (r0 <  c);          \
+    r0 += *d; r1 += (r0 < *d);          \
+    c = r1; *(d++) = r0;
+
+#define MULADDC_STOP                    \
+}
+
+#else
+#define MULADDC_INIT                    \
+{                                       \
+    mbedtls_mpi_uint s0, s1, b0, b1;              \
+    mbedtls_mpi_uint r0, r1, rx, ry;              \
+    b0 = ( b << biH ) >> biH;           \
+    b1 = ( b >> biH );
+
+#define MULADDC_CORE                    \
+    s0 = ( *s << biH ) >> biH;          \
+    s1 = ( *s >> biH ); s++;            \
+    rx = s0 * b1; r0 = s0 * b0;         \
+    ry = s1 * b0; r1 = s1 * b1;         \
+    r1 += ( rx >> biH );                \
+    r1 += ( ry >> biH );                \
+    rx <<= biH; ry <<= biH;             \
+    r0 += rx; r1 += (r0 < rx);          \
+    r0 += ry; r1 += (r0 < ry);          \
+    r0 +=  c; r1 += (r0 <  c);          \
+    r0 += *d; r1 += (r0 < *d);          \
+    c = r1; *(d++) = r0;
+
+#define MULADDC_STOP                    \
+}
+
+#endif /* C (generic)  */
+#endif /* C (longlong) */
+
+#endif /* bn_mul.h */
--- a/arm9/mbedtls/config.h
+++ b/arm9/mbedtls/config.h
@ -0,0 +1,4 @@
+
+#define MBEDTLS_BIGNUM_C
+
+#define MBEDTLS_HAVE_ASM
--- a/arm9/mbedtls/readme.txt
+++ b/arm9/mbedtls/readme.txt
@ -0,0 +1,7 @@
+aes.c/.h rsa.c/.h are heavily modified/reduced
+
+bignum.c/.h bn_mul.h only had some minor modifications:
+	headers location moved from mbedtls/ to .
+	disabled some unused functions by "#if 0 // unused"
+		ASCII I/O
+		everything below mbedtls_mpi_exp_mod
--- a/arm9/mbedtls/rsa.c
+++ b/arm9/mbedtls/rsa.c
@ -0,0 +1,61 @@
+
+// mbedtls RSA public
+// only the pubkey function for signatures verifying
+// original rsa.c had too many extra functions not used and too many dependencies
+
+#include <string.h>
+#include "bignum.h"
+#include "rsa.h"
+
+void rsa_init(rsa_context_t *ctx) {
+	memset(ctx, 0, sizeof(rsa_context_t));
+}
+
+// I don't know why mbedtls doesn't provide this
+// instead, all callers set N/E/len manually
+// this could be seen in mbedtls_rsa_self_test(rsa.c), main(dh_client.c) and main(rsa_verify.c)
+int rsa_set_pubkey(rsa_context_t *ctx, const unsigned char * n_buf, size_t n_len,
+	const unsigned char * e_buf, size_t e_len)
+{
+	int ret0 = (mbedtls_mpi_read_binary(&ctx->N, n_buf, n_len));
+	int ret1 = (mbedtls_mpi_read_binary(&ctx->E, e_buf, e_len));
+	if (ret0 == 0 && ret1 == 0) {
+		ctx->len = (mbedtls_mpi_bitlen(&ctx->N) + 7) >> 3;
+		// we should check the key now to be safe?
+		// anyway usually we load known working keys, so it's omitted
+		return 0;
+	} else {
+		return ret0 || ret1;
+	}
+}
+
+// basically mbedtls_rsa_public
+int rsa_public(rsa_context_t *ctx, const unsigned char *input, unsigned char *output) {
+	int ret;
+	size_t olen;
+	mbedtls_mpi T;
+
+	mbedtls_mpi_init(&T);
+
+	MBEDTLS_MPI_CHK(mbedtls_mpi_read_binary(&T, input, ctx->len));
+
+	if (mbedtls_mpi_cmp_mpi(&T, &ctx->N) >= 0)
+	{
+		ret = MBEDTLS_ERR_MPI_BAD_INPUT_DATA;
+		goto cleanup;
+	}
+
+	olen = ctx->len;
+	MBEDTLS_MPI_CHK(mbedtls_mpi_exp_mod(&T, &T, &ctx->E, &ctx->N, &ctx->RN));
+	MBEDTLS_MPI_CHK(mbedtls_mpi_write_binary(&T, output, olen));
+
+cleanup:
+
+	mbedtls_mpi_free(&T);
+
+	if (ret != 0)
+		return(MBEDTLS_ERR_RSA_PUBLIC_FAILED + ret);
+
+	return(0);
+}
+
--- a/arm9/mbedtls/rsa.h
+++ b/arm9/mbedtls/rsa.h
@ -0,0 +1,18 @@
+
+#define MBEDTLS_ERR_RSA_PUBLIC_FAILED                     -0x4280  /**< The public key operation failed. */
+
+#include "bignum.h"
+
+typedef struct {
+	size_t len;
+	mbedtls_mpi N;
+	mbedtls_mpi E;
+	mbedtls_mpi RN;
+} rsa_context_t;
+
+void rsa_init(rsa_context_t *rsa);
+
+int rsa_set_pubkey(rsa_context_t *rsa, const unsigned char * n_buf, size_t n_len,
+	const unsigned char * e_buf, size_t e_len);
+
+int rsa_public(rsa_context_t *rsa, const unsigned char *input, unsigned char *output);
--- a/arm9/source/crypto.c
+++ b/arm9/source/crypto.c
@ -0,0 +1,323 @@
+#include <stdint.h>
+#include "../mbedtls/aes.h"
+#include "crypto.h"
+//#include "ticket0.h"
+#include "utils.h"
+
+// more info:
+//		https://github.com/Jimmy-Z/TWLbf/blob/master/dsi.c
+//		https://github.com/Jimmy-Z/bfCL/blob/master/dsi.h
+// ported back to 32 bit for ARM9
+
+static const uint32_t DSi_NAND_KEY_Y[4] =
+	{0x0ab9dc76u, 0xbd4dc4d3u, 0x202ddd1du, 0xe1a00005u};
+
+static const uint32_t DSi_ES_KEY_Y[4] =
+	{0x8b5acce5u, 0x72c9d056u, 0xdce8179cu, 0xa9361239u};
+
+static const uint32_t DSi_BOOT2_KEY[4] =
+	{0x8080ee98u, 0xf6b46c00u, 0x626ec23au, 0xad34ecf9u};
+
+static const uint32_t DSi_KEY_MAGIC[4] =
+	{0x1a4f3e79u, 0x2a680f5fu, 0x29590258u, 0xfffefb4eu};
+
+static inline void xor_128(uint32_t *x, const uint32_t *a, const uint32_t *b){
+	x[0] = a[0] ^ b[0];
+	x[1] = a[1] ^ b[1];
+	x[2] = a[2] ^ b[2];
+	x[3] = a[3] ^ b[3];
+}
+
+static inline void add_128(uint32_t *a, const uint32_t *b){
+	unsigned c1, c2, c3; // carry
+	// round 1
+	a[3] += b[3];
+	a[2] += b[2];
+	a[1] += b[1];
+	a[0] += b[0];
+	// carry
+	c3 = a[2] < b[2];
+	c2 = a[1] < b[1];
+	c1 = a[0] < b[0];
+	// round 2
+	a[3] += c3;
+	a[2] += c2;
+	a[1] += c1;
+	// carry
+	c3 = a[2] < c2;
+	c2 = a[1] < c1;
+	// round 3
+	a[3] += c3;
+	a[2] += c2;
+	// carry
+	c3 = a[2] < c2;
+	// round 4
+	a[3] += c3;
+}
+
+static inline void add_128_32(uint32_t *a, uint32_t b){
+	a[0] += b;
+	if(a[0] < b){
+		a[1] += 1;
+		if (a[1] == 0) {
+			a[2] += 1;
+			if (a[2] == 0) {
+				a[3] += 1;
+			}
+		}
+	}
+}
+
+// Answer to life, universe and everything.
+static inline void rol42_128(uint32_t *a){
+	uint32_t t3 = a[3], t2 = a[2];
+	a[3] = (a[2] << 10) | (a[1] >> 22);
+	a[2] = (a[1] << 10) | (a[0] >> 22);
+	a[1] = (a[0] << 10) | (t3 >> 22);
+	a[0] = (t3 << 10) | (t2 >> 22);
+}
+
+static void dsi_aes_set_key(uint32_t *rk, const uint32_t *console_id, key_mode_t mode) {
+	uint32_t key[4];
+	switch (mode) {
+	case NAND:
+		key[0] = console_id[0];
+		key[1] = console_id[0] ^ 0x24ee6906;
+		key[2] = console_id[1] ^ 0xe65b601d;
+		key[3] = console_id[1];
+		break;
+	case NAND_3DS:
+		key[0] = (console_id[0] ^ 0xb358a6af) | 0x80000000;
+		key[1] = 0x544e494e;
+		key[2] = 0x4f444e45;
+		key[3] = console_id[1] ^ 0x08c267b7;
+		break;
+	case ES:
+		key[0] = 0x4e00004a;
+		key[1] = 0x4a00004e;
+		key[2] = console_id[1] ^ 0xc80c4b72;
+		key[3] = console_id[0];
+		break;
+	default:
+		break;
+	}
+	// Key = ((Key_X XOR Key_Y) + FFFEFB4E295902582A680F5F1A4F3E79h) ROL 42
+	// equivalent to F_XY in twltool/f_xy.c
+	xor_128(key, key, mode == ES ? DSi_ES_KEY_Y : DSi_NAND_KEY_Y);
+	// iprintf("AES KEY: XOR KEY_Y:\n");
+	// print_bytes(key, 16);
+	add_128(key, DSi_KEY_MAGIC);
+	// iprintf("AES KEY: + MAGIC:\n");
+	// print_bytes(key, 16);
+	rol42_128(key);
+	// iprintf("AES KEY: ROL 42:\n");
+	// print_bytes(key, 16);
+	aes_set_key_enc_128_be(rk, (uint8_t*)key);
+}
+
+int dsi_sha1_verify(const void *digest_verify, const void *data, unsigned len) {
+	uint8_t digest[SHA1_LEN];
+	swiSHA1Calc(digest, data, len);
+	// return type of swiSHA1Verify() is declared void, so how exactly should we use it?
+	int ret = memcmp(digest, digest_verify, SHA1_LEN);
+	if (ret != 0) {
+		//printf("  ");
+		print_bytes(digest_verify, SHA1_LEN);
+		//printf("\n  ");
+		print_bytes(digest, SHA1_LEN);
+		//printf("\n");
+	}
+	return ret;
+}
+
+static uint32_t nand_rk[RK_LEN];
+static uint32_t nand_ctr_iv[4];
+static uint32_t es_rk[RK_LEN];
+static uint32_t boot2_rk[RK_LEN];
+
+static int tables_generated = 0;
+
+void dsi_crypt_init(const uint8_t *console_id_be, const uint8_t *emmc_cid, int is3DS) {
+	if (tables_generated == 0) {
+		aes_gen_tables();
+		tables_generated = 1;
+	}
+	
+	uint32_t console_id[2];
+	GET_UINT32_BE(console_id[0], console_id_be, 4);
+	GET_UINT32_BE(console_id[1], console_id_be, 0);
+
+	dsi_aes_set_key(nand_rk, console_id, is3DS ? NAND_3DS : NAND);
+	dsi_aes_set_key(es_rk, console_id, ES);
+
+	aes_set_key_enc_128_be(boot2_rk, (uint8_t*)DSi_BOOT2_KEY);
+
+	uint32_t digest[SHA1_LEN / sizeof(uint32_t)];
+	swiSHA1Calc(digest, emmc_cid, 16);
+	nand_ctr_iv[0] = digest[0];
+	nand_ctr_iv[1] = digest[1];
+	nand_ctr_iv[2] = digest[2];
+	nand_ctr_iv[3] = digest[3];
+}
+
+static inline void aes_ctr(const uint32_t *rk, const uint32_t *ctr, uint32_t *in, uint32_t *out) {
+	uint32_t xor[4];
+	aes_encrypt_128_be(rk, (uint8_t*)ctr, (uint8_t*)xor);
+	xor_128(out, in, xor);
+}
+
+// crypt one block, in/out must be aligned to 32 bit(restriction induced by xor_128)
+// offset as block offset, block as AES block
+void dsi_nand_crypt_1(uint8_t* out, const uint8_t* in, uint32_t offset) {
+	uint32_t ctr[4] = { nand_ctr_iv[0], nand_ctr_iv[1], nand_ctr_iv[2], nand_ctr_iv[3] };
+	add_128_32(ctr, offset);
+	// iprintf("AES CTR:\n");
+	// print_bytes(buf, 16);
+	aes_ctr(nand_rk, ctr, (uint32_t*)in, (uint32_t*)out);
+}
+
+void dsi_nand_crypt(uint8_t* out, const uint8_t* in, uint32_t offset, unsigned count) {
+	uint32_t ctr[4] = { nand_ctr_iv[0], nand_ctr_iv[1], nand_ctr_iv[2], nand_ctr_iv[3] };
+	add_128_32(ctr, offset);
+	for (unsigned i = 0; i < count; ++i) {
+		aes_ctr(nand_rk, ctr, (uint32_t*)in, (uint32_t*)out);
+		out += AES_BLOCK_SIZE;
+		in += AES_BLOCK_SIZE;
+		add_128_32(ctr, 1);
+	}
+}
+	
+static uint32_t boot2_ctr[4];
+
+void dsi_boot2_crypt_set_ctr(uint32_t size_r) {
+	boot2_ctr[0] = size_r;
+	boot2_ctr[1] = -size_r;
+	boot2_ctr[2] = ~size_r;
+	boot2_ctr[3] = 0;
+}
+
+void dsi_boot2_crypt(uint8_t* out, const uint8_t* in, unsigned count) {
+	for (unsigned i = 0; i < count; ++i) {
+		aes_ctr(boot2_rk, boot2_ctr, (uint32_t*)in, (uint32_t*)out);
+		out += AES_BLOCK_SIZE;
+		in += AES_BLOCK_SIZE;
+		add_128_32(boot2_ctr, 1);
+	}
+}
+
+// http://problemkaputt.de/gbatek.htm#dsiesblockencryption
+// works in place, also must be aligned to 32 bit
+// why is it called ES?
+/*int dsi_es_block_crypt(uint8_t *buf, unsigned buf_len, crypt_mode_t mode) {
+	es_block_footer_t *footer;
+	footer = (es_block_footer_t*)(buf + buf_len - sizeof(es_block_footer_t));
+	// backup mac since it might be overwritten by padding
+	// and also nonce, it becomes garbage after decryption
+	uint8_t ccm_mac[AES_CCM_MAC_LEN];
+	uint8_t nonce[AES_CCM_NONCE_LEN];
+	memcpy(ccm_mac, footer->ccm_mac, AES_CCM_MAC_LEN);
+	memcpy(nonce, footer->nonce, AES_CCM_NONCE_LEN);
+
+	uint32_t ctr32[4], pad32[4], mac32[4];
+// I'm too paranoid to use more stack variables
+#define ctr ((uint8_t*)ctr32)
+#define pad ((uint8_t*)pad32)
+#define mac ((uint8_t*)mac32)
+#define zero(a) static_assert(sizeof(a[0]) == 4, "invalid operand"); \
+	a[0] = 0; a[1] = 0; a[2] = 0; a[3] = 0
+	if (mode == DECRYPT) {
+		// decrypt footer
+		zero(ctr32);
+		memcpy(ctr + 1, nonce, AES_CCM_NONCE_LEN);
+		// footer might not be 32 bit aligned after all, so we copy it out to decrypt
+		memcpy(pad, footer->encrypted, AES_BLOCK_SIZE);
+		aes_ctr(es_rk, ctr32, pad32, pad32);
+		memcpy(footer->encrypted, pad, AES_BLOCK_SIZE);
+	}
+	// check decrypted footer
+	if (footer->fixed_3a != 0x3a) {
+		i//printff("ES block footer offset 0x10 should be 0x3a, got 0x%02x\n", footer->fixed_3a);
+		return 1;
+	}
+	uint32_t block_size;
+	GET_UINT32_BE(block_size, footer->len32be, 0);
+	block_size &= 0xffffff;
+	if (block_size + sizeof(es_block_footer_t) != buf_len) {
+		i//printff("block size in footer doesn't match, %06x != %06x\n",
+			(unsigned)block_size, (unsigned)(buf_len - sizeof(es_block_footer_t)));
+		return 1;
+	}
+	// padding to multiple of 16
+	uint32_t remainder = block_size & 0xf;
+	if (remainder != 0) {
+		zero(pad32);
+		if (mode == DECRYPT) {
+			ctr32[0] = (block_size >> 4) + 1;
+			memcpy(ctr + 3, nonce, AES_CCM_NONCE_LEN);
+			ctr[0xf] = 2;
+			aes_ctr(es_rk, ctr32, pad32, pad32);
+		}
+		memcpy(buf + block_size, pad + remainder, 16 - remainder);
+		block_size += 16 - remainder;
+	}
+	// AES-CCM MAC
+	mac32[0] = block_size;
+	memcpy(mac + 3, nonce, AES_CCM_NONCE_LEN);
+	mac[0xf] = 0x3a;
+	aes_encrypt_128_be(es_rk, mac, mac);
+	// AES-CCM CTR
+	ctr32[0] = 0;
+	memcpy(ctr + 3, nonce, AES_CCM_NONCE_LEN);
+	ctr[0xf] = 2;
+	// AES-CCM start
+	zero(pad32);
+	aes_ctr(es_rk, ctr32, pad32, pad32);
+	add_128_32(ctr32, 1);
+	// AES-CCM loop
+	if (mode == DECRYPT) {
+		for (unsigned i = 0; i < block_size; i += 16) {
+			aes_ctr(es_rk, ctr32, (uint32_t*)(buf + i), (uint32_t*)(buf + i));
+			add_128_32(ctr32, 1);
+			xor_128(mac32, mac32, (uint32_t*)(buf + i));
+			aes_encrypt_128_be(es_rk, mac, mac);
+		}
+	} else {
+		for (unsigned i = 0; i < block_size; i += 16) {
+			xor_128(mac32, mac32, (uint32_t*)(buf + i));
+			aes_encrypt_128_be(es_rk, mac, mac);
+			aes_ctr(es_rk, ctr32, (uint32_t*)(buf + i), (uint32_t*)(buf + i));
+			add_128_32(ctr32, 1);
+		}
+	}
+	// AES-CCM MAC final
+	xor_128(mac32, mac32, pad32);
+	if (mode == DECRYPT) {
+		if (memcmp(mac, ccm_mac, 16) == 0) {
+			if (remainder != 0) {
+				// restore mac
+				memcpy(footer->ccm_mac, ccm_mac, AES_CCM_MAC_LEN);
+			}
+			// restore nonce
+			memcpy(footer->nonce, nonce, AES_CCM_NONCE_LEN);
+			return 0;
+		} else {
+			//printf("MAC verification failed\n");
+			return 1;
+		}
+	} else {
+		memcpy(footer->ccm_mac, mac, AES_CCM_MAC_LEN);
+		// AES-CTR crypt later half of footer
+		zero(ctr32);
+		memcpy(ctr + 1, nonce, AES_CCM_NONCE_LEN);
+		memcpy(pad, footer->encrypted, AES_BLOCK_SIZE);
+		aes_ctr(es_rk, ctr32, pad32, pad32);
+		memcpy(footer->encrypted, pad, AES_BLOCK_SIZE);
+		// restore nonce
+		memcpy(footer->nonce, nonce, AES_CCM_NONCE_LEN);
+		return 0;
+	}
+#undef ctr
+#undef pad
+#undef mac
+#undef zero
+}*/
--- a/arm9/source/crypto.h
+++ b/arm9/source/crypto.h
@ -0,0 +1,35 @@
+#pragma once
+
+#include <nds.h>
+
+#define SHA1_LEN 20
+
+#define AES_BLOCK_SIZE 16
+
+typedef enum {
+	ENCRYPT,
+	DECRYPT
+} crypt_mode_t;
+
+typedef enum {
+	NAND,
+	NAND_3DS,
+	ES
+} key_mode_t;
+
+// don't want to include nds.h just for this
+void swiSHA1Calc(void *digest, const void *buf, size_t len);
+
+int dsi_sha1_verify(const void *digest_verify, const void *data, unsigned len);
+
+void dsi_crypt_init(const uint8_t *console_id_be, const uint8_t *emmc_cid, int is3DS);
+
+void dsi_nand_crypt_1(uint8_t *out, const uint8_t* in, u32 offset);
+
+void dsi_nand_crypt(uint8_t *out, const uint8_t* in, u32 offset, unsigned count);
+
+int dsi_es_block_crypt(uint8_t *buf, unsigned buf_len, crypt_mode_t mode);
+
+void dsi_boot2_crypt_set_ctr(uint32_t size_r);
+
+void dsi_boot2_crypt(uint8_t* out, const uint8_t* in, unsigned count);
--- a/arm9/source/driveMenu.cpp
+++ b/arm9/source/driveMenu.cpp
@ -222,10 +222,14 @@ void driveMenu (void) {
 			dmAssignedOp[i] = -1;
 		}
 		dmMaxCursors = -1;
-		if (isDSiMode() && sdMounted){
+		if (sdMounted){
 			dmMaxCursors++;
 			dmAssignedOp[dmMaxCursors] = 0;
 		}
+		if (nandMounted) {
+			dmMaxCursors++;
+			dmAssignedOp[dmMaxCursors] = 7;
+		}
 		if (flashcardMounted) {
 			dmMaxCursors++;
 			dmAssignedOp[dmMaxCursors] = 1;
@ -238,10 +242,6 @@ void driveMenu (void) {
 			dmMaxCursors++;
 			dmAssignedOp[dmMaxCursors] = 6;
 		}
-		if (nandMounted) {
-			dmMaxCursors++;
-			dmAssignedOp[dmMaxCursors] = 7;
-		}
 		if (expansionPakFound
 		|| (io_dldi_data->ioInterface.features & FEATURE_SLOT_GBA)
 		|| (isDSiMode() && !(REG_SCFG_MC & BIT(0)))) {
--- a/arm9/source/driveOperations.cpp
+++ b/arm9/source/driveOperations.cpp
@ -10,7 +10,7 @@
 #include "lzss.h"
 #include "ramd.h"
 #include "ramdrive-include.h"
-#include "nand.h"
+#include "nandio.h"
 #include "tonccpy.h"

 static sNDSHeader nds;
@ -134,7 +134,7 @@ bool bothSDandFlashcard(void) {
 }

 TWL_CODE bool nandMount(void) {
-	fatMountSimple("nand", &io_nand);
+	fatMountSimple("nand", &io_dsi_nand);
 	if (nandFound()) {
 		nandMountedDone = true;
 		struct statvfs st;
--- a/arm9/source/f_xy.c
+++ b/arm9/source/f_xy.c
@ -0,0 +1,199 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include "types.h"
+#include "utils.h"
+
+//#define DEBUG
+
+// flip each word and return as a u64 array
+void aes_flip_to_64(u32 *in, u64* out)
+{
+    u32 endian_flip[4];
+    u32 i;
+    
+    for(i = 0; i < 4; i++)
+        endian_flip[i] = getbe32((u8*)&in[i]);
+
+    out[0] = (u64)endian_flip[1] | ((u64)endian_flip[0] << 32);
+    out[1] = (u64)endian_flip[3] | ((u64)endian_flip[2] << 32);
+}
+
+void aes_unflip_to_32(u64* in, u32* out)
+{
+    out[0] = getbe32((u8*)&in[0]+4);
+    out[1] = getbe32((u8*)&in[0]);
+    out[2] = getbe32((u8*)&in[1]+4);
+    out[3] = getbe32((u8*)&in[1]);
+}
+
+void n128_lrot_3ds_internal(u32 *num, u32 shift)
+{
+	u64 tmp[2];
+    u64 num_work[2];
+    
+    aes_flip_to_64(num, num_work);
+    
+	tmp[0] = num_work[0]<<shift;
+	tmp[1] = num_work[1]<<shift;
+	tmp[0] |= num_work[1]>>(64-shift);
+	tmp[1] |= num_work[0]>>(64-shift);
+
+    aes_unflip_to_32(tmp, num);
+}
+
+void n128_rrot_3ds_internal(u32 *num, u32 shift)
+{
+	u64 tmp[2];
+    u64 num_work[2];
+
+    aes_flip_to_64(num, num_work);
+    
+	tmp[0] = num_work[0]>>shift;
+	tmp[1] = num_work[1]>>shift;
+	tmp[0] |= (num_work[1]<<(64-shift));
+	tmp[1] |= (num_work[0]<<(64-shift));
+
+    aes_unflip_to_32(tmp, num);
+}
+
+void n128_lrot_3ds(u32 *num, u32 shift)
+{
+    u32 shift_cycle;
+    while(shift > 0)
+    {
+        if(shift >= 32)
+        {
+            shift_cycle = 32;
+            shift -= 32;
+        }
+        else
+        {
+            shift_cycle = shift;
+            shift = 0;
+        }
+        n128_lrot_3ds_internal(num, shift_cycle);
+    }
+    
+}
+
+void n128_rrot_3ds(u32 *num, u32 shift)
+{
+    u32 shift_cycle;
+    while(shift > 0)
+    {
+        if(shift >= 32)
+        {
+            shift_cycle = 32;
+            shift -= 32;
+        }
+        else
+        {
+            shift_cycle = shift;
+            shift = 0;
+        }
+        n128_rrot_3ds_internal(num, shift_cycle);
+    }
+    
+}
+
+void n128_add_3ds(u32 *a, u32 *b)
+{
+    u64 a64[4];
+    u64 b64[4];
+    aes_flip_to_64(a, a64);
+    aes_flip_to_64(b, b64);
+    
+	uint64_t tmp = (a64[0]>>1)+(b64[0]>>1) + (a64[0] & b64[0] & 1);
+	 
+	tmp = tmp >> 63;
+        a64[0] = a64[0] + b64[0];
+        a64[1] = a64[1] + b64[1] + tmp;
+    aes_unflip_to_32(a64, a);
+}
+
+void n128_lrot(uint64_t *num, uint32_t shift)
+{
+	uint64_t tmp[2];
+
+	tmp[0] = num[0]<<shift;
+	tmp[1] = num[1]<<shift;
+	tmp[0] |= (num[1]>>(64-shift));
+	tmp[1] |= (num[0]>>(64-shift));
+
+	num[0] = tmp[0];
+	num[1] = tmp[1];
+}
+
+void n128_rrot(uint64_t *num, uint32_t shift)
+{
+	uint64_t tmp[2];
+
+	tmp[0] = num[0]>>shift;
+	tmp[1] = num[1]>>shift;
+	tmp[0] |= (num[1]<<(64-shift));
+	tmp[1] |= (num[0]<<(64-shift));
+
+	num[0] = tmp[0];
+	num[1] = tmp[1];
+}
+
+void n128_add(uint64_t *a, uint64_t *b)
+{
+	uint64_t *a64 = a;
+	uint64_t *b64 = b;
+	uint64_t tmp = (a64[0]>>1)+(b64[0]>>1) + (a64[0] & b64[0] & 1);
+	 
+	tmp = tmp >> 63;
+        a64[0] = a64[0] + b64[0];
+        a64[1] = a64[1] + b64[1] + tmp;
+}
+
+void n128_sub(uint64_t *a, uint64_t *b)
+{
+	uint64_t *a64 = a;
+	uint64_t *b64 = b;
+	uint64_t tmp = (a64[0]>>1)-(b64[0]>>1) - ((a64[0]>>63) & (b64[0]>>63) & 1);
+        
+	tmp = tmp >> 63;
+        a64[0] = a64[0] - b64[0];
+        a64[1] = a64[1] - b64[1] - tmp;
+}
+
+void F_XY(uint32_t *key, uint32_t *key_x, uint32_t *key_y)
+{
+	int i;
+	unsigned char key_xy[16];
+
+	memset(key_xy, 0, 16);
+	memset(key, 0, 16);
+	for(i=0; i<16; i++)key_xy[i] = ((unsigned char*)key_x)[i] ^ ((unsigned char*)key_y)[i];
+
+	key[0] = 0x1a4f3e79;
+	key[1] = 0x2a680f5f;
+	key[2] = 0x29590258;
+	key[3] = 0xfffefb4e;
+
+	n128_add((uint64_t*)key, (uint64_t*)key_xy);
+	n128_lrot((uint64_t*)key, 42);
+}
+
+//F_XY_reverse does the reverse of F(X^Y): takes (normal)key, and does F in reverse to generate the original X^Y key_xy.
+void F_XY_reverse(uint32_t *key, uint32_t *key_xy)
+{
+	uint32_t tmpkey[4];
+	memset(key_xy, 0, 16);
+	memset(tmpkey, 0, 16);
+	memcpy(tmpkey, key, 16);
+
+	key_xy[0] = 0x1a4f3e79;
+	key_xy[1] = 0x2a680f5f;
+	key_xy[2] = 0x29590258;
+	key_xy[3] = 0xfffefb4e;
+
+	n128_rrot((uint64_t*)tmpkey, 42);
+	n128_sub((uint64_t*)tmpkey, (uint64_t*)key_xy);
+	memcpy(key_xy, tmpkey, 16);
+}
+
--- a/arm9/source/f_xy.h
+++ b/arm9/source/f_xy.h
@ -0,0 +1,20 @@
+#ifndef _H_F_XY
+#define _H_F_XY
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void F_XY(uint32_t *key, uint32_t *key_x, uint32_t *key_y);
+void F_XY_reverse(uint32_t *key, uint32_t *key_xy);
+
+void n128_lrot_3ds(u32 *num, u32 shift);
+void n128_rrot_3ds(u32 *num, u32 shift);
+void n128_add_3ds(u32 *a, u32 *b);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
--- a/arm9/source/main.cpp
+++ b/arm9/source/main.cpp
@ -207,6 +207,9 @@ int main(int argc, char **argv) {
 		}
 		nandMounted = nandMount();
 		is3DS = ((access("sd:/Nintendo 3DS", F_OK) == 0) && (*(vu32*)(0x0DFFFE0C) == 0x474D3969));
+		/*FILE* cidFile = fopen("sd:/gm9i/CID.bin", "wb");
+		fwrite((void*)0x2FFD7BC, 1, 16, cidFile);
+		fclose(cidFile);*/
 	} /*else if (isRegularDS) {
 		*(vu32*)(0x08240000) = 1;
 		expansionPakFound = ((*(vu32*)(0x08240000) == 1) && (io_dldi_data->ioInterface.features & FEATURE_SLOT_NDS));
--- a/arm9/source/nand.c
+++ b/arm9/source/nand.c
@ -1,55 +0,0 @@
-
-#include <nds.h>
-#include <nds/disc_io.h>
-#include <stdio.h>
-
-#define SECTOR_SIZE 512
-
-static FILE* nandFile;
-
-bool nand_startup() {
-	nandFile = fopen("sd:/nand.bin", "rb");
-	if (nandFile) {
-		return true;
-	}
-	return false;
-}
-
-bool nand_is_inserted() {
-	if (nandFile) {
-		return true;
-	}
-	return false;
-}
-
-bool nand_read_sectors(sec_t sector, sec_t numSectors, void *buffer) {
-	if (!nandFile) return false;
-
-	fseek(nandFile, (sector << 9), SEEK_SET);
-	fread(buffer, 1, (numSectors << 9), nandFile);
-	return true;
-}
-
-bool nand_write_sectors(sec_t sector, sec_t numSectors, const void *buffer) {
-	return false;
-}
-
-bool nand_clear_status() {
-	return true;
-}
-
-bool nand_shutdown() {
-	fclose(nandFile);
-	return true;
-}
-
-const DISC_INTERFACE io_nand = {
-	('N' << 24) | ('A' << 16) | ('N' << 8) | 'D',
-	FEATURE_MEDIUM_CANREAD,
-	nand_startup,
-	nand_is_inserted,
-	nand_read_sectors,
-	nand_write_sectors,
-	nand_clear_status,
-	nand_shutdown
-};
--- a/arm9/source/nand.h
+++ b/arm9/source/nand.h
@ -1,6 +0,0 @@
-#pragma once
-
-#include <nds.h>
-#include <nds/disc_io.h>
-
-extern const DISC_INTERFACE io_nand;
--- a/arm9/source/nandio.c
+++ b/arm9/source/nandio.c
@ -0,0 +1,117 @@
+
+#include <nds.h>
+#include <nds/disc_io.h>
+#include <malloc.h>
+#include "crypto.h"
+#include "sector0.h"
+
+//#define SECTOR_SIZE 512
+#define CRYPT_BUF_LEN 64
+
+extern bool nand_Startup();
+
+static u8* crypt_buf = 0;
+
+static u32 fat_sig_fix_offset = 0;
+
+static u32 sector_buf32[SECTOR_SIZE/sizeof(u32)];
+static u8 *sector_buf = (u8*)sector_buf32;
+
+void nandio_set_fat_sig_fix(u32 offset) {
+	fat_sig_fix_offset = offset;
+}
+
+bool nandio_startup() {
+	if (!nand_Startup()) return false;
+
+	nand_ReadSectors(0, 1, sector_buf);
+	int is3DS = parse_ncsd(sector_buf, 0) == 0;
+	if (is3DS) return false;
+
+	if (*(u32*)(0x2FFD7BC) == 0) {
+		// Get eMMC CID
+		*(u32*)(0x2FFFD0C) = 0x454D4D43;
+		while (*(u32*)(0x2FFFD0C) != 0);
+	}
+
+	// iprintf("sector 0 is %s\n", is3DS ? "3DS" : "DSi");
+	dsi_crypt_init((const u8*)0x2FFFD00, (const u8*)0x2FFD7BC, is3DS);
+	//dsi_nand_crypt(sector_buf, sector_buf, 0, SECTOR_SIZE / AES_BLOCK_SIZE);
+	//parse_mbr(sector_buf, is3DS, 0);
+
+	if (crypt_buf == 0) {
+		crypt_buf = (u8*)memalign(32, SECTOR_SIZE * CRYPT_BUF_LEN);
+		//if (crypt_buf == 0) {
+			//printf("nandio: failed to alloc buffer\n");
+		//}
+	}
+	return crypt_buf != 0;
+}
+
+bool nandio_is_inserted() {
+	return true;
+}
+
+// len is guaranteed <= CRYPT_BUF_LEN
+static bool read_sectors(sec_t start, sec_t len, void *buffer) {
+	if (nand_ReadSectors(start, len, crypt_buf)) {
+		dsi_nand_crypt(buffer, crypt_buf, start * SECTOR_SIZE / AES_BLOCK_SIZE, len * SECTOR_SIZE / AES_BLOCK_SIZE);
+		if (fat_sig_fix_offset &&
+			start == fat_sig_fix_offset
+			&& ((u8*)buffer)[0x36] == 0
+			&& ((u8*)buffer)[0x37] == 0
+			&& ((u8*)buffer)[0x38] == 0)
+		{
+			((u8*)buffer)[0x36] = 'F';
+			((u8*)buffer)[0x37] = 'A';
+			((u8*)buffer)[0x38] = 'T';
+		}
+		return true;
+	} else {
+		//printf("NANDIO: read error\n");
+		return false;
+	}
+}
+
+bool nandio_read_sectors(sec_t offset, sec_t len, void *buffer) {
+	// iprintf("R: %u(0x%08x), %u\n", (unsigned)offset, (unsigned)offset, (unsigned)len);
+	while (len >= CRYPT_BUF_LEN) {
+		if (!read_sectors(offset, CRYPT_BUF_LEN, buffer)) {
+			return false;
+		}
+		offset += CRYPT_BUF_LEN;
+		len -= CRYPT_BUF_LEN;
+		buffer = ((u8*)buffer) + SECTOR_SIZE * CRYPT_BUF_LEN;
+	}
+	if (len > 0) {
+		return read_sectors(offset, len, buffer);
+	} else {
+		return true;
+	}
+}
+
+bool nandio_write_sectors(sec_t offset, sec_t len, const void *buffer) {
+	// lol, nope
+	return false;
+}
+
+bool nandio_clear_status() {
+	return true;
+}
+
+bool nandio_shutdown() {
+	free(crypt_buf);
+	crypt_buf = 0;
+	return true;
+}
+
+const DISC_INTERFACE io_dsi_nand = {
+	('N' << 24) | ('A' << 16) | ('N' << 8) | 'D',
+	FEATURE_MEDIUM_CANREAD,
+	nandio_startup,
+	nandio_is_inserted,
+	nandio_read_sectors,
+	nandio_write_sectors,
+	nandio_clear_status,
+	nandio_shutdown
+};
--- a/arm9/source/nandio.h
+++ b/arm9/source/nandio.h
@ -0,0 +1,8 @@
+#pragma once
+
+#include <nds.h>
+#include <nds/disc_io.h>
+
+void nandio_set_fat_sig_fix(u32 offset);
+
+extern const DISC_INTERFACE io_dsi_nand;
--- a/arm9/source/sector0.c
+++ b/arm9/source/sector0.c
@ -0,0 +1,100 @@
+
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+#include "utils.h"
+#include "sector0.h"
+
+// return 0 for valid NCSD header
+int parse_ncsd(const uint8_t sector0[SECTOR_SIZE], int verbose) {
+	const ncsd_header_t * h = (ncsd_header_t *)sector0;
+	if (h->magic == 0x4453434e) {
+		if (verbose) {
+			//printf("NCSD magic found\n");
+		}
+	} else {
+		if (verbose) {
+			//printf("NCSD magic not found\n");
+		}
+		return -1;
+	}
+	if (verbose) {
+		//iprintf("size: %" PRIu32 " sectors, %s MB\n", h->size, to_mebi(h->size * SECTOR_SIZE));
+		//iprintf("media ID: %08" PRIx32 "%08" PRIx32 "\n", h->media_id_h, h->media_id_l);
+	}
+
+	for (unsigned i = 0; i < NCSD_PARTITIONS; ++i) {
+		unsigned fs_type = h->fs_types[i];
+		if (fs_type == 0) {
+			break;
+		}
+		const char *s_fs_type;
+		switch (fs_type) {
+			case 1:
+				s_fs_type = "Normal";
+				break;
+			case 3:
+				s_fs_type = "FIRM";
+				break;
+			case 4:
+				s_fs_type = "AGB_FIRM save";
+				break;
+			default:
+				if (verbose) {
+					//iprintf("invalid partition type %d\n", fs_type);
+				}
+				return -2;
+		}
+		if (verbose) {
+			// yes I use MB for "MiB", bite me
+			//iprintf("partition %u, %s, crypt: %" PRIu8 ", offset: 0x%08" PRIx32 ", length: 0x%08" PRIx32 "(%s MB)\n",
+				//i, s_fs_type, h->crypt_types[i],
+				//h->partitions[i].offset, h->partitions[i].length, to_mebi(h->partitions[i].length * SECTOR_SIZE));
+		}
+	}
+	return 0;
+}
+
+const mbr_partition_t ptable_DSi[MBR_PARTITIONS] = {
+	{0, {3, 24, 4}, 6, {15, 224, 59}, 0x00000877, 0x00066f89},
+	{0, {2, 206, 60}, 6, {15, 224, 190}, 0x0006784d, 0x000105b3},
+	{0, {2, 222, 191}, 1, {15, 224, 191}, 0x00077e5d, 0x000001a3},
+	{0, {0, 0, 0}, 0, {0, 0, 0}, 0, 0}
+};
+
+const mbr_partition_t ptable_3DS[MBR_PARTITIONS] = {
+	{0, {4, 24, 0}, 6, {1, 160, 63}, 0x00000097, 0x00047da9},
+	{0, {4, 142, 64}, 6, {1, 160, 195}, 0x0004808d, 0x000105b3},
+	{0, {0, 0, 0}, 0, {0, 0, 0}, 0, 0},
+	{0, {0, 0, 0}, 0, {0, 0, 0}, 0, 0}
+};
+
+// return 0 for valid MBR
+int parse_mbr(const uint8_t sector0[SECTOR_SIZE], int is3DS, int verbose) {
+	const mbr_t *m = (mbr_t*)sector0;
+	const mbr_partition_t *ref_ptable; // reference partition table
+	int ret = 0;
+	if (m->boot_signature_0 != 0x55 || m->boot_signature_1 != 0xaa) {
+		//printf("invalid boot signature(0x55, 0xaa)\n");
+		ret = -1;
+	}
+	if (!is3DS) {
+		for (unsigned i = 0; i < sizeof(m->bootstrap); ++i) {
+			if (m->bootstrap[i]) {
+				//printf("bootstrap on DSi should be all zero\n");
+				ret = 0;
+				break;
+			}
+		}
+		ref_ptable = ptable_DSi;
+	} else {
+		ref_ptable = ptable_3DS;
+	}
+	// only test the 1st partition now, we've seen variations on the 3rd partition
+	// and after all we only care about the 1st partition
+	if (memcmp(ref_ptable, m->partitions, sizeof(mbr_partition_t))) {
+		//printf("invalid partition table\n");
+		ret = -2;
+	}
+	return ret;
+}
--- a/arm9/source/sector0.h
+++ b/arm9/source/sector0.h
@ -0,0 +1,72 @@
+#pragma once
+
+#include <stdint.h>
+#include <assert.h>
+
+// https://3dbrew.org/wiki/NCSD#NCSD_header
+
+#define SECTOR_SIZE 0x200
+
+#define NCSD_PARTITIONS 8
+
+#ifdef _MSC_VER
+#pragma pack(push, 1)
+#define __PACKED
+#elif defined __GNUC__
+#define __PACKED __attribute__ ((__packed__))
+#endif
+
+typedef struct {
+	uint32_t offset;
+	uint32_t length;
+} __PACKED ncsd_partition_t;
+
+typedef struct {
+	uint8_t signature[0x100];
+	uint32_t magic;
+	uint32_t size;
+	uint32_t media_id_l;
+	uint32_t media_id_h;
+	uint8_t fs_types[NCSD_PARTITIONS];
+	uint8_t crypt_types[NCSD_PARTITIONS];
+	ncsd_partition_t partitions[NCSD_PARTITIONS];
+} __PACKED ncsd_header_t;
+
+typedef struct {
+	uint8_t head;
+	uint8_t sector;
+	uint8_t cylinder;
+} __PACKED chs_t;
+
+typedef struct {
+	uint8_t status;
+	chs_t chs_first;
+	uint8_t type;
+	chs_t chs_last;
+	uint32_t offset;
+	uint32_t length;
+} __PACKED mbr_partition_t;
+
+#define MBR_PARTITIONS 4
+// or 446 in decimal, all zero on DSi in all my samples
+#define MBR_BOOTSTRAP_SIZE 0x1be
+
+typedef struct {
+	uint8_t bootstrap[MBR_BOOTSTRAP_SIZE];
+	mbr_partition_t partitions[MBR_PARTITIONS];
+	uint8_t boot_signature_0;
+	uint8_t boot_signature_1;
+} __PACKED mbr_t;
+
+#ifdef _MSC_VER
+#pragma pack(pop)
+#endif
+#undef __PACKED
+
+
+static_assert(sizeof(ncsd_header_t) == 0x160, "sizeof(ncsd_header_t) should equal 0x160");
+static_assert(sizeof(mbr_t) == SECTOR_SIZE, "sizeof(mbr_t) should equal 0x200");
+
+int parse_ncsd(const uint8_t sector0[SECTOR_SIZE], int verbose);
+
+int parse_mbr(const uint8_t sector0[SECTOR_SIZE], int is3DS, int verbose);
--- a/arm9/source/sha1.c
+++ b/arm9/source/sha1.c
@ -0,0 +1,242 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 2002, Dr Brian Gladman, Worcester, UK.   All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software in both source and binary
+ form is allowed (with or without changes) provided that:
+
+   1. distributions of this source code include the above copyright
+      notice, this list of conditions and the following disclaimer;
+
+   2. distributions in binary form include the above copyright
+      notice, this list of conditions and the following disclaimer
+      in the documentation and/or other associated materials;
+
+   3. the copyright holder's name is not used to endorse products
+      built using this software without specific written permission.
+
+ ALTERNATIVELY, provided that this notice is retained in full, this product
+ may be distributed under the terms of the GNU General Public License (GPL),
+ in which case the provisions of the GPL apply INSTEAD OF those given above.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 01/08/2005
+
+ This is a byte oriented version of SHA1 that operates on arrays of bytes
+ stored in memory.
+*/
+
+#include <string.h>     /* for memcpy() etc.        */
+
+#include "sha1.h"
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif
+
+#define SHA1_BLOCK_SIZE  64
+
+#define rotl32(x,n)   (((x) << n) | ((x) >> (32 - n)))
+#define rotr32(x,n)   (((x) >> n) | ((x) << (32 - n)))
+
+#define bswap_32(x) ((rotr32((x), 24) & 0x00ff00ff) | (rotr32((x), 8) & 0xff00ff00))
+
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#define bsw_32(p,n) \
+    { int _i = (n); while(_i--) ((uint32_t*)p)[_i] = bswap_32(((uint32_t*)p)[_i]); }
+#else
+#define bsw_32(p,n)
+#endif
+
+#define SHA1_MASK   (SHA1_BLOCK_SIZE - 1)
+
+#if 0
+
+#define ch(x,y,z)       (((x) & (y)) ^ (~(x) & (z)))
+#define parity(x,y,z)   ((x) ^ (y) ^ (z))
+#define maj(x,y,z)      (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+
+#else   /* Discovered by Rich Schroeppel and Colin Plumb   */
+
+#define ch(x,y,z)       ((z) ^ ((x) & ((y) ^ (z))))
+#define parity(x,y,z)   ((x) ^ (y) ^ (z))
+#define maj(x,y,z)      (((x) & (y)) | ((z) & ((x) ^ (y))))
+
+#endif
+
+/* Compile 64 bytes of hash data into SHA1 context. Note    */
+/* that this routine assumes that the byte order in the     */
+/* ctx->wbuf[] at this point is in such an order that low   */
+/* address bytes in the ORIGINAL byte stream will go in     */
+/* this buffer to the high end of 32-bit words on BOTH big  */
+/* and little endian systems                                */
+
+#ifdef ARRAY
+#define q(v,n)  v[n]
+#else
+#define q(v,n)  v##n
+#endif
+
+#define one_cycle(v,a,b,c,d,e,f,k,h)            \
+    q(v,e) += rotr32(q(v,a),27) +               \
+              f(q(v,b),q(v,c),q(v,d)) + k + h;  \
+    q(v,b)  = rotr32(q(v,b), 2)
+
+#define five_cycle(v,f,k,i)                 \
+    one_cycle(v, 0,1,2,3,4, f,k,hf(i  ));   \
+    one_cycle(v, 4,0,1,2,3, f,k,hf(i+1));   \
+    one_cycle(v, 3,4,0,1,2, f,k,hf(i+2));   \
+    one_cycle(v, 2,3,4,0,1, f,k,hf(i+3));   \
+    one_cycle(v, 1,2,3,4,0, f,k,hf(i+4))
+
+static void sha1_compile(sha1_ctx ctx[1])
+{   uint32_t    *w = ctx->wbuf;
+
+#ifdef ARRAY
+    uint32_t    v[5];
+    memcpy(v, ctx->hash, 5 * sizeof(uint32_t));
+#else
+    uint32_t    v0, v1, v2, v3, v4;
+    v0 = ctx->hash[0]; v1 = ctx->hash[1];
+    v2 = ctx->hash[2]; v3 = ctx->hash[3];
+    v4 = ctx->hash[4];
+#endif
+
+#define hf(i)   w[i]
+
+    five_cycle(v, ch, 0x5a827999,  0);
+    five_cycle(v, ch, 0x5a827999,  5);
+    five_cycle(v, ch, 0x5a827999, 10);
+    one_cycle(v,0,1,2,3,4, ch, 0x5a827999, hf(15)); \
+
+#undef  hf
+#define hf(i) (w[(i) & 15] = rotl32(                    \
+                 w[((i) + 13) & 15] ^ w[((i) + 8) & 15] \
+               ^ w[((i) +  2) & 15] ^ w[(i) & 15], 1))
+
+    one_cycle(v,4,0,1,2,3, ch, 0x5a827999, hf(16));
+    one_cycle(v,3,4,0,1,2, ch, 0x5a827999, hf(17));
+    one_cycle(v,2,3,4,0,1, ch, 0x5a827999, hf(18));
+    one_cycle(v,1,2,3,4,0, ch, 0x5a827999, hf(19));
+
+    five_cycle(v, parity, 0x6ed9eba1,  20);
+    five_cycle(v, parity, 0x6ed9eba1,  25);
+    five_cycle(v, parity, 0x6ed9eba1,  30);
+    five_cycle(v, parity, 0x6ed9eba1,  35);
+
+    five_cycle(v, maj, 0x8f1bbcdc,  40);
+    five_cycle(v, maj, 0x8f1bbcdc,  45);
+    five_cycle(v, maj, 0x8f1bbcdc,  50);
+    five_cycle(v, maj, 0x8f1bbcdc,  55);
+
+    five_cycle(v, parity, 0xca62c1d6,  60);
+    five_cycle(v, parity, 0xca62c1d6,  65);
+    five_cycle(v, parity, 0xca62c1d6,  70);
+    five_cycle(v, parity, 0xca62c1d6,  75);
+
+#ifdef ARRAY
+    ctx->hash[0] += v[0]; ctx->hash[1] += v[1];
+    ctx->hash[2] += v[2]; ctx->hash[3] += v[3];
+    ctx->hash[4] += v[4];
+#else
+    ctx->hash[0] += v0; ctx->hash[1] += v1;
+    ctx->hash[2] += v2; ctx->hash[3] += v3;
+    ctx->hash[4] += v4;
+#endif
+}
+
+void sha1_begin(sha1_ctx ctx[1])
+{
+    ctx->count[0] = ctx->count[1] = 0;
+    ctx->hash[0] = 0x67452301;
+    ctx->hash[1] = 0xefcdab89;
+    ctx->hash[2] = 0x98badcfe;
+    ctx->hash[3] = 0x10325476;
+    ctx->hash[4] = 0xc3d2e1f0;
+}
+
+/* SHA1 hash data in an array of bytes into hash buffer and */
+/* call the hash_compile function as required.              */
+
+void sha1_hash(const unsigned char data[], unsigned long len, sha1_ctx ctx[1])
+{   uint32_t pos = (uint32_t)(ctx->count[0] & SHA1_MASK),
+            space = SHA1_BLOCK_SIZE - pos;
+    const unsigned char *sp = data;
+
+    if((ctx->count[0] += len) < len)
+        ++(ctx->count[1]);
+
+    while(len >= space)     /* tranfer whole blocks if possible  */
+    {
+        memcpy(((unsigned char*)ctx->wbuf) + pos, sp, space);
+        sp += space; len -= space; space = SHA1_BLOCK_SIZE; pos = 0;
+        bsw_32(ctx->wbuf, SHA1_BLOCK_SIZE >> 2);
+        sha1_compile(ctx);
+    }
+
+    memcpy(((unsigned char*)ctx->wbuf) + pos, sp, len);
+}
+
+/* SHA1 final padding and digest calculation  */
+
+void sha1_end(unsigned char hval[], sha1_ctx ctx[1])
+{   uint32_t    i = (uint32_t)(ctx->count[0] & SHA1_MASK);
+
+    /* put bytes in the buffer in an order in which references to   */
+    /* 32-bit words will put bytes with lower addresses into the    */
+    /* top of 32 bit words on BOTH big and little endian machines   */
+    bsw_32(ctx->wbuf, (i + 3) >> 2);
+
+    /* we now need to mask valid bytes and add the padding which is */
+    /* a single 1 bit and as many zero bits as necessary. Note that */
+    /* we can always add the first padding byte here because the    */
+    /* buffer always has at least one empty slot                    */
+    ctx->wbuf[i >> 2] &= 0xffffff80 << 8 * (~i & 3);
+    ctx->wbuf[i >> 2] |= 0x00000080 << 8 * (~i & 3);
+
+    /* we need 9 or more empty positions, one for the padding byte  */
+    /* (above) and eight for the length count. If there is not      */
+    /* enough space, pad and empty the buffer                       */
+    if(i > SHA1_BLOCK_SIZE - 9)
+    {
+        if(i < 60) ctx->wbuf[15] = 0;
+        sha1_compile(ctx);
+        i = 0;
+    }
+    else    /* compute a word index for the empty buffer positions  */
+        i = (i >> 2) + 1;
+
+    while(i < 14) /* and zero pad all but last two positions        */
+        ctx->wbuf[i++] = 0;
+
+    /* the following 32-bit length fields are assembled in the      */
+    /* wrong byte order on little endian machines but this is       */
+    /* corrected later since they are only ever used as 32-bit      */
+    /* word values.                                                 */
+    ctx->wbuf[14] = (ctx->count[1] << 3) | (ctx->count[0] >> 29);
+    ctx->wbuf[15] = ctx->count[0] << 3;
+    sha1_compile(ctx);
+
+    /* extract the hash value as bytes in case the hash buffer is   */
+    /* misaligned for 32-bit words                                  */
+    for(i = 0; i < SHA1_DIGEST_SIZE; ++i)
+        hval[i] = (unsigned char)(ctx->hash[i >> 2] >> (8 * (~i & 3)));
+}
+
+void sha1(unsigned char hval[], const unsigned char data[], unsigned long len)
+{   sha1_ctx    cx[1];
+
+    sha1_begin(cx); sha1_hash(data, len, cx); sha1_end(hval, cx);
+}
+
+#if defined(__cplusplus)
+}
+#endif
--- a/arm9/source/sha1.h
+++ b/arm9/source/sha1.h
@ -0,0 +1,67 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 2002, Dr Brian Gladman, Worcester, UK.   All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software in both source and binary
+ form is allowed (with or without changes) provided that:
+
+   1. distributions of this source code include the above copyright
+      notice, this list of conditions and the following disclaimer;
+
+   2. distributions in binary form include the above copyright
+      notice, this list of conditions and the following disclaimer
+      in the documentation and/or other associated materials;
+
+   3. the copyright holder's name is not used to endorse products
+      built using this software without specific written permission.
+
+ ALTERNATIVELY, provided that this notice is retained in full, this product
+ may be distributed under the terms of the GNU General Public License (GPL),
+ in which case the provisions of the GPL apply INSTEAD OF those given above.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 01/08/2005
+*/
+
+#ifndef _SHA1_H
+#define _SHA1_H
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif
+#if 0
+} /* Appleasing Emacs */
+#endif
+
+#include <stdint.h>
+
+/* Size of SHA1 digest */
+
+#define SHA1_DIGEST_SIZE 20
+
+/* type to hold the SHA1 context  */
+
+typedef struct
+{   uint32_t count[2];
+    uint32_t hash[5];
+    uint32_t wbuf[16];
+} sha1_ctx;
+
+void sha1_begin(sha1_ctx ctx[1]);
+void sha1_hash(const unsigned char data[], unsigned long len, sha1_ctx ctx[1]);
+void sha1_end(unsigned char hval[], sha1_ctx ctx[1]);
+void sha1(unsigned char hval[], const unsigned char data[], unsigned long len);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
--- a/arm9/source/types.h
+++ b/arm9/source/types.h
@ -0,0 +1,43 @@
+#ifndef __TYPES_H__
+#define __TYPES_H__
+
+#include <stdint.h>
+#include <stdbool.h>
+
+typedef uint8_t   u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+
+typedef int8_t    s8;
+typedef int16_t  s16;
+typedef int32_t  s32;
+typedef int64_t  s64;
+
+
+enum flags
+{
+	ExtractFlag = (1<<0),
+	InfoFlag = (1<<1),
+	PlainFlag = (1<<2),
+	VerboseFlag = (1<<3),
+	VerifyFlag = (1<<4),
+	RawFlag = (1<<5),
+	ShowKeysFlag = (1<<6),
+	DecompressCodeFlag = (1<<7)
+};
+
+enum validstate
+{
+	Unchecked = 0,
+	Good = 1,
+	Fail = 2,
+};
+
+enum sizeunits
+{
+	sizeKB = 0x400,
+	sizeMB = 0x100000,
+};
+
+#endif
--- a/arm9/source/utils.c
+++ b/arm9/source/utils.c
@ -0,0 +1,194 @@
+
+#include <stdio.h>
+#include <sys/statvfs.h>
+#include <nds.h>
+#include "utils.h"
+
+swiSHA1context_t sha1ctx;
+
+static inline int htoi(char a){
+	if(a >= '0' && a <= '9'){
+		return a - '0';
+	}else if(a >= 'a' && a <= 'f'){
+		return a - ('a' - 0xa);
+	}else if(a >= 'A' && a <= 'F'){
+		return a - ('A' - 0xa);
+	}else{
+		return -1;
+	}
+}
+
+int hex2bytes(uint8_t *out, unsigned byte_len, const char *in){
+	if (strlen(in) < byte_len << 1){
+		iprintf("%s: invalid input length, expecting %u, got %u.\n",
+			__FUNCTION__, (unsigned)byte_len << 1, (unsigned)strlen(in));
+		return -1;
+	}
+	for(unsigned i = 0; i < byte_len; ++i){
+		int h = htoi(*in++), l = htoi(*in++);
+		if(h == -1 || l == -1){
+			iprintf("%s: invalid input \"%c%c\"\n",
+				__FUNCTION__, *(in - 2), *(in - 1));
+			return -2;
+		}
+		*out++ = (h << 4) + l;
+	}
+	return 0;
+}
+
+static char str_buf[0x10];
+
+const char *to_mebi(size_t size) {
+	if (size % (1024 * 1024)) {
+		sprintf(str_buf, "%.2f", (float)(((double)size) / 1024 / 1024));
+	} else {
+		siprintf(str_buf, "%u", (unsigned)(size >> 20));
+	}
+	return str_buf;
+}
+
+int save_file(const char *filename, const void *buffer, size_t size, int save_sha1) {
+	FILE *f = fopen(filename, "wb");
+	if (f == 0) {
+		//iprintf("failed to open %s to write\n", filename);
+		return -1;
+	}
+	size_t written = fwrite(buffer, 1, size, f);
+	fclose(f);
+	if (written != size) {
+		//iprintf("error writting %s\n", filename);
+		return -2;
+	} else {
+		//iprintf("saved %s\n", filename);
+	}
+	if (save_sha1) {
+		sha1ctx.sha_block = 0;
+		swiSHA1Init(&sha1ctx);
+		swiSHA1Update(&sha1ctx, buffer, size);
+		save_sha1_file(filename);
+	}
+	return 0;
+}
+
+int load_file(void **pbuf, size_t *psize, const char *filename, int verify_sha1, int align) {
+	FILE *f = fopen(filename, "rb");
+	if (f == 0) {
+		//iprintf("failed to open %s to read\n", filename);
+		return -1;
+	}
+	int ret;
+	fseek(f, 0, SEEK_END);
+	*psize = ftell(f);
+	if (*psize == 0) {
+		*pbuf = 0;
+		ret = 1;
+	} else {
+		if (align) {
+			*pbuf = memalign(align, *psize);
+		} else {
+			*pbuf = malloc(*psize);
+		}
+		if (*pbuf == 0) {
+			//printf("failed to alloc memory\n");
+			ret = -1;
+		} else {
+			fseek(f, 0, SEEK_SET);
+			unsigned read = fread(*pbuf, 1, *psize, f);
+			if (read != *psize) {
+				//iprintf("error reading %s\n", filename);
+				free(*pbuf);
+				*pbuf = 0;
+				ret = -2;
+			} else {
+				//iprintf("loaded %s(%u)\n", filename, read);
+				if (verify_sha1) {
+					//TODO:
+					//iprintf("%s: not implemented\n", __FUNCTION__);
+				}
+				ret = 0;
+			}
+		}
+	}
+	fclose(f);
+	return ret;
+}
+
+int load_block_from_file(void *buf, const char *filename, unsigned offset, unsigned size) {
+	FILE *f = fopen(filename, "rb");
+	if (f == 0) {
+		//iprintf("failed to open %s\n", filename);
+		return -1;
+	}
+	unsigned read;
+	int ret;
+	if (offset != 0 && fseek(f, offset, SEEK_SET) != 0) {
+		//printf("seek error\n");
+		ret = -1;
+	} else if ((read = fread(buf, 1, size, f)) != size) {
+		//iprintf("read error, expecting %u, got %u\n", size, read);
+		ret = -1;
+	} else {
+		ret = 0;
+	}
+	fclose(f);
+	return ret;
+}
+
+// you should have updated the sha1 context before calling save_sha1_file
+// example: save_file() in this file and backup() in nand.c
+
+int save_sha1_file(const char *filename) {
+	size_t len_fn = strlen(filename);
+	char *sha1_fn = (char *)malloc(len_fn + 6);
+	siprintf(sha1_fn, "%s.sha1", filename);
+	// 20 bytes each use 2 chars, space, asterisk, filename, new line
+	size_t len_buf = 2 * 20 + 1 + 1 + len_fn + 1;
+	char *sha1_buf = (char *)malloc(len_buf + 1); // extra for \0
+	char *p = sha1_buf;
+	char *digest = (char *)malloc(20);
+	swiSHA1Final(digest, &sha1ctx);
+	for (int i = 0; i < 20; ++i) {
+		p += siprintf(p, "%02X", digest[i]);
+	}
+	free(digest);
+	siprintf(p, " *%s\n", filename);
+	int ret = save_file(sha1_fn, (u8*)sha1_buf, len_buf, false);
+	free(sha1_fn);
+	free(sha1_buf);
+	return ret;
+}
+
+void print_bytes(const void *buf, size_t len) {
+	const unsigned char *p = (const unsigned char *)buf;
+	for(size_t i = 0; i < len; ++i) {
+		iprintf("%02x", *p++);
+	}
+}
+
+// out must be big enough
+// can work in place
+void utf16_to_ascii(uint8_t *out, const uint16_t *in, unsigned len) {
+	const uint16_t *end = in + len;
+	while (in < end){
+		uint16_t c = *in++;
+		if (c == 0) {
+			*out = 0;
+			break;
+		} else if (c < 0x80) {
+			*out++ = (uint8_t)c;
+		}
+	}
+}
+
+size_t df(const char *path, int verbose) {
+	// it's amazing libfat even got this to work
+	struct statvfs s;
+	statvfs(path, &s);
+	size_t free = s.f_bsize * s.f_bfree;
+	if (verbose) {
+		//iprintf("%s", to_mebi(free));
+		//iprintf("/%s MB (free/total)\n", to_mebi(s.f_bsize * s.f_blocks));
+	}
+	return free;
+}
+
--- a/arm9/source/utils.h
+++ b/arm9/source/utils.h
@ -0,0 +1,23 @@
+
+#pragma once
+
+#include <nds.h>
+#include <stdint.h>
+
+int hex2bytes(uint8_t *out, unsigned byte_len, const char *in);
+
+const char * to_mebi(size_t size);
+
+int save_file(const char *filename, const void *buffer, size_t size, int save_sha1);
+
+int load_file(void **pbuf, size_t *psize, const char *filename, int verify_sha1, int align);
+
+int load_block_from_file(void *buf, const char *filename, unsigned offset, unsigned size);
+
+int save_sha1_file(const char *filename);
+
+void print_bytes(const void *buf, size_t len);
+
+void utf16_to_ascii(uint8_t *out, const uint16_t *in, unsigned len);
+
+size_t df(const char *path, int verbose);