bfCL/ocl_brute.c


#include <stdio.h>
#include "utils.h"
#include "crypto.h"
#include "ocl.h"
#include "dsi.h"
#include "ocl_brute.h"

// 123 -> 0x123
static cl_ulong to_bcd(cl_ulong i) {
	cl_ulong o = 0;
	unsigned shift = 0;
	while (i) {
		o |= (i % 10) << (shift++ << 2);
		i /= 10;
	}
	return o;
}

// 0x123 -> 123
static cl_ulong from_bcd(cl_ulong i) {
	cl_ulong o = 0;
	cl_ulong pow = 1;
	while (i) {
		o += (i & 0xf) * pow;
		i >>= 4;
		pow *= 10;
	}
	return o;
}

int ocl_brute_console_id(const cl_uchar *console_id, const cl_uchar *emmc_cid,
	cl_uint offset0, const cl_uchar *src0, const cl_uchar *ver0,
	cl_uint offset1, const cl_uchar *src1, const cl_uchar *ver1,
	ocl_brute_mode mode)
{
	// preparing args
	cl_ulong console_id_template = u64be(console_id);
	cl_ulong xor0[2] = { 0 }, xor1[2] = { 0 };
	dsi_make_xor((u8*)xor0, src0, ver0);
	if (src1 != 0) {
		dsi_make_xor((u8*)xor1, src1, ver1);
	}
	cl_uint ctr[4] = { 0 };
	if (emmc_cid != 0) {
		dsi_make_ctr((u8*)ctr, emmc_cid, offset0);
	}
	cl_ulong out = 0;
#if DEBUG
	{
		printf("XOR     : %s\n", hexdump(xor0, 16, 0));
		u8 aes_key[16];
		dsi_make_key(aes_key, u64be(console_id));
		printf("AES KEY : %s\n", hexdump(aes_key, 16, 0));
		aes_init();
		aes_set_key_enc_128(aes_key);
		printf("CTR     : %s\n", hexdump(ctr, 16, 0));
		aes_encrypt_128((u8*)ctr, (u8*)xor0);
		printf("XOR TRY : %s\n", hexdump(xor0, 16, 0));
		// exit(1);
	}
#endif

	TimeHP t0, t1; long long td = 0;

	cl_int err;
	cl_platform_id platform_id;
	cl_device_id device_id;
	ocl_get_device(&platform_id, &device_id);
	if (platform_id == NULL || device_id == NULL) {
		return -1;
	}

	cl_context context = OCL_ASSERT2(clCreateContext(0, 1, &device_id, NULL, NULL, &err));
	cl_command_queue command_queue = OCL_ASSERT2(clCreateCommandQueue(context, device_id, 0, &err));

	const char *source_names[] = {
		"cl/common.h",
		"cl/aes_tables.cl",
		"cl/aes_128.cl",
		"cl/dsi.h",
		"cl/bcd.h",
		emmc_cid != 0 ? "cl/kernel_console_id.cl" : "cl/kernel_console_id_ds3.cl" };
	const char * options;
	if (mode == NORMAL) {
		options = "-w -Werror";
	} else if (mode == BCD) {
		options = "-w -Werror -DBCD";
	} else if (mode == CTR) {
		options = "-w -Werror -DCTR";
	} else {
		return -1;
	}
	cl_program program = ocl_build_from_sources(sizeof(source_names) / sizeof(char *),
		source_names, context, device_id, options);

	cl_kernel kernel = OCL_ASSERT2(clCreateKernel(program, emmc_cid != 0 ? "test_console_id" : "test_console_id_ds3", &err));

	size_t local;
	OCL_ASSERT(clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL));
	printf("local work size: %u\n", (unsigned)local);

	// there's no option to create it zero initialized
	cl_mem mem_out = OCL_ASSERT2(clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_ulong), NULL, &err));
	OCL_ASSERT(clEnqueueWriteBuffer(command_queue, mem_out, CL_TRUE, 0, sizeof(cl_ulong), &out, 0, NULL, NULL));

	unsigned template_bits;
	cl_ulong total;
	unsigned group_bits;
	size_t num_items;
	unsigned loops;
	if (mode == NORMAL || mode == CTR) {
		// CTR mode is actually quite similar to NORMAL except we brute the lower 31 bits
		// since bit 31 doesn't matter, got ORed with 1 when making key
		template_bits = 32;
		total = 1ull << (64 - template_bits);
		group_bits = 28;
		num_items = 1ull << group_bits; // 268435456
		loops = 1 << (64 - template_bits - group_bits);
		if (mode == CTR) {
			loops >>= 1;
		}
	} else {
		// 0x08a15????????1?? as example, 10 digit to brute, beware the known digit near the end
		// 5 BCD digits, used to calculate template mask
		template_bits = 20;
		// I wish we could use 1e10 in C, counting 0 is not good to your eye
		total = from_bcd(1ull << 40);
		// work items variations on lower bits per enqueue, 8 + 1 digits, including the known digit
		// reduced from 36 to 28 to make nvidia runtime happy
		group_bits = 28;
		// work items per enqueue, don't count the known digit here
		num_items = from_bcd(1ull << (group_bits - 4));
		// between the template bits and group bits, it's the loop bits
		loops = from_bcd(1 << (64 - template_bits - group_bits));
	}
	// it needs to be aligned, a little overhead hurts nobody
	if (num_items % local) {
		// printf("alignment: %"LL"d -> ", num_items);
		num_items = (num_items / local + 1) * local;
		// printf("%"LL"d\n", num_items);
	}
	console_id_template &= ((cl_ulong)-1ll) << (64 - template_bits);
	// printf("total: %I64u, 0x%I64x\n", total, total);
	// printf("num_items: %I64u, 0x%I64x\n", num_items, num_items);
	// printf("steps: %u, 0x%x\n", steps, steps);
	OCL_ASSERT(clSetKernelArg(kernel, 7, sizeof(cl_mem), &mem_out));
	if (emmc_cid != 0) {
		OCL_ASSERT(clSetKernelArg(kernel, 1, sizeof(cl_uint), &ctr[0]));
		OCL_ASSERT(clSetKernelArg(kernel, 2, sizeof(cl_uint), &ctr[1]));
		OCL_ASSERT(clSetKernelArg(kernel, 3, sizeof(cl_uint), &ctr[2]));
		OCL_ASSERT(clSetKernelArg(kernel, 4, sizeof(cl_uint), &ctr[3]));
		OCL_ASSERT(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &xor0[0]));
		OCL_ASSERT(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &xor0[1]));
	} else {
		OCL_ASSERT(clSetKernelArg(kernel, 1, sizeof(cl_uint), &offset0));
		OCL_ASSERT(clSetKernelArg(kernel, 2, sizeof(cl_ulong), &xor0[0]));
		OCL_ASSERT(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &xor0[1]));
		OCL_ASSERT(clSetKernelArg(kernel, 4, sizeof(cl_uint), &offset1));
		OCL_ASSERT(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &xor1[0]));
		OCL_ASSERT(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &xor1[1]));
	}
	get_hp_time(&t0);
	for (unsigned i = 0; i < loops; ++i) {
		cl_ulong console_id = console_id_template;
		if (mode == BCD) {
			console_id |= to_bcd(i) << group_bits;
		} else {
			console_id |= (u64)i << group_bits;
		}
		printf("%016"LL"x\n", console_id);
		OCL_ASSERT(clSetKernelArg(kernel, 0, sizeof(cl_ulong), &console_id));

		OCL_ASSERT(clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &num_items, &local, 0, NULL, NULL));
		clFinish(command_queue);

		OCL_ASSERT(clEnqueueReadBuffer(command_queue, mem_out, CL_TRUE, 0, sizeof(cl_ulong), &out, 0, NULL, NULL));
		if (out) {
			get_hp_time(&t1); td = hp_time_diff(&t0, &t1);
			printf("got a hit: %016"LL"x\n", out);
			// also write to a file
			dump_to_file(emmc_cid ? hexdump(emmc_cid, 16, 0) : hexdump(src0, 16, 0), &out, 8);
			break;
		}
	}

	u64 tested = 0;
	if (!out) {
		tested = total;
		get_hp_time(&t1); td = hp_time_diff(&t0, &t1);
	} else {
		tested = out - console_id_template;
		if (mode == BCD) {
			tested = from_bcd(((tested & ~0xfff) >> 4) | (tested & 0xff));
		}
	}
	printf("%.2f seconds, %.2f M/s\n", td / 1000000.0, tested * 1.0 / td);

	clReleaseKernel(kernel);
	clReleaseMemObject(mem_out);
	clReleaseProgram(program);
	clReleaseCommandQueue(command_queue);
	clReleaseContext(context);
	return !out;
}

int ocl_brute_emmc_cid(const cl_uchar *console_id, cl_uchar *emmc_cid,
	cl_uint offset, const cl_uchar *src, const cl_uchar *ver)
{
	// preparing args
	u8 aes_key[16];
	dsi_make_key(aes_key, u64be(console_id));
	aes_init();
	aes_set_key_dec_128(aes_key);
	cl_ulong xor[2];
	dsi_make_xor((u8*)xor, src, ver);
	cl_ulong ctr[2];
	aes_decrypt_128((u8*)xor , (u8*)ctr);
	cl_ulong emmc_cid_sha1_16[2];
	byte_reverse_16((u8*)emmc_cid_sha1_16, (u8*)ctr);
	sub_128_64(emmc_cid_sha1_16, offset);
	cl_ulong out = 0;
#ifdef DEBUG
	{
		printf("SHA1 A: %s\n", hexdump(emmc_cid_sha1_16, 16, 0));
		u8 sha1_verify[16];
		sha1_16(emmc_cid, sha1_verify);
		printf("SHA1 B: %s\n", hexdump(sha1_verify, 16, 0));
	}
#endif

	TimeHP t0, t1; long long td = 0;

	cl_int err;
	cl_platform_id platform_id;
	cl_device_id device_id;
	ocl_get_device(&platform_id, &device_id);
	if (platform_id == NULL || device_id == NULL) {
		return -1;
	}

	cl_context context = OCL_ASSERT2(clCreateContext(0, 1, &device_id, NULL, NULL, &err));
	cl_command_queue command_queue = OCL_ASSERT2(clCreateCommandQueue(context, device_id, 0, &err));

	const char *source_names[] = {
		"cl/common.h",
		"cl/sha1_16.cl",
		"cl/kernel_emmc_cid.cl" };
	cl_program program = ocl_build_from_sources(sizeof(source_names) / sizeof(char *),
		source_names, context, device_id, "-w -Werror");

	cl_kernel kernel = OCL_ASSERT2(clCreateKernel(program, "test_emmc_cid", &err));

	size_t local;
	OCL_ASSERT(clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL));
	printf("local work size: %u\n", (unsigned)local);

	// there's no option to create it zero initialized
	cl_mem mem_out = OCL_ASSERT2(clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_ulong), NULL, &err));
	OCL_ASSERT(clEnqueueWriteBuffer(command_queue, mem_out, CL_TRUE, 0, sizeof(cl_ulong), &out, 0, NULL, NULL));

	unsigned brute_bits = 32;
	unsigned group_bits = 28;
	unsigned loop_bits = brute_bits - group_bits;
	unsigned loops = 1ull << loop_bits;
	size_t num_items = 1ull << group_bits;
	// it needs to be aligned, a little overhead hurts nobody
	if (num_items % local) {
		num_items = (num_items / local + 1) * local;
	}
	OCL_ASSERT(clSetKernelArg(kernel, 2, sizeof(cl_ulong), emmc_cid_sha1_16));
	OCL_ASSERT(clSetKernelArg(kernel, 3, sizeof(cl_ulong), emmc_cid_sha1_16 + 1));
	OCL_ASSERT(clSetKernelArg(kernel, 4, sizeof(cl_mem), &mem_out));
	get_hp_time(&t0);
	for (unsigned i = 0; i < loops; ++i) {
		*(u32*)(emmc_cid + 1) = i << group_bits;
		puts(hexdump(emmc_cid, 16, 0));
		OCL_ASSERT(clSetKernelArg(kernel, 0, sizeof(cl_ulong), emmc_cid));
		OCL_ASSERT(clSetKernelArg(kernel, 1, sizeof(cl_ulong), emmc_cid + 8));

		OCL_ASSERT(clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &num_items, &local, 0, NULL, NULL));
		clFinish(command_queue);

		OCL_ASSERT(clEnqueueReadBuffer(command_queue, mem_out, CL_TRUE, 0, sizeof(cl_ulong), &out, 0, NULL, NULL));
		if (out) {
			get_hp_time(&t1); td = hp_time_diff(&t0, &t1);
			*(u32*)(emmc_cid + 1) |= out;
			printf("got a hit: %s\n", hexdump(emmc_cid, 16, 0));
			// also write to a file
			dump_to_file(hexdump(console_id, 8, 0), emmc_cid, 16);
			break;
		}
	}

	u64 tested = 0;
	if (!out) {
		tested = 1ull << 32;
		get_hp_time(&t1); td = hp_time_diff(&t0, &t1);
	} else {
		tested = *(u32*)(emmc_cid + 1);
	}
	printf("%.2f seconds, %.2f M/s\n", td / 1000000.0, tested * 1.0 / td);

	clReleaseKernel(kernel);
	clReleaseMemObject(mem_out);
	clReleaseProgram(program);
	clReleaseCommandQueue(command_queue);
	clReleaseContext(context);
	return !out;
}

/* more info:
 * https://zoogie.github.io/web/34%E2%85%95c3/#/22
 * https://gbatemp.net/threads/eol-is-lol-the-34c3-talk-for-3ds-that-never-was.494698/
 * what I'm doing here is simply brute the 3rd u32 of a u128 so that the first half of sha256 matches ver
 */
int ocl_brute_msky(const cl_uint *msky, const cl_uint *ver)
{
	TimeHP t0, t1; long long td = 0;

	cl_int err;
	cl_platform_id platform_id;
	cl_device_id device_id;
	ocl_get_device(&platform_id, &device_id);
	if (platform_id == NULL || device_id == NULL) {
		return -1;
	}

	cl_context context = OCL_ASSERT2(clCreateContext(0, 1, &device_id, NULL, NULL, &err));
	cl_command_queue command_queue = OCL_ASSERT2(clCreateCommandQueue(context, device_id, 0, &err));

	const char *source_names[] = {
		"cl/common.h",
		"cl/sha256_16.cl",
		"cl/kernel_msky.cl" };
	cl_program program = ocl_build_from_sources(sizeof(source_names) / sizeof(char *),
		source_names, context, device_id, "-w -Werror -DSHA256_16");

	cl_kernel kernel = OCL_ASSERT2(clCreateKernel(program, "test_msky", &err));

	size_t local;
	OCL_ASSERT(clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL));
	printf("local work size: %u\n", (unsigned)local);

	// there's no option to create it zero initialized
	cl_uint out = 0;
	cl_mem mem_out = OCL_ASSERT2(clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint), NULL, &err));
	OCL_ASSERT(clEnqueueWriteBuffer(command_queue, mem_out, CL_TRUE, 0, sizeof(cl_uint), &out, 0, NULL, NULL));

	unsigned brute_bits = 32;
	unsigned group_bits = 20;
	unsigned loop_bits = brute_bits - group_bits;
	unsigned loops = 1ull << loop_bits;
	size_t num_items = 1ull << group_bits;
	// it needs to be aligned, a little overhead hurts nobody
	if (num_items % local) {
		num_items = (num_items / local + 1) * local;
	}
	OCL_ASSERT(clSetKernelArg(kernel, 0, sizeof(cl_uint), &msky[0]));
	OCL_ASSERT(clSetKernelArg(kernel, 1, sizeof(cl_uint), &msky[1]));
	OCL_ASSERT(clSetKernelArg(kernel, 4, sizeof(cl_uint), &ver[0]));
	OCL_ASSERT(clSetKernelArg(kernel, 5, sizeof(cl_uint), &ver[1]));
	OCL_ASSERT(clSetKernelArg(kernel, 6, sizeof(cl_uint), &ver[2]));
	OCL_ASSERT(clSetKernelArg(kernel, 7, sizeof(cl_uint), &ver[3]));
	OCL_ASSERT(clSetKernelArg(kernel, 8, sizeof(cl_mem), &mem_out));
	get_hp_time(&t0);
	int msky3_range = 16384; // "fan out" +/-8192 on msky3
	unsigned i, j;
	for (j = 0; j < msky3_range; ++j) {
		cl_uint msky3 = msky[3] + (j & 1 ? 1 : -1) * ((j + 1) >> 1);
		printf("%08x\r", msky3);
		OCL_ASSERT(clSetKernelArg(kernel, 3, sizeof(cl_uint), &msky3));
		for (i = 0; i < loops; ++i) {
			cl_uint msky2 = i << group_bits;
			OCL_ASSERT(clSetKernelArg(kernel, 2, sizeof(cl_uint), &msky2));

			OCL_ASSERT(clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &num_items, &local, 0, NULL, NULL));
			clFinish(command_queue);

			OCL_ASSERT(clEnqueueReadBuffer(command_queue, mem_out, CL_TRUE, 0, sizeof(cl_uint), &out, 0, NULL, NULL));
			if (out) {
				get_hp_time(&t1); td = hp_time_diff(&t0, &t1);
				cl_uint msky_ret[4] = { msky[0], msky[1], out, msky3 };
				printf("got a hit: %s\n", hexdump(msky_ret, 16, 0));
				break;
			}
		}
		if (out) {
			break;
		}
	}
	u64 tested = 0;
	if (!out) {
		tested = (1ull << brute_bits) * msky3_range;
		get_hp_time(&t1); td = hp_time_diff(&t0, &t1);
	} else {
		tested = out + (1ull << brute_bits) * j;
	}
	printf("%.2f seconds, %.2f M/s\n", td / 1000000.0, tested * 1.0 / td);

	clReleaseKernel(kernel);
	clReleaseMemObject(mem_out);
	clReleaseProgram(program);
	clReleaseCommandQueue(command_queue);
	clReleaseContext(context);
	return !out;
}

// LFCS brute force, https://gist.github.com/zoogie/4046726878dba89eddfa1fc07c8a27da
int ocl_brute_lfcs(cl_uint lfcs_template, cl_ushort newflag, const cl_uint *ver)
{
	TimeHP t0, t1; long long td = 0;

	cl_int err;
	cl_platform_id platform_id;
	cl_device_id device_id;
	ocl_get_device(&platform_id, &device_id);
	if (platform_id == NULL || device_id == NULL) {
		return -1;
	}

	cl_context context = OCL_ASSERT2(clCreateContext(0, 1, &device_id, NULL, NULL, &err));
	cl_command_queue command_queue = OCL_ASSERT2(clCreateCommandQueue(context, device_id, 0, &err));

	const char *source_names[] = {
		"cl/common.h",
		"cl/sha256_16.cl",
		"cl/kernel_lfcs.cl" };
	cl_program program = ocl_build_from_sources(sizeof(source_names) / sizeof(char *),
		source_names, context, device_id, "-w -Werror -DSHA256_12");

	cl_kernel kernel = OCL_ASSERT2(clCreateKernel(program, "test_lfcs", &err));

	size_t local;
	OCL_ASSERT(clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL));
	printf("local work size: %u\n", (unsigned)local);

	// there's no option to create it zero initialized
	cl_uint out = 0;
	cl_mem mem_out = OCL_ASSERT2(clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint), NULL, &err));
	OCL_ASSERT(clEnqueueWriteBuffer(command_queue, mem_out, CL_TRUE, 0, sizeof(cl_uint), &out, 0, NULL, NULL));

	unsigned brute_bits = 32;
	unsigned group_bits = 20;
	unsigned loop_bits = brute_bits - group_bits;
	unsigned loops = 1ull << loop_bits;
	size_t num_items = 1ull << group_bits;
	// it needs to be aligned, a little overhead hurts nobody
	if (num_items % local) {
		num_items = (num_items / local + 1) * local;
	}
	OCL_ASSERT(clSetKernelArg(kernel, 1, sizeof(cl_ushort), &newflag));
	OCL_ASSERT(clSetKernelArg(kernel, 2, sizeof(cl_uint), &ver[0]));
	OCL_ASSERT(clSetKernelArg(kernel, 3, sizeof(cl_uint), &ver[1]));
	OCL_ASSERT(clSetKernelArg(kernel, 4, sizeof(cl_mem), &mem_out));
	get_hp_time(&t0);
	int fan_range = 0x10000; // "fan out" full 16 bits
	unsigned i, j;
	for (j = 0; j < fan_range; ++j) {
		int fan = (j & 1 ? 1 : -1) * ((j + 1) >> 1);
		printf("%d \r", fan);
		for (i = 0; i < loops; ++i) {
			cl_uint lfcs = lfcs_template + fan * 0x10000 + (i << (group_bits - 16));
			OCL_ASSERT(clSetKernelArg(kernel, 0, sizeof(cl_uint), &lfcs));

			OCL_ASSERT(clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &num_items, &local, 0, NULL, NULL));
			clFinish(command_queue);

			OCL_ASSERT(clEnqueueReadBuffer(command_queue, mem_out, CL_TRUE, 0, sizeof(cl_uint), &out, 0, NULL, NULL));
			if (out) {
				get_hp_time(&t1); td = hp_time_diff(&t0, &t1);
				lfcs += out >> 16;
				printf("got a hit: %s (rand: 0x%04x)\n", hexdump(&lfcs, 4, 0), out & 0xffff);
				break;
			}
		}
		if (out) {
			break;
		}
	}
	u64 tested = 0;
	if (!out) {
		tested = (1ull << brute_bits) * fan_range;
		get_hp_time(&t1); td = hp_time_diff(&t0, &t1);
	} else {
		tested = out + (1ull << brute_bits) * j;
	}
	printf("%.2f seconds, %.2f M/s\n", td / 1000000.0, tested * 1.0 / td);

	clReleaseKernel(kernel);
	clReleaseMemObject(mem_out);
	clReleaseProgram(program);
	clReleaseCommandQueue(command_queue);
	clReleaseContext(context);
	return !out;
}