bfCL/ocl_brute.c
JimmyZ 687acab0c8 reduced work size to make nvidia runtime happy
give nvidia GPU more weight
2018-05-29 10:39:26 +08:00

498 lines
17 KiB
C

#include <stdio.h>
#include "utils.h"
#include "crypto.h"
#include "ocl.h"
#include "dsi.h"
#include "ocl_brute.h"
// 123 -> 0x123
static cl_ulong to_bcd(cl_ulong i) {
cl_ulong o = 0;
unsigned shift = 0;
while (i) {
o |= (i % 10) << (shift++ << 2);
i /= 10;
}
return o;
}
// 0x123 -> 123
static cl_ulong from_bcd(cl_ulong i) {
cl_ulong o = 0;
cl_ulong pow = 1;
while (i) {
o += (i & 0xf) * pow;
i >>= 4;
pow *= 10;
}
return o;
}
int ocl_brute_console_id(const cl_uchar *console_id, const cl_uchar *emmc_cid,
cl_uint offset0, const cl_uchar *src0, const cl_uchar *ver0,
cl_uint offset1, const cl_uchar *src1, const cl_uchar *ver1,
ocl_brute_mode mode)
{
// preparing args
cl_ulong console_id_template = u64be(console_id);
cl_ulong xor0[2] = { 0 }, xor1[2] = { 0 };
dsi_make_xor((u8*)xor0, src0, ver0);
if (src1 != 0) {
dsi_make_xor((u8*)xor1, src1, ver1);
}
cl_uint ctr[4] = { 0 };
if (emmc_cid != 0) {
dsi_make_ctr((u8*)ctr, emmc_cid, offset0);
}
cl_ulong out = 0;
#if DEBUG
{
printf("XOR : %s\n", hexdump(xor0, 16, 0));
u8 aes_key[16];
dsi_make_key(aes_key, u64be(console_id));
printf("AES KEY : %s\n", hexdump(aes_key, 16, 0));
aes_init();
aes_set_key_enc_128(aes_key);
printf("CTR : %s\n", hexdump(ctr, 16, 0));
aes_encrypt_128((u8*)ctr, (u8*)xor0);
printf("XOR TRY : %s\n", hexdump(xor0, 16, 0));
// exit(1);
}
#endif
TimeHP t0, t1; long long td = 0;
cl_int err;
cl_platform_id platform_id;
cl_device_id device_id;
ocl_get_device(&platform_id, &device_id);
if (platform_id == NULL || device_id == NULL) {
return -1;
}
cl_context context = OCL_ASSERT2(clCreateContext(0, 1, &device_id, NULL, NULL, &err));
cl_command_queue command_queue = OCL_ASSERT2(clCreateCommandQueue(context, device_id, 0, &err));
const char *source_names[] = {
"cl/common.h",
"cl/aes_tables.cl",
"cl/aes_128.cl",
"cl/dsi.h",
"cl/bcd.h",
emmc_cid != 0 ? "cl/kernel_console_id.cl" : "cl/kernel_console_id_ds3.cl" };
const char * options;
if (mode == NORMAL) {
options = "-w -Werror";
} else if (mode == BCD) {
options = "-w -Werror -DBCD";
} else if (mode == CTR) {
options = "-w -Werror -DCTR";
} else {
return -1;
}
cl_program program = ocl_build_from_sources(sizeof(source_names) / sizeof(char *),
source_names, context, device_id, options);
cl_kernel kernel = OCL_ASSERT2(clCreateKernel(program, emmc_cid != 0 ? "test_console_id" : "test_console_id_ds3", &err));
size_t local;
OCL_ASSERT(clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL));
printf("local work size: %u\n", (unsigned)local);
// there's no option to create it zero initialized
cl_mem mem_out = OCL_ASSERT2(clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_ulong), NULL, &err));
OCL_ASSERT(clEnqueueWriteBuffer(command_queue, mem_out, CL_TRUE, 0, sizeof(cl_ulong), &out, 0, NULL, NULL));
unsigned template_bits;
cl_ulong total;
unsigned group_bits;
size_t num_items;
unsigned loops;
if (mode == NORMAL || mode == CTR) {
// CTR mode is actually quite similar to NORMAL except we brute the lower 31 bits
// since bit 31 doesn't matter, got ORed with 1 when making key
template_bits = 32;
total = 1ull << (64 - template_bits);
group_bits = 28;
num_items = 1ull << group_bits; // 268435456
loops = 1 << (64 - template_bits - group_bits);
if (mode == CTR) {
loops >>= 1;
}
} else {
// 0x08a15????????1?? as example, 10 digit to brute, beware the known digit near the end
// 5 BCD digits, used to calculate template mask
template_bits = 20;
// I wish we could use 1e10 in C, counting 0 is not good to your eye
total = from_bcd(1ull << 40);
// work items variations on lower bits per enqueue, 8 + 1 digits, including the known digit
// reduced from 36 to 28 to make nvidia runtime happy
group_bits = 28;
// work items per enqueue, don't count the known digit here
num_items = from_bcd(1ull << (group_bits - 4));
// between the template bits and group bits, it's the loop bits
loops = from_bcd(1 << (64 - template_bits - group_bits));
}
// it needs to be aligned, a little overhead hurts nobody
if (num_items % local) {
// printf("alignment: %"LL"d -> ", num_items);
num_items = (num_items / local + 1) * local;
// printf("%"LL"d\n", num_items);
}
console_id_template &= ((cl_ulong)-1ll) << (64 - template_bits);
// printf("total: %I64u, 0x%I64x\n", total, total);
// printf("num_items: %I64u, 0x%I64x\n", num_items, num_items);
// printf("steps: %u, 0x%x\n", steps, steps);
OCL_ASSERT(clSetKernelArg(kernel, 7, sizeof(cl_mem), &mem_out));
if (emmc_cid != 0) {
OCL_ASSERT(clSetKernelArg(kernel, 1, sizeof(cl_uint), &ctr[0]));
OCL_ASSERT(clSetKernelArg(kernel, 2, sizeof(cl_uint), &ctr[1]));
OCL_ASSERT(clSetKernelArg(kernel, 3, sizeof(cl_uint), &ctr[2]));
OCL_ASSERT(clSetKernelArg(kernel, 4, sizeof(cl_uint), &ctr[3]));
OCL_ASSERT(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &xor0[0]));
OCL_ASSERT(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &xor0[1]));
} else {
OCL_ASSERT(clSetKernelArg(kernel, 1, sizeof(cl_uint), &offset0));
OCL_ASSERT(clSetKernelArg(kernel, 2, sizeof(cl_ulong), &xor0[0]));
OCL_ASSERT(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &xor0[1]));
OCL_ASSERT(clSetKernelArg(kernel, 4, sizeof(cl_uint), &offset1));
OCL_ASSERT(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &xor1[0]));
OCL_ASSERT(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &xor1[1]));
}
get_hp_time(&t0);
for (unsigned i = 0; i < loops; ++i) {
cl_ulong console_id = console_id_template;
if (mode == BCD) {
console_id |= to_bcd(i) << group_bits;
} else {
console_id |= (u64)i << group_bits;
}
printf("%016"LL"x\n", console_id);
OCL_ASSERT(clSetKernelArg(kernel, 0, sizeof(cl_ulong), &console_id));
OCL_ASSERT(clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &num_items, &local, 0, NULL, NULL));
clFinish(command_queue);
OCL_ASSERT(clEnqueueReadBuffer(command_queue, mem_out, CL_TRUE, 0, sizeof(cl_ulong), &out, 0, NULL, NULL));
if (out) {
get_hp_time(&t1); td = hp_time_diff(&t0, &t1);
printf("got a hit: %016"LL"x\n", out);
// also write to a file
dump_to_file(emmc_cid ? hexdump(emmc_cid, 16, 0) : hexdump(src0, 16, 0), &out, 8);
break;
}
}
u64 tested = 0;
if (!out) {
tested = total;
get_hp_time(&t1); td = hp_time_diff(&t0, &t1);
} else {
tested = out - console_id_template;
if (mode == BCD) {
tested = from_bcd(((tested & ~0xfff) >> 4) | (tested & 0xff));
}
}
printf("%.2f seconds, %.2f M/s\n", td / 1000000.0, tested * 1.0 / td);
clReleaseKernel(kernel);
clReleaseMemObject(mem_out);
clReleaseProgram(program);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
return !out;
}
int ocl_brute_emmc_cid(const cl_uchar *console_id, cl_uchar *emmc_cid,
cl_uint offset, const cl_uchar *src, const cl_uchar *ver)
{
// preparing args
u8 aes_key[16];
dsi_make_key(aes_key, u64be(console_id));
aes_init();
aes_set_key_dec_128(aes_key);
cl_ulong xor[2];
dsi_make_xor((u8*)xor, src, ver);
cl_ulong ctr[2];
aes_decrypt_128((u8*)xor , (u8*)ctr);
cl_ulong emmc_cid_sha1_16[2];
byte_reverse_16((u8*)emmc_cid_sha1_16, (u8*)ctr);
sub_128_64(emmc_cid_sha1_16, offset);
cl_ulong out = 0;
#ifdef DEBUG
{
printf("SHA1 A: %s\n", hexdump(emmc_cid_sha1_16, 16, 0));
u8 sha1_verify[16];
sha1_16(emmc_cid, sha1_verify);
printf("SHA1 B: %s\n", hexdump(sha1_verify, 16, 0));
}
#endif
TimeHP t0, t1; long long td = 0;
cl_int err;
cl_platform_id platform_id;
cl_device_id device_id;
ocl_get_device(&platform_id, &device_id);
if (platform_id == NULL || device_id == NULL) {
return -1;
}
cl_context context = OCL_ASSERT2(clCreateContext(0, 1, &device_id, NULL, NULL, &err));
cl_command_queue command_queue = OCL_ASSERT2(clCreateCommandQueue(context, device_id, 0, &err));
const char *source_names[] = {
"cl/common.h",
"cl/sha1_16.cl",
"cl/kernel_emmc_cid.cl" };
cl_program program = ocl_build_from_sources(sizeof(source_names) / sizeof(char *),
source_names, context, device_id, "-w -Werror");
cl_kernel kernel = OCL_ASSERT2(clCreateKernel(program, "test_emmc_cid", &err));
size_t local;
OCL_ASSERT(clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL));
printf("local work size: %u\n", (unsigned)local);
// there's no option to create it zero initialized
cl_mem mem_out = OCL_ASSERT2(clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_ulong), NULL, &err));
OCL_ASSERT(clEnqueueWriteBuffer(command_queue, mem_out, CL_TRUE, 0, sizeof(cl_ulong), &out, 0, NULL, NULL));
unsigned brute_bits = 32;
unsigned group_bits = 28;
unsigned loop_bits = brute_bits - group_bits;
unsigned loops = 1ull << loop_bits;
size_t num_items = 1ull << group_bits;
// it needs to be aligned, a little overhead hurts nobody
if (num_items % local) {
num_items = (num_items / local + 1) * local;
}
OCL_ASSERT(clSetKernelArg(kernel, 2, sizeof(cl_ulong), emmc_cid_sha1_16));
OCL_ASSERT(clSetKernelArg(kernel, 3, sizeof(cl_ulong), emmc_cid_sha1_16 + 1));
OCL_ASSERT(clSetKernelArg(kernel, 4, sizeof(cl_mem), &mem_out));
get_hp_time(&t0);
for (unsigned i = 0; i < loops; ++i) {
*(u32*)(emmc_cid + 1) = i << group_bits;
puts(hexdump(emmc_cid, 16, 0));
OCL_ASSERT(clSetKernelArg(kernel, 0, sizeof(cl_ulong), emmc_cid));
OCL_ASSERT(clSetKernelArg(kernel, 1, sizeof(cl_ulong), emmc_cid + 8));
OCL_ASSERT(clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &num_items, &local, 0, NULL, NULL));
clFinish(command_queue);
OCL_ASSERT(clEnqueueReadBuffer(command_queue, mem_out, CL_TRUE, 0, sizeof(cl_ulong), &out, 0, NULL, NULL));
if (out) {
get_hp_time(&t1); td = hp_time_diff(&t0, &t1);
*(u32*)(emmc_cid + 1) |= out;
printf("got a hit: %s\n", hexdump(emmc_cid, 16, 0));
// also write to a file
dump_to_file(hexdump(console_id, 8, 0), emmc_cid, 16);
break;
}
}
u64 tested = 0;
if (!out) {
tested = 1ull << 32;
get_hp_time(&t1); td = hp_time_diff(&t0, &t1);
} else {
tested = *(u32*)(emmc_cid + 1);
}
printf("%.2f seconds, %.2f M/s\n", td / 1000000.0, tested * 1.0 / td);
clReleaseKernel(kernel);
clReleaseMemObject(mem_out);
clReleaseProgram(program);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
return !out;
}
/* more info:
* https://zoogie.github.io/web/34%E2%85%95c3/#/22
* https://gbatemp.net/threads/eol-is-lol-the-34c3-talk-for-3ds-that-never-was.494698/
* what I'm doing here is simply brute the 3rd u32 of a u128 so that the first half of sha256 matches ver
*/
int ocl_brute_msky(const cl_uint *msky, const cl_uint *ver)
{
TimeHP t0, t1; long long td = 0;
cl_int err;
cl_platform_id platform_id;
cl_device_id device_id;
ocl_get_device(&platform_id, &device_id);
if (platform_id == NULL || device_id == NULL) {
return -1;
}
cl_context context = OCL_ASSERT2(clCreateContext(0, 1, &device_id, NULL, NULL, &err));
cl_command_queue command_queue = OCL_ASSERT2(clCreateCommandQueue(context, device_id, 0, &err));
const char *source_names[] = {
"cl/common.h",
"cl/sha256_16.cl",
"cl/kernel_msky.cl" };
cl_program program = ocl_build_from_sources(sizeof(source_names) / sizeof(char *),
source_names, context, device_id, "-w -Werror -DSHA256_16");
cl_kernel kernel = OCL_ASSERT2(clCreateKernel(program, "test_msky", &err));
size_t local;
OCL_ASSERT(clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL));
printf("local work size: %u\n", (unsigned)local);
// there's no option to create it zero initialized
cl_uint out = 0;
cl_mem mem_out = OCL_ASSERT2(clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint), NULL, &err));
OCL_ASSERT(clEnqueueWriteBuffer(command_queue, mem_out, CL_TRUE, 0, sizeof(cl_uint), &out, 0, NULL, NULL));
unsigned brute_bits = 32;
unsigned group_bits = 20;
unsigned loop_bits = brute_bits - group_bits;
unsigned loops = 1ull << loop_bits;
size_t num_items = 1ull << group_bits;
// it needs to be aligned, a little overhead hurts nobody
if (num_items % local) {
num_items = (num_items / local + 1) * local;
}
OCL_ASSERT(clSetKernelArg(kernel, 0, sizeof(cl_uint), &msky[0]));
OCL_ASSERT(clSetKernelArg(kernel, 1, sizeof(cl_uint), &msky[1]));
OCL_ASSERT(clSetKernelArg(kernel, 4, sizeof(cl_uint), &ver[0]));
OCL_ASSERT(clSetKernelArg(kernel, 5, sizeof(cl_uint), &ver[1]));
OCL_ASSERT(clSetKernelArg(kernel, 6, sizeof(cl_uint), &ver[2]));
OCL_ASSERT(clSetKernelArg(kernel, 7, sizeof(cl_uint), &ver[3]));
OCL_ASSERT(clSetKernelArg(kernel, 8, sizeof(cl_mem), &mem_out));
get_hp_time(&t0);
int msky3_range = 16384; // "fan out" +/-8192 on msky3
unsigned i, j;
for (j = 0; j < msky3_range; ++j) {
cl_uint msky3 = msky[3] + (j & 1 ? 1 : -1) * ((j + 1) >> 1);
printf("%08x\r", msky3);
OCL_ASSERT(clSetKernelArg(kernel, 3, sizeof(cl_uint), &msky3));
for (i = 0; i < loops; ++i) {
cl_uint msky2 = i << group_bits;
OCL_ASSERT(clSetKernelArg(kernel, 2, sizeof(cl_uint), &msky2));
OCL_ASSERT(clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &num_items, &local, 0, NULL, NULL));
clFinish(command_queue);
OCL_ASSERT(clEnqueueReadBuffer(command_queue, mem_out, CL_TRUE, 0, sizeof(cl_uint), &out, 0, NULL, NULL));
if (out) {
get_hp_time(&t1); td = hp_time_diff(&t0, &t1);
cl_uint msky_ret[4] = { msky[0], msky[1], out, msky3 };
printf("got a hit: %s\n", hexdump(msky_ret, 16, 0));
break;
}
}
if (out) {
break;
}
}
u64 tested = 0;
if (!out) {
tested = (1ull << brute_bits) * msky3_range;
get_hp_time(&t1); td = hp_time_diff(&t0, &t1);
} else {
tested = out + (1ull << brute_bits) * j;
}
printf("%.2f seconds, %.2f M/s\n", td / 1000000.0, tested * 1.0 / td);
clReleaseKernel(kernel);
clReleaseMemObject(mem_out);
clReleaseProgram(program);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
return !out;
}
// LFCS brute force, https://gist.github.com/zoogie/4046726878dba89eddfa1fc07c8a27da
int ocl_brute_lfcs(cl_uint lfcs_template, cl_ushort newflag, const cl_uint *ver)
{
TimeHP t0, t1; long long td = 0;
cl_int err;
cl_platform_id platform_id;
cl_device_id device_id;
ocl_get_device(&platform_id, &device_id);
if (platform_id == NULL || device_id == NULL) {
return -1;
}
cl_context context = OCL_ASSERT2(clCreateContext(0, 1, &device_id, NULL, NULL, &err));
cl_command_queue command_queue = OCL_ASSERT2(clCreateCommandQueue(context, device_id, 0, &err));
const char *source_names[] = {
"cl/common.h",
"cl/sha256_16.cl",
"cl/kernel_lfcs.cl" };
cl_program program = ocl_build_from_sources(sizeof(source_names) / sizeof(char *),
source_names, context, device_id, "-w -Werror -DSHA256_12");
cl_kernel kernel = OCL_ASSERT2(clCreateKernel(program, "test_lfcs", &err));
size_t local;
OCL_ASSERT(clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL));
printf("local work size: %u\n", (unsigned)local);
// there's no option to create it zero initialized
cl_uint out = 0;
cl_mem mem_out = OCL_ASSERT2(clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint), NULL, &err));
OCL_ASSERT(clEnqueueWriteBuffer(command_queue, mem_out, CL_TRUE, 0, sizeof(cl_uint), &out, 0, NULL, NULL));
unsigned brute_bits = 32;
unsigned group_bits = 20;
unsigned loop_bits = brute_bits - group_bits;
unsigned loops = 1ull << loop_bits;
size_t num_items = 1ull << group_bits;
// it needs to be aligned, a little overhead hurts nobody
if (num_items % local) {
num_items = (num_items / local + 1) * local;
}
OCL_ASSERT(clSetKernelArg(kernel, 1, sizeof(cl_ushort), &newflag));
OCL_ASSERT(clSetKernelArg(kernel, 2, sizeof(cl_uint), &ver[0]));
OCL_ASSERT(clSetKernelArg(kernel, 3, sizeof(cl_uint), &ver[1]));
OCL_ASSERT(clSetKernelArg(kernel, 4, sizeof(cl_mem), &mem_out));
get_hp_time(&t0);
int fan_range = 0x10000; // "fan out" full 16 bits
unsigned i, j;
for (j = 0; j < fan_range; ++j) {
int fan = (j & 1 ? 1 : -1) * ((j + 1) >> 1);
printf("%d \r", fan);
for (i = 0; i < loops; ++i) {
cl_uint lfcs = lfcs_template + fan * 0x10000 + (i << (group_bits - 16));
OCL_ASSERT(clSetKernelArg(kernel, 0, sizeof(cl_uint), &lfcs));
OCL_ASSERT(clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &num_items, &local, 0, NULL, NULL));
clFinish(command_queue);
OCL_ASSERT(clEnqueueReadBuffer(command_queue, mem_out, CL_TRUE, 0, sizeof(cl_uint), &out, 0, NULL, NULL));
if (out) {
get_hp_time(&t1); td = hp_time_diff(&t0, &t1);
lfcs += out >> 16;
printf("got a hit: %s (rand: 0x%04x)\n", hexdump(&lfcs, 4, 0), out & 0xffff);
break;
}
}
if (out) {
break;
}
}
u64 tested = 0;
if (!out) {
tested = (1ull << brute_bits) * fan_range;
get_hp_time(&t1); td = hp_time_diff(&t0, &t1);
} else {
tested = out + (1ull << brute_bits) * j;
}
printf("%.2f seconds, %.2f M/s\n", td / 1000000.0, tested * 1.0 / td);
clReleaseKernel(kernel);
clReleaseMemObject(mem_out);
clReleaseProgram(program);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
return !out;
}