some how it is broken now after some refactoring

This commit is contained in:
JimmyZ 2017-08-28 18:07:43 +08:00
parent e64a85470c
commit ed7e0ccfd9
16 changed files with 559 additions and 322 deletions

2
.gitignore vendored
View File

@ -2,5 +2,5 @@
*.o
*.exe
*.sln
*.vcxproj*
*.vc*
x64/

View File

@ -1,6 +1,6 @@
# only tested in mingw
PNAME = bfcl
OBJS = $(PNAME).o ocl_util.o utils.o sha1_16.o aes_128.o
OBJS = $(PNAME).o ocl_util.o utils.o sha1_16.o aes_128.o ocl_test.o ocl_brute.o
CFLAGS += -std=c11 -Wall -O2 -mrdrnd -I$(INTELOCLSDKROOT)/include
LDFLAGS += -L$(INTELOCLSDKROOT)/lib/x64

276
bfcl.c
View File

@ -1,285 +1,19 @@
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <immintrin.h>
#include "ocl.h"
#include "crypto.h"
#include "utils.h"
#ifdef __GNUC__
#include <cpuid.h>
#elif _MSC_VER
#include <intrin.h>
#endif
void ocl_test();
int cpu_has_rdrand() {
#if __GNUC__
unsigned a = 0, b = 0, c = 0, d = 0;
__get_cpuid(1, &a, &b, &c, &d);
return c & bit_RDRND;
#elif _MSC_VER
int regs[4];
__cpuid(regs, 1);
return regs[2] & (1<<30);
#else
// ICL only?
return _may_i_use_cpu_feature(_FEATURE_RDRND);
#endif
}
// CAUTION: caller is responsible to free the buf
char * read_file(const char *file_name, size_t *p_size) {
FILE * f = fopen(file_name, "rb");
if (f == NULL) {
printf("can't read file: %s", file_name);
exit(-1);
}
fseek(f, 0, SEEK_END);
*p_size = ftell(f);
char * buf = malloc(*p_size);
fseek(f, 0, SEEK_SET);
fread(buf, *p_size, 1, f);
fclose(f);
return buf;
}
void read_files(unsigned num_files, const char *file_names[], char *ptrs[], size_t sizes[]) {
for (unsigned i = 0; i < num_files; ++i) {
ptrs[i] = read_file(file_names[i], &sizes[i]);
}
}
void dump_to_file(const char *file_name, const void *buf, size_t len) {
FILE *f = fopen(file_name, "wb");
if (f == NULL) {
printf("can't open file to write: %s\n", file_name);
return;
}
fwrite(buf, len, 1, f);
fclose(f);
}
#define TEST_SHA1_16 1
#define TEST_AES_128_ECB 2
#define BLOCK_SIZE 0x10
#define NUM_BLOCKS (1 << 23)
#define BLOCKS_PER_ITEM 1
void ocl_test(cl_device_id device_id, const cl_uchar *buf_in, int test_case) {
cl_int err;
cl_context context = OCL_ASSERT2(clCreateContext(0, 1, &device_id, NULL, NULL, &err));
cl_command_queue command_queue = OCL_ASSERT2(clCreateCommandQueue(context, device_id, 0, &err));
HP_Time t0, t1;
long long td;
const size_t num_items = NUM_BLOCKS / BLOCKS_PER_ITEM;
const size_t io_buf_len = NUM_BLOCKS * BLOCK_SIZE;
const char *source_names[] = { "cl/sha1_16.cl", "cl/aes_tables.cl", "cl/aes_128.cl", "cl/kernels.cl" };
const unsigned num_sources = sizeof(source_names) / sizeof(char *);
char *sources[sizeof(source_names) / sizeof(char *)];
size_t source_sizes[sizeof(source_names) / sizeof(char *)];
read_files(num_sources, source_names, sources, source_sizes);
get_hp_time(&t0);
// WTF? GCC complains if I pass char ** in to a function expecting const char **?
cl_program program = OCL_ASSERT2(clCreateProgramWithSource(context, num_sources, (const char **)sources, source_sizes, &err));
char options[0x100];
sprintf(options, "-w -Werror -DBLOCKS_PER_ITEM=%d", BLOCKS_PER_ITEM);
// printf("compiler options: %s\n", options);
err = clBuildProgram(program, 0, NULL, options, NULL, NULL);
get_hp_time(&t1);
printf("%d microseconds for compile\n", (int)hp_time_diff(&t0, &t1));
for (unsigned i = 0; i < num_sources; ++i) {
free(sources[i]);
}
if (err != CL_SUCCESS) {
fprintf(stderr, "failed to build program, error: %s, build log:\n", ocl_err_msg(err));
size_t len;
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &len);
char *buf_log = malloc(len + 1);
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, len, buf_log, NULL);
buf_log[len] = '\0';
fprintf(stderr, "%s\n", buf_log);
free(buf_log);
exit(err);
}
const char * test_name;
switch (test_case) {
case TEST_SHA1_16: test_name = "sha1_16_test"; break;
case TEST_AES_128_ECB: test_name = "aes_128_ecb_test"; break;
default: exit(-1);
}
printf("%s on %u MB\n", test_name, (unsigned)io_buf_len >> 20);
cl_kernel kernel = OCL_ASSERT2(clCreateKernel(program, test_name, &err));
cl_uchar key[16];
unsigned int aes_rk[RK_LEN];
if (test_case == TEST_AES_128_ECB) {
aes_gen_tables();
for (unsigned i = 0; i < 16; ++i) {
key[i] = rand() & 0xff;
}
printf("Key: %s\n", hexdump(key, 16, 0));
}
cl_mem mem_in = OCL_ASSERT2(clCreateBuffer(context, CL_MEM_READ_ONLY, io_buf_len, NULL, &err));
cl_mem mem_out = OCL_ASSERT2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, io_buf_len, NULL, &err));
cl_mem mem_key;
if (test_case == TEST_AES_128_ECB) {
mem_key = OCL_ASSERT2(clCreateBuffer(context, CL_MEM_READ_ONLY, 16, NULL, &err));
}
get_hp_time(&t0);
OCL_ASSERT(clEnqueueWriteBuffer(command_queue, mem_in, CL_TRUE, 0, io_buf_len, buf_in, 0, NULL, NULL));
if (test_case == TEST_AES_128_ECB) {
OCL_ASSERT(clEnqueueWriteBuffer(command_queue, mem_key, CL_TRUE, 0, 16, key, 0, NULL, NULL));
}
get_hp_time(&t1);
td = hp_time_diff(&t0, &t1);
printf("%d microseconds for data upload, %.2f MB/s\n", (int)td, io_buf_len * 1.0f / td);
OCL_ASSERT(clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_in));
OCL_ASSERT(clSetKernelArg(kernel, 1, sizeof(cl_mem), &mem_out));
if (test_case == TEST_AES_128_ECB) {
OCL_ASSERT(clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_key));
}
size_t local;
OCL_ASSERT(clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL));
printf("local work size: %u\n", (unsigned)local);
get_hp_time(&t0);
// apparently, setting local work size to NULL doesn't affect performance, at least in this kind of work
OCL_ASSERT(clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &num_items, &local, 0, NULL, NULL));
// OCL_ASSERT(clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &num_items, NULL, 0, NULL, NULL));
clFinish(command_queue);
get_hp_time(&t1);
td = hp_time_diff(&t0, &t1);
printf("%d microseconds for OpenCL, %.2f MB/s\n", (int)td, io_buf_len * 1.0f / td);
cl_uchar *buf_out = malloc(io_buf_len);
get_hp_time(&t0);
OCL_ASSERT(clEnqueueReadBuffer(command_queue, mem_out, CL_TRUE, 0, io_buf_len, buf_out, 0, NULL, NULL));
get_hp_time(&t1);
td = hp_time_diff(&t0, &t1);
printf("%d microseconds for data download, %.2f MB/s\n", (int)td, io_buf_len * 1.0f / td);
/*
if(test_case == TEST_AES_128_ECB){
dump_to_file("r:/test_aes_in.bin", buf_in, io_buf_len);
dump_to_file("r:/test_aes_out.bin", buf_out, io_buf_len);
}
*/
cl_uchar *buf_verify = malloc(io_buf_len);
get_hp_time(&t0);
if (test_case == TEST_SHA1_16) {
for (unsigned offset = 0; offset < io_buf_len; offset += BLOCK_SIZE) {
sha1_16(buf_in + offset, buf_verify + offset);
}
} else {
for (unsigned offset = 0; offset < io_buf_len; offset += BLOCK_SIZE) {
// setting the same key over and over is stupid
// yet we still do it to keep it in line with the OpenCL port
// otherwise we can't test the set key in OpenCL
aes_set_key_enc_128(aes_rk, key);
aes_encrypt_128(aes_rk, buf_in + offset, buf_verify + offset);
}
}
get_hp_time(&t1);
td = hp_time_diff(&t0, &t1);
printf("%d microseconds for C(single thread), %.2f MB/s\n", (int)td, io_buf_len * 1.0f / td);
if (memcmp(buf_verify, buf_out, io_buf_len)) {
printf("%s: verification failed\n", test_name);
for (unsigned offset = 0; offset < io_buf_len; offset += BLOCK_SIZE) {
if (memcmp(buf_verify + offset, buf_out + offset, BLOCK_SIZE)) {
printf("first difference @ 0x%08x/0x%08x:\n", offset, (unsigned)num_items );
printf("\t%s\n", hexdump(buf_in + offset, BLOCK_SIZE, 0));
printf("\t%s\n", hexdump(buf_verify + offset, BLOCK_SIZE, 0));
printf("\t%s\n", hexdump(buf_out + offset, BLOCK_SIZE, 0));
break;
}
}
} else {
printf("%s: succeed\n", test_name);
}
free(buf_out);
clReleaseMemObject(mem_in);
clReleaseMemObject(mem_out);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
}
void ocl_brute();
int main(int argc, const char *argv[]) {
if (argc == 2 && !strcmp(argv[1], "info")) {
cl_uint num_platforms;
ocl_info(&num_platforms, 1);
} else if (argc == 2 && !strcmp(argv[1], "console_id")){
ocl_brute();
} else if (argc == 1){
cl_platform_id platform_id;
cl_device_id device_id;
ocl_get_device(&platform_id, &device_id);
if (platform_id == NULL || device_id == NULL) {
return -1;
}
cl_uchar *buf_in = malloc(BLOCK_SIZE * NUM_BLOCKS);
srand(2501);
HP_Time t0, t1; long long td;
get_hp_time(&t0);
if(cpu_has_rdrand()){
// ~190 MB/s @ X230, ~200 without the success check
printf("randomize source buffer using RDRAND\n");
unsigned long long *p = (unsigned long long *)buf_in;
unsigned long long *p_end = (unsigned long long *)(buf_in + BLOCK_SIZE * NUM_BLOCKS);
int success = 1;
while (p < p_end) {
success &= _rdrand64_step(p++);
}
if (!success) {
printf("RDRND failed\n");
exit(-1);
}
}else {
printf("randomize source buffer using AES OFB\n");
// rand() & 0xff is about ~60 MB/s @ X230
// it's worse than that AES single thread C, so OFB it is
// ~240 MB/s, even faster than RDRAND
srand(2501);
unsigned int aes_rk[RK_LEN];
unsigned char key_iv[16 * 2];
for (unsigned i = 0; i < 16 * 2; ++i) {
key_iv[i] = rand() & 0xff;
}
aes_set_key_enc_128(aes_rk, key_iv);
aes_encrypt_128(aes_rk, key_iv + 16, buf_in);
unsigned char *p_in = buf_in, *p_out = buf_in + 16,
*p_end = buf_in + BLOCK_SIZE * NUM_BLOCKS;
while (p_out < p_end) {
aes_encrypt_128(aes_rk, p_in, p_out);
p_in = p_out;
p_out += 16;
}
}
get_hp_time(&t1);
td = hp_time_diff(&t0, &t1);
printf("%d microseconds for preparing test data, %.2f MB/s\n",
(int)td, BLOCK_SIZE * NUM_BLOCKS * 1.0f / td);
ocl_test(device_id, buf_in, TEST_SHA1_16);
ocl_test(device_id, buf_in, TEST_AES_128_ECB);
free(buf_in);
ocl_test();
#ifdef _WIN32
system("pause");
#endif

View File

@ -1,18 +1,9 @@
// OpenCL has these fancy address space qualifiers that can't be cast without
#define GET_UINT32_LE(n, b, i) \
(n) = *(uint32_t*)(b + i)
#define GET_UINT32_LE_G(n, b, i) \
(n) = *(__global uint32_t*)(b + i)
#define GET_UINT32_LE_C(n, b, i) \
(n) = *(__constant uint32_t*)(b + i)
#define PUT_UINT32_LE(n, b, i) \
*(uint32_t*)(b + i) = (n)
#define PUT_UINT32_LE_G(n, b, i) \
*(__global uint32_t*)(b + i) = (n)
// AES 128 ECB adapted for OpenCL, see "aes_128.c" for more info
#define RK_LEN 44
// the caller is responsible to put the key in rk
void aes_set_key_enc_128(uint32_t rk[RK_LEN])
{
uint32_t *RK = rk;

View File

@ -1,8 +1,11 @@
// I'm obsessed with the idea of generating this file in memory
// on the other hand I know that's pointless...
/*
* Forward S-box
*/
__constant static const unsigned char FSb[256] =
static const unsigned char FSb[256] =
{
0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5,
0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76,
@ -109,19 +112,19 @@ __constant static const unsigned char FSb[256] =
V(CB,B0,B0,7B), V(FC,54,54,A8), V(D6,BB,BB,6D), V(3A,16,16,2C)
#define V(a,b,c,d) 0x##a##b##c##d
__constant static const uint32_t FT0[256] = { FT };
static const uint32_t FT0[256] = { FT };
#undef V
#define V(a,b,c,d) 0x##b##c##d##a
__constant static const uint32_t FT1[256] = { FT };
static const uint32_t FT1[256] = { FT };
#undef V
#define V(a,b,c,d) 0x##c##d##a##b
__constant static const uint32_t FT2[256] = { FT };
static const uint32_t FT2[256] = { FT };
#undef V
#define V(a,b,c,d) 0x##d##a##b##c
__constant static const uint32_t FT3[256] = { FT };
static const uint32_t FT3[256] = { FT };
#undef V
#undef FT
@ -129,7 +132,7 @@ __constant static const uint32_t FT3[256] = { FT };
/*
* Round constants
*/
__constant static const uint32_t RCON[10] =
static const uint32_t RCON[10] =
{
0x00000001, 0x00000002, 0x00000004, 0x00000008,
0x00000010, 0x00000020, 0x00000040, 0x00000080,

39
cl/common.h Normal file
View File

@ -0,0 +1,39 @@
typedef unsigned int uint32_t;
typedef unsigned char u8;
typedef unsigned int u32;
typedef unsigned long u64;
#ifndef GET_UINT32_BE
#define GET_UINT32_BE(n,b,i) \
{ \
(n) = ( (uint32_t) (b)[(i) ] << 24 ) \
| ( (uint32_t) (b)[(i) + 1] << 16 ) \
| ( (uint32_t) (b)[(i) + 2] << 8 ) \
| ( (uint32_t) (b)[(i) + 3] ); \
}
#endif
#ifndef PUT_UINT32_BE
#define PUT_UINT32_BE(n,b,i) \
{ \
(b)[(i) ] = (unsigned char) ( (n) >> 24 ); \
(b)[(i) + 1] = (unsigned char) ( (n) >> 16 ); \
(b)[(i) + 2] = (unsigned char) ( (n) >> 8 ); \
(b)[(i) + 3] = (unsigned char) ( (n) ); \
}
#endif
// OpenCL has these fancy address space qualifiers that can't be cast without
#define GET_UINT32_LE(n, b, i) \
(n) = *(uint32_t*)(b + i)
#define GET_UINT32_LE_G(n, b, i) \
(n) = *(__global uint32_t*)(b + i)
#define GET_UINT32_LE_C(n, b, i) \
(n) = *(__constant uint32_t*)(b + i)
#define PUT_UINT32_LE(n, b, i) \
*(uint32_t*)(b + i) = (n)
#define PUT_UINT32_LE_G(n, b, i) \
*(__global uint32_t*)(b + i) = (n)

67
cl/dsi.cl Normal file
View File

@ -0,0 +1,67 @@
// more about this: https://github.com/Jimmy-Z/TWLbf/blob/master/dsi.c
static const u64 DSi_KEY_Y[2] =
{0xbd4dc4d30ab9dc76ull, 0xe1a00005202ddd1dull};
static const u64 DSi_KEY_MAGIC[2] =
{0x2a680f5f1a4f3e79ull, 0xfffefb4e29590258ull};
static inline void xor_128(u64 *x, const u64 *a, const u64 *b){
x[0] = a[0] ^ b[0];
x[1] = a[1] ^ b[1];
}
static inline void add_128(u64 *a, const u64 *b){
a[0] += b[0];
if(a[0] < b[0]){
a[1] += b[1] + 1;
}else{
a[1] += b[1];
}
}
static inline void add_128_64(u64 *a, u64 b){
a[0] += b;
if(a[0] < b){
a[1] += 1;
}
}
// Answer to life, universe and everything.
static inline void rol42_128(u64 *a){
u64 t = a[1];
a[1] = (t << 42 ) | (a[0] >> 22);
a[0] = (a[0] << 42 ) | (t >> 22);
}
// eMMC Encryption for MBR/Partitions (AES-CTR, with console-specific key)
void dsi_make_key(u64 *key, u64 console_id){
u32 h = console_id >> 32, l = (u32)console_id;
u32 key_x[4] = {l, l ^ 0x24ee6906, h ^ 0xe65b601d, h};
// Key = ((Key_X XOR Key_Y) + FFFEFB4E295902582A680F5F1A4F3E79h) ROL 42
// equivalent to F_XY in twltool/f_xy.c
xor_128(key, (u64*)key_x, DSi_KEY_Y);
add_128(key, DSi_KEY_MAGIC);
rol42_128(key);
}
// CAUTION this one doesn't work in-place
void byte_reverse_16(u8 *out, const u8 *in){
out[0] = in[15];
out[1] = in[14];
out[2] = in[13];
out[3] = in[12];
out[4] = in[11];
out[5] = in[10];
out[6] = in[9];
out[7] = in[8];
out[8] = in[7];
out[9] = in[6];
out[10] = in[5];
out[11] = in[4];
out[12] = in[3];
out[13] = in[2];
out[14] = in[1];
out[15] = in[0];
}

31
cl/kernel_console_id.cl Normal file
View File

@ -0,0 +1,31 @@
// the caller should feed the target xor pad byte reversed as two uint64_t
// the ctr from emmc_cid_sha1 byte reversed as 4 uint32_t
__kernel void test_console_id(
u64 xor_l, u64 xor_h,
u64 console_id_template,
u32 ctr0, u32 ctr1, u32 ctr2, u32 ctr3,
__global int *success,
__global u64 *console_id_out)
{
if(success){
return;
}
// TODO: BCD conversion
u64 console_id = get_global_id(0) | console_id_template;
u64 dsi_key[2];
dsi_make_key(dsi_key, console_id);
u32 aes_rk[RK_LEN];
byte_reverse_16((u8*)aes_rk, (u8*)dsi_key);
aes_set_key_enc_128(aes_rk);
u32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
u64 xor[2];
aes_encrypt_128(aes_rk, ctr, (u32*)xor);
if(xor[0] == xor_l && xor[1] == xor_h){
*success = 1;
*console_id_out = console_id;
}
}

View File

@ -1,6 +1,6 @@
__kernel void sha1_16_test(
__global const unsigned char *in,
__constant const unsigned char *in,
__global unsigned char *out)
{
unsigned offset = get_global_id(0) * BLOCKS_PER_ITEM * 16;
@ -26,9 +26,9 @@ __kernel void sha1_16_test(
#define AES_BLOCK_SIZE 16
__kernel void aes_128_ecb_test(
__global const uint32_t *in,
__global uint32_t *out,
__constant const uint32_t *key)
__constant const uint32_t *key,
__constant const uint32_t *in,
__global uint32_t *out)
{
uint32_t rk[RK_LEN];
rk[0] = key[0]; rk[1] = key[1]; rk[2] = key[2]; rk[3] = key[3];
@ -43,7 +43,7 @@ __kernel void aes_128_ecb_test(
aes_encrypt_128(rk, buf, buf);
out[0] = buf[0]; out[1] = buf[1]; out[2] = buf[2]; out[3] = buf[3];
#if BLOCKS_PER_ITEM != 1
offset += AES_BLOCK_SIZE;
offset += AES_BLOCK_SIZE / 4;
}
#endif
}

View File

@ -1,25 +1,5 @@
typedef unsigned int uint32_t;
#ifndef GET_UINT32_BE
#define GET_UINT32_BE(n,b,i) \
{ \
(n) = ( (uint32_t) (b)[(i) ] << 24 ) \
| ( (uint32_t) (b)[(i) + 1] << 16 ) \
| ( (uint32_t) (b)[(i) + 2] << 8 ) \
| ( (uint32_t) (b)[(i) + 3] ); \
}
#endif
#ifndef PUT_UINT32_BE
#define PUT_UINT32_BE(n,b,i) \
{ \
(b)[(i) ] = (unsigned char) ( (n) >> 24 ); \
(b)[(i) + 1] = (unsigned char) ( (n) >> 16 ); \
(b)[(i) + 2] = (unsigned char) ( (n) >> 8 ); \
(b)[(i) + 3] = (unsigned char) ( (n) ); \
}
#endif
// sha1_16 adapted for OpenCL, see "sha1_16.c" for more information
__constant const uint32_t
h0 = 0x67452301,

4
ocl.h
View File

@ -46,3 +46,7 @@ void ocl_assert(cl_int ret, const char * code, const char * file,
OCL_Platform *ocl_info(cl_uint *p_num_platforms, int verbose);
void ocl_get_device(cl_platform_id *p_platform_id, cl_device_id *p_device_id);
cl_program ocl_build_from_sources(
unsigned num_sources, const char *source_names[],
cl_context context, cl_device_id device_id, const char * options);

77
ocl_brute.c Normal file
View File

@ -0,0 +1,77 @@
#include <stdio.h>
#include "utils.h"
#include "crypto.h"
#include "ocl.h"
void ocl_brute() {
TimeHP t0, t1; long long td;
cl_int err;
cl_platform_id platform_id;
cl_device_id device_id;
ocl_get_device(&platform_id, &device_id);
if (platform_id == NULL || device_id == NULL) {
return;
}
cl_context context = OCL_ASSERT2(clCreateContext(0, 1, &device_id, NULL, NULL, &err));
cl_command_queue command_queue = OCL_ASSERT2(clCreateCommandQueue(context, device_id, 0, &err));
const char *source_names[] = {
"cl/common.h",
"cl/aes_tables.cl",
"cl/aes_128.cl",
"cl/dsi.cl",
"cl/kernel_console_id.cl" };
cl_program program = ocl_build_from_sources(sizeof(source_names) / sizeof(char *),
source_names, context, device_id, NULL /* "-w -Werror" */);
cl_kernel kernel = OCL_ASSERT2(clCreateKernel(program, "test_console_id", &err));
cl_ulong xor_l, xor_h, console_id_template = 0x08a1522617110100ull, out;
cl_uint ctr0, ctr1, ctr2, ctr3;
cl_int success = 0;
cl_mem mem_success =
OCL_ASSERT2(clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &err));
cl_mem mem_out =
OCL_ASSERT2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_ulong), NULL, &err));
OCL_ASSERT(clEnqueueWriteBuffer(command_queue, mem_success, CL_TRUE, 0, sizeof(cl_int), &success, 0, NULL, NULL));
OCL_ASSERT(clSetKernelArg(kernel, 0, sizeof(cl_ulong), &xor_l));
OCL_ASSERT(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &xor_h));
OCL_ASSERT(clSetKernelArg(kernel, 2, sizeof(cl_ulong), &console_id_template));
OCL_ASSERT(clSetKernelArg(kernel, 3, sizeof(cl_uint), &ctr0));
OCL_ASSERT(clSetKernelArg(kernel, 4, sizeof(cl_uint), &ctr1));
OCL_ASSERT(clSetKernelArg(kernel, 5, sizeof(cl_uint), &ctr2));
OCL_ASSERT(clSetKernelArg(kernel, 6, sizeof(cl_uint), &ctr3));
OCL_ASSERT(clSetKernelArg(kernel, 7, sizeof(cl_mem), &mem_success));
OCL_ASSERT(clSetKernelArg(kernel, 8, sizeof(cl_mem), &mem_out));
size_t local;
OCL_ASSERT(clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL));
printf("local work size: %u\n", (unsigned)local);
size_t num_items = 0x100;
get_hp_time(&t0);
OCL_ASSERT(clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &num_items, &local, 0, NULL, NULL));
clFinish(command_queue);
get_hp_time(&t1); td = hp_time_diff(&t0, &t1);
OCL_ASSERT(clEnqueueReadBuffer(command_queue, mem_success, CL_TRUE, 0, sizeof(cl_int), &success, 0, NULL, NULL));
if (success) {
// if success, the speed measurement is invalid
printf("got a hit in %d microseconds\n", (int)td);
OCL_ASSERT(clEnqueueReadBuffer(command_queue, mem_out, CL_TRUE, 0, sizeof(cl_ulong), &out, 0, NULL, NULL));
printf("%08x%08x\n", (unsigned)(out >> 32), (unsigned)(out|0xffffffffu));
} else {
printf("%d microseconds, %.2f M/s\n", (int)td, num_items * 1.0f / td);
printf("sorry, no hit\n");
}
clReleaseKernel(kernel);
}

197
ocl_test.c Normal file
View File

@ -0,0 +1,197 @@
#include <stdio.h>
#include "utils.h"
#include "ocl.h"
#include "crypto.h"
#define BLOCK_SIZE 0x10
#define NUM_BLOCKS (1 << 23)
#define BLOCKS_PER_ITEM 1
#define NUM_ITEMS (NUM_BLOCKS / BLOCKS_PER_ITEM)
#define BUF_SIZE (BLOCK_SIZE * NUM_BLOCKS)
void ocl_test_run_and_read(const char * test_name, cl_kernel kernel,
cl_device_id device_id, cl_command_queue command_queue,
cl_mem mem_out, cl_uchar *buf_out)
{
printf("%s on %u MB\n", test_name, (unsigned)BUF_SIZE >> 20);
TimeHP t0, t1; long long td;
size_t local;
OCL_ASSERT(clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL));
printf("local work size: %u\n", (unsigned)local);
size_t num_items = NUM_ITEMS;
get_hp_time(&t0);
// apparently, setting local work size to NULL doesn't affect performance, at least in this kind of work
OCL_ASSERT(clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &num_items, &local, 0, NULL, NULL));
// OCL_ASSERT(clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &num_items, NULL, 0, NULL, NULL));
clFinish(command_queue);
get_hp_time(&t1); td = hp_time_diff(&t0, &t1);
printf("%d microseconds for OpenCL, %.2f MB/s\n", (int)td, BUF_SIZE * 1.0f / td);
get_hp_time(&t0);
OCL_ASSERT(clEnqueueReadBuffer(command_queue, mem_out, CL_TRUE, 0, BUF_SIZE, buf_out, 0, NULL, NULL));
get_hp_time(&t1); td = hp_time_diff(&t0, &t1);
printf("%d microseconds for data download, %.2f MB/s\n", (int)td, BUF_SIZE * 1.0f / td);
clReleaseKernel(kernel);
}
void verify(const char *test_name, cl_uchar *buf_in, cl_uchar *buf_out, cl_uchar *buf_verify){
if (memcmp(buf_verify, buf_out, BUF_SIZE)) {
printf("%s: verification failed\n", test_name);
unsigned count = 5;
for (unsigned offset = 0; offset < BUF_SIZE; offset += BLOCK_SIZE) {
if (memcmp(buf_verify + offset, buf_out + offset, BLOCK_SIZE)) {
printf("difference @ 0x%08x/0x%08x:\n", offset, (unsigned)NUM_BLOCKS);
printf("\t%s\n", hexdump(buf_in + offset, BLOCK_SIZE, 0));
printf("\t%s\n", hexdump(buf_out + offset, BLOCK_SIZE, 0));
printf("\t%s\n", hexdump(buf_verify + offset, BLOCK_SIZE, 0));
if (!--count) {
break;
}
}
}
} else {
printf("%s: succeed\n", test_name);
}
}
void ocl_test() {
TimeHP t0, t1; long long td;
cl_int err;
cl_platform_id platform_id;
cl_device_id device_id;
ocl_get_device(&platform_id, &device_id);
if (platform_id == NULL || device_id == NULL) {
return;
}
cl_uchar *buf_in = malloc(BUF_SIZE);
cl_uchar *buf_out = malloc(BUF_SIZE);
cl_uchar *buf_verify = malloc(BUF_SIZE);
srand(2501);
cl_uchar key[16];
unsigned int aes_rk[RK_LEN];
aes_gen_tables();
for (unsigned i = 0; i < 16; ++i) {
key[i] = rand() & 0xff;
}
printf("AES Key: %s\n", hexdump(key, 16, 0));
get_hp_time(&t0);
if(cpu_has_rdrand()){
// ~190 MB/s @ X230, ~200 without the success check
printf("randomize source buffer using RDRAND\n");
if (!rdrand_fill((cl_ulong*)buf_in, BUF_SIZE >> 3)) {
printf("RDRND failed\n");
exit(-1);
}
}else {
printf("randomize source buffer using AES OFB\n");
// rand() & 0xff is about ~60 MB/s @ X230
// it's worse than that AES single thread C, so OFB it is
// ~240 MB/s, even faster than RDRAND
unsigned int aes_rk[RK_LEN];
unsigned char key_iv[16 * 2];
for (unsigned i = 0; i < 16 * 2; ++i) {
key_iv[i] = rand() & 0xff;
}
aes_set_key_enc_128(aes_rk, key_iv);
aes_encrypt_128(aes_rk, key_iv + 16, buf_in);
unsigned char *p_in = buf_in, *p_out = buf_in + 16,
*p_end = buf_in + BUF_SIZE;
while (p_out < p_end) {
aes_encrypt_128(aes_rk, p_in, p_out);
p_in = p_out;
p_out += 16;
}
}
get_hp_time(&t1); td = hp_time_diff(&t0, &t1);
printf("%d microseconds for preparing test data, %.2f MB/s\n",
(int)td, BUF_SIZE * 1.0f / td);
cl_context context = OCL_ASSERT2(clCreateContext(0, 1, &device_id, NULL, NULL, &err));
cl_command_queue command_queue = OCL_ASSERT2(clCreateCommandQueue(context, device_id, 0, &err));
const char *source_names[] = {
"cl/common.h",
"cl/sha1_16.cl",
"cl/aes_tables.cl",
"cl/aes_128.cl",
"cl/kernel_tests.cl" };
char options[0x100];
sprintf(options, "-w -Werror -DBLOCKS_PER_ITEM=%u", BLOCKS_PER_ITEM);
cl_program program = ocl_build_from_sources(sizeof(source_names) / sizeof(char *),
source_names, context, device_id, options);
// create buffer and upload data
cl_mem mem_key = OCL_ASSERT2(clCreateBuffer(context, CL_MEM_READ_ONLY, 16, NULL, &err));
cl_mem mem_in = OCL_ASSERT2(clCreateBuffer(context, CL_MEM_READ_ONLY, BUF_SIZE, NULL, &err));
cl_mem mem_out = OCL_ASSERT2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, BUF_SIZE, NULL, &err));
OCL_ASSERT(clEnqueueWriteBuffer(command_queue, mem_key, CL_TRUE, 0, 16, key, 0, NULL, NULL));
get_hp_time(&t0);
OCL_ASSERT(clEnqueueWriteBuffer(command_queue, mem_in, CL_TRUE, 0, BUF_SIZE, buf_in, 0, NULL, NULL));
get_hp_time(&t1); td = hp_time_diff(&t0, &t1);
printf("%d microseconds for data upload, %.2f MB/s\n", (int)td, BUF_SIZE * 1.0f / td);
// SHA1_16 test
const char * test_name = "sha1_16_test";
cl_kernel kernel = OCL_ASSERT2(clCreateKernel(program, test_name, &err));
OCL_ASSERT(clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_in));
OCL_ASSERT(clSetKernelArg(kernel, 1, sizeof(cl_mem), &mem_out));
ocl_test_run_and_read(test_name, kernel, device_id, command_queue, mem_out, buf_out);
get_hp_time(&t0);
for (unsigned offset = 0; offset < BUF_SIZE; offset += BLOCK_SIZE) {
sha1_16(buf_in + offset, buf_verify + offset);
}
get_hp_time(&t1); td = hp_time_diff(&t0, &t1);
printf("%d microseconds for C(single thread), %.2f MB/s\n", (int)td, BUF_SIZE * 1.0f / td);
verify(test_name, buf_in, buf_out, buf_verify);
// AES 128 ECB test
test_name = "aes_128_ecb_test";
kernel = OCL_ASSERT2(clCreateKernel(program, test_name, &err));
OCL_ASSERT(clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_key));
OCL_ASSERT(clSetKernelArg(kernel, 1, sizeof(cl_mem), &mem_in));
OCL_ASSERT(clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_out));
ocl_test_run_and_read(test_name, kernel, device_id, command_queue, mem_out, buf_out);
/*
if(test_case == TEST_AES_128_ECB){
dump_to_file("r:/test_aes_in.bin", buf_in, io_buf_len);
dump_to_file("r:/test_aes_out.bin", buf_out, io_buf_len);
}
*/
get_hp_time(&t0);
for (unsigned offset = 0; offset < BUF_SIZE; offset += BLOCK_SIZE) {
// setting the same key over and over is stupid
// yet we still do it to make the results comparable
aes_set_key_enc_128(aes_rk, key);
aes_encrypt_128(aes_rk, buf_in + offset, buf_verify + offset);
}
get_hp_time(&t1); td = hp_time_diff(&t0, &t1);
printf("%d microseconds for C(single thread), %.2f MB/s\n", (int)td, BUF_SIZE * 1.0f / td);
verify(test_name, buf_in, buf_out, buf_verify);
// cleanup
free(buf_in); free(buf_out); free(buf_verify);
clReleaseMemObject(mem_in);
clReleaseMemObject(mem_out);
clReleaseMemObject(mem_key);
clReleaseProgram(program);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
}

View File

@ -4,6 +4,7 @@
#include <malloc.h>
#include <CL/cl_ext.h>
#include "ocl.h"
#include "utils.h"
#define STATIC_ASSERT(c) static_assert(c, #c)
STATIC_ASSERT(sizeof(char) == sizeof(cl_char));
@ -192,3 +193,42 @@ void ocl_get_device(cl_platform_id *p_platform_id, cl_device_id *p_device_id) {
*p_device_id = NULL;
}
}
cl_program ocl_build_from_sources(
unsigned num_sources, const char *source_names[],
cl_context context, cl_device_id device_id, const char * options)
{
TimeHP t0, t1;
cl_int err;
// read sources
char **sources = malloc(sizeof(char*) * num_sources);
size_t *source_sizes = malloc(sizeof(size_t) * num_sources);
read_files(num_sources, source_names, sources, source_sizes);
// compile
get_hp_time(&t0);
// WTF? GCC complains if I pass char ** to a function expecting const char **?
cl_program program = OCL_ASSERT2(clCreateProgramWithSource(context, num_sources,
(const char **)sources, source_sizes, &err));
// printf("compiler options: %s\n", options);
err = clBuildProgram(program, 0, NULL, options, NULL, NULL);
get_hp_time(&t1);
printf("%d microseconds for compile\n", (int)hp_time_diff(&t0, &t1));
if (err != CL_SUCCESS) {
fprintf(stderr, "failed to build program, error: %s, build log:\n", ocl_err_msg(err));
size_t len;
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &len);
char *buf_log = malloc(len + 1);
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, len, buf_log, NULL);
buf_log[len] = '\0';
fprintf(stderr, "%s\n", buf_log);
free(buf_log);
exit(err);
}
for (unsigned i = 0; i < num_sources; ++i) {
free(sources[i]);
}
free(sources);
free(source_sizes);
return program;
}

68
utils.c
View File

@ -3,8 +3,15 @@
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <immintrin.h>
#include "utils.h"
#ifdef __GNUC__
#include <cpuid.h>
#elif _MSC_VER
#include <intrin.h>
#endif
int htoi(char a){
if(a >= '0' && a <= '9'){
return a - '0';
@ -42,7 +49,7 @@ int hex2bytes(unsigned char *out, unsigned byte_len, const char *in, int critica
#endif
static char hexdump_buf[HEXDUMP_BUF_SIZE];
// CAUTION, this always assume you have a buffer big enough
// CAUTION, this always assume the buffer is big enough
const char *hexdump(const void *b, unsigned l, int space){
const unsigned char *p = (unsigned char*)b;
char *out = hexdump_buf;
@ -80,4 +87,61 @@ long long hp_time_diff(struct timeval *pt0, struct timeval *pt1) {
return diff;
}
#endif
#endif
// CAUTION: caller is responsible to free the buf
char * read_file(const char *file_name, size_t *p_size) {
FILE * f = fopen(file_name, "rb");
if (f == NULL) {
fprintf(stderr, "can't read file: %s", file_name);
exit(-1);
}
fseek(f, 0, SEEK_END);
*p_size = ftell(f);
char * buf = malloc(*p_size);
fseek(f, 0, SEEK_SET);
fread(buf, *p_size, 1, f);
fclose(f);
return buf;
}
void read_files(unsigned num_files, const char *file_names[], char *ptrs[], size_t sizes[]) {
for (unsigned i = 0; i < num_files; ++i) {
ptrs[i] = read_file(file_names[i], &sizes[i]);
}
}
void dump_to_file(const char *file_name, const void *buf, size_t len) {
FILE *f = fopen(file_name, "wb");
if (f == NULL) {
fprintf(stderr, "can't open file to write: %s\n", file_name);
return;
}
fwrite(buf, len, 1, f);
fclose(f);
}
int cpu_has_rdrand() {
#if __GNUC__
unsigned a = 0, b = 0, c = 0, d = 0;
__get_cpuid(1, &a, &b, &c, &d);
return c & bit_RDRND;
#elif _MSC_VER
int regs[4];
__cpuid(regs, 1);
return regs[2] & (1<<30);
#else
// ICL only?
return _may_i_use_cpu_feature(_FEATURE_RDRND);
#endif
}
// input must be multiple of uint64_t
int rdrand_fill(unsigned long long *p, size_t size) {
unsigned long long *p_end = p + size;
int success = 1;
while (p < p_end) {
success &= _rdrand64_step(p++);
}
return success;
}

18
utils.h
View File

@ -5,19 +5,29 @@
#ifdef _WIN32
#include <Windows.h>
typedef LARGE_INTEGER HP_Time;
typedef LARGE_INTEGER TimeHP;
#define get_hp_time QueryPerformanceCounter
#else
#include <sys/time.h>
typedef struct timeval HP_Time;
void get_hp_time(HP_Time *pt);
typedef struct timeval TimeHP;
void get_hp_time(TimeHP *pt);
#endif
long long hp_time_diff(HP_Time *pt0, HP_Time *pt1);
long long hp_time_diff(TimeHP *pt0, TimeHP *pt1);
int hex2bytes(unsigned char *out, unsigned byte_len, const char *in, int critical);
const char * hexdump(const void *a, unsigned l, int space);
char * read_file(const char *file_name, size_t *p_size);
void read_files(unsigned num_files, const char *file_names[], char *ptrs[], size_t sizes[]);
void dump_to_file(const char *file_name, const void *buf, size_t len);
int cpu_has_rdrand();
int rdrand_fill(unsigned long long *p, size_t size);