mirror of
https://github.com/Jimmy-Z/bfCL.git
synced 2025-06-18 11:05:49 -04:00
216 lines
7.5 KiB
C
216 lines
7.5 KiB
C
|
|
#include <stdio.h>
|
|
#include "utils.h"
|
|
#include "ocl.h"
|
|
#include "crypto.h"
|
|
|
|
#define BLOCK_SIZE 0x10
|
|
#define NUM_BLOCKS (1 << 22)
|
|
#define BLOCKS_PER_ITEM 1
|
|
#define NUM_ITEMS (NUM_BLOCKS / BLOCKS_PER_ITEM)
|
|
#define BUF_SIZE (BLOCK_SIZE * NUM_BLOCKS)
|
|
|
|
void ocl_test_run_and_read(const char * test_name, cl_kernel kernel,
|
|
cl_device_id device_id, cl_command_queue command_queue,
|
|
cl_mem mem_out, cl_uchar *buf_out)
|
|
{
|
|
printf("# %s on %u MB\n", test_name, (unsigned)BUF_SIZE >> 20);
|
|
TimeHP t0, t1; long long td;
|
|
size_t local;
|
|
OCL_ASSERT(clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL));
|
|
// printf("local work size: %u\n", (unsigned)local);
|
|
|
|
size_t num_items = NUM_ITEMS;
|
|
|
|
get_hp_time(&t0);
|
|
// apparently, setting local work size to NULL doesn't affect performance, at least in this kind of work
|
|
OCL_ASSERT(clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &num_items, &local, 0, NULL, NULL));
|
|
// OCL_ASSERT(clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &num_items, NULL, 0, NULL, NULL));
|
|
clFinish(command_queue);
|
|
get_hp_time(&t1); td = hp_time_diff(&t0, &t1);
|
|
printf("%.3f seconds for OpenCL, %.2f MB/s\n", td / 1000000.0, BUF_SIZE * 1.0 / td);
|
|
|
|
get_hp_time(&t0);
|
|
OCL_ASSERT(clEnqueueReadBuffer(command_queue, mem_out, CL_TRUE, 0, BUF_SIZE, buf_out, 0, NULL, NULL));
|
|
get_hp_time(&t1); td = hp_time_diff(&t0, &t1);
|
|
printf("%.3f seconds for data download, %.2f MB/s\n", td / 1000000.0, BUF_SIZE * 1.0 / td);
|
|
|
|
clReleaseKernel(kernel);
|
|
}
|
|
|
|
int verify(const char *test_name, cl_uchar *buf_in, cl_uchar *buf_out, cl_uchar *buf_verify){
|
|
if (memcmp(buf_verify, buf_out, BUF_SIZE)) {
|
|
printf("%s: verification failed\n", test_name);
|
|
unsigned count = 5;
|
|
for (unsigned offset = 0; offset < BUF_SIZE; offset += BLOCK_SIZE) {
|
|
if (memcmp(buf_verify + offset, buf_out + offset, BLOCK_SIZE)) {
|
|
printf("difference @ 0x%08x/0x%08x:\n", offset, (unsigned)NUM_BLOCKS);
|
|
printf("\t%s\n", hexdump(buf_in + offset, BLOCK_SIZE, 1));
|
|
printf("\t%s\n", hexdump(buf_out + offset, BLOCK_SIZE, 1));
|
|
printf("\t%s\n", hexdump(buf_verify + offset, BLOCK_SIZE, 1));
|
|
if (!--count) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
return 1;
|
|
} else {
|
|
printf("%s: succeed\n", test_name);
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
int ocl_test() {
|
|
TimeHP t0, t1; long long td;
|
|
|
|
cl_int err;
|
|
cl_platform_id platform_id;
|
|
cl_device_id device_id;
|
|
ocl_get_device(&platform_id, &device_id);
|
|
if (platform_id == NULL || device_id == NULL) {
|
|
return -1;
|
|
}
|
|
|
|
cl_uchar *buf_in = malloc(BUF_SIZE);
|
|
cl_uchar *buf_out = malloc(BUF_SIZE);
|
|
cl_uchar *buf_verify = malloc(BUF_SIZE);
|
|
|
|
srand(2501);
|
|
cl_uchar key[16];
|
|
aes_init();
|
|
for (unsigned i = 0; i < 16; ++i) {
|
|
key[i] = rand() & 0xff;
|
|
}
|
|
printf("self-test/benchmark mode\nAES Key: %s\n", hexdump(key, 16, 0));
|
|
get_hp_time(&t0);
|
|
if(cpu_has_rdrand()){
|
|
// ~190 MB/s @ X230, ~200 without the success check
|
|
printf("randomize source buffer using RDRAND\n");
|
|
if (!rdrand_fill((cl_ulong*)buf_in, BUF_SIZE >> 3)) {
|
|
printf("RDRND failed\n");
|
|
exit(-1);
|
|
}
|
|
}else {
|
|
printf("randomize source buffer using AES OFB\n");
|
|
// rand() & 0xff is about ~60 MB/s @ X230
|
|
// it's worse than that AES single thread C, so OFB it is
|
|
// ~240 MB/s, even faster than RDRAND
|
|
unsigned char key_iv[16 * 2];
|
|
for (unsigned i = 0; i < 16 * 2; ++i) {
|
|
key_iv[i] = rand() & 0xff;
|
|
}
|
|
aes_set_key_enc_128(key_iv);
|
|
aes_encrypt_128(key_iv + 16, buf_in);
|
|
unsigned char *p_in = buf_in, *p_out = buf_in + 16,
|
|
*p_end = buf_in + BUF_SIZE;
|
|
while (p_out < p_end) {
|
|
aes_encrypt_128(p_in, p_out);
|
|
p_in = p_out;
|
|
p_out += 16;
|
|
}
|
|
}
|
|
get_hp_time(&t1); td = hp_time_diff(&t0, &t1);
|
|
printf("%.3f seconds for preparing test data, %.2f MB/s\n",
|
|
td / 1000000.0, BUF_SIZE * 1.0 / td);
|
|
|
|
cl_context context = OCL_ASSERT2(clCreateContext(0, 1, &device_id, NULL, NULL, &err));
|
|
cl_command_queue command_queue = OCL_ASSERT2(clCreateCommandQueue(context, device_id, 0, &err));
|
|
|
|
const char *source_names[] = {
|
|
"cl/common.h",
|
|
"cl/sha1_16.cl",
|
|
// "cl/sha256_16.cl",
|
|
"cl/aes_tables.cl",
|
|
"cl/aes_128.cl",
|
|
"cl/kernel_tests.cl" };
|
|
char options[0x100];
|
|
sprintf(options, "-w -Werror -DBLOCKS_PER_ITEM=%u", BLOCKS_PER_ITEM);
|
|
cl_program program = ocl_build_from_sources(sizeof(source_names) / sizeof(char *),
|
|
source_names, context, device_id, options);
|
|
|
|
// create buffer and upload data
|
|
cl_mem mem_key = OCL_ASSERT2(clCreateBuffer(context, CL_MEM_READ_ONLY, 16, NULL, &err));
|
|
cl_mem mem_in = OCL_ASSERT2(clCreateBuffer(context, CL_MEM_READ_ONLY, BUF_SIZE, NULL, &err));
|
|
cl_mem mem_out = OCL_ASSERT2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, BUF_SIZE, NULL, &err));
|
|
|
|
OCL_ASSERT(clEnqueueWriteBuffer(command_queue, mem_key, CL_TRUE, 0, 16, key, 0, NULL, NULL));
|
|
get_hp_time(&t0);
|
|
OCL_ASSERT(clEnqueueWriteBuffer(command_queue, mem_in, CL_TRUE, 0, BUF_SIZE, buf_in, 0, NULL, NULL));
|
|
get_hp_time(&t1); td = hp_time_diff(&t0, &t1);
|
|
printf("%.3f seconds for data upload, %.2f MB/s\n", td / 1000000.0, BUF_SIZE * 1.0 / td);
|
|
|
|
// SHA1_16 test
|
|
const char * test_name = "sha1_16_test";
|
|
|
|
cl_kernel kernel = OCL_ASSERT2(clCreateKernel(program, test_name, &err));
|
|
|
|
OCL_ASSERT(clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_in));
|
|
OCL_ASSERT(clSetKernelArg(kernel, 1, sizeof(cl_mem), &mem_out));
|
|
|
|
ocl_test_run_and_read(test_name, kernel, device_id, command_queue, mem_out, buf_out);
|
|
|
|
get_hp_time(&t0);
|
|
for (unsigned offset = 0; offset < BUF_SIZE; offset += BLOCK_SIZE) {
|
|
sha1_16(buf_in + offset, buf_verify + offset);
|
|
}
|
|
get_hp_time(&t1); td = hp_time_diff(&t0, &t1);
|
|
printf("%.3f seconds for reference C(single thread), %.2f MB/s\n", td / 1000000.0, BUF_SIZE * 1.0 / td);
|
|
|
|
int succeed = verify(test_name, buf_in, buf_out, buf_verify);
|
|
|
|
// AES encrypt test
|
|
test_name = "aes_enc_128_test";
|
|
|
|
kernel = OCL_ASSERT2(clCreateKernel(program, test_name, &err));
|
|
|
|
OCL_ASSERT(clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_key));
|
|
OCL_ASSERT(clSetKernelArg(kernel, 1, sizeof(cl_mem), &mem_in));
|
|
OCL_ASSERT(clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_out));
|
|
|
|
ocl_test_run_and_read(test_name, kernel, device_id, command_queue, mem_out, buf_out);
|
|
|
|
get_hp_time(&t0);
|
|
for (unsigned offset = 0; offset < BUF_SIZE; offset += BLOCK_SIZE) {
|
|
// setting the same key over and over is stupid
|
|
// we do this to make the results comparable
|
|
aes_set_key_enc_128(key);
|
|
aes_encrypt_128(buf_in + offset, buf_verify + offset);
|
|
}
|
|
get_hp_time(&t1); td = hp_time_diff(&t0, &t1);
|
|
printf("%.3f seconds for reference C(single thread), %.2f MB/s\n", td / 1000000.0, BUF_SIZE * 1.0f / td);
|
|
// dump_to_file("r:/test_aes_in.bin", buf_in, BUF_SIZE);
|
|
// dump_to_file("r:/test_aes_out.bin", buf_out, BUF_SIZE);
|
|
// dump_to_file("r:/test_aes_verify.bin", buf_verify, BUF_SIZE);
|
|
|
|
succeed |= verify(test_name, buf_in, buf_out, buf_verify);
|
|
|
|
// AES decrypt test
|
|
test_name = "aes_dec_128_test";
|
|
// use the encrypt output as input
|
|
OCL_ASSERT(clEnqueueWriteBuffer(command_queue, mem_in, CL_TRUE, 0, BUF_SIZE, buf_out, 0, NULL, NULL));
|
|
|
|
kernel = OCL_ASSERT2(clCreateKernel(program, test_name, &err));
|
|
|
|
OCL_ASSERT(clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_key));
|
|
OCL_ASSERT(clSetKernelArg(kernel, 1, sizeof(cl_mem), &mem_in));
|
|
OCL_ASSERT(clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_out));
|
|
|
|
// read out to buf_verify
|
|
ocl_test_run_and_read(test_name, kernel, device_id, command_queue, mem_out, buf_verify);
|
|
|
|
// verify against buf_in
|
|
succeed |= verify(test_name, buf_out, buf_verify, buf_in);
|
|
|
|
// cleanup
|
|
free(buf_in); free(buf_out); free(buf_verify);
|
|
clReleaseMemObject(mem_in);
|
|
clReleaseMemObject(mem_out);
|
|
clReleaseMemObject(mem_key);
|
|
clReleaseProgram(program);
|
|
clReleaseCommandQueue(command_queue);
|
|
clReleaseContext(context);
|
|
|
|
return succeed;
|
|
}
|
|
|