reduced work size to make nvidia runtime happy

give nvidia GPU more weight
This commit is contained in:
JimmyZ 2018-05-29 10:39:26 +08:00
parent 5f7e13a770
commit 687acab0c8
3 changed files with 9 additions and 6 deletions

View File

@ -2,10 +2,10 @@
// more about this: https://github.com/Jimmy-Z/TWLbf/blob/master/dsi.c
__constant static const u64 DSi_KEY_Y[2] =
{0xbd4dc4d30ab9dc76ull, 0xe1a00005202ddd1dull};
{0xbd4dc4d30ab9dc76ul, 0xe1a00005202ddd1dul};
__constant static const u64 DSi_KEY_MAGIC[2] =
{0x2a680f5f1a4f3e79ull, 0xfffefb4e29590258ull};
{0x2a680f5f1a4f3e79ul, 0xfffefb4e29590258ul};
// CAUTION this one doesn't work in-place
inline void byte_reverse_16(u8 *out, const u8 *in){

View File

@ -127,7 +127,8 @@ int ocl_brute_console_id(const cl_uchar *console_id, const cl_uchar *emmc_cid,
// I wish we could use 1e10 in C, counting 0 is not good to your eye
total = from_bcd(1ull << 40);
// work items variations on lower bits per enqueue, 8 + 1 digits, including the known digit
group_bits = 36;
// reduced from 36 to 28 to make nvidia runtime happy
group_bits = 28;
// work items per enqueue, don't count the known digit here
num_items = from_bcd(1ull << (group_bits - 4));
// between the template bits and group bits, it's the loop bits
@ -347,7 +348,7 @@ int ocl_brute_msky(const cl_uint *msky, const cl_uint *ver)
OCL_ASSERT(clEnqueueWriteBuffer(command_queue, mem_out, CL_TRUE, 0, sizeof(cl_uint), &out, 0, NULL, NULL));
unsigned brute_bits = 32;
unsigned group_bits = 28;
unsigned group_bits = 20;
unsigned loop_bits = brute_bits - group_bits;
unsigned loops = 1ull << loop_bits;
size_t num_items = 1ull << group_bits;
@ -440,7 +441,7 @@ int ocl_brute_lfcs(cl_uint lfcs_template, cl_ushort newflag, const cl_uint *ver)
OCL_ASSERT(clEnqueueWriteBuffer(command_queue, mem_out, CL_TRUE, 0, sizeof(cl_uint), &out, 0, NULL, NULL));
unsigned brute_bits = 32;
unsigned group_bits = 28;
unsigned group_bits = 20;
unsigned loop_bits = brute_bits - group_bits;
unsigned loops = 1ull << loop_bits;
size_t num_items = 1ull << group_bits;

View File

@ -182,8 +182,10 @@ void ocl_get_device(cl_platform_id *p_platform_id, cl_device_id *p_device_id) {
&& devices[j].c_avail == CL_TRUE){
cl_ulong cap = 1ull * devices[j].max_compute_units * devices[j].freq;
// unfortunately that metric is not comparable between different vendors
if (strstr((const char*)devices[j].name, "Intel") == 0) {
if (strstr((const char*)devices[j].vendor, "Advanced Micro Devices") != 0) {
cap *= 64;
} else if(strstr((const char*)devices[j].vendor, "NVIDIA") != 0) {
cap *= 128;
}
if (cap > maximum) {
maximum = cap;