diff --git a/arch/generic/crc32_braid_c.c b/arch/generic/crc32_braid_c.c index 999eed4a..6947cfd8 100644 --- a/arch/generic/crc32_braid_c.c +++ b/arch/generic/crc32_braid_c.c @@ -12,83 +12,84 @@ #include "crc32_braid_tbl.h" /* - A CRC of a message is computed on N braids of words in the message, where - each word consists of W bytes (4 or 8). If N is 3, for example, then three - running sparse CRCs are calculated respectively on each braid, at these + A CRC of a message is computed on BRAID_N braids of words in the message, where + each word consists of BRAID_W bytes (4 or 8). If BRAID_N is 3, for example, then + three running sparse CRCs are calculated respectively on each braid, at these indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ... - This is done starting at a word boundary, and continues until as many blocks - of N * W bytes as are available have been processed. The results are combined - into a single CRC at the end. For this code, N must be in the range 1..6 and - W must be 4 or 8. The upper limit on N can be increased if desired by adding - more #if blocks, extending the patterns apparent in the code. In addition, - crc32 tables would need to be regenerated, if the maximum N value is increased. + This is done starting at a word boundary, and continues until as many blocks of + BRAID_N * BRAID_W bytes as are available have been processed. The results are + combined into a single CRC at the end. For this code, BRAID_N must be in the + range 1..6 and BRAID_W must be 4 or 8. The upper limit on BRAID_N can be increased + if desired by adding more #if blocks, extending the patterns apparent in the code. + In addition, crc32 tables would need to be regenerated, if the maximum BRAID_N + value is increased. - N and W are chosen empirically by benchmarking the execution time on a given - processor. The choices for N and W below were based on testing on Intel Kaby - Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64 - Octeon II processors. The Intel, AMD, and ARM processors were all fastest - with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4. + BRAID_N and BRAID_W are chosen empirically by benchmarking the execution time + on a given processor. The choices for BRAID_N and BRAID_W below were based on + testing on Intel Kaby Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC + POWER9, and MIPS64 Octeon II processors. + The Intel, AMD, and ARM processors were all fastest with BRAID_N=5, BRAID_W=8. + The Sparc, PowerPC, and MIPS64 were all fastest at BRAID_N=5, BRAID_W=4. They were all tested with either gcc or clang, all using the -O3 optimization level. Your mileage may vary. */ /* ========================================================================= */ -#ifdef W +#ifdef BRAID_W /* - Return the CRC of the W bytes in the word_t data, taking the + Return the CRC of the BRAID_W bytes in the word_t data, taking the least-significant byte of the word as the first byte of data, without any pre or post conditioning. This is used to combine the CRCs of each braid. */ -#if BYTE_ORDER == LITTLE_ENDIAN +# if BYTE_ORDER == LITTLE_ENDIAN static uint32_t crc_word(z_word_t data) { int k; - for (k = 0; k < W; k++) + for (k = 0; k < BRAID_W; k++) data = (data >> 8) ^ crc_table[data & 0xff]; return (uint32_t)data; } -#elif BYTE_ORDER == BIG_ENDIAN +# elif BYTE_ORDER == BIG_ENDIAN static z_word_t crc_word(z_word_t data) { int k; - for (k = 0; k < W; k++) + for (k = 0; k < BRAID_W; k++) data = (data << 8) ^ - crc_big_table[(data >> ((W - 1) << 3)) & 0xff]; + crc_big_table[(data >> ((BRAID_W - 1) << 3)) & 0xff]; return data; } -#endif /* BYTE_ORDER */ - -#endif /* W */ +# endif /* BYTE_ORDER */ +#endif /* BRAID_W */ /* ========================================================================= */ Z_INTERNAL uint32_t crc32_braid_internal(uint32_t c, const uint8_t *buf, size_t len) { -#ifdef W +#ifdef BRAID_W /* If provided enough bytes, do a braided CRC calculation. */ - if (len >= N * W + W - 1) { + if (len >= BRAID_N * BRAID_W + BRAID_W - 1) { size_t blks; z_word_t const *words; int k; /* Compute the CRC up to a z_word_t boundary. */ - while (len && ((uintptr_t)buf & (W - 1)) != 0) { + while (len && ((uintptr_t)buf & (BRAID_W - 1)) != 0) { len--; DO1; } - /* Compute the CRC on as many N z_word_t blocks as are available. */ - blks = len / (N * W); - len -= blks * N * W; + /* Compute the CRC on as many BRAID_N z_word_t blocks as are available. */ + blks = len / (BRAID_N * BRAID_W); + len -= blks * BRAID_N * BRAID_W; words = (z_word_t const *)buf; z_word_t crc0, word0, comb; -#if N > 1 +#if BRAID_N > 1 z_word_t crc1, word1; -#if N > 2 +#if BRAID_N > 2 z_word_t crc2, word2; -#if N > 3 +#if BRAID_N > 3 z_word_t crc3, word3; -#if N > 4 +#if BRAID_N > 4 z_word_t crc4, word4; -#if N > 5 +#if BRAID_N > 5 z_word_t crc5, word5; #endif #endif @@ -97,15 +98,15 @@ Z_INTERNAL uint32_t crc32_braid_internal(uint32_t c, const uint8_t *buf, size_t #endif /* Initialize the CRC for each braid. */ crc0 = ZSWAPWORD(c); -#if N > 1 +#if BRAID_N > 1 crc1 = 0; -#if N > 2 +#if BRAID_N > 2 crc2 = 0; -#if N > 3 +#if BRAID_N > 3 crc3 = 0; -#if N > 4 +#if BRAID_N > 4 crc4 = 0; -#if N > 5 +#if BRAID_N > 5 crc5 = 0; #endif #endif @@ -116,51 +117,51 @@ Z_INTERNAL uint32_t crc32_braid_internal(uint32_t c, const uint8_t *buf, size_t while (--blks) { /* Load the word for each braid into registers. */ word0 = crc0 ^ words[0]; -#if N > 1 +#if BRAID_N > 1 word1 = crc1 ^ words[1]; -#if N > 2 +#if BRAID_N > 2 word2 = crc2 ^ words[2]; -#if N > 3 +#if BRAID_N > 3 word3 = crc3 ^ words[3]; -#if N > 4 +#if BRAID_N > 4 word4 = crc4 ^ words[4]; -#if N > 5 +#if BRAID_N > 5 word5 = crc5 ^ words[5]; #endif #endif #endif #endif #endif - words += N; + words += BRAID_N; /* Compute and update the CRC for each word. The loop should get unrolled. */ crc0 = BRAID_TABLE[0][word0 & 0xff]; -#if N > 1 +#if BRAID_N > 1 crc1 = BRAID_TABLE[0][word1 & 0xff]; -#if N > 2 +#if BRAID_N > 2 crc2 = BRAID_TABLE[0][word2 & 0xff]; -#if N > 3 +#if BRAID_N > 3 crc3 = BRAID_TABLE[0][word3 & 0xff]; -#if N > 4 +#if BRAID_N > 4 crc4 = BRAID_TABLE[0][word4 & 0xff]; -#if N > 5 +#if BRAID_N > 5 crc5 = BRAID_TABLE[0][word5 & 0xff]; #endif #endif #endif #endif #endif - for (k = 1; k < W; k++) { + for (k = 1; k < BRAID_W; k++) { crc0 ^= BRAID_TABLE[k][(word0 >> (k << 3)) & 0xff]; -#if N > 1 +#if BRAID_N > 1 crc1 ^= BRAID_TABLE[k][(word1 >> (k << 3)) & 0xff]; -#if N > 2 +#if BRAID_N > 2 crc2 ^= BRAID_TABLE[k][(word2 >> (k << 3)) & 0xff]; -#if N > 3 +#if BRAID_N > 3 crc3 ^= BRAID_TABLE[k][(word3 >> (k << 3)) & 0xff]; -#if N > 4 +#if BRAID_N > 4 crc4 ^= BRAID_TABLE[k][(word4 >> (k << 3)) & 0xff]; -#if N > 5 +#if BRAID_N > 5 crc5 ^= BRAID_TABLE[k][(word5 >> (k << 3)) & 0xff]; #endif #endif @@ -170,24 +171,24 @@ Z_INTERNAL uint32_t crc32_braid_internal(uint32_t c, const uint8_t *buf, size_t } } - /* Process the last block, combining the CRCs of the N braids at the same time. */ + /* Process the last block, combining the CRCs of the BRAID_N braids at the same time. */ comb = crc_word(crc0 ^ words[0]); -#if N > 1 +#if BRAID_N > 1 comb = crc_word(crc1 ^ words[1] ^ comb); -#if N > 2 +#if BRAID_N > 2 comb = crc_word(crc2 ^ words[2] ^ comb); -#if N > 3 +#if BRAID_N > 3 comb = crc_word(crc3 ^ words[3] ^ comb); -#if N > 4 +#if BRAID_N > 4 comb = crc_word(crc4 ^ words[4] ^ comb); -#if N > 5 +#if BRAID_N > 5 comb = crc_word(crc5 ^ words[5] ^ comb); #endif #endif #endif #endif #endif - words += N; + words += BRAID_N; Assert(comb <= UINT32_MAX, "comb should fit in uint32_t"); c = (uint32_t)ZSWAPWORD(comb); @@ -195,7 +196,7 @@ Z_INTERNAL uint32_t crc32_braid_internal(uint32_t c, const uint8_t *buf, size_t buf = (const unsigned char *)words; } -#endif /* W */ +#endif /* BRAID_W */ /* Complete the computation of the CRC on any remaining bytes. */ while (len >= 8) { @@ -211,7 +212,7 @@ Z_INTERNAL uint32_t crc32_braid_internal(uint32_t c, const uint8_t *buf, size_t return c; } -uint32_t PREFIX(crc32_braid)(uint32_t c, const uint8_t *buf, size_t len) { +Z_INTERNAL uint32_t crc32_braid(uint32_t c, const uint8_t *buf, size_t len) { c = (~c) & 0xffffffff; c = crc32_braid_internal(c, buf, len); diff --git a/arch/generic/crc32_c.c b/arch/generic/crc32_c.c index 88b2d9f6..a17dff37 100644 --- a/arch/generic/crc32_c.c +++ b/arch/generic/crc32_c.c @@ -25,7 +25,7 @@ uint32_t PREFIX(crc32_c)(uint32_t crc, const uint8_t *buf, size_t len) { aligned_len = len - algn_diff; if(aligned_len > CHORBA_LARGE_THRESHOLD) c = crc32_chorba_118960_nondestructive(c, (z_word_t*) aligned_buf, aligned_len); -# if W == 8 +# if BRAID_W == 8 else if (aligned_len > CHORBA_MEDIUM_LOWER_THRESHOLD && aligned_len <= CHORBA_MEDIUM_UPPER_THRESHOLD) c = crc32_chorba_32768_nondestructive(c, (uint64_t*) aligned_buf, aligned_len); else if (aligned_len > CHORBA_SMALL_THRESHOLD_64BIT) diff --git a/crc32_braid_p.h b/crc32_braid_p.h index 003bf919..004cc5cc 100644 --- a/crc32_braid_p.h +++ b/crc32_braid_p.h @@ -3,51 +3,27 @@ #include "zendian.h" -/* Define N */ -#ifdef Z_TESTN -# define N Z_TESTN -#else -# define N 5 -#endif -#if N < 1 || N > 6 -# error N must be in 1..6 -#endif +/* Define BRAID_N, valid range is 1..6 */ +#define BRAID_N 5 -/* - Define W and the associated z_word_t type. If W is not defined, then a - braided calculation is not used, and the associated tables and code are not - compiled. +/* Define BRAID_W and the associated z_word_t type. If BRAID_W is not defined, then a braided + calculation is not used, and the associated tables and code are not compiled. */ -#ifdef Z_TESTW -# if Z_TESTW-1 != -1 -# define W Z_TESTW -# endif +#if defined(__x86_64__) || defined(_M_AMD64) || defined(__aarch64__) || defined(_M_ARM64) || defined(__powerpc64__) +# define BRAID_W 8 + typedef uint64_t z_word_t; #else -# ifndef W -# if defined(__x86_64__) || defined(_M_AMD64) || defined(__aarch64__) || defined(_M_ARM64) || defined(__powerpc64__) -# define W 8 -# else -# define W 4 -# endif -# endif -#endif -#ifdef W -# if W == 8 - typedef uint64_t z_word_t; -# else -# undef W -# define W 4 - typedef uint32_t z_word_t; -# endif +# define BRAID_W 4 + typedef uint32_t z_word_t; #endif #if BYTE_ORDER == LITTLE_ENDIAN # define ZSWAPWORD(word) (word) # define BRAID_TABLE crc_braid_table #elif BYTE_ORDER == BIG_ENDIAN -# if W == 8 +# if BRAID_W == 8 # define ZSWAPWORD(word) ZSWAP64(word) -# elif W == 4 +# elif BRAID_W == 4 # define ZSWAPWORD(word) ZSWAP32(word) # endif # define BRAID_TABLE crc_braid_big_table diff --git a/crc32_braid_tbl.h b/crc32_braid_tbl.h index 84d79a69..5e39a1c4 100644 --- a/crc32_braid_tbl.h +++ b/crc32_braid_tbl.h @@ -59,9 +59,8 @@ static const uint32_t crc_table[] = { 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d}; -#ifdef W - -#if W == 8 +#ifdef BRAID_W +# if BRAID_W == 8 static const z_word_t crc_big_table[] = { 0x0000000000000000, 0x9630077700000000, 0x2c610eee00000000, @@ -151,7 +150,7 @@ static const z_word_t crc_big_table[] = { 0x37be0bb400000000, 0xa18e0cc300000000, 0x1bdf055a00000000, 0x8def022d00000000}; -#else /* W == 4 */ +# else /* BRAID_W == 4 */ static const z_word_t crc_big_table[] = { 0x00000000, 0x96300777, 0x2c610eee, 0xba510999, 0x19c46d07, @@ -207,13 +206,11 @@ static const z_word_t crc_big_table[] = { 0x021b685d, 0x942b6f2a, 0x37be0bb4, 0xa18e0cc3, 0x1bdf055a, 0x8def022d}; -#endif +# endif +#endif /* BRAID_W */ -#endif /* W */ - -#if N == 1 - -#if W == 8 +#if BRAID_N == 1 +# if BRAID_W == 8 static const uint32_t crc_braid_table[][256] = { {0x00000000, 0xccaa009e, 0x4225077d, 0x8e8f07e3, 0x844a0efa, @@ -1323,7 +1320,7 @@ static const z_word_t crc_braid_big_table[][256] = { 0x0501c4a800000000, 0x9b016e6400000000, 0x7806e1ea00000000, 0xe6064b2600000000}}; -#else /* W == 4 */ +# else /* BRAID_W == 4 */ static const uint32_t crc_braid_table[][256] = { {0x00000000, 0xb8bc6765, 0xaa09c88b, 0x12b5afee, 0x8f629757, @@ -1745,12 +1742,10 @@ static const z_word_t crc_braid_big_table[][256] = { 0xc3f6dbe9, 0xa6916751, 0x1fa9b0cc, 0x7ace0c74, 0x9461b966, 0xf10605de}}; -#endif /* W */ - -#endif /* N == 1 */ -#if N == 2 - -#if W == 8 +# endif /* BRAID_W */ +#endif /* BRAID_N == 1 */ +#if BRAID_N == 2 +# if BRAID_W == 8 static const uint32_t crc_braid_table[][256] = { {0x00000000, 0xae689191, 0x87a02563, 0x29c8b4f2, 0xd4314c87, @@ -2860,7 +2855,7 @@ static const z_word_t crc_braid_big_table[][256] = { 0x258db92400000000, 0xb41cd18a00000000, 0x46a819a300000000, 0xd739710d00000000}}; -#else /* W == 4 */ +# else /* BRAID_W == 4 */ static const uint32_t crc_braid_table[][256] = { {0x00000000, 0xccaa009e, 0x4225077d, 0x8e8f07e3, 0x844a0efa, @@ -3282,12 +3277,10 @@ static const z_word_t crc_braid_big_table[][256] = { 0x8208ab6e, 0x1c0801a2, 0x0501c4a8, 0x9b016e64, 0x7806e1ea, 0xe6064b26}}; -#endif /* W */ - -#endif /* N == 2 */ -#if N == 3 - -#if W == 8 +# endif /* BRAID_W */ +#endif /* BRAID_N == 2 */ +#if BRAID_N == 3 +# if BRAID_W == 8 static const uint32_t crc_braid_table[][256] = { {0x00000000, 0x81256527, 0xd93bcc0f, 0x581ea928, 0x69069e5f, @@ -4397,7 +4390,7 @@ static const z_word_t crc_braid_big_table[][256] = { 0x792cd35100000000, 0x5e49f6d000000000, 0x76e0e88800000000, 0x5185cd0900000000}}; -#else /* W == 4 */ +# else /* BRAID_W == 4 */ static const uint32_t crc_braid_table[][256] = { {0x00000000, 0x9ba54c6f, 0xec3b9e9f, 0x779ed2f0, 0x03063b7f, @@ -4819,12 +4812,10 @@ static const z_word_t crc_braid_big_table[][256] = { 0x7506baae, 0x1a4a1f35, 0x95a38741, 0xfaef22da, 0x0a3dbcad, 0x65711936}}; -#endif /* W */ - -#endif /* N == 3 */ -#if N == 4 - -#if W == 8 +# endif /* BRAID_W */ +#endif /* BRAID_N == 3 */ +#if BRAID_N == 4 +# if BRAID_W == 8 static const uint32_t crc_braid_table[][256] = { {0x00000000, 0xf1da05aa, 0x38c50d15, 0xc91f08bf, 0x718a1a2a, @@ -5934,7 +5925,7 @@ static const z_word_t crc_braid_big_table[][256] = { 0xa951db2a00000000, 0x035401db00000000, 0xbc5c1e1200000000, 0x1659c4e300000000}}; -#else /* W == 4 */ +# else /* BRAID_W == 4 */ static const uint32_t crc_braid_table[][256] = { {0x00000000, 0xae689191, 0x87a02563, 0x29c8b4f2, 0xd4314c87, @@ -6356,12 +6347,10 @@ static const z_word_t crc_braid_big_table[][256] = { 0xc1e42877, 0x507540d9, 0x258db924, 0xb41cd18a, 0x46a819a3, 0xd739710d}}; -#endif /* W */ - -#endif /* N == 4 */ -#if N == 5 - -#if W == 8 +# endif /* BRAID_W */ +#endif /* BRAID_N == 4 */ +#if BRAID_N == 5 +# if BRAID_W == 8 static const uint32_t crc_braid_table[][256] = { {0x00000000, 0xaf449247, 0x85f822cf, 0x2abcb088, 0xd08143df, @@ -7471,7 +7460,7 @@ static const z_word_t crc_braid_big_table[][256] = { 0xedc528c300000000, 0xaa576c6c00000000, 0x22e7d04600000000, 0x657594e900000000}}; -#else /* W == 4 */ +# else /* BRAID_W == 4 */ static const uint32_t crc_braid_table[][256] = { {0x00000000, 0x65673b46, 0xcace768c, 0xafa94dca, 0x4eedeb59, @@ -7893,12 +7882,10 @@ static const z_word_t crc_braid_big_table[][256] = { 0x2abb26f3, 0x6c804196, 0xff260577, 0xb91d6212, 0x7350cbbd, 0x356bacd8}}; -#endif /* W */ - -#endif /* N == 5 */ -#if N == 6 - -#if W == 8 +# endif /* BRAID_W */ +#endif /* BRAID_N == 5 */ +#if BRAID_N == 6 +# if BRAID_W == 8 static const uint32_t crc_braid_table[][256] = { {0x00000000, 0x3db1ecdc, 0x7b63d9b8, 0x46d23564, 0xf6c7b370, @@ -9008,7 +8995,7 @@ static const z_word_t crc_braid_big_table[][256] = { 0xcc95bac300000000, 0x10790bfe00000000, 0x744cd9b800000000, 0xa8a0688500000000}}; -#else /* W == 4 */ +# else /* BRAID_W == 4 */ static const uint32_t crc_braid_table[][256] = { {0x00000000, 0x81256527, 0xd93bcc0f, 0x581ea928, 0x69069e5f, @@ -9430,9 +9417,8 @@ static const z_word_t crc_braid_big_table[][256] = { 0x297eeee1, 0x0e1bcb60, 0x792cd351, 0x5e49f6d0, 0x76e0e888, 0x5185cd09}}; -#endif /* W */ - -#endif /* N == 6 */ +# endif /* BRAID_W */ +#endif /* BRAID_N == 6 */ static const uint32_t x2n_table[] = { 0x40000000, 0x20000000, 0x08000000, 0x00800000, 0x00008000, diff --git a/crc32_c.h b/crc32_c.h index 553bb04d..f27fe6f8 100644 --- a/crc32_c.h +++ b/crc32_c.h @@ -2,4 +2,3 @@ Z_INTERNAL uint32_t crc32_chorba_118960_nondestructive (uint32_t crc, const z_wo Z_INTERNAL uint32_t crc32_chorba_32768_nondestructive (uint32_t crc, const uint64_t* buf, size_t len); Z_INTERNAL uint32_t crc32_chorba_small_nondestructive (uint32_t crc, const uint64_t* buf, size_t len); Z_INTERNAL uint32_t crc32_chorba_small_nondestructive_32bit (uint32_t crc, const uint32_t* buf, size_t len); -Z_INTERNAL uint32_t crc32_braid_internal(uint32_t c, const uint8_t *buf, size_t len); diff --git a/tools/makecrct.c b/tools/makecrct.c index 9e65d249..812954ac 100644 --- a/tools/makecrct.c +++ b/tools/makecrct.c @@ -15,9 +15,9 @@ and writes out the tables for the case that z_word_t is 32 bits. */ -#define W 8 /* Need a 64-bit integer type in order to generate crc32 tables. */ - -#include "crc32_braid_p.h" +#define POLY 0xedb88320 /* p(x) reflected, with x^32 implied */ +#define BRAID_W 8 /* Need a 64-bit integer type in order to generate crc32 tables. */ +typedef uint64_t z_word_t; static uint32_t crc_table[256]; static z_word_t crc_big_table[256]; @@ -156,32 +156,31 @@ static void print_crc_table(void) { printf("};\n\n"); /* print big-endian CRC table for 64-bit z_word_t */ - printf("#ifdef W\n\n"); - printf("#if W == 8\n\n"); + printf("#ifdef BRAID_W\n"); + printf("# if BRAID_W == 8\n\n"); printf("static const z_word_t crc_big_table[] = {\n"); printf(" "); write_table64(crc_big_table, 256); printf("};\n\n"); /* print big-endian CRC table for 32-bit z_word_t */ - printf("#else /* W == 4 */\n\n"); + printf("# else /* BRAID_W == 4 */\n\n"); printf("static const z_word_t crc_big_table[] = {\n"); printf(" "); write_table32hi(crc_big_table, 256); printf("};\n\n"); - printf("#endif\n\n"); - printf("#endif /* W */\n\n"); + printf("# endif\n"); + printf("#endif /* BRAID_W */\n\n"); /* write out braid tables for each value of N */ for (n = 1; n <= 6; n++) { - printf("#if N == %d\n", n); + printf("#if BRAID_N == %d\n", n); /* compute braid tables for this N and 64-bit word_t */ braid(ltl, big, n, 8); /* write out braid tables for 64-bit z_word_t */ - printf("\n"); - printf("#if W == 8\n\n"); + printf("# if BRAID_W == 8\n\n"); printf("static const uint32_t crc_braid_table[][256] = {\n"); for (k = 0; k < 8; k++) { printf(" {"); @@ -202,7 +201,7 @@ static void print_crc_table(void) { /* write out braid tables for 32-bit z_word_t */ printf("\n"); - printf("#else /* W == 4 */\n\n"); + printf("# else /* BRAID_W == 4 */\n\n"); printf("static const uint32_t crc_braid_table[][256] = {\n"); for (k = 0; k < 4; k++) { printf(" {"); @@ -217,9 +216,8 @@ static void print_crc_table(void) { printf("}%s", k < 3 ? ",\n" : ""); } printf("};\n\n"); - printf("#endif /* W */\n\n"); - - printf("#endif /* N == %d */\n", n); + printf("# endif /* BRAID_W */\n"); + printf("#endif /* BRAID_N == %d */\n", n); } printf("\n");