Clean up crc32_braid.

- Rename N and W to BRAID_N and BRAID_W
- Remove override capabilities for BRAID_N and BRAID_W
- Fix formatting in crc32_braid_tbl.h
- Make makecrct not rely on crc32_braid_p.h
This commit is contained in:
Hans Kristian Rosbach 2025-02-17 19:18:22 +01:00 committed by Hans Kristian Rosbach
parent e8d8049382
commit 8648ffef49
6 changed files with 127 additions and 167 deletions

View File

@ -12,83 +12,84 @@
#include "crc32_braid_tbl.h"
/*
A CRC of a message is computed on N braids of words in the message, where
each word consists of W bytes (4 or 8). If N is 3, for example, then three
running sparse CRCs are calculated respectively on each braid, at these
A CRC of a message is computed on BRAID_N braids of words in the message, where
each word consists of BRAID_W bytes (4 or 8). If BRAID_N is 3, for example, then
three running sparse CRCs are calculated respectively on each braid, at these
indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
This is done starting at a word boundary, and continues until as many blocks
of N * W bytes as are available have been processed. The results are combined
into a single CRC at the end. For this code, N must be in the range 1..6 and
W must be 4 or 8. The upper limit on N can be increased if desired by adding
more #if blocks, extending the patterns apparent in the code. In addition,
crc32 tables would need to be regenerated, if the maximum N value is increased.
This is done starting at a word boundary, and continues until as many blocks of
BRAID_N * BRAID_W bytes as are available have been processed. The results are
combined into a single CRC at the end. For this code, BRAID_N must be in the
range 1..6 and BRAID_W must be 4 or 8. The upper limit on BRAID_N can be increased
if desired by adding more #if blocks, extending the patterns apparent in the code.
In addition, crc32 tables would need to be regenerated, if the maximum BRAID_N
value is increased.
N and W are chosen empirically by benchmarking the execution time on a given
processor. The choices for N and W below were based on testing on Intel Kaby
Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
Octeon II processors. The Intel, AMD, and ARM processors were all fastest
with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
BRAID_N and BRAID_W are chosen empirically by benchmarking the execution time
on a given processor. The choices for BRAID_N and BRAID_W below were based on
testing on Intel Kaby Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC
POWER9, and MIPS64 Octeon II processors.
The Intel, AMD, and ARM processors were all fastest with BRAID_N=5, BRAID_W=8.
The Sparc, PowerPC, and MIPS64 were all fastest at BRAID_N=5, BRAID_W=4.
They were all tested with either gcc or clang, all using the -O3 optimization
level. Your mileage may vary.
*/
/* ========================================================================= */
#ifdef W
#ifdef BRAID_W
/*
Return the CRC of the W bytes in the word_t data, taking the
Return the CRC of the BRAID_W bytes in the word_t data, taking the
least-significant byte of the word as the first byte of data, without any pre
or post conditioning. This is used to combine the CRCs of each braid.
*/
#if BYTE_ORDER == LITTLE_ENDIAN
# if BYTE_ORDER == LITTLE_ENDIAN
static uint32_t crc_word(z_word_t data) {
int k;
for (k = 0; k < W; k++)
for (k = 0; k < BRAID_W; k++)
data = (data >> 8) ^ crc_table[data & 0xff];
return (uint32_t)data;
}
#elif BYTE_ORDER == BIG_ENDIAN
# elif BYTE_ORDER == BIG_ENDIAN
static z_word_t crc_word(z_word_t data) {
int k;
for (k = 0; k < W; k++)
for (k = 0; k < BRAID_W; k++)
data = (data << 8) ^
crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
crc_big_table[(data >> ((BRAID_W - 1) << 3)) & 0xff];
return data;
}
#endif /* BYTE_ORDER */
#endif /* W */
# endif /* BYTE_ORDER */
#endif /* BRAID_W */
/* ========================================================================= */
Z_INTERNAL uint32_t crc32_braid_internal(uint32_t c, const uint8_t *buf, size_t len) {
#ifdef W
#ifdef BRAID_W
/* If provided enough bytes, do a braided CRC calculation. */
if (len >= N * W + W - 1) {
if (len >= BRAID_N * BRAID_W + BRAID_W - 1) {
size_t blks;
z_word_t const *words;
int k;
/* Compute the CRC up to a z_word_t boundary. */
while (len && ((uintptr_t)buf & (W - 1)) != 0) {
while (len && ((uintptr_t)buf & (BRAID_W - 1)) != 0) {
len--;
DO1;
}
/* Compute the CRC on as many N z_word_t blocks as are available. */
blks = len / (N * W);
len -= blks * N * W;
/* Compute the CRC on as many BRAID_N z_word_t blocks as are available. */
blks = len / (BRAID_N * BRAID_W);
len -= blks * BRAID_N * BRAID_W;
words = (z_word_t const *)buf;
z_word_t crc0, word0, comb;
#if N > 1
#if BRAID_N > 1
z_word_t crc1, word1;
#if N > 2
#if BRAID_N > 2
z_word_t crc2, word2;
#if N > 3
#if BRAID_N > 3
z_word_t crc3, word3;
#if N > 4
#if BRAID_N > 4
z_word_t crc4, word4;
#if N > 5
#if BRAID_N > 5
z_word_t crc5, word5;
#endif
#endif
@ -97,15 +98,15 @@ Z_INTERNAL uint32_t crc32_braid_internal(uint32_t c, const uint8_t *buf, size_t
#endif
/* Initialize the CRC for each braid. */
crc0 = ZSWAPWORD(c);
#if N > 1
#if BRAID_N > 1
crc1 = 0;
#if N > 2
#if BRAID_N > 2
crc2 = 0;
#if N > 3
#if BRAID_N > 3
crc3 = 0;
#if N > 4
#if BRAID_N > 4
crc4 = 0;
#if N > 5
#if BRAID_N > 5
crc5 = 0;
#endif
#endif
@ -116,51 +117,51 @@ Z_INTERNAL uint32_t crc32_braid_internal(uint32_t c, const uint8_t *buf, size_t
while (--blks) {
/* Load the word for each braid into registers. */
word0 = crc0 ^ words[0];
#if N > 1
#if BRAID_N > 1
word1 = crc1 ^ words[1];
#if N > 2
#if BRAID_N > 2
word2 = crc2 ^ words[2];
#if N > 3
#if BRAID_N > 3
word3 = crc3 ^ words[3];
#if N > 4
#if BRAID_N > 4
word4 = crc4 ^ words[4];
#if N > 5
#if BRAID_N > 5
word5 = crc5 ^ words[5];
#endif
#endif
#endif
#endif
#endif
words += N;
words += BRAID_N;
/* Compute and update the CRC for each word. The loop should get unrolled. */
crc0 = BRAID_TABLE[0][word0 & 0xff];
#if N > 1
#if BRAID_N > 1
crc1 = BRAID_TABLE[0][word1 & 0xff];
#if N > 2
#if BRAID_N > 2
crc2 = BRAID_TABLE[0][word2 & 0xff];
#if N > 3
#if BRAID_N > 3
crc3 = BRAID_TABLE[0][word3 & 0xff];
#if N > 4
#if BRAID_N > 4
crc4 = BRAID_TABLE[0][word4 & 0xff];
#if N > 5
#if BRAID_N > 5
crc5 = BRAID_TABLE[0][word5 & 0xff];
#endif
#endif
#endif
#endif
#endif
for (k = 1; k < W; k++) {
for (k = 1; k < BRAID_W; k++) {
crc0 ^= BRAID_TABLE[k][(word0 >> (k << 3)) & 0xff];
#if N > 1
#if BRAID_N > 1
crc1 ^= BRAID_TABLE[k][(word1 >> (k << 3)) & 0xff];
#if N > 2
#if BRAID_N > 2
crc2 ^= BRAID_TABLE[k][(word2 >> (k << 3)) & 0xff];
#if N > 3
#if BRAID_N > 3
crc3 ^= BRAID_TABLE[k][(word3 >> (k << 3)) & 0xff];
#if N > 4
#if BRAID_N > 4
crc4 ^= BRAID_TABLE[k][(word4 >> (k << 3)) & 0xff];
#if N > 5
#if BRAID_N > 5
crc5 ^= BRAID_TABLE[k][(word5 >> (k << 3)) & 0xff];
#endif
#endif
@ -170,24 +171,24 @@ Z_INTERNAL uint32_t crc32_braid_internal(uint32_t c, const uint8_t *buf, size_t
}
}
/* Process the last block, combining the CRCs of the N braids at the same time. */
/* Process the last block, combining the CRCs of the BRAID_N braids at the same time. */
comb = crc_word(crc0 ^ words[0]);
#if N > 1
#if BRAID_N > 1
comb = crc_word(crc1 ^ words[1] ^ comb);
#if N > 2
#if BRAID_N > 2
comb = crc_word(crc2 ^ words[2] ^ comb);
#if N > 3
#if BRAID_N > 3
comb = crc_word(crc3 ^ words[3] ^ comb);
#if N > 4
#if BRAID_N > 4
comb = crc_word(crc4 ^ words[4] ^ comb);
#if N > 5
#if BRAID_N > 5
comb = crc_word(crc5 ^ words[5] ^ comb);
#endif
#endif
#endif
#endif
#endif
words += N;
words += BRAID_N;
Assert(comb <= UINT32_MAX, "comb should fit in uint32_t");
c = (uint32_t)ZSWAPWORD(comb);
@ -195,7 +196,7 @@ Z_INTERNAL uint32_t crc32_braid_internal(uint32_t c, const uint8_t *buf, size_t
buf = (const unsigned char *)words;
}
#endif /* W */
#endif /* BRAID_W */
/* Complete the computation of the CRC on any remaining bytes. */
while (len >= 8) {
@ -211,7 +212,7 @@ Z_INTERNAL uint32_t crc32_braid_internal(uint32_t c, const uint8_t *buf, size_t
return c;
}
uint32_t PREFIX(crc32_braid)(uint32_t c, const uint8_t *buf, size_t len) {
Z_INTERNAL uint32_t crc32_braid(uint32_t c, const uint8_t *buf, size_t len) {
c = (~c) & 0xffffffff;
c = crc32_braid_internal(c, buf, len);

View File

@ -25,7 +25,7 @@ uint32_t PREFIX(crc32_c)(uint32_t crc, const uint8_t *buf, size_t len) {
aligned_len = len - algn_diff;
if(aligned_len > CHORBA_LARGE_THRESHOLD)
c = crc32_chorba_118960_nondestructive(c, (z_word_t*) aligned_buf, aligned_len);
# if W == 8
# if BRAID_W == 8
else if (aligned_len > CHORBA_MEDIUM_LOWER_THRESHOLD && aligned_len <= CHORBA_MEDIUM_UPPER_THRESHOLD)
c = crc32_chorba_32768_nondestructive(c, (uint64_t*) aligned_buf, aligned_len);
else if (aligned_len > CHORBA_SMALL_THRESHOLD_64BIT)

View File

@ -3,51 +3,27 @@
#include "zendian.h"
/* Define N */
#ifdef Z_TESTN
# define N Z_TESTN
#else
# define N 5
#endif
#if N < 1 || N > 6
# error N must be in 1..6
#endif
/* Define BRAID_N, valid range is 1..6 */
#define BRAID_N 5
/*
Define W and the associated z_word_t type. If W is not defined, then a
braided calculation is not used, and the associated tables and code are not
compiled.
/* Define BRAID_W and the associated z_word_t type. If BRAID_W is not defined, then a braided
calculation is not used, and the associated tables and code are not compiled.
*/
#ifdef Z_TESTW
# if Z_TESTW-1 != -1
# define W Z_TESTW
# endif
#if defined(__x86_64__) || defined(_M_AMD64) || defined(__aarch64__) || defined(_M_ARM64) || defined(__powerpc64__)
# define BRAID_W 8
typedef uint64_t z_word_t;
#else
# ifndef W
# if defined(__x86_64__) || defined(_M_AMD64) || defined(__aarch64__) || defined(_M_ARM64) || defined(__powerpc64__)
# define W 8
# else
# define W 4
# endif
# endif
#endif
#ifdef W
# if W == 8
typedef uint64_t z_word_t;
# else
# undef W
# define W 4
typedef uint32_t z_word_t;
# endif
# define BRAID_W 4
typedef uint32_t z_word_t;
#endif
#if BYTE_ORDER == LITTLE_ENDIAN
# define ZSWAPWORD(word) (word)
# define BRAID_TABLE crc_braid_table
#elif BYTE_ORDER == BIG_ENDIAN
# if W == 8
# if BRAID_W == 8
# define ZSWAPWORD(word) ZSWAP64(word)
# elif W == 4
# elif BRAID_W == 4
# define ZSWAPWORD(word) ZSWAP32(word)
# endif
# define BRAID_TABLE crc_braid_big_table

View File

@ -59,9 +59,8 @@ static const uint32_t crc_table[] = {
0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b,
0x2d02ef8d};
#ifdef W
#if W == 8
#ifdef BRAID_W
# if BRAID_W == 8
static const z_word_t crc_big_table[] = {
0x0000000000000000, 0x9630077700000000, 0x2c610eee00000000,
@ -151,7 +150,7 @@ static const z_word_t crc_big_table[] = {
0x37be0bb400000000, 0xa18e0cc300000000, 0x1bdf055a00000000,
0x8def022d00000000};
#else /* W == 4 */
# else /* BRAID_W == 4 */
static const z_word_t crc_big_table[] = {
0x00000000, 0x96300777, 0x2c610eee, 0xba510999, 0x19c46d07,
@ -207,13 +206,11 @@ static const z_word_t crc_big_table[] = {
0x021b685d, 0x942b6f2a, 0x37be0bb4, 0xa18e0cc3, 0x1bdf055a,
0x8def022d};
#endif
# endif
#endif /* BRAID_W */
#endif /* W */
#if N == 1
#if W == 8
#if BRAID_N == 1
# if BRAID_W == 8
static const uint32_t crc_braid_table[][256] = {
{0x00000000, 0xccaa009e, 0x4225077d, 0x8e8f07e3, 0x844a0efa,
@ -1323,7 +1320,7 @@ static const z_word_t crc_braid_big_table[][256] = {
0x0501c4a800000000, 0x9b016e6400000000, 0x7806e1ea00000000,
0xe6064b2600000000}};
#else /* W == 4 */
# else /* BRAID_W == 4 */
static const uint32_t crc_braid_table[][256] = {
{0x00000000, 0xb8bc6765, 0xaa09c88b, 0x12b5afee, 0x8f629757,
@ -1745,12 +1742,10 @@ static const z_word_t crc_braid_big_table[][256] = {
0xc3f6dbe9, 0xa6916751, 0x1fa9b0cc, 0x7ace0c74, 0x9461b966,
0xf10605de}};
#endif /* W */
#endif /* N == 1 */
#if N == 2
#if W == 8
# endif /* BRAID_W */
#endif /* BRAID_N == 1 */
#if BRAID_N == 2
# if BRAID_W == 8
static const uint32_t crc_braid_table[][256] = {
{0x00000000, 0xae689191, 0x87a02563, 0x29c8b4f2, 0xd4314c87,
@ -2860,7 +2855,7 @@ static const z_word_t crc_braid_big_table[][256] = {
0x258db92400000000, 0xb41cd18a00000000, 0x46a819a300000000,
0xd739710d00000000}};
#else /* W == 4 */
# else /* BRAID_W == 4 */
static const uint32_t crc_braid_table[][256] = {
{0x00000000, 0xccaa009e, 0x4225077d, 0x8e8f07e3, 0x844a0efa,
@ -3282,12 +3277,10 @@ static const z_word_t crc_braid_big_table[][256] = {
0x8208ab6e, 0x1c0801a2, 0x0501c4a8, 0x9b016e64, 0x7806e1ea,
0xe6064b26}};
#endif /* W */
#endif /* N == 2 */
#if N == 3
#if W == 8
# endif /* BRAID_W */
#endif /* BRAID_N == 2 */
#if BRAID_N == 3
# if BRAID_W == 8
static const uint32_t crc_braid_table[][256] = {
{0x00000000, 0x81256527, 0xd93bcc0f, 0x581ea928, 0x69069e5f,
@ -4397,7 +4390,7 @@ static const z_word_t crc_braid_big_table[][256] = {
0x792cd35100000000, 0x5e49f6d000000000, 0x76e0e88800000000,
0x5185cd0900000000}};
#else /* W == 4 */
# else /* BRAID_W == 4 */
static const uint32_t crc_braid_table[][256] = {
{0x00000000, 0x9ba54c6f, 0xec3b9e9f, 0x779ed2f0, 0x03063b7f,
@ -4819,12 +4812,10 @@ static const z_word_t crc_braid_big_table[][256] = {
0x7506baae, 0x1a4a1f35, 0x95a38741, 0xfaef22da, 0x0a3dbcad,
0x65711936}};
#endif /* W */
#endif /* N == 3 */
#if N == 4
#if W == 8
# endif /* BRAID_W */
#endif /* BRAID_N == 3 */
#if BRAID_N == 4
# if BRAID_W == 8
static const uint32_t crc_braid_table[][256] = {
{0x00000000, 0xf1da05aa, 0x38c50d15, 0xc91f08bf, 0x718a1a2a,
@ -5934,7 +5925,7 @@ static const z_word_t crc_braid_big_table[][256] = {
0xa951db2a00000000, 0x035401db00000000, 0xbc5c1e1200000000,
0x1659c4e300000000}};
#else /* W == 4 */
# else /* BRAID_W == 4 */
static const uint32_t crc_braid_table[][256] = {
{0x00000000, 0xae689191, 0x87a02563, 0x29c8b4f2, 0xd4314c87,
@ -6356,12 +6347,10 @@ static const z_word_t crc_braid_big_table[][256] = {
0xc1e42877, 0x507540d9, 0x258db924, 0xb41cd18a, 0x46a819a3,
0xd739710d}};
#endif /* W */
#endif /* N == 4 */
#if N == 5
#if W == 8
# endif /* BRAID_W */
#endif /* BRAID_N == 4 */
#if BRAID_N == 5
# if BRAID_W == 8
static const uint32_t crc_braid_table[][256] = {
{0x00000000, 0xaf449247, 0x85f822cf, 0x2abcb088, 0xd08143df,
@ -7471,7 +7460,7 @@ static const z_word_t crc_braid_big_table[][256] = {
0xedc528c300000000, 0xaa576c6c00000000, 0x22e7d04600000000,
0x657594e900000000}};
#else /* W == 4 */
# else /* BRAID_W == 4 */
static const uint32_t crc_braid_table[][256] = {
{0x00000000, 0x65673b46, 0xcace768c, 0xafa94dca, 0x4eedeb59,
@ -7893,12 +7882,10 @@ static const z_word_t crc_braid_big_table[][256] = {
0x2abb26f3, 0x6c804196, 0xff260577, 0xb91d6212, 0x7350cbbd,
0x356bacd8}};
#endif /* W */
#endif /* N == 5 */
#if N == 6
#if W == 8
# endif /* BRAID_W */
#endif /* BRAID_N == 5 */
#if BRAID_N == 6
# if BRAID_W == 8
static const uint32_t crc_braid_table[][256] = {
{0x00000000, 0x3db1ecdc, 0x7b63d9b8, 0x46d23564, 0xf6c7b370,
@ -9008,7 +8995,7 @@ static const z_word_t crc_braid_big_table[][256] = {
0xcc95bac300000000, 0x10790bfe00000000, 0x744cd9b800000000,
0xa8a0688500000000}};
#else /* W == 4 */
# else /* BRAID_W == 4 */
static const uint32_t crc_braid_table[][256] = {
{0x00000000, 0x81256527, 0xd93bcc0f, 0x581ea928, 0x69069e5f,
@ -9430,9 +9417,8 @@ static const z_word_t crc_braid_big_table[][256] = {
0x297eeee1, 0x0e1bcb60, 0x792cd351, 0x5e49f6d0, 0x76e0e888,
0x5185cd09}};
#endif /* W */
#endif /* N == 6 */
# endif /* BRAID_W */
#endif /* BRAID_N == 6 */
static const uint32_t x2n_table[] = {
0x40000000, 0x20000000, 0x08000000, 0x00800000, 0x00008000,

View File

@ -2,4 +2,3 @@ Z_INTERNAL uint32_t crc32_chorba_118960_nondestructive (uint32_t crc, const z_wo
Z_INTERNAL uint32_t crc32_chorba_32768_nondestructive (uint32_t crc, const uint64_t* buf, size_t len);
Z_INTERNAL uint32_t crc32_chorba_small_nondestructive (uint32_t crc, const uint64_t* buf, size_t len);
Z_INTERNAL uint32_t crc32_chorba_small_nondestructive_32bit (uint32_t crc, const uint32_t* buf, size_t len);
Z_INTERNAL uint32_t crc32_braid_internal(uint32_t c, const uint8_t *buf, size_t len);

View File

@ -15,9 +15,9 @@
and writes out the tables for the case that z_word_t is 32 bits.
*/
#define W 8 /* Need a 64-bit integer type in order to generate crc32 tables. */
#include "crc32_braid_p.h"
#define POLY 0xedb88320 /* p(x) reflected, with x^32 implied */
#define BRAID_W 8 /* Need a 64-bit integer type in order to generate crc32 tables. */
typedef uint64_t z_word_t;
static uint32_t crc_table[256];
static z_word_t crc_big_table[256];
@ -156,32 +156,31 @@ static void print_crc_table(void) {
printf("};\n\n");
/* print big-endian CRC table for 64-bit z_word_t */
printf("#ifdef W\n\n");
printf("#if W == 8\n\n");
printf("#ifdef BRAID_W\n");
printf("# if BRAID_W == 8\n\n");
printf("static const z_word_t crc_big_table[] = {\n");
printf(" ");
write_table64(crc_big_table, 256);
printf("};\n\n");
/* print big-endian CRC table for 32-bit z_word_t */
printf("#else /* W == 4 */\n\n");
printf("# else /* BRAID_W == 4 */\n\n");
printf("static const z_word_t crc_big_table[] = {\n");
printf(" ");
write_table32hi(crc_big_table, 256);
printf("};\n\n");
printf("#endif\n\n");
printf("#endif /* W */\n\n");
printf("# endif\n");
printf("#endif /* BRAID_W */\n\n");
/* write out braid tables for each value of N */
for (n = 1; n <= 6; n++) {
printf("#if N == %d\n", n);
printf("#if BRAID_N == %d\n", n);
/* compute braid tables for this N and 64-bit word_t */
braid(ltl, big, n, 8);
/* write out braid tables for 64-bit z_word_t */
printf("\n");
printf("#if W == 8\n\n");
printf("# if BRAID_W == 8\n\n");
printf("static const uint32_t crc_braid_table[][256] = {\n");
for (k = 0; k < 8; k++) {
printf(" {");
@ -202,7 +201,7 @@ static void print_crc_table(void) {
/* write out braid tables for 32-bit z_word_t */
printf("\n");
printf("#else /* W == 4 */\n\n");
printf("# else /* BRAID_W == 4 */\n\n");
printf("static const uint32_t crc_braid_table[][256] = {\n");
for (k = 0; k < 4; k++) {
printf(" {");
@ -217,9 +216,8 @@ static void print_crc_table(void) {
printf("}%s", k < 3 ? ",\n" : "");
}
printf("};\n\n");
printf("#endif /* W */\n\n");
printf("#endif /* N == %d */\n", n);
printf("# endif /* BRAID_W */\n");
printf("#endif /* BRAID_N == %d */\n", n);
}
printf("\n");