Continued cleanup of old UNALIGNED_OK checks

- Remove obsolete checks
- Fix checks that are inconsistent
- Stop compiling compare256/longest_match variants that never gets called
- Improve how the generic compare256 functions are handled.
- Allow overriding OPTIMAL_CMP

This simplifies the code and avoids having a lot of code in the compiled library than can never get executed.
This commit is contained in:
Hans Kristian Rosbach 2024-12-20 23:31:37 +01:00 committed by Hans Kristian Rosbach
parent 1aeb2915a0
commit bf05e882b8
15 changed files with 220 additions and 347 deletions

View File

@ -1074,6 +1074,9 @@ set(ZLIB_PUBLIC_HDRS
${CMAKE_CURRENT_BINARY_DIR}/zlib${SUFFIX}.h
)
set(ZLIB_PRIVATE_HDRS
arch/generic/chunk_permute_table.h
arch/generic/compare256_p.h
arch/generic/generic_functions.h
adler32_p.h
chunkset_tpl.h
compare256_rle.h

View File

@ -40,10 +40,10 @@ chunkset_c.o: $(SRCDIR)/chunkset_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.
chunkset_c.lo: $(SRCDIR)/chunkset_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c
compare256_c.o: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
compare256_c.o: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCDIR)/compare256_p.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c
compare256_c.lo: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
compare256_c.lo: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCDIR)/compare256_p.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c
crc32_braid_c.o: $(SRCDIR)/crc32_braid_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h

View File

@ -4,187 +4,28 @@
*/
#include "zbuild.h"
#include "zmemory.h"
#include "deflate.h"
#include "fallback_builtins.h"
#include "compare256_p.h"
/* ALIGNED, byte comparison */
static inline uint32_t compare256_c_static(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
do {
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
} while (len < 256);
return 256;
}
// Set optimal COMPARE256 function variant
#if OPTIMAL_CMP == 8
# define COMPARE256 compare256_8
#elif defined(HAVE_BUILTIN_CTZLL)
# define COMPARE256 compare256_64
#elif defined(HAVE_BUILTIN_CTZ)
# define COMPARE256 compare256_32
#else
# define COMPARE256 compare256_16
#endif
Z_INTERNAL uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1) {
return compare256_c_static(src0, src1);
return COMPARE256(src0, src1);
}
// Generate longest_match_c
#define LONGEST_MATCH longest_match_c
#define COMPARE256 compare256_c_static
#include "match_tpl.h"
// Generate longest_match_slow_c
#define LONGEST_MATCH_SLOW
#define LONGEST_MATCH longest_match_slow_c
#define COMPARE256 compare256_c_static
#include "match_tpl.h"
#if OPTIMAL_CMP >= 32
/* 16-bit unaligned integer comparison */
static inline uint32_t compare256_16_static(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
do {
if (zng_memcmp_2(src0, src1) != 0)
return len + (*src0 == *src1);
src0 += 2, src1 += 2, len += 2;
if (zng_memcmp_2(src0, src1) != 0)
return len + (*src0 == *src1);
src0 += 2, src1 += 2, len += 2;
if (zng_memcmp_2(src0, src1) != 0)
return len + (*src0 == *src1);
src0 += 2, src1 += 2, len += 2;
if (zng_memcmp_2(src0, src1) != 0)
return len + (*src0 == *src1);
src0 += 2, src1 += 2, len += 2;
} while (len < 256);
return 256;
}
Z_INTERNAL uint32_t compare256_16(const uint8_t *src0, const uint8_t *src1) {
return compare256_16_static(src0, src1);
}
#define LONGEST_MATCH longest_match_16
#define COMPARE256 compare256_16_static
#include "match_tpl.h"
#define LONGEST_MATCH_SLOW
#define LONGEST_MATCH longest_match_slow_16
#define COMPARE256 compare256_16_static
#include "match_tpl.h"
#ifdef HAVE_BUILTIN_CTZ
/* 32-bit unaligned integer comparison */
static inline uint32_t compare256_32_static(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
do {
uint32_t sv, mv, diff;
sv = zng_memread_4(src0);
mv = zng_memread_4(src1);
diff = sv ^ mv;
if (diff) {
#if BYTE_ORDER == LITTLE_ENDIAN
uint32_t match_byte = __builtin_ctz(diff) / 8;
#else
uint32_t match_byte = __builtin_clz(diff) / 8;
#endif
return len + match_byte;
}
src0 += 4, src1 += 4, len += 4;
} while (len < 256);
return 256;
}
Z_INTERNAL uint32_t compare256_32(const uint8_t *src0, const uint8_t *src1) {
return compare256_32_static(src0, src1);
}
#define LONGEST_MATCH longest_match_32
#define COMPARE256 compare256_32_static
#include "match_tpl.h"
#define LONGEST_MATCH_SLOW
#define LONGEST_MATCH longest_match_slow_32
#define COMPARE256 compare256_32_static
#include "match_tpl.h"
#endif
#if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64
/* 64-bit integer comparison */
static inline uint32_t compare256_64_static(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
do {
uint64_t sv, mv, diff;
sv = zng_memread_8(src0);
mv = zng_memread_8(src1);
diff = sv ^ mv;
if (diff) {
#if BYTE_ORDER == LITTLE_ENDIAN
uint64_t match_byte = __builtin_ctzll(diff) / 8;
#else
uint64_t match_byte = __builtin_clzll(diff) / 8;
#endif
return len + (uint32_t)match_byte;
}
src0 += 8, src1 += 8, len += 8;
} while (len < 256);
return 256;
}
Z_INTERNAL uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1) {
return compare256_64_static(src0, src1);
}
#define LONGEST_MATCH longest_match_64
#define COMPARE256 compare256_64_static
#include "match_tpl.h"
#define LONGEST_MATCH_SLOW
#define LONGEST_MATCH longest_match_slow_64
#define COMPARE256 compare256_64_static
#include "match_tpl.h"
#endif
#endif

123
arch/generic/compare256_p.h Normal file
View File

@ -0,0 +1,123 @@
/* compare256_p.h -- 256 byte memory comparison with match length return
* Copyright (C) 2020 Nathan Moinvaziri
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zmemory.h"
#include "deflate.h"
#include "fallback_builtins.h"
/* 8-bit integer comparison */
static inline uint32_t compare256_8(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
do {
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
} while (len < 256);
return 256;
}
/* 16-bit integer comparison */
static inline uint32_t compare256_16(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
do {
if (zng_memcmp_2(src0, src1) != 0)
return len + (*src0 == *src1);
src0 += 2, src1 += 2, len += 2;
if (zng_memcmp_2(src0, src1) != 0)
return len + (*src0 == *src1);
src0 += 2, src1 += 2, len += 2;
if (zng_memcmp_2(src0, src1) != 0)
return len + (*src0 == *src1);
src0 += 2, src1 += 2, len += 2;
if (zng_memcmp_2(src0, src1) != 0)
return len + (*src0 == *src1);
src0 += 2, src1 += 2, len += 2;
} while (len < 256);
return 256;
}
#ifdef HAVE_BUILTIN_CTZ
/* 32-bit integer comparison */
static inline uint32_t compare256_32(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
do {
uint32_t sv, mv, diff;
sv = zng_memread_4(src0);
mv = zng_memread_4(src1);
diff = sv ^ mv;
if (diff) {
# if BYTE_ORDER == LITTLE_ENDIAN
uint32_t match_byte = __builtin_ctz(diff) / 8;
# else
uint32_t match_byte = __builtin_clz(diff) / 8;
# endif
return len + match_byte;
}
src0 += 4, src1 += 4, len += 4;
} while (len < 256);
return 256;
}
#endif
#ifdef HAVE_BUILTIN_CTZLL
/* 64-bit integer comparison */
static inline uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
do {
uint64_t sv, mv, diff;
sv = zng_memread_8(src0);
mv = zng_memread_8(src1);
diff = sv ^ mv;
if (diff) {
# if BYTE_ORDER == LITTLE_ENDIAN
uint64_t match_byte = __builtin_ctzll(diff) / 8;
# else
uint64_t match_byte = __builtin_clzll(diff) / 8;
# endif
return len + (uint32_t)match_byte;
}
src0 += 8, src1 += 8, len += 8;
} while (len < 256);
return 256;
}
#endif

View File

@ -28,15 +28,6 @@ void inflate_fast_c(PREFIX3(stream) *strm, uint32_t start);
uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len);
uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1);
#if OPTIMAL_CMP >= 32
uint32_t compare256_16(const uint8_t *src0, const uint8_t *src1);
# ifdef HAVE_BUILTIN_CTZ
uint32_t compare256_32(const uint8_t *src0, const uint8_t *src1);
# endif
# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64
uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1);
# endif
#endif
typedef void (*slide_hash_func)(deflate_state *s);
@ -44,41 +35,6 @@ void slide_hash_c(deflate_state *s);
uint32_t longest_match_c(deflate_state *const s, Pos cur_match);
uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match);
#if OPTIMAL_CMP >= 32
uint32_t longest_match_16(deflate_state *const s, Pos cur_match);
uint32_t longest_match_slow_16(deflate_state *const s, Pos cur_match);
# ifdef HAVE_BUILTIN_CTZ
uint32_t longest_match_32(deflate_state *const s, Pos cur_match);
uint32_t longest_match_slow_32(deflate_state *const s, Pos cur_match);
# endif
# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64
uint32_t longest_match_64(deflate_state *const s, Pos cur_match);
uint32_t longest_match_slow_64(deflate_state *const s, Pos cur_match);
# endif
#endif
// Select generic implementation for longest_match, longest_match_slow, longest_match_slow functions.
#if OPTIMAL_CMP >= 32
# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64
# define longest_match_generic longest_match_64
# define longest_match_slow_generic longest_match_slow_64
# define compare256_generic compare256_64
# elif defined(HAVE_BUILTIN_CTZ)
# define longest_match_generic longest_match_32
# define longest_match_slow_generic longest_match_slow_32
# define compare256_generic compare256_32
# else
# define longest_match_generic longest_match_16
# define longest_match_slow_generic longest_match_slow_16
# define compare256_generic compare256_16
# endif
#else
# define longest_match_generic longest_match_c
# define longest_match_slow_generic longest_match_slow_c
# define compare256_generic compare256_c
#endif
#ifdef DISABLE_RUNTIME_CPU_DETECTION
// Generic code
@ -93,9 +49,9 @@ uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match);
# define native_crc32_fold_reset crc32_fold_reset_c
# define native_inflate_fast inflate_fast_c
# define native_slide_hash slide_hash_c
# define native_longest_match longest_match_generic
# define native_longest_match_slow longest_match_slow_generic
# define native_compare256 compare256_generic
# define native_longest_match longest_match_c
# define native_longest_match_slow longest_match_slow_c
# define native_compare256 compare256_c
#endif
#endif

View File

@ -6,12 +6,11 @@
#include "zbuild.h"
#include "zmemory.h"
#include "fallback_builtins.h"
#include "zendian.h"
typedef uint32_t (*compare256_rle_func)(const uint8_t* src0, const uint8_t* src1);
/* ALIGNED, byte comparison */
static inline uint32_t compare256_rle_c(const uint8_t *src0, const uint8_t *src1) {
/* 8-bit integer comparison */
static inline uint32_t compare256_rle_8(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
do {
@ -44,8 +43,7 @@ static inline uint32_t compare256_rle_c(const uint8_t *src0, const uint8_t *src1
return 256;
}
#if OPTIMAL_CMP >= 32
/* 16-bit unaligned integer comparison */
/* 16-bit integer comparison */
static inline uint32_t compare256_rle_16(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
uint16_t src0_cmp;
@ -71,7 +69,7 @@ static inline uint32_t compare256_rle_16(const uint8_t *src0, const uint8_t *src
}
#ifdef HAVE_BUILTIN_CTZ
/* 32-bit unaligned integer comparison */
/* 32-bit integer comparison */
static inline uint32_t compare256_rle_32(const uint8_t *src0, const uint8_t *src1) {
uint32_t sv, len = 0;
uint16_t src0_cmp;
@ -99,11 +97,10 @@ static inline uint32_t compare256_rle_32(const uint8_t *src0, const uint8_t *src
return 256;
}
#endif
#if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64
/* 64-bit unaligned integer comparison */
#ifdef HAVE_BUILTIN_CTZLL
/* 64-bit integer comparison */
static inline uint32_t compare256_rle_64(const uint8_t *src0, const uint8_t *src1) {
uint32_t src0_cmp32, len = 0;
uint16_t src0_cmp;
@ -133,8 +130,4 @@ static inline uint32_t compare256_rle_64(const uint8_t *src0, const uint8_t *src
return 256;
}
#endif
#endif

View File

@ -5,21 +5,19 @@
*/
#include "zbuild.h"
#include "compare256_rle.h"
#include "deflate.h"
#include "deflate_p.h"
#include "functable.h"
#include "compare256_rle.h"
#if OPTIMAL_CMP >= 32
# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64
# define compare256_rle compare256_rle_64
# elif defined(HAVE_BUILTIN_CTZ)
# define compare256_rle compare256_rle_32
# else
# define compare256_rle compare256_rle_16
# endif
#if OPTIMAL_CMP == 8
# define compare256_rle compare256_rle_8
#elif defined(HAVE_BUILTIN_CTZLL)
# define compare256_rle compare256_rle_64
#elif defined(HAVE_BUILTIN_CTZ)
# define compare256_rle compare256_rle_32
#else
# define compare256_rle compare256_rle_c
# define compare256_rle compare256_rle_16
#endif
/* ===========================================================================

View File

@ -61,9 +61,9 @@ static void init_functable(void) {
ft.crc32_fold_reset = &crc32_fold_reset_c;
ft.inflate_fast = &inflate_fast_c;
ft.slide_hash = &slide_hash_c;
ft.longest_match = &longest_match_generic;
ft.longest_match_slow = &longest_match_slow_generic;
ft.compare256 = &compare256_generic;
ft.longest_match = &longest_match_c;
ft.longest_match_slow = &longest_match_slow_c;
ft.compare256 = &compare256_c;
// Select arch-optimized functions

View File

@ -22,6 +22,9 @@
* IN assertions: cur_match is the head of the hash chain for the current
* string (strstart) and its distance is <= MAX_DIST, and prev_length >=1
* OUT assertion: the match length is not greater than s->lookahead
*
* The LONGEST_MATCH_SLOW variant spends more time to attempt to find longer
* matches once a match has already been found.
*/
Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) {
unsigned int strstart = s->strstart;
@ -40,15 +43,8 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) {
uint32_t chain_length, nice_match, best_len, offset;
uint32_t lookahead = s->lookahead;
Pos match_offset = 0;
#if OPTIMAL_CMP >= 64
uint64_t scan_start;
uint64_t scan_end;
#elif OPTIMAL_CMP >= 32
uint32_t scan_start;
uint32_t scan_end;
#else
uint8_t scan_end[8];
#endif
#define GOTO_NEXT_CHAIN \
if (--chain_length && (cur_match = prev[cur_match & wmask]) > limit) \
@ -64,26 +60,14 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) {
* to find the next best match length.
*/
offset = best_len-1;
#if OPTIMAL_CMP >= 32
if (best_len >= sizeof(uint32_t)) {
offset -= 2;
#if OPTIMAL_CMP >= 64
if (best_len >= sizeof(uint64_t))
offset -= 4;
#endif
}
#endif
#if OPTIMAL_CMP >= 64
scan_start = zng_memread_8(scan);
scan_end = zng_memread_8(scan+offset);
#elif OPTIMAL_CMP >= 32
scan_start = zng_memread_4(scan);
scan_end = zng_memread_4(scan+offset);
#else
scan_end[0] = *(scan+offset);
scan_end[1] = *(scan+offset+1);
#endif
mbase_end = (mbase_start+offset);
/* Do not waste too much time if we already have a good match */
@ -143,7 +127,6 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) {
* that depend on those values. However the length of the match is limited to the
* lookahead, so the output of deflate is not affected by the uninitialized values.
*/
#if OPTIMAL_CMP >= 32
if (best_len < sizeof(uint32_t)) {
for (;;) {
if (zng_memcmp_2(mbase_end+cur_match, &scan_end) == 0 &&
@ -151,7 +134,6 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) {
break;
GOTO_NEXT_CHAIN;
}
# if OPTIMAL_CMP >= 64
} else if (best_len >= sizeof(uint64_t)) {
for (;;) {
if (zng_memcmp_8(mbase_end+cur_match, &scan_end) == 0 &&
@ -159,7 +141,6 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) {
break;
GOTO_NEXT_CHAIN;
}
# endif
} else {
for (;;) {
if (zng_memcmp_4(mbase_end+cur_match, &scan_end) == 0 &&
@ -168,14 +149,6 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) {
GOTO_NEXT_CHAIN;
}
}
#else
for (;;) {
if (mbase_end[cur_match] == scan_end[0] && mbase_end[cur_match+1] == scan_end[1] &&
mbase_start[cur_match] == scan[0] && mbase_start[cur_match+1] == scan[1])
break;
GOTO_NEXT_CHAIN;
}
#endif
uint32_t len = COMPARE256(scan+2, mbase_start+cur_match+2) + 2;
Assert(scan+len <= window+(unsigned)(s->window_size-1), "wild scan");
@ -191,24 +164,13 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) {
return best_len;
offset = best_len-1;
#if OPTIMAL_CMP >= 32
if (best_len >= sizeof(uint32_t)) {
offset -= 2;
#if OPTIMAL_CMP >= 64
if (best_len >= sizeof(uint64_t))
offset -= 4;
#endif
}
#endif
#if OPTIMAL_CMP >= 64
scan_end = zng_memread_8(scan+offset);
#elif OPTIMAL_CMP >= 32
scan_end = zng_memread_4(scan+offset);
#else
scan_end[0] = *(scan+offset);
scan_end[1] = *(scan+offset+1);
#endif
#ifdef LONGEST_MATCH_SLOW
/* Look for a better string offset */
@ -286,4 +248,3 @@ break_matching:
#undef LONGEST_MATCH_SLOW
#undef LONGEST_MATCH
#undef COMPARE256

View File

@ -12,6 +12,7 @@ extern "C" {
# include "zutil_p.h"
# include "arch_functions.h"
# include "../test_cpu_features.h"
# include "arch/generic/compare256_p.h"
}
#define MAX_COMPARE_SIZE (256)
@ -60,21 +61,19 @@ public:
} \
BENCHMARK_REGISTER_F(compare256, name)->Range(1, MAX_COMPARE_SIZE);
BENCHMARK_COMPARE256(c, compare256_c, 1);
#ifdef DISABLE_RUNTIME_CPU_DETECTION
BENCHMARK_COMPARE256(native, native_compare256, 1);
#else
#if BYTE_ORDER == LITTLE_ENDIAN && OPTIMAL_CMP >= 32
BENCHMARK_COMPARE256(8, compare256_8, 1);
BENCHMARK_COMPARE256(16, compare256_16, 1);
# if defined(HAVE_BUILTIN_CTZ)
#if defined(HAVE_BUILTIN_CTZ)
BENCHMARK_COMPARE256(32, compare256_32, 1);
# endif
# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64
BENCHMARK_COMPARE256(64, compare256_64, 1);
# endif
#endif
#if defined(HAVE_BUILTIN_CTZLL)
BENCHMARK_COMPARE256(64, compare256_64, 1);
#endif
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
BENCHMARK_COMPARE256(sse2, compare256_sse2, test_cpu_features.x86.has_sse2);
#endif

View File

@ -59,14 +59,11 @@ public:
} \
BENCHMARK_REGISTER_F(compare256_rle, name)->Range(1, MAX_COMPARE_SIZE);
BENCHMARK_COMPARE256_RLE(c, compare256_rle_c, 1);
#if BYTE_ORDER == LITTLE_ENDIAN && OPTIMAL_CMP >= 32
BENCHMARK_COMPARE256_RLE(8, compare256_rle_8, 1);
BENCHMARK_COMPARE256_RLE(16, compare256_rle_16, 1);
# if defined(HAVE_BUILTIN_CTZ)
#if defined(HAVE_BUILTIN_CTZ)
BENCHMARK_COMPARE256_RLE(32, compare256_rle_32, 1);
# endif
# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64
BENCHMARK_COMPARE256_RLE(64, compare256_rle_64, 1);
# endif
#endif
#if defined(HAVE_BUILTIN_CTZLL)
BENCHMARK_COMPARE256_RLE(64, compare256_rle_64, 1);
#endif

View File

@ -12,6 +12,7 @@ extern "C" {
# include "zutil.h"
# include "arch_functions.h"
# include "test_cpu_features.h"
# include "arch/generic/compare256_p.h"
}
#include <gtest/gtest.h>
@ -59,20 +60,17 @@ static inline void compare256_match_check(compare256_func compare256) {
compare256_match_check(func); \
}
TEST_COMPARE256(c, compare256_c, 1)
#ifdef DISABLE_RUNTIME_CPU_DETECTION
TEST_COMPARE256(native, native_compare256, 1)
#else
#if BYTE_ORDER == LITTLE_ENDIAN && OPTIMAL_CMP >= 32
TEST_COMPARE256(8, compare256_8, 1)
TEST_COMPARE256(16, compare256_16, 1)
# if defined(HAVE_BUILTIN_CTZ)
#if defined(HAVE_BUILTIN_CTZ)
TEST_COMPARE256(32, compare256_32, 1)
# endif
# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64
#endif
#if defined(HAVE_BUILTIN_CTZLL)
TEST_COMPARE256(64, compare256_64, 1)
# endif
#endif
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)

View File

@ -50,14 +50,11 @@ static inline void compare256_rle_match_check(compare256_rle_func compare256_rle
compare256_rle_match_check(func); \
}
TEST_COMPARE256_RLE(c, compare256_rle_c, 1)
#if BYTE_ORDER == LITTLE_ENDIAN && OPTIMAL_CMP >= 32
TEST_COMPARE256_RLE(8, compare256_rle_8, 1)
TEST_COMPARE256_RLE(16, compare256_rle_16, 1)
# if defined(HAVE_BUILTIN_CTZ)
#if defined(HAVE_BUILTIN_CTZ)
TEST_COMPARE256_RLE(32, compare256_rle_32, 1)
# endif
# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64
TEST_COMPARE256_RLE(64, compare256_rle_64, 1)
# endif
#endif
#if defined(HAVE_BUILTIN_CTZLL)
TEST_COMPARE256_RLE(64, compare256_rle_64, 1)
#endif

View File

@ -243,36 +243,43 @@
# define Tracecv(c, x)
#endif
#if defined(__x86_64__) || defined(_M_X64) || defined(__amd64__) || defined(_M_AMD64)
# define OPTIMAL_CMP 64
#elif defined(__i386__) || defined(__i486__) || defined(__i586__) || \
defined(__i686__) || defined(_X86_) || defined(_M_IX86)
# define OPTIMAL_CMP 32
#elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
# if defined(__ARM_FEATURE_UNALIGNED) || defined(_WIN32)
/* OPTIMAL_CMP values determine the comparison width:
* 64: Best for 64-bit architectures with unaligned access
* 32: Best for 32-bit architectures with unaligned access
* 16: Safe default for unknown architectures
* 8: Safe fallback for architectures without unaligned access
* Note: The unaligned access mentioned is cpu-support, this allows compiler or
* separate unaligned intrinsics to utilize safe unaligned access, without
* utilizing unaligned C pointers that are known to have undefined behavior.
*/
#if !defined(OPTIMAL_CMP)
# if defined(__x86_64__) || defined(_M_X64) || defined(__amd64__) || defined(_M_AMD64)
# define OPTIMAL_CMP 64
# else
# define OPTIMAL_CMP 8
# endif
#elif defined(__arm__) || defined(_M_ARM)
# if defined(__ARM_FEATURE_UNALIGNED) || defined(_WIN32)
# elif defined(__i386__) || defined(__i486__) || defined(__i586__) || \
defined(__i686__) || defined(_X86_) || defined(_M_IX86)
# define OPTIMAL_CMP 32
# elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
# if defined(__ARM_FEATURE_UNALIGNED) || defined(_WIN32)
# define OPTIMAL_CMP 64
# else
# define OPTIMAL_CMP 8
# endif
# elif defined(__arm__) || defined(_M_ARM)
# if defined(__ARM_FEATURE_UNALIGNED) || defined(_WIN32)
# define OPTIMAL_CMP 32
# else
# define OPTIMAL_CMP 8
# endif
# elif defined(__powerpc64__) || defined(__ppc64__)
# define OPTIMAL_CMP 64
# elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__)
# define OPTIMAL_CMP 32
# else
# define OPTIMAL_CMP 8
# endif
#elif defined(__powerpc64__) || defined(__ppc64__)
# define OPTIMAL_CMP 64
#elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__)
# define OPTIMAL_CMP 32
#endif
#if defined(NO_UNALIGNED)
# undef OPTIMAL_CMP
#endif
#if !defined(OPTIMAL_CMP)
# define OPTIMAL_CMP 8
# define OPTIMAL_CMP 16
#endif
#if defined(__has_feature)
# if __has_feature(address_sanitizer)
# define Z_ADDRESS_SANITIZER 1

View File

@ -73,7 +73,7 @@ static inline void zng_memwrite_8(void *ptr, uint64_t val) {
calls to unaligned comparisons when unaligned access is supported. Use memcmp only when
unaligned support is not available to avoid an extra call to memcpy. */
static inline int32_t zng_memcmp_2(const void *src0, const void *src1) {
#if defined(HAVE_MAY_ALIAS) || OPTIMAL_CMP >= 16
#if defined(HAVE_MAY_ALIAS)
return zng_memread_2(src0) != zng_memread_2(src1);
#else
return memcmp(src0, src1, 2);
@ -81,7 +81,7 @@ static inline int32_t zng_memcmp_2(const void *src0, const void *src1) {
}
static inline int32_t zng_memcmp_4(const void *src0, const void *src1) {
#if defined(HAVE_MAY_ALIAS) || OPTIMAL_CMP >= 32
#if defined(HAVE_MAY_ALIAS)
return zng_memread_4(src0) != zng_memread_4(src1);
#else
return memcmp(src0, src1, 4);
@ -89,7 +89,7 @@ static inline int32_t zng_memcmp_4(const void *src0, const void *src1) {
}
static inline int32_t zng_memcmp_8(const void *src0, const void *src1) {
#if defined(HAVE_MAY_ALIAS) || OPTIMAL_CMP >= 64
#if defined(HAVE_MAY_ALIAS)
return zng_memread_8(src0) != zng_memread_8(src1);
#else
return memcmp(src0, src1, 8);