Continued cleanup of old UNALIGNED_OK checks

- Remove obsolete checks
- Fix checks that are inconsistent
- Stop compiling compare256/longest_match variants that never gets called
- Improve how the generic compare256 functions are handled.
- Allow overriding OPTIMAL_CMP

This simplifies the code and avoids having a lot of code in the compiled library than can never get executed.
This commit is contained in:
Hans Kristian Rosbach 2024-12-20 23:31:37 +01:00 committed by Hans Kristian Rosbach
parent 1aeb2915a0
commit bf05e882b8
15 changed files with 220 additions and 347 deletions

View File

@ -1074,6 +1074,9 @@ set(ZLIB_PUBLIC_HDRS
${CMAKE_CURRENT_BINARY_DIR}/zlib${SUFFIX}.h ${CMAKE_CURRENT_BINARY_DIR}/zlib${SUFFIX}.h
) )
set(ZLIB_PRIVATE_HDRS set(ZLIB_PRIVATE_HDRS
arch/generic/chunk_permute_table.h
arch/generic/compare256_p.h
arch/generic/generic_functions.h
adler32_p.h adler32_p.h
chunkset_tpl.h chunkset_tpl.h
compare256_rle.h compare256_rle.h

View File

@ -40,10 +40,10 @@ chunkset_c.o: $(SRCDIR)/chunkset_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.
chunkset_c.lo: $(SRCDIR)/chunkset_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h chunkset_c.lo: $(SRCDIR)/chunkset_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c
compare256_c.o: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h compare256_c.o: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCDIR)/compare256_p.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c
compare256_c.lo: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h compare256_c.lo: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCDIR)/compare256_p.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c
crc32_braid_c.o: $(SRCDIR)/crc32_braid_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h crc32_braid_c.o: $(SRCDIR)/crc32_braid_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h

View File

@ -4,187 +4,28 @@
*/ */
#include "zbuild.h" #include "zbuild.h"
#include "zmemory.h" #include "compare256_p.h"
#include "deflate.h"
#include "fallback_builtins.h"
/* ALIGNED, byte comparison */ // Set optimal COMPARE256 function variant
static inline uint32_t compare256_c_static(const uint8_t *src0, const uint8_t *src1) { #if OPTIMAL_CMP == 8
uint32_t len = 0; # define COMPARE256 compare256_8
#elif defined(HAVE_BUILTIN_CTZLL)
do { # define COMPARE256 compare256_64
if (*src0 != *src1) #elif defined(HAVE_BUILTIN_CTZ)
return len; # define COMPARE256 compare256_32
src0 += 1, src1 += 1, len += 1; #else
if (*src0 != *src1) # define COMPARE256 compare256_16
return len; #endif
src0 += 1, src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
} while (len < 256);
return 256;
}
Z_INTERNAL uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1) { Z_INTERNAL uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1) {
return compare256_c_static(src0, src1); return COMPARE256(src0, src1);
} }
// Generate longest_match_c
#define LONGEST_MATCH longest_match_c #define LONGEST_MATCH longest_match_c
#define COMPARE256 compare256_c_static
#include "match_tpl.h" #include "match_tpl.h"
// Generate longest_match_slow_c
#define LONGEST_MATCH_SLOW #define LONGEST_MATCH_SLOW
#define LONGEST_MATCH longest_match_slow_c #define LONGEST_MATCH longest_match_slow_c
#define COMPARE256 compare256_c_static
#include "match_tpl.h" #include "match_tpl.h"
#if OPTIMAL_CMP >= 32
/* 16-bit unaligned integer comparison */
static inline uint32_t compare256_16_static(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
do {
if (zng_memcmp_2(src0, src1) != 0)
return len + (*src0 == *src1);
src0 += 2, src1 += 2, len += 2;
if (zng_memcmp_2(src0, src1) != 0)
return len + (*src0 == *src1);
src0 += 2, src1 += 2, len += 2;
if (zng_memcmp_2(src0, src1) != 0)
return len + (*src0 == *src1);
src0 += 2, src1 += 2, len += 2;
if (zng_memcmp_2(src0, src1) != 0)
return len + (*src0 == *src1);
src0 += 2, src1 += 2, len += 2;
} while (len < 256);
return 256;
}
Z_INTERNAL uint32_t compare256_16(const uint8_t *src0, const uint8_t *src1) {
return compare256_16_static(src0, src1);
}
#define LONGEST_MATCH longest_match_16
#define COMPARE256 compare256_16_static
#include "match_tpl.h"
#define LONGEST_MATCH_SLOW
#define LONGEST_MATCH longest_match_slow_16
#define COMPARE256 compare256_16_static
#include "match_tpl.h"
#ifdef HAVE_BUILTIN_CTZ
/* 32-bit unaligned integer comparison */
static inline uint32_t compare256_32_static(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
do {
uint32_t sv, mv, diff;
sv = zng_memread_4(src0);
mv = zng_memread_4(src1);
diff = sv ^ mv;
if (diff) {
#if BYTE_ORDER == LITTLE_ENDIAN
uint32_t match_byte = __builtin_ctz(diff) / 8;
#else
uint32_t match_byte = __builtin_clz(diff) / 8;
#endif
return len + match_byte;
}
src0 += 4, src1 += 4, len += 4;
} while (len < 256);
return 256;
}
Z_INTERNAL uint32_t compare256_32(const uint8_t *src0, const uint8_t *src1) {
return compare256_32_static(src0, src1);
}
#define LONGEST_MATCH longest_match_32
#define COMPARE256 compare256_32_static
#include "match_tpl.h"
#define LONGEST_MATCH_SLOW
#define LONGEST_MATCH longest_match_slow_32
#define COMPARE256 compare256_32_static
#include "match_tpl.h"
#endif
#if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64
/* 64-bit integer comparison */
static inline uint32_t compare256_64_static(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
do {
uint64_t sv, mv, diff;
sv = zng_memread_8(src0);
mv = zng_memread_8(src1);
diff = sv ^ mv;
if (diff) {
#if BYTE_ORDER == LITTLE_ENDIAN
uint64_t match_byte = __builtin_ctzll(diff) / 8;
#else
uint64_t match_byte = __builtin_clzll(diff) / 8;
#endif
return len + (uint32_t)match_byte;
}
src0 += 8, src1 += 8, len += 8;
} while (len < 256);
return 256;
}
Z_INTERNAL uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1) {
return compare256_64_static(src0, src1);
}
#define LONGEST_MATCH longest_match_64
#define COMPARE256 compare256_64_static
#include "match_tpl.h"
#define LONGEST_MATCH_SLOW
#define LONGEST_MATCH longest_match_slow_64
#define COMPARE256 compare256_64_static
#include "match_tpl.h"
#endif
#endif

123
arch/generic/compare256_p.h Normal file
View File

@ -0,0 +1,123 @@
/* compare256_p.h -- 256 byte memory comparison with match length return
* Copyright (C) 2020 Nathan Moinvaziri
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zmemory.h"
#include "deflate.h"
#include "fallback_builtins.h"
/* 8-bit integer comparison */
static inline uint32_t compare256_8(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
do {
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
if (*src0 != *src1)
return len;
src0 += 1, src1 += 1, len += 1;
} while (len < 256);
return 256;
}
/* 16-bit integer comparison */
static inline uint32_t compare256_16(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
do {
if (zng_memcmp_2(src0, src1) != 0)
return len + (*src0 == *src1);
src0 += 2, src1 += 2, len += 2;
if (zng_memcmp_2(src0, src1) != 0)
return len + (*src0 == *src1);
src0 += 2, src1 += 2, len += 2;
if (zng_memcmp_2(src0, src1) != 0)
return len + (*src0 == *src1);
src0 += 2, src1 += 2, len += 2;
if (zng_memcmp_2(src0, src1) != 0)
return len + (*src0 == *src1);
src0 += 2, src1 += 2, len += 2;
} while (len < 256);
return 256;
}
#ifdef HAVE_BUILTIN_CTZ
/* 32-bit integer comparison */
static inline uint32_t compare256_32(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
do {
uint32_t sv, mv, diff;
sv = zng_memread_4(src0);
mv = zng_memread_4(src1);
diff = sv ^ mv;
if (diff) {
# if BYTE_ORDER == LITTLE_ENDIAN
uint32_t match_byte = __builtin_ctz(diff) / 8;
# else
uint32_t match_byte = __builtin_clz(diff) / 8;
# endif
return len + match_byte;
}
src0 += 4, src1 += 4, len += 4;
} while (len < 256);
return 256;
}
#endif
#ifdef HAVE_BUILTIN_CTZLL
/* 64-bit integer comparison */
static inline uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
do {
uint64_t sv, mv, diff;
sv = zng_memread_8(src0);
mv = zng_memread_8(src1);
diff = sv ^ mv;
if (diff) {
# if BYTE_ORDER == LITTLE_ENDIAN
uint64_t match_byte = __builtin_ctzll(diff) / 8;
# else
uint64_t match_byte = __builtin_clzll(diff) / 8;
# endif
return len + (uint32_t)match_byte;
}
src0 += 8, src1 += 8, len += 8;
} while (len < 256);
return 256;
}
#endif

View File

@ -28,15 +28,6 @@ void inflate_fast_c(PREFIX3(stream) *strm, uint32_t start);
uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len); uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len);
uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1); uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1);
#if OPTIMAL_CMP >= 32
uint32_t compare256_16(const uint8_t *src0, const uint8_t *src1);
# ifdef HAVE_BUILTIN_CTZ
uint32_t compare256_32(const uint8_t *src0, const uint8_t *src1);
# endif
# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64
uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1);
# endif
#endif
typedef void (*slide_hash_func)(deflate_state *s); typedef void (*slide_hash_func)(deflate_state *s);
@ -44,41 +35,6 @@ void slide_hash_c(deflate_state *s);
uint32_t longest_match_c(deflate_state *const s, Pos cur_match); uint32_t longest_match_c(deflate_state *const s, Pos cur_match);
uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match); uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match);
#if OPTIMAL_CMP >= 32
uint32_t longest_match_16(deflate_state *const s, Pos cur_match);
uint32_t longest_match_slow_16(deflate_state *const s, Pos cur_match);
# ifdef HAVE_BUILTIN_CTZ
uint32_t longest_match_32(deflate_state *const s, Pos cur_match);
uint32_t longest_match_slow_32(deflate_state *const s, Pos cur_match);
# endif
# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64
uint32_t longest_match_64(deflate_state *const s, Pos cur_match);
uint32_t longest_match_slow_64(deflate_state *const s, Pos cur_match);
# endif
#endif
// Select generic implementation for longest_match, longest_match_slow, longest_match_slow functions.
#if OPTIMAL_CMP >= 32
# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64
# define longest_match_generic longest_match_64
# define longest_match_slow_generic longest_match_slow_64
# define compare256_generic compare256_64
# elif defined(HAVE_BUILTIN_CTZ)
# define longest_match_generic longest_match_32
# define longest_match_slow_generic longest_match_slow_32
# define compare256_generic compare256_32
# else
# define longest_match_generic longest_match_16
# define longest_match_slow_generic longest_match_slow_16
# define compare256_generic compare256_16
# endif
#else
# define longest_match_generic longest_match_c
# define longest_match_slow_generic longest_match_slow_c
# define compare256_generic compare256_c
#endif
#ifdef DISABLE_RUNTIME_CPU_DETECTION #ifdef DISABLE_RUNTIME_CPU_DETECTION
// Generic code // Generic code
@ -93,9 +49,9 @@ uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match);
# define native_crc32_fold_reset crc32_fold_reset_c # define native_crc32_fold_reset crc32_fold_reset_c
# define native_inflate_fast inflate_fast_c # define native_inflate_fast inflate_fast_c
# define native_slide_hash slide_hash_c # define native_slide_hash slide_hash_c
# define native_longest_match longest_match_generic # define native_longest_match longest_match_c
# define native_longest_match_slow longest_match_slow_generic # define native_longest_match_slow longest_match_slow_c
# define native_compare256 compare256_generic # define native_compare256 compare256_c
#endif #endif
#endif #endif

View File

@ -6,12 +6,11 @@
#include "zbuild.h" #include "zbuild.h"
#include "zmemory.h" #include "zmemory.h"
#include "fallback_builtins.h" #include "fallback_builtins.h"
#include "zendian.h"
typedef uint32_t (*compare256_rle_func)(const uint8_t* src0, const uint8_t* src1); typedef uint32_t (*compare256_rle_func)(const uint8_t* src0, const uint8_t* src1);
/* ALIGNED, byte comparison */ /* 8-bit integer comparison */
static inline uint32_t compare256_rle_c(const uint8_t *src0, const uint8_t *src1) { static inline uint32_t compare256_rle_8(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0; uint32_t len = 0;
do { do {
@ -44,8 +43,7 @@ static inline uint32_t compare256_rle_c(const uint8_t *src0, const uint8_t *src1
return 256; return 256;
} }
#if OPTIMAL_CMP >= 32 /* 16-bit integer comparison */
/* 16-bit unaligned integer comparison */
static inline uint32_t compare256_rle_16(const uint8_t *src0, const uint8_t *src1) { static inline uint32_t compare256_rle_16(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0; uint32_t len = 0;
uint16_t src0_cmp; uint16_t src0_cmp;
@ -71,7 +69,7 @@ static inline uint32_t compare256_rle_16(const uint8_t *src0, const uint8_t *src
} }
#ifdef HAVE_BUILTIN_CTZ #ifdef HAVE_BUILTIN_CTZ
/* 32-bit unaligned integer comparison */ /* 32-bit integer comparison */
static inline uint32_t compare256_rle_32(const uint8_t *src0, const uint8_t *src1) { static inline uint32_t compare256_rle_32(const uint8_t *src0, const uint8_t *src1) {
uint32_t sv, len = 0; uint32_t sv, len = 0;
uint16_t src0_cmp; uint16_t src0_cmp;
@ -99,11 +97,10 @@ static inline uint32_t compare256_rle_32(const uint8_t *src0, const uint8_t *src
return 256; return 256;
} }
#endif #endif
#if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64 #ifdef HAVE_BUILTIN_CTZLL
/* 64-bit unaligned integer comparison */ /* 64-bit integer comparison */
static inline uint32_t compare256_rle_64(const uint8_t *src0, const uint8_t *src1) { static inline uint32_t compare256_rle_64(const uint8_t *src0, const uint8_t *src1) {
uint32_t src0_cmp32, len = 0; uint32_t src0_cmp32, len = 0;
uint16_t src0_cmp; uint16_t src0_cmp;
@ -133,8 +130,4 @@ static inline uint32_t compare256_rle_64(const uint8_t *src0, const uint8_t *src
return 256; return 256;
} }
#endif #endif
#endif

View File

@ -5,21 +5,19 @@
*/ */
#include "zbuild.h" #include "zbuild.h"
#include "compare256_rle.h"
#include "deflate.h" #include "deflate.h"
#include "deflate_p.h" #include "deflate_p.h"
#include "functable.h" #include "functable.h"
#include "compare256_rle.h"
#if OPTIMAL_CMP >= 32 #if OPTIMAL_CMP == 8
# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64 # define compare256_rle compare256_rle_8
# define compare256_rle compare256_rle_64 #elif defined(HAVE_BUILTIN_CTZLL)
# elif defined(HAVE_BUILTIN_CTZ) # define compare256_rle compare256_rle_64
# define compare256_rle compare256_rle_32 #elif defined(HAVE_BUILTIN_CTZ)
# else # define compare256_rle compare256_rle_32
# define compare256_rle compare256_rle_16
# endif
#else #else
# define compare256_rle compare256_rle_c # define compare256_rle compare256_rle_16
#endif #endif
/* =========================================================================== /* ===========================================================================

View File

@ -61,9 +61,9 @@ static void init_functable(void) {
ft.crc32_fold_reset = &crc32_fold_reset_c; ft.crc32_fold_reset = &crc32_fold_reset_c;
ft.inflate_fast = &inflate_fast_c; ft.inflate_fast = &inflate_fast_c;
ft.slide_hash = &slide_hash_c; ft.slide_hash = &slide_hash_c;
ft.longest_match = &longest_match_generic; ft.longest_match = &longest_match_c;
ft.longest_match_slow = &longest_match_slow_generic; ft.longest_match_slow = &longest_match_slow_c;
ft.compare256 = &compare256_generic; ft.compare256 = &compare256_c;
// Select arch-optimized functions // Select arch-optimized functions

View File

@ -22,6 +22,9 @@
* IN assertions: cur_match is the head of the hash chain for the current * IN assertions: cur_match is the head of the hash chain for the current
* string (strstart) and its distance is <= MAX_DIST, and prev_length >=1 * string (strstart) and its distance is <= MAX_DIST, and prev_length >=1
* OUT assertion: the match length is not greater than s->lookahead * OUT assertion: the match length is not greater than s->lookahead
*
* The LONGEST_MATCH_SLOW variant spends more time to attempt to find longer
* matches once a match has already been found.
*/ */
Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) { Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) {
unsigned int strstart = s->strstart; unsigned int strstart = s->strstart;
@ -40,15 +43,8 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) {
uint32_t chain_length, nice_match, best_len, offset; uint32_t chain_length, nice_match, best_len, offset;
uint32_t lookahead = s->lookahead; uint32_t lookahead = s->lookahead;
Pos match_offset = 0; Pos match_offset = 0;
#if OPTIMAL_CMP >= 64
uint64_t scan_start; uint64_t scan_start;
uint64_t scan_end; uint64_t scan_end;
#elif OPTIMAL_CMP >= 32
uint32_t scan_start;
uint32_t scan_end;
#else
uint8_t scan_end[8];
#endif
#define GOTO_NEXT_CHAIN \ #define GOTO_NEXT_CHAIN \
if (--chain_length && (cur_match = prev[cur_match & wmask]) > limit) \ if (--chain_length && (cur_match = prev[cur_match & wmask]) > limit) \
@ -64,26 +60,14 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) {
* to find the next best match length. * to find the next best match length.
*/ */
offset = best_len-1; offset = best_len-1;
#if OPTIMAL_CMP >= 32
if (best_len >= sizeof(uint32_t)) { if (best_len >= sizeof(uint32_t)) {
offset -= 2; offset -= 2;
#if OPTIMAL_CMP >= 64
if (best_len >= sizeof(uint64_t)) if (best_len >= sizeof(uint64_t))
offset -= 4; offset -= 4;
#endif
} }
#endif
#if OPTIMAL_CMP >= 64
scan_start = zng_memread_8(scan); scan_start = zng_memread_8(scan);
scan_end = zng_memread_8(scan+offset); scan_end = zng_memread_8(scan+offset);
#elif OPTIMAL_CMP >= 32
scan_start = zng_memread_4(scan);
scan_end = zng_memread_4(scan+offset);
#else
scan_end[0] = *(scan+offset);
scan_end[1] = *(scan+offset+1);
#endif
mbase_end = (mbase_start+offset); mbase_end = (mbase_start+offset);
/* Do not waste too much time if we already have a good match */ /* Do not waste too much time if we already have a good match */
@ -143,7 +127,6 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) {
* that depend on those values. However the length of the match is limited to the * that depend on those values. However the length of the match is limited to the
* lookahead, so the output of deflate is not affected by the uninitialized values. * lookahead, so the output of deflate is not affected by the uninitialized values.
*/ */
#if OPTIMAL_CMP >= 32
if (best_len < sizeof(uint32_t)) { if (best_len < sizeof(uint32_t)) {
for (;;) { for (;;) {
if (zng_memcmp_2(mbase_end+cur_match, &scan_end) == 0 && if (zng_memcmp_2(mbase_end+cur_match, &scan_end) == 0 &&
@ -151,7 +134,6 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) {
break; break;
GOTO_NEXT_CHAIN; GOTO_NEXT_CHAIN;
} }
# if OPTIMAL_CMP >= 64
} else if (best_len >= sizeof(uint64_t)) { } else if (best_len >= sizeof(uint64_t)) {
for (;;) { for (;;) {
if (zng_memcmp_8(mbase_end+cur_match, &scan_end) == 0 && if (zng_memcmp_8(mbase_end+cur_match, &scan_end) == 0 &&
@ -159,7 +141,6 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) {
break; break;
GOTO_NEXT_CHAIN; GOTO_NEXT_CHAIN;
} }
# endif
} else { } else {
for (;;) { for (;;) {
if (zng_memcmp_4(mbase_end+cur_match, &scan_end) == 0 && if (zng_memcmp_4(mbase_end+cur_match, &scan_end) == 0 &&
@ -168,14 +149,6 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) {
GOTO_NEXT_CHAIN; GOTO_NEXT_CHAIN;
} }
} }
#else
for (;;) {
if (mbase_end[cur_match] == scan_end[0] && mbase_end[cur_match+1] == scan_end[1] &&
mbase_start[cur_match] == scan[0] && mbase_start[cur_match+1] == scan[1])
break;
GOTO_NEXT_CHAIN;
}
#endif
uint32_t len = COMPARE256(scan+2, mbase_start+cur_match+2) + 2; uint32_t len = COMPARE256(scan+2, mbase_start+cur_match+2) + 2;
Assert(scan+len <= window+(unsigned)(s->window_size-1), "wild scan"); Assert(scan+len <= window+(unsigned)(s->window_size-1), "wild scan");
@ -191,24 +164,13 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) {
return best_len; return best_len;
offset = best_len-1; offset = best_len-1;
#if OPTIMAL_CMP >= 32
if (best_len >= sizeof(uint32_t)) { if (best_len >= sizeof(uint32_t)) {
offset -= 2; offset -= 2;
#if OPTIMAL_CMP >= 64
if (best_len >= sizeof(uint64_t)) if (best_len >= sizeof(uint64_t))
offset -= 4; offset -= 4;
#endif
} }
#endif
#if OPTIMAL_CMP >= 64
scan_end = zng_memread_8(scan+offset); scan_end = zng_memread_8(scan+offset);
#elif OPTIMAL_CMP >= 32
scan_end = zng_memread_4(scan+offset);
#else
scan_end[0] = *(scan+offset);
scan_end[1] = *(scan+offset+1);
#endif
#ifdef LONGEST_MATCH_SLOW #ifdef LONGEST_MATCH_SLOW
/* Look for a better string offset */ /* Look for a better string offset */
@ -286,4 +248,3 @@ break_matching:
#undef LONGEST_MATCH_SLOW #undef LONGEST_MATCH_SLOW
#undef LONGEST_MATCH #undef LONGEST_MATCH
#undef COMPARE256

View File

@ -12,6 +12,7 @@ extern "C" {
# include "zutil_p.h" # include "zutil_p.h"
# include "arch_functions.h" # include "arch_functions.h"
# include "../test_cpu_features.h" # include "../test_cpu_features.h"
# include "arch/generic/compare256_p.h"
} }
#define MAX_COMPARE_SIZE (256) #define MAX_COMPARE_SIZE (256)
@ -60,21 +61,19 @@ public:
} \ } \
BENCHMARK_REGISTER_F(compare256, name)->Range(1, MAX_COMPARE_SIZE); BENCHMARK_REGISTER_F(compare256, name)->Range(1, MAX_COMPARE_SIZE);
BENCHMARK_COMPARE256(c, compare256_c, 1);
#ifdef DISABLE_RUNTIME_CPU_DETECTION #ifdef DISABLE_RUNTIME_CPU_DETECTION
BENCHMARK_COMPARE256(native, native_compare256, 1); BENCHMARK_COMPARE256(native, native_compare256, 1);
#else #else
#if BYTE_ORDER == LITTLE_ENDIAN && OPTIMAL_CMP >= 32 BENCHMARK_COMPARE256(8, compare256_8, 1);
BENCHMARK_COMPARE256(16, compare256_16, 1); BENCHMARK_COMPARE256(16, compare256_16, 1);
# if defined(HAVE_BUILTIN_CTZ) #if defined(HAVE_BUILTIN_CTZ)
BENCHMARK_COMPARE256(32, compare256_32, 1); BENCHMARK_COMPARE256(32, compare256_32, 1);
# endif
# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64
BENCHMARK_COMPARE256(64, compare256_64, 1);
# endif
#endif #endif
#if defined(HAVE_BUILTIN_CTZLL)
BENCHMARK_COMPARE256(64, compare256_64, 1);
#endif
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
BENCHMARK_COMPARE256(sse2, compare256_sse2, test_cpu_features.x86.has_sse2); BENCHMARK_COMPARE256(sse2, compare256_sse2, test_cpu_features.x86.has_sse2);
#endif #endif

View File

@ -59,14 +59,11 @@ public:
} \ } \
BENCHMARK_REGISTER_F(compare256_rle, name)->Range(1, MAX_COMPARE_SIZE); BENCHMARK_REGISTER_F(compare256_rle, name)->Range(1, MAX_COMPARE_SIZE);
BENCHMARK_COMPARE256_RLE(c, compare256_rle_c, 1); BENCHMARK_COMPARE256_RLE(8, compare256_rle_8, 1);
#if BYTE_ORDER == LITTLE_ENDIAN && OPTIMAL_CMP >= 32
BENCHMARK_COMPARE256_RLE(16, compare256_rle_16, 1); BENCHMARK_COMPARE256_RLE(16, compare256_rle_16, 1);
# if defined(HAVE_BUILTIN_CTZ) #if defined(HAVE_BUILTIN_CTZ)
BENCHMARK_COMPARE256_RLE(32, compare256_rle_32, 1); BENCHMARK_COMPARE256_RLE(32, compare256_rle_32, 1);
# endif #endif
# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64 #if defined(HAVE_BUILTIN_CTZLL)
BENCHMARK_COMPARE256_RLE(64, compare256_rle_64, 1); BENCHMARK_COMPARE256_RLE(64, compare256_rle_64, 1);
# endif
#endif #endif

View File

@ -12,6 +12,7 @@ extern "C" {
# include "zutil.h" # include "zutil.h"
# include "arch_functions.h" # include "arch_functions.h"
# include "test_cpu_features.h" # include "test_cpu_features.h"
# include "arch/generic/compare256_p.h"
} }
#include <gtest/gtest.h> #include <gtest/gtest.h>
@ -59,20 +60,17 @@ static inline void compare256_match_check(compare256_func compare256) {
compare256_match_check(func); \ compare256_match_check(func); \
} }
TEST_COMPARE256(c, compare256_c, 1)
#ifdef DISABLE_RUNTIME_CPU_DETECTION #ifdef DISABLE_RUNTIME_CPU_DETECTION
TEST_COMPARE256(native, native_compare256, 1) TEST_COMPARE256(native, native_compare256, 1)
#else #else
#if BYTE_ORDER == LITTLE_ENDIAN && OPTIMAL_CMP >= 32 TEST_COMPARE256(8, compare256_8, 1)
TEST_COMPARE256(16, compare256_16, 1) TEST_COMPARE256(16, compare256_16, 1)
# if defined(HAVE_BUILTIN_CTZ) #if defined(HAVE_BUILTIN_CTZ)
TEST_COMPARE256(32, compare256_32, 1) TEST_COMPARE256(32, compare256_32, 1)
# endif #endif
# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64 #if defined(HAVE_BUILTIN_CTZLL)
TEST_COMPARE256(64, compare256_64, 1) TEST_COMPARE256(64, compare256_64, 1)
# endif
#endif #endif
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)

View File

@ -50,14 +50,11 @@ static inline void compare256_rle_match_check(compare256_rle_func compare256_rle
compare256_rle_match_check(func); \ compare256_rle_match_check(func); \
} }
TEST_COMPARE256_RLE(c, compare256_rle_c, 1) TEST_COMPARE256_RLE(8, compare256_rle_8, 1)
#if BYTE_ORDER == LITTLE_ENDIAN && OPTIMAL_CMP >= 32
TEST_COMPARE256_RLE(16, compare256_rle_16, 1) TEST_COMPARE256_RLE(16, compare256_rle_16, 1)
# if defined(HAVE_BUILTIN_CTZ) #if defined(HAVE_BUILTIN_CTZ)
TEST_COMPARE256_RLE(32, compare256_rle_32, 1) TEST_COMPARE256_RLE(32, compare256_rle_32, 1)
# endif #endif
# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64 #if defined(HAVE_BUILTIN_CTZLL)
TEST_COMPARE256_RLE(64, compare256_rle_64, 1) TEST_COMPARE256_RLE(64, compare256_rle_64, 1)
# endif
#endif #endif

View File

@ -243,36 +243,43 @@
# define Tracecv(c, x) # define Tracecv(c, x)
#endif #endif
#if defined(__x86_64__) || defined(_M_X64) || defined(__amd64__) || defined(_M_AMD64) /* OPTIMAL_CMP values determine the comparison width:
# define OPTIMAL_CMP 64 * 64: Best for 64-bit architectures with unaligned access
#elif defined(__i386__) || defined(__i486__) || defined(__i586__) || \ * 32: Best for 32-bit architectures with unaligned access
defined(__i686__) || defined(_X86_) || defined(_M_IX86) * 16: Safe default for unknown architectures
# define OPTIMAL_CMP 32 * 8: Safe fallback for architectures without unaligned access
#elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) * Note: The unaligned access mentioned is cpu-support, this allows compiler or
# if defined(__ARM_FEATURE_UNALIGNED) || defined(_WIN32) * separate unaligned intrinsics to utilize safe unaligned access, without
* utilizing unaligned C pointers that are known to have undefined behavior.
*/
#if !defined(OPTIMAL_CMP)
# if defined(__x86_64__) || defined(_M_X64) || defined(__amd64__) || defined(_M_AMD64)
# define OPTIMAL_CMP 64 # define OPTIMAL_CMP 64
# else # elif defined(__i386__) || defined(__i486__) || defined(__i586__) || \
# define OPTIMAL_CMP 8 defined(__i686__) || defined(_X86_) || defined(_M_IX86)
# endif # define OPTIMAL_CMP 32
#elif defined(__arm__) || defined(_M_ARM) # elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
# if defined(__ARM_FEATURE_UNALIGNED) || defined(_WIN32) # if defined(__ARM_FEATURE_UNALIGNED) || defined(_WIN32)
# define OPTIMAL_CMP 64
# else
# define OPTIMAL_CMP 8
# endif
# elif defined(__arm__) || defined(_M_ARM)
# if defined(__ARM_FEATURE_UNALIGNED) || defined(_WIN32)
# define OPTIMAL_CMP 32
# else
# define OPTIMAL_CMP 8
# endif
# elif defined(__powerpc64__) || defined(__ppc64__)
# define OPTIMAL_CMP 64
# elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__)
# define OPTIMAL_CMP 32 # define OPTIMAL_CMP 32
# else
# define OPTIMAL_CMP 8
# endif # endif
#elif defined(__powerpc64__) || defined(__ppc64__)
# define OPTIMAL_CMP 64
#elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__)
# define OPTIMAL_CMP 32
#endif
#if defined(NO_UNALIGNED)
# undef OPTIMAL_CMP
#endif #endif
#if !defined(OPTIMAL_CMP) #if !defined(OPTIMAL_CMP)
# define OPTIMAL_CMP 8 # define OPTIMAL_CMP 16
#endif #endif
#if defined(__has_feature) #if defined(__has_feature)
# if __has_feature(address_sanitizer) # if __has_feature(address_sanitizer)
# define Z_ADDRESS_SANITIZER 1 # define Z_ADDRESS_SANITIZER 1

View File

@ -73,7 +73,7 @@ static inline void zng_memwrite_8(void *ptr, uint64_t val) {
calls to unaligned comparisons when unaligned access is supported. Use memcmp only when calls to unaligned comparisons when unaligned access is supported. Use memcmp only when
unaligned support is not available to avoid an extra call to memcpy. */ unaligned support is not available to avoid an extra call to memcpy. */
static inline int32_t zng_memcmp_2(const void *src0, const void *src1) { static inline int32_t zng_memcmp_2(const void *src0, const void *src1) {
#if defined(HAVE_MAY_ALIAS) || OPTIMAL_CMP >= 16 #if defined(HAVE_MAY_ALIAS)
return zng_memread_2(src0) != zng_memread_2(src1); return zng_memread_2(src0) != zng_memread_2(src1);
#else #else
return memcmp(src0, src1, 2); return memcmp(src0, src1, 2);
@ -81,7 +81,7 @@ static inline int32_t zng_memcmp_2(const void *src0, const void *src1) {
} }
static inline int32_t zng_memcmp_4(const void *src0, const void *src1) { static inline int32_t zng_memcmp_4(const void *src0, const void *src1) {
#if defined(HAVE_MAY_ALIAS) || OPTIMAL_CMP >= 32 #if defined(HAVE_MAY_ALIAS)
return zng_memread_4(src0) != zng_memread_4(src1); return zng_memread_4(src0) != zng_memread_4(src1);
#else #else
return memcmp(src0, src1, 4); return memcmp(src0, src1, 4);
@ -89,7 +89,7 @@ static inline int32_t zng_memcmp_4(const void *src0, const void *src1) {
} }
static inline int32_t zng_memcmp_8(const void *src0, const void *src1) { static inline int32_t zng_memcmp_8(const void *src0, const void *src1) {
#if defined(HAVE_MAY_ALIAS) || OPTIMAL_CMP >= 64 #if defined(HAVE_MAY_ALIAS)
return zng_memread_8(src0) != zng_memread_8(src1); return zng_memread_8(src0) != zng_memread_8(src1);
#else #else
return memcmp(src0, src1, 8); return memcmp(src0, src1, 8);