mirror of
https://github.com/GerbilSoft/zlib-ng.git
synced 2025-06-18 11:35:35 -04:00
Continued cleanup of old UNALIGNED_OK checks
- Remove obsolete checks - Fix checks that are inconsistent - Stop compiling compare256/longest_match variants that never gets called - Improve how the generic compare256 functions are handled. - Allow overriding OPTIMAL_CMP This simplifies the code and avoids having a lot of code in the compiled library than can never get executed.
This commit is contained in:
parent
1aeb2915a0
commit
bf05e882b8
@ -1074,6 +1074,9 @@ set(ZLIB_PUBLIC_HDRS
|
||||
${CMAKE_CURRENT_BINARY_DIR}/zlib${SUFFIX}.h
|
||||
)
|
||||
set(ZLIB_PRIVATE_HDRS
|
||||
arch/generic/chunk_permute_table.h
|
||||
arch/generic/compare256_p.h
|
||||
arch/generic/generic_functions.h
|
||||
adler32_p.h
|
||||
chunkset_tpl.h
|
||||
compare256_rle.h
|
||||
|
@ -40,10 +40,10 @@ chunkset_c.o: $(SRCDIR)/chunkset_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.
|
||||
chunkset_c.lo: $(SRCDIR)/chunkset_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/chunkset_tpl.h $(SRCTOP)/inffast_tpl.h
|
||||
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_c.c
|
||||
|
||||
compare256_c.o: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
|
||||
compare256_c.o: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCDIR)/compare256_p.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
|
||||
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c
|
||||
|
||||
compare256_c.lo: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
|
||||
compare256_c.lo: $(SRCDIR)/compare256_c.c $(SRCTOP)/zbuild.h $(SRCDIR)/compare256_p.h $(SRCTOP)/zmemory.h $(SRCTOP)/deflate.h $(SRCTOP)/fallback_builtins.h
|
||||
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_c.c
|
||||
|
||||
crc32_braid_c.o: $(SRCDIR)/crc32_braid_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h
|
||||
|
@ -4,187 +4,28 @@
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "zmemory.h"
|
||||
#include "deflate.h"
|
||||
#include "fallback_builtins.h"
|
||||
#include "compare256_p.h"
|
||||
|
||||
/* ALIGNED, byte comparison */
|
||||
static inline uint32_t compare256_c_static(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t len = 0;
|
||||
|
||||
do {
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src0 += 1, src1 += 1, len += 1;
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src0 += 1, src1 += 1, len += 1;
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src0 += 1, src1 += 1, len += 1;
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src0 += 1, src1 += 1, len += 1;
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src0 += 1, src1 += 1, len += 1;
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src0 += 1, src1 += 1, len += 1;
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src0 += 1, src1 += 1, len += 1;
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src0 += 1, src1 += 1, len += 1;
|
||||
} while (len < 256);
|
||||
|
||||
return 256;
|
||||
}
|
||||
// Set optimal COMPARE256 function variant
|
||||
#if OPTIMAL_CMP == 8
|
||||
# define COMPARE256 compare256_8
|
||||
#elif defined(HAVE_BUILTIN_CTZLL)
|
||||
# define COMPARE256 compare256_64
|
||||
#elif defined(HAVE_BUILTIN_CTZ)
|
||||
# define COMPARE256 compare256_32
|
||||
#else
|
||||
# define COMPARE256 compare256_16
|
||||
#endif
|
||||
|
||||
Z_INTERNAL uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1) {
|
||||
return compare256_c_static(src0, src1);
|
||||
return COMPARE256(src0, src1);
|
||||
}
|
||||
|
||||
// Generate longest_match_c
|
||||
#define LONGEST_MATCH longest_match_c
|
||||
#define COMPARE256 compare256_c_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
// Generate longest_match_slow_c
|
||||
#define LONGEST_MATCH_SLOW
|
||||
#define LONGEST_MATCH longest_match_slow_c
|
||||
#define COMPARE256 compare256_c_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#if OPTIMAL_CMP >= 32
|
||||
|
||||
/* 16-bit unaligned integer comparison */
|
||||
static inline uint32_t compare256_16_static(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t len = 0;
|
||||
|
||||
do {
|
||||
if (zng_memcmp_2(src0, src1) != 0)
|
||||
return len + (*src0 == *src1);
|
||||
src0 += 2, src1 += 2, len += 2;
|
||||
|
||||
if (zng_memcmp_2(src0, src1) != 0)
|
||||
return len + (*src0 == *src1);
|
||||
src0 += 2, src1 += 2, len += 2;
|
||||
|
||||
if (zng_memcmp_2(src0, src1) != 0)
|
||||
return len + (*src0 == *src1);
|
||||
src0 += 2, src1 += 2, len += 2;
|
||||
|
||||
if (zng_memcmp_2(src0, src1) != 0)
|
||||
return len + (*src0 == *src1);
|
||||
src0 += 2, src1 += 2, len += 2;
|
||||
} while (len < 256);
|
||||
|
||||
return 256;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t compare256_16(const uint8_t *src0, const uint8_t *src1) {
|
||||
return compare256_16_static(src0, src1);
|
||||
}
|
||||
|
||||
#define LONGEST_MATCH longest_match_16
|
||||
#define COMPARE256 compare256_16_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#define LONGEST_MATCH_SLOW
|
||||
#define LONGEST_MATCH longest_match_slow_16
|
||||
#define COMPARE256 compare256_16_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#ifdef HAVE_BUILTIN_CTZ
|
||||
/* 32-bit unaligned integer comparison */
|
||||
static inline uint32_t compare256_32_static(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t len = 0;
|
||||
|
||||
do {
|
||||
uint32_t sv, mv, diff;
|
||||
|
||||
sv = zng_memread_4(src0);
|
||||
mv = zng_memread_4(src1);
|
||||
|
||||
diff = sv ^ mv;
|
||||
if (diff) {
|
||||
#if BYTE_ORDER == LITTLE_ENDIAN
|
||||
uint32_t match_byte = __builtin_ctz(diff) / 8;
|
||||
#else
|
||||
uint32_t match_byte = __builtin_clz(diff) / 8;
|
||||
#endif
|
||||
return len + match_byte;
|
||||
}
|
||||
|
||||
src0 += 4, src1 += 4, len += 4;
|
||||
} while (len < 256);
|
||||
|
||||
return 256;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t compare256_32(const uint8_t *src0, const uint8_t *src1) {
|
||||
return compare256_32_static(src0, src1);
|
||||
}
|
||||
|
||||
#define LONGEST_MATCH longest_match_32
|
||||
#define COMPARE256 compare256_32_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#define LONGEST_MATCH_SLOW
|
||||
#define LONGEST_MATCH longest_match_slow_32
|
||||
#define COMPARE256 compare256_32_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64
|
||||
/* 64-bit integer comparison */
|
||||
static inline uint32_t compare256_64_static(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t len = 0;
|
||||
|
||||
do {
|
||||
uint64_t sv, mv, diff;
|
||||
|
||||
sv = zng_memread_8(src0);
|
||||
mv = zng_memread_8(src1);
|
||||
|
||||
diff = sv ^ mv;
|
||||
if (diff) {
|
||||
#if BYTE_ORDER == LITTLE_ENDIAN
|
||||
uint64_t match_byte = __builtin_ctzll(diff) / 8;
|
||||
#else
|
||||
uint64_t match_byte = __builtin_clzll(diff) / 8;
|
||||
#endif
|
||||
return len + (uint32_t)match_byte;
|
||||
}
|
||||
|
||||
src0 += 8, src1 += 8, len += 8;
|
||||
} while (len < 256);
|
||||
|
||||
return 256;
|
||||
}
|
||||
|
||||
Z_INTERNAL uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1) {
|
||||
return compare256_64_static(src0, src1);
|
||||
}
|
||||
|
||||
#define LONGEST_MATCH longest_match_64
|
||||
#define COMPARE256 compare256_64_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#define LONGEST_MATCH_SLOW
|
||||
#define LONGEST_MATCH longest_match_slow_64
|
||||
#define COMPARE256 compare256_64_static
|
||||
|
||||
#include "match_tpl.h"
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
123
arch/generic/compare256_p.h
Normal file
123
arch/generic/compare256_p.h
Normal file
@ -0,0 +1,123 @@
|
||||
/* compare256_p.h -- 256 byte memory comparison with match length return
|
||||
* Copyright (C) 2020 Nathan Moinvaziri
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "zmemory.h"
|
||||
#include "deflate.h"
|
||||
#include "fallback_builtins.h"
|
||||
|
||||
/* 8-bit integer comparison */
|
||||
static inline uint32_t compare256_8(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t len = 0;
|
||||
|
||||
do {
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src0 += 1, src1 += 1, len += 1;
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src0 += 1, src1 += 1, len += 1;
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src0 += 1, src1 += 1, len += 1;
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src0 += 1, src1 += 1, len += 1;
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src0 += 1, src1 += 1, len += 1;
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src0 += 1, src1 += 1, len += 1;
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src0 += 1, src1 += 1, len += 1;
|
||||
if (*src0 != *src1)
|
||||
return len;
|
||||
src0 += 1, src1 += 1, len += 1;
|
||||
} while (len < 256);
|
||||
|
||||
return 256;
|
||||
}
|
||||
|
||||
/* 16-bit integer comparison */
|
||||
static inline uint32_t compare256_16(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t len = 0;
|
||||
|
||||
do {
|
||||
if (zng_memcmp_2(src0, src1) != 0)
|
||||
return len + (*src0 == *src1);
|
||||
src0 += 2, src1 += 2, len += 2;
|
||||
|
||||
if (zng_memcmp_2(src0, src1) != 0)
|
||||
return len + (*src0 == *src1);
|
||||
src0 += 2, src1 += 2, len += 2;
|
||||
|
||||
if (zng_memcmp_2(src0, src1) != 0)
|
||||
return len + (*src0 == *src1);
|
||||
src0 += 2, src1 += 2, len += 2;
|
||||
|
||||
if (zng_memcmp_2(src0, src1) != 0)
|
||||
return len + (*src0 == *src1);
|
||||
src0 += 2, src1 += 2, len += 2;
|
||||
} while (len < 256);
|
||||
|
||||
return 256;
|
||||
}
|
||||
|
||||
#ifdef HAVE_BUILTIN_CTZ
|
||||
/* 32-bit integer comparison */
|
||||
static inline uint32_t compare256_32(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t len = 0;
|
||||
|
||||
do {
|
||||
uint32_t sv, mv, diff;
|
||||
|
||||
sv = zng_memread_4(src0);
|
||||
mv = zng_memread_4(src1);
|
||||
|
||||
diff = sv ^ mv;
|
||||
if (diff) {
|
||||
# if BYTE_ORDER == LITTLE_ENDIAN
|
||||
uint32_t match_byte = __builtin_ctz(diff) / 8;
|
||||
# else
|
||||
uint32_t match_byte = __builtin_clz(diff) / 8;
|
||||
# endif
|
||||
return len + match_byte;
|
||||
}
|
||||
|
||||
src0 += 4, src1 += 4, len += 4;
|
||||
} while (len < 256);
|
||||
|
||||
return 256;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_BUILTIN_CTZLL
|
||||
/* 64-bit integer comparison */
|
||||
static inline uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t len = 0;
|
||||
|
||||
do {
|
||||
uint64_t sv, mv, diff;
|
||||
|
||||
sv = zng_memread_8(src0);
|
||||
mv = zng_memread_8(src1);
|
||||
|
||||
diff = sv ^ mv;
|
||||
if (diff) {
|
||||
# if BYTE_ORDER == LITTLE_ENDIAN
|
||||
uint64_t match_byte = __builtin_ctzll(diff) / 8;
|
||||
# else
|
||||
uint64_t match_byte = __builtin_clzll(diff) / 8;
|
||||
# endif
|
||||
return len + (uint32_t)match_byte;
|
||||
}
|
||||
|
||||
src0 += 8, src1 += 8, len += 8;
|
||||
} while (len < 256);
|
||||
|
||||
return 256;
|
||||
}
|
||||
#endif
|
@ -28,15 +28,6 @@ void inflate_fast_c(PREFIX3(stream) *strm, uint32_t start);
|
||||
uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len);
|
||||
|
||||
uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1);
|
||||
#if OPTIMAL_CMP >= 32
|
||||
uint32_t compare256_16(const uint8_t *src0, const uint8_t *src1);
|
||||
# ifdef HAVE_BUILTIN_CTZ
|
||||
uint32_t compare256_32(const uint8_t *src0, const uint8_t *src1);
|
||||
# endif
|
||||
# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64
|
||||
uint32_t compare256_64(const uint8_t *src0, const uint8_t *src1);
|
||||
# endif
|
||||
#endif
|
||||
|
||||
typedef void (*slide_hash_func)(deflate_state *s);
|
||||
|
||||
@ -44,41 +35,6 @@ void slide_hash_c(deflate_state *s);
|
||||
|
||||
uint32_t longest_match_c(deflate_state *const s, Pos cur_match);
|
||||
uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match);
|
||||
#if OPTIMAL_CMP >= 32
|
||||
uint32_t longest_match_16(deflate_state *const s, Pos cur_match);
|
||||
uint32_t longest_match_slow_16(deflate_state *const s, Pos cur_match);
|
||||
# ifdef HAVE_BUILTIN_CTZ
|
||||
uint32_t longest_match_32(deflate_state *const s, Pos cur_match);
|
||||
uint32_t longest_match_slow_32(deflate_state *const s, Pos cur_match);
|
||||
# endif
|
||||
# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64
|
||||
uint32_t longest_match_64(deflate_state *const s, Pos cur_match);
|
||||
uint32_t longest_match_slow_64(deflate_state *const s, Pos cur_match);
|
||||
# endif
|
||||
#endif
|
||||
|
||||
|
||||
// Select generic implementation for longest_match, longest_match_slow, longest_match_slow functions.
|
||||
#if OPTIMAL_CMP >= 32
|
||||
# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64
|
||||
# define longest_match_generic longest_match_64
|
||||
# define longest_match_slow_generic longest_match_slow_64
|
||||
# define compare256_generic compare256_64
|
||||
# elif defined(HAVE_BUILTIN_CTZ)
|
||||
# define longest_match_generic longest_match_32
|
||||
# define longest_match_slow_generic longest_match_slow_32
|
||||
# define compare256_generic compare256_32
|
||||
# else
|
||||
# define longest_match_generic longest_match_16
|
||||
# define longest_match_slow_generic longest_match_slow_16
|
||||
# define compare256_generic compare256_16
|
||||
# endif
|
||||
#else
|
||||
# define longest_match_generic longest_match_c
|
||||
# define longest_match_slow_generic longest_match_slow_c
|
||||
# define compare256_generic compare256_c
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef DISABLE_RUNTIME_CPU_DETECTION
|
||||
// Generic code
|
||||
@ -93,9 +49,9 @@ uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match);
|
||||
# define native_crc32_fold_reset crc32_fold_reset_c
|
||||
# define native_inflate_fast inflate_fast_c
|
||||
# define native_slide_hash slide_hash_c
|
||||
# define native_longest_match longest_match_generic
|
||||
# define native_longest_match_slow longest_match_slow_generic
|
||||
# define native_compare256 compare256_generic
|
||||
# define native_longest_match longest_match_c
|
||||
# define native_longest_match_slow longest_match_slow_c
|
||||
# define native_compare256 compare256_c
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@ -6,12 +6,11 @@
|
||||
#include "zbuild.h"
|
||||
#include "zmemory.h"
|
||||
#include "fallback_builtins.h"
|
||||
#include "zendian.h"
|
||||
|
||||
typedef uint32_t (*compare256_rle_func)(const uint8_t* src0, const uint8_t* src1);
|
||||
|
||||
/* ALIGNED, byte comparison */
|
||||
static inline uint32_t compare256_rle_c(const uint8_t *src0, const uint8_t *src1) {
|
||||
/* 8-bit integer comparison */
|
||||
static inline uint32_t compare256_rle_8(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t len = 0;
|
||||
|
||||
do {
|
||||
@ -44,8 +43,7 @@ static inline uint32_t compare256_rle_c(const uint8_t *src0, const uint8_t *src1
|
||||
return 256;
|
||||
}
|
||||
|
||||
#if OPTIMAL_CMP >= 32
|
||||
/* 16-bit unaligned integer comparison */
|
||||
/* 16-bit integer comparison */
|
||||
static inline uint32_t compare256_rle_16(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t len = 0;
|
||||
uint16_t src0_cmp;
|
||||
@ -71,7 +69,7 @@ static inline uint32_t compare256_rle_16(const uint8_t *src0, const uint8_t *src
|
||||
}
|
||||
|
||||
#ifdef HAVE_BUILTIN_CTZ
|
||||
/* 32-bit unaligned integer comparison */
|
||||
/* 32-bit integer comparison */
|
||||
static inline uint32_t compare256_rle_32(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t sv, len = 0;
|
||||
uint16_t src0_cmp;
|
||||
@ -99,11 +97,10 @@ static inline uint32_t compare256_rle_32(const uint8_t *src0, const uint8_t *src
|
||||
|
||||
return 256;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64
|
||||
/* 64-bit unaligned integer comparison */
|
||||
#ifdef HAVE_BUILTIN_CTZLL
|
||||
/* 64-bit integer comparison */
|
||||
static inline uint32_t compare256_rle_64(const uint8_t *src0, const uint8_t *src1) {
|
||||
uint32_t src0_cmp32, len = 0;
|
||||
uint16_t src0_cmp;
|
||||
@ -133,8 +130,4 @@ static inline uint32_t compare256_rle_64(const uint8_t *src0, const uint8_t *src
|
||||
|
||||
return 256;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -5,21 +5,19 @@
|
||||
*/
|
||||
|
||||
#include "zbuild.h"
|
||||
#include "compare256_rle.h"
|
||||
#include "deflate.h"
|
||||
#include "deflate_p.h"
|
||||
#include "functable.h"
|
||||
#include "compare256_rle.h"
|
||||
|
||||
#if OPTIMAL_CMP >= 32
|
||||
# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64
|
||||
# define compare256_rle compare256_rle_64
|
||||
# elif defined(HAVE_BUILTIN_CTZ)
|
||||
# define compare256_rle compare256_rle_32
|
||||
# else
|
||||
# define compare256_rle compare256_rle_16
|
||||
# endif
|
||||
#if OPTIMAL_CMP == 8
|
||||
# define compare256_rle compare256_rle_8
|
||||
#elif defined(HAVE_BUILTIN_CTZLL)
|
||||
# define compare256_rle compare256_rle_64
|
||||
#elif defined(HAVE_BUILTIN_CTZ)
|
||||
# define compare256_rle compare256_rle_32
|
||||
#else
|
||||
# define compare256_rle compare256_rle_c
|
||||
# define compare256_rle compare256_rle_16
|
||||
#endif
|
||||
|
||||
/* ===========================================================================
|
||||
|
@ -61,9 +61,9 @@ static void init_functable(void) {
|
||||
ft.crc32_fold_reset = &crc32_fold_reset_c;
|
||||
ft.inflate_fast = &inflate_fast_c;
|
||||
ft.slide_hash = &slide_hash_c;
|
||||
ft.longest_match = &longest_match_generic;
|
||||
ft.longest_match_slow = &longest_match_slow_generic;
|
||||
ft.compare256 = &compare256_generic;
|
||||
ft.longest_match = &longest_match_c;
|
||||
ft.longest_match_slow = &longest_match_slow_c;
|
||||
ft.compare256 = &compare256_c;
|
||||
|
||||
// Select arch-optimized functions
|
||||
|
||||
|
45
match_tpl.h
45
match_tpl.h
@ -22,6 +22,9 @@
|
||||
* IN assertions: cur_match is the head of the hash chain for the current
|
||||
* string (strstart) and its distance is <= MAX_DIST, and prev_length >=1
|
||||
* OUT assertion: the match length is not greater than s->lookahead
|
||||
*
|
||||
* The LONGEST_MATCH_SLOW variant spends more time to attempt to find longer
|
||||
* matches once a match has already been found.
|
||||
*/
|
||||
Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) {
|
||||
unsigned int strstart = s->strstart;
|
||||
@ -40,15 +43,8 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) {
|
||||
uint32_t chain_length, nice_match, best_len, offset;
|
||||
uint32_t lookahead = s->lookahead;
|
||||
Pos match_offset = 0;
|
||||
#if OPTIMAL_CMP >= 64
|
||||
uint64_t scan_start;
|
||||
uint64_t scan_end;
|
||||
#elif OPTIMAL_CMP >= 32
|
||||
uint32_t scan_start;
|
||||
uint32_t scan_end;
|
||||
#else
|
||||
uint8_t scan_end[8];
|
||||
#endif
|
||||
|
||||
#define GOTO_NEXT_CHAIN \
|
||||
if (--chain_length && (cur_match = prev[cur_match & wmask]) > limit) \
|
||||
@ -64,26 +60,14 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) {
|
||||
* to find the next best match length.
|
||||
*/
|
||||
offset = best_len-1;
|
||||
#if OPTIMAL_CMP >= 32
|
||||
if (best_len >= sizeof(uint32_t)) {
|
||||
offset -= 2;
|
||||
#if OPTIMAL_CMP >= 64
|
||||
if (best_len >= sizeof(uint64_t))
|
||||
offset -= 4;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#if OPTIMAL_CMP >= 64
|
||||
scan_start = zng_memread_8(scan);
|
||||
scan_end = zng_memread_8(scan+offset);
|
||||
#elif OPTIMAL_CMP >= 32
|
||||
scan_start = zng_memread_4(scan);
|
||||
scan_end = zng_memread_4(scan+offset);
|
||||
#else
|
||||
scan_end[0] = *(scan+offset);
|
||||
scan_end[1] = *(scan+offset+1);
|
||||
#endif
|
||||
mbase_end = (mbase_start+offset);
|
||||
|
||||
/* Do not waste too much time if we already have a good match */
|
||||
@ -143,7 +127,6 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) {
|
||||
* that depend on those values. However the length of the match is limited to the
|
||||
* lookahead, so the output of deflate is not affected by the uninitialized values.
|
||||
*/
|
||||
#if OPTIMAL_CMP >= 32
|
||||
if (best_len < sizeof(uint32_t)) {
|
||||
for (;;) {
|
||||
if (zng_memcmp_2(mbase_end+cur_match, &scan_end) == 0 &&
|
||||
@ -151,7 +134,6 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) {
|
||||
break;
|
||||
GOTO_NEXT_CHAIN;
|
||||
}
|
||||
# if OPTIMAL_CMP >= 64
|
||||
} else if (best_len >= sizeof(uint64_t)) {
|
||||
for (;;) {
|
||||
if (zng_memcmp_8(mbase_end+cur_match, &scan_end) == 0 &&
|
||||
@ -159,7 +141,6 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) {
|
||||
break;
|
||||
GOTO_NEXT_CHAIN;
|
||||
}
|
||||
# endif
|
||||
} else {
|
||||
for (;;) {
|
||||
if (zng_memcmp_4(mbase_end+cur_match, &scan_end) == 0 &&
|
||||
@ -168,14 +149,6 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) {
|
||||
GOTO_NEXT_CHAIN;
|
||||
}
|
||||
}
|
||||
#else
|
||||
for (;;) {
|
||||
if (mbase_end[cur_match] == scan_end[0] && mbase_end[cur_match+1] == scan_end[1] &&
|
||||
mbase_start[cur_match] == scan[0] && mbase_start[cur_match+1] == scan[1])
|
||||
break;
|
||||
GOTO_NEXT_CHAIN;
|
||||
}
|
||||
#endif
|
||||
uint32_t len = COMPARE256(scan+2, mbase_start+cur_match+2) + 2;
|
||||
Assert(scan+len <= window+(unsigned)(s->window_size-1), "wild scan");
|
||||
|
||||
@ -191,24 +164,13 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) {
|
||||
return best_len;
|
||||
|
||||
offset = best_len-1;
|
||||
#if OPTIMAL_CMP >= 32
|
||||
if (best_len >= sizeof(uint32_t)) {
|
||||
offset -= 2;
|
||||
#if OPTIMAL_CMP >= 64
|
||||
if (best_len >= sizeof(uint64_t))
|
||||
offset -= 4;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#if OPTIMAL_CMP >= 64
|
||||
scan_end = zng_memread_8(scan+offset);
|
||||
#elif OPTIMAL_CMP >= 32
|
||||
scan_end = zng_memread_4(scan+offset);
|
||||
#else
|
||||
scan_end[0] = *(scan+offset);
|
||||
scan_end[1] = *(scan+offset+1);
|
||||
#endif
|
||||
|
||||
#ifdef LONGEST_MATCH_SLOW
|
||||
/* Look for a better string offset */
|
||||
@ -286,4 +248,3 @@ break_matching:
|
||||
|
||||
#undef LONGEST_MATCH_SLOW
|
||||
#undef LONGEST_MATCH
|
||||
#undef COMPARE256
|
||||
|
@ -12,6 +12,7 @@ extern "C" {
|
||||
# include "zutil_p.h"
|
||||
# include "arch_functions.h"
|
||||
# include "../test_cpu_features.h"
|
||||
# include "arch/generic/compare256_p.h"
|
||||
}
|
||||
|
||||
#define MAX_COMPARE_SIZE (256)
|
||||
@ -60,21 +61,19 @@ public:
|
||||
} \
|
||||
BENCHMARK_REGISTER_F(compare256, name)->Range(1, MAX_COMPARE_SIZE);
|
||||
|
||||
BENCHMARK_COMPARE256(c, compare256_c, 1);
|
||||
|
||||
#ifdef DISABLE_RUNTIME_CPU_DETECTION
|
||||
BENCHMARK_COMPARE256(native, native_compare256, 1);
|
||||
#else
|
||||
|
||||
#if BYTE_ORDER == LITTLE_ENDIAN && OPTIMAL_CMP >= 32
|
||||
BENCHMARK_COMPARE256(8, compare256_8, 1);
|
||||
BENCHMARK_COMPARE256(16, compare256_16, 1);
|
||||
# if defined(HAVE_BUILTIN_CTZ)
|
||||
#if defined(HAVE_BUILTIN_CTZ)
|
||||
BENCHMARK_COMPARE256(32, compare256_32, 1);
|
||||
# endif
|
||||
# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64
|
||||
BENCHMARK_COMPARE256(64, compare256_64, 1);
|
||||
# endif
|
||||
#endif
|
||||
#if defined(HAVE_BUILTIN_CTZLL)
|
||||
BENCHMARK_COMPARE256(64, compare256_64, 1);
|
||||
#endif
|
||||
|
||||
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
|
||||
BENCHMARK_COMPARE256(sse2, compare256_sse2, test_cpu_features.x86.has_sse2);
|
||||
#endif
|
||||
|
@ -59,14 +59,11 @@ public:
|
||||
} \
|
||||
BENCHMARK_REGISTER_F(compare256_rle, name)->Range(1, MAX_COMPARE_SIZE);
|
||||
|
||||
BENCHMARK_COMPARE256_RLE(c, compare256_rle_c, 1);
|
||||
|
||||
#if BYTE_ORDER == LITTLE_ENDIAN && OPTIMAL_CMP >= 32
|
||||
BENCHMARK_COMPARE256_RLE(8, compare256_rle_8, 1);
|
||||
BENCHMARK_COMPARE256_RLE(16, compare256_rle_16, 1);
|
||||
# if defined(HAVE_BUILTIN_CTZ)
|
||||
#if defined(HAVE_BUILTIN_CTZ)
|
||||
BENCHMARK_COMPARE256_RLE(32, compare256_rle_32, 1);
|
||||
# endif
|
||||
# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64
|
||||
BENCHMARK_COMPARE256_RLE(64, compare256_rle_64, 1);
|
||||
# endif
|
||||
#endif
|
||||
#if defined(HAVE_BUILTIN_CTZLL)
|
||||
BENCHMARK_COMPARE256_RLE(64, compare256_rle_64, 1);
|
||||
#endif
|
||||
|
@ -12,6 +12,7 @@ extern "C" {
|
||||
# include "zutil.h"
|
||||
# include "arch_functions.h"
|
||||
# include "test_cpu_features.h"
|
||||
# include "arch/generic/compare256_p.h"
|
||||
}
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
@ -59,20 +60,17 @@ static inline void compare256_match_check(compare256_func compare256) {
|
||||
compare256_match_check(func); \
|
||||
}
|
||||
|
||||
TEST_COMPARE256(c, compare256_c, 1)
|
||||
|
||||
#ifdef DISABLE_RUNTIME_CPU_DETECTION
|
||||
TEST_COMPARE256(native, native_compare256, 1)
|
||||
#else
|
||||
|
||||
#if BYTE_ORDER == LITTLE_ENDIAN && OPTIMAL_CMP >= 32
|
||||
TEST_COMPARE256(8, compare256_8, 1)
|
||||
TEST_COMPARE256(16, compare256_16, 1)
|
||||
# if defined(HAVE_BUILTIN_CTZ)
|
||||
#if defined(HAVE_BUILTIN_CTZ)
|
||||
TEST_COMPARE256(32, compare256_32, 1)
|
||||
# endif
|
||||
# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64
|
||||
#endif
|
||||
#if defined(HAVE_BUILTIN_CTZLL)
|
||||
TEST_COMPARE256(64, compare256_64, 1)
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
|
||||
|
@ -50,14 +50,11 @@ static inline void compare256_rle_match_check(compare256_rle_func compare256_rle
|
||||
compare256_rle_match_check(func); \
|
||||
}
|
||||
|
||||
TEST_COMPARE256_RLE(c, compare256_rle_c, 1)
|
||||
|
||||
#if BYTE_ORDER == LITTLE_ENDIAN && OPTIMAL_CMP >= 32
|
||||
TEST_COMPARE256_RLE(8, compare256_rle_8, 1)
|
||||
TEST_COMPARE256_RLE(16, compare256_rle_16, 1)
|
||||
# if defined(HAVE_BUILTIN_CTZ)
|
||||
#if defined(HAVE_BUILTIN_CTZ)
|
||||
TEST_COMPARE256_RLE(32, compare256_rle_32, 1)
|
||||
# endif
|
||||
# if defined(HAVE_BUILTIN_CTZLL) && OPTIMAL_CMP >= 64
|
||||
TEST_COMPARE256_RLE(64, compare256_rle_64, 1)
|
||||
# endif
|
||||
#endif
|
||||
#if defined(HAVE_BUILTIN_CTZLL)
|
||||
TEST_COMPARE256_RLE(64, compare256_rle_64, 1)
|
||||
#endif
|
||||
|
53
zbuild.h
53
zbuild.h
@ -243,36 +243,43 @@
|
||||
# define Tracecv(c, x)
|
||||
#endif
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_X64) || defined(__amd64__) || defined(_M_AMD64)
|
||||
# define OPTIMAL_CMP 64
|
||||
#elif defined(__i386__) || defined(__i486__) || defined(__i586__) || \
|
||||
defined(__i686__) || defined(_X86_) || defined(_M_IX86)
|
||||
# define OPTIMAL_CMP 32
|
||||
#elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
|
||||
# if defined(__ARM_FEATURE_UNALIGNED) || defined(_WIN32)
|
||||
/* OPTIMAL_CMP values determine the comparison width:
|
||||
* 64: Best for 64-bit architectures with unaligned access
|
||||
* 32: Best for 32-bit architectures with unaligned access
|
||||
* 16: Safe default for unknown architectures
|
||||
* 8: Safe fallback for architectures without unaligned access
|
||||
* Note: The unaligned access mentioned is cpu-support, this allows compiler or
|
||||
* separate unaligned intrinsics to utilize safe unaligned access, without
|
||||
* utilizing unaligned C pointers that are known to have undefined behavior.
|
||||
*/
|
||||
#if !defined(OPTIMAL_CMP)
|
||||
# if defined(__x86_64__) || defined(_M_X64) || defined(__amd64__) || defined(_M_AMD64)
|
||||
# define OPTIMAL_CMP 64
|
||||
# else
|
||||
# define OPTIMAL_CMP 8
|
||||
# endif
|
||||
#elif defined(__arm__) || defined(_M_ARM)
|
||||
# if defined(__ARM_FEATURE_UNALIGNED) || defined(_WIN32)
|
||||
# elif defined(__i386__) || defined(__i486__) || defined(__i586__) || \
|
||||
defined(__i686__) || defined(_X86_) || defined(_M_IX86)
|
||||
# define OPTIMAL_CMP 32
|
||||
# elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
|
||||
# if defined(__ARM_FEATURE_UNALIGNED) || defined(_WIN32)
|
||||
# define OPTIMAL_CMP 64
|
||||
# else
|
||||
# define OPTIMAL_CMP 8
|
||||
# endif
|
||||
# elif defined(__arm__) || defined(_M_ARM)
|
||||
# if defined(__ARM_FEATURE_UNALIGNED) || defined(_WIN32)
|
||||
# define OPTIMAL_CMP 32
|
||||
# else
|
||||
# define OPTIMAL_CMP 8
|
||||
# endif
|
||||
# elif defined(__powerpc64__) || defined(__ppc64__)
|
||||
# define OPTIMAL_CMP 64
|
||||
# elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__)
|
||||
# define OPTIMAL_CMP 32
|
||||
# else
|
||||
# define OPTIMAL_CMP 8
|
||||
# endif
|
||||
#elif defined(__powerpc64__) || defined(__ppc64__)
|
||||
# define OPTIMAL_CMP 64
|
||||
#elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__)
|
||||
# define OPTIMAL_CMP 32
|
||||
#endif
|
||||
#if defined(NO_UNALIGNED)
|
||||
# undef OPTIMAL_CMP
|
||||
#endif
|
||||
#if !defined(OPTIMAL_CMP)
|
||||
# define OPTIMAL_CMP 8
|
||||
# define OPTIMAL_CMP 16
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__has_feature)
|
||||
# if __has_feature(address_sanitizer)
|
||||
# define Z_ADDRESS_SANITIZER 1
|
||||
|
@ -73,7 +73,7 @@ static inline void zng_memwrite_8(void *ptr, uint64_t val) {
|
||||
calls to unaligned comparisons when unaligned access is supported. Use memcmp only when
|
||||
unaligned support is not available to avoid an extra call to memcpy. */
|
||||
static inline int32_t zng_memcmp_2(const void *src0, const void *src1) {
|
||||
#if defined(HAVE_MAY_ALIAS) || OPTIMAL_CMP >= 16
|
||||
#if defined(HAVE_MAY_ALIAS)
|
||||
return zng_memread_2(src0) != zng_memread_2(src1);
|
||||
#else
|
||||
return memcmp(src0, src1, 2);
|
||||
@ -81,7 +81,7 @@ static inline int32_t zng_memcmp_2(const void *src0, const void *src1) {
|
||||
}
|
||||
|
||||
static inline int32_t zng_memcmp_4(const void *src0, const void *src1) {
|
||||
#if defined(HAVE_MAY_ALIAS) || OPTIMAL_CMP >= 32
|
||||
#if defined(HAVE_MAY_ALIAS)
|
||||
return zng_memread_4(src0) != zng_memread_4(src1);
|
||||
#else
|
||||
return memcmp(src0, src1, 4);
|
||||
@ -89,7 +89,7 @@ static inline int32_t zng_memcmp_4(const void *src0, const void *src1) {
|
||||
}
|
||||
|
||||
static inline int32_t zng_memcmp_8(const void *src0, const void *src1) {
|
||||
#if defined(HAVE_MAY_ALIAS) || OPTIMAL_CMP >= 64
|
||||
#if defined(HAVE_MAY_ALIAS)
|
||||
return zng_memread_8(src0) != zng_memread_8(src1);
|
||||
#else
|
||||
return memcmp(src0, src1, 8);
|
||||
|
Loading…
Reference in New Issue
Block a user