mirror of
https://github.com/GerbilSoft/zlib-ng.git
synced 2025-06-19 03:55:39 -04:00

This is ~25-30% faster than the SSE2 variant on a core2 quad. The main reason for this has to do with the fact that, while incurring far fewer shifts, an entirely separate stack buffer has to be managed that is the size of the L1 cache on most CPUs. This was one of the main reasons the 32k specialized function was slower for the scalar counterpart, despite auto vectorizing. The auto vectorized loop was setting up the stack buffer at unaligned offsets, which is detrimental to performance pre-nehalem. Additionally, we were losing a fair bit of time to the zero initialization, which we are now doing more selectively. There are a ton of loads and stores happening, and for sure we are bound on the fill buffer + store forwarding. An SSE2 version of this code is probably possible by simply replacing the shifts with unpacks with zero and the palignr's with shufpd's. I'm just not sure it'll be all that worth it, though. We are gating against SSE4.1 not because we are using specifically a 4.1 instruction but because that marks when Wolfdale came out and palignr became a lot faster.
219 lines
8.7 KiB
C
219 lines
8.7 KiB
C
/* x86_functions.h -- x86 implementations for arch-specific functions.
|
|
* Copyright (C) 2013 Intel Corporation Jim Kukunas
|
|
* For conditions of distribution and use, see copyright notice in zlib.h
|
|
*/
|
|
|
|
#ifndef X86_FUNCTIONS_H_
|
|
#define X86_FUNCTIONS_H_
|
|
|
|
/* So great news, your compiler is broken and causes stack smashing. Rather than
|
|
* notching out its compilation we'll just remove the assignment in the functable.
|
|
* Further context:
|
|
* https://developercommunity.visualstudio.com/t/Stack-corruption-with-v142-toolchain-whe/10853479 */
|
|
#if defined(_MSC_VER) && !defined(_M_AMD64) && _MSC_VER >= 1920 && _MSC_VER <= 1929
|
|
#define NO_CHORBA_SSE
|
|
#endif
|
|
|
|
#ifdef X86_SSE2
|
|
uint32_t chunksize_sse2(void);
|
|
uint8_t* chunkmemset_safe_sse2(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
|
|
|
|
# ifdef HAVE_BUILTIN_CTZ
|
|
uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1);
|
|
uint32_t longest_match_sse2(deflate_state *const s, Pos cur_match);
|
|
uint32_t longest_match_slow_sse2(deflate_state *const s, Pos cur_match);
|
|
void slide_hash_sse2(deflate_state *s);
|
|
# endif
|
|
void inflate_fast_sse2(PREFIX3(stream)* strm, uint32_t start);
|
|
# if !defined(WITHOUT_CHORBA)
|
|
uint32_t crc32_chorba_sse2(uint32_t crc32, const uint8_t *buf, size_t len);
|
|
# endif
|
|
#endif
|
|
|
|
#ifdef X86_SSSE3
|
|
uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len);
|
|
uint8_t* chunkmemset_safe_ssse3(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
|
|
void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start);
|
|
#endif
|
|
|
|
#ifdef X86_SSE41
|
|
# if !defined(WITHOUT_CHORBA)
|
|
uint32_t crc32_chorba_sse41(uint32_t crc32, const uint8_t *buf, size_t len);
|
|
# endif
|
|
#endif
|
|
|
|
#ifdef X86_SSE42
|
|
uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
|
|
#endif
|
|
|
|
#ifdef X86_AVX2
|
|
uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len);
|
|
uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
|
|
uint32_t chunksize_avx2(void);
|
|
uint8_t* chunkmemset_safe_avx2(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
|
|
|
|
# ifdef HAVE_BUILTIN_CTZ
|
|
uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1);
|
|
uint32_t longest_match_avx2(deflate_state *const s, Pos cur_match);
|
|
uint32_t longest_match_slow_avx2(deflate_state *const s, Pos cur_match);
|
|
void slide_hash_avx2(deflate_state *s);
|
|
# endif
|
|
void inflate_fast_avx2(PREFIX3(stream)* strm, uint32_t start);
|
|
#endif
|
|
#ifdef X86_AVX512
|
|
uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len);
|
|
uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
|
|
uint32_t chunksize_avx512(void);
|
|
uint8_t* chunkmemset_safe_avx512(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
|
|
void inflate_fast_avx512(PREFIX3(stream)* strm, uint32_t start);
|
|
# ifdef HAVE_BUILTIN_CTZLL
|
|
uint32_t compare256_avx512(const uint8_t *src0, const uint8_t *src1);
|
|
uint32_t longest_match_avx512(deflate_state *const s, Pos cur_match);
|
|
uint32_t longest_match_slow_avx512(deflate_state *const s, Pos cur_match);
|
|
# endif
|
|
#endif
|
|
#ifdef X86_AVX512VNNI
|
|
uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len);
|
|
uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
|
|
#endif
|
|
|
|
#ifdef X86_PCLMULQDQ_CRC
|
|
uint32_t crc32_fold_pclmulqdq_reset(crc32_fold *crc);
|
|
void crc32_fold_pclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
|
|
void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
|
|
uint32_t crc32_fold_pclmulqdq_final(crc32_fold *crc);
|
|
uint32_t crc32_pclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
|
|
#endif
|
|
#ifdef X86_VPCLMULQDQ_CRC
|
|
uint32_t crc32_fold_vpclmulqdq_reset(crc32_fold *crc);
|
|
void crc32_fold_vpclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
|
|
void crc32_fold_vpclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
|
|
uint32_t crc32_fold_vpclmulqdq_final(crc32_fold *crc);
|
|
uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
|
|
#endif
|
|
|
|
#ifdef DISABLE_RUNTIME_CPU_DETECTION
|
|
// X86 - SSE2
|
|
# if (defined(X86_SSE2) && defined(__SSE2__)) || defined(__x86_64__) || defined(_M_X64) || defined(X86_NOCHECK_SSE2)
|
|
# undef native_chunkmemset_safe
|
|
# define native_chunkmemset_safe chunkmemset_safe_sse2
|
|
# undef native_chunksize
|
|
# define native_chunksize chunksize_sse2
|
|
# undef native_inflate_fast
|
|
# define native_inflate_fast inflate_fast_sse2
|
|
# undef native_slide_hash
|
|
# define native_slide_hash slide_hash_sse2
|
|
# ifdef HAVE_BUILTIN_CTZ
|
|
# undef native_compare256
|
|
# define native_compare256 compare256_sse2
|
|
# undef native_longest_match
|
|
# define native_longest_match longest_match_sse2
|
|
# undef native_longest_match_slow
|
|
# define native_longest_match_slow longest_match_slow_sse2
|
|
# if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE)
|
|
# undef native_crc32
|
|
# define native_crc32 crc32_chorba_sse2
|
|
# endif
|
|
# endif
|
|
#endif
|
|
// X86 - SSSE3
|
|
# if defined(X86_SSSE3) && defined(__SSSE3__)
|
|
# undef native_adler32
|
|
# define native_adler32 adler32_ssse3
|
|
# undef native_chunkmemset_safe
|
|
# define native_chunkmemset_safe chunkmemset_safe_ssse3
|
|
# undef native_inflate_fast
|
|
# define native_inflate_fast inflate_fast_ssse3
|
|
# endif
|
|
# if !defined(WITHOUT_CHORBA) && defined(X86_SSE41) && defined(__SSE4_1__) && !defined(NO_CHORBA_SSE)
|
|
# undef native_crc32
|
|
# define native_crc32 crc32_chorba_sse41
|
|
# endif
|
|
// X86 - SSE4.2
|
|
# if defined(X86_SSE42) && defined(__SSE4_2__)
|
|
# undef native_adler32_fold_copy
|
|
# define native_adler32_fold_copy adler32_fold_copy_sse42
|
|
# endif
|
|
|
|
// X86 - PCLMUL
|
|
#if defined(X86_PCLMULQDQ_CRC) && defined(__PCLMUL__)
|
|
# undef native_crc32
|
|
# define native_crc32 crc32_pclmulqdq
|
|
# undef native_crc32_fold
|
|
# define native_crc32_fold crc32_fold_pclmulqdq
|
|
# undef native_crc32_fold_copy
|
|
# define native_crc32_fold_copy crc32_fold_pclmulqdq_copy
|
|
# undef native_crc32_fold_final
|
|
# define native_crc32_fold_final crc32_fold_pclmulqdq_final
|
|
# undef native_crc32_fold_reset
|
|
# define native_crc32_fold_reset crc32_fold_pclmulqdq_reset
|
|
#endif
|
|
// X86 - AVX
|
|
# if defined(X86_AVX2) && defined(__AVX2__)
|
|
# undef native_adler32
|
|
# define native_adler32 adler32_avx2
|
|
# undef native_adler32_fold_copy
|
|
# define native_adler32_fold_copy adler32_fold_copy_avx2
|
|
# undef native_chunkmemset_safe
|
|
# define native_chunkmemset_safe chunkmemset_safe_avx2
|
|
# undef native_chunksize
|
|
# define native_chunksize chunksize_avx2
|
|
# undef native_inflate_fast
|
|
# define native_inflate_fast inflate_fast_avx2
|
|
# undef native_slide_hash
|
|
# define native_slide_hash slide_hash_avx2
|
|
# ifdef HAVE_BUILTIN_CTZ
|
|
# undef native_compare256
|
|
# define native_compare256 compare256_avx2
|
|
# undef native_longest_match
|
|
# define native_longest_match longest_match_avx2
|
|
# undef native_longest_match_slow
|
|
# define native_longest_match_slow longest_match_slow_avx2
|
|
# endif
|
|
# endif
|
|
|
|
// X86 - AVX512 (F,DQ,BW,Vl)
|
|
# if defined(X86_AVX512) && defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__)
|
|
# undef native_adler32
|
|
# define native_adler32 adler32_avx512
|
|
# undef native_adler32_fold_copy
|
|
# define native_adler32_fold_copy adler32_fold_copy_avx512
|
|
# undef native_chunkmemset_safe
|
|
# define native_chunkmemset_safe chunkmemset_safe_avx512
|
|
# undef native_chunksize
|
|
# define native_chunksize chunksize_avx512
|
|
# undef native_inflate_fast
|
|
# define native_inflate_fast inflate_fast_avx512
|
|
# ifdef HAVE_BUILTIN_CTZLL
|
|
# undef native_compare256
|
|
# define native_compare256 compare256_avx512
|
|
# undef native_longest_match
|
|
# define native_longest_match longest_match_avx512
|
|
# undef native_longest_match_slow
|
|
# define native_longest_match_slow longest_match_slow_avx512
|
|
# endif
|
|
// X86 - AVX512 (VNNI)
|
|
# if defined(X86_AVX512VNNI) && defined(__AVX512VNNI__)
|
|
# undef native_adler32
|
|
# define native_adler32 adler32_avx512_vnni
|
|
# undef native_adler32_fold_copy
|
|
# define native_adler32_fold_copy adler32_fold_copy_avx512_vnni
|
|
# endif
|
|
// X86 - VPCLMULQDQ
|
|
# if defined(__PCLMUL__) && defined(__AVX512F__) && defined(__VPCLMULQDQ__)
|
|
# undef native_crc32
|
|
# define native_crc32 crc32_vpclmulqdq
|
|
# undef native_crc32_fold
|
|
# define native_crc32_fold crc32_fold_vpclmulqdq
|
|
# undef native_crc32_fold_copy
|
|
# define native_crc32_fold_copy crc32_fold_vpclmulqdq_copy
|
|
# undef native_crc32_fold_final
|
|
# define native_crc32_fold_final crc32_fold_vpclmulqdq_final
|
|
# undef native_crc32_fold_reset
|
|
# define native_crc32_fold_reset crc32_fold_vpclmulqdq_reset
|
|
# endif
|
|
# endif
|
|
#endif
|
|
|
|
#endif /* X86_FUNCTIONS_H_ */
|