diff --git a/CMakeLists.txt b/CMakeLists.txt index fe83ceb9..92dc2d4d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -981,8 +981,10 @@ if(WITH_OPTIM) add_definitions(-DX86_AVX512) list(APPEND AVX512_SRCS ${ARCHDIR}/adler32_avx512.c) add_feature_info(AVX512_ADLER32 1 "Support AVX512-accelerated adler32, using \"${AVX512FLAG}\"") - list(APPEND ZLIB_ARCH_SRCS ${AVX512_SRCS}) + list(APPEND AVX512_SRCS ${ARCHDIR}/chunkset_avx512.c) + add_feature_info(AVX512_CHUNKSET 1 "Support AVX512 optimized chunkset, using \"${AVX512FLAG}\"") list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/adler32_avx512_p.h) + list(APPEND ZLIB_ARCH_SRCS ${AVX512_SRCS}) set_property(SOURCE ${AVX512_SRCS} PROPERTY COMPILE_FLAGS "${AVX512FLAG} ${NOLTOFLAG}") else() set(WITH_AVX512 OFF) diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in index c13cd179..a012e61e 100644 --- a/arch/x86/Makefile.in +++ b/arch/x86/Makefile.in @@ -8,8 +8,8 @@ SFLAGS= INCLUDES= SUFFIX= -AVX512FLAG=-mavx512f -mavx512dq -mavx512vl -mavx512bw -AVX512VNNIFLAG=-mavx512vnni +AVX512FLAG=-mavx512f -mavx512dq -mavx512vl -mavx512bw -mbmi2 +AVX512VNNIFLAG=-mavx512vnni -mbmi2 AVX2FLAG=-mavx2 SSE2FLAG=-msse2 SSSE3FLAG=-mssse3 @@ -31,6 +31,7 @@ all: \ adler32_sse42.o adler32_sse42.lo \ adler32_ssse3.o adler32_ssse3.lo \ chunkset_avx2.o chunkset_avx2.lo \ + chunkset_avx512.o chunkset_avx512.lo \ chunkset_sse2.o chunkset_sse2.lo \ chunkset_ssse3.o chunkset_ssse3.lo \ compare256_avx2.o compare256_avx2.lo \ @@ -52,6 +53,12 @@ chunkset_avx2.o: chunkset_avx2.lo: $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx2.c +chunkset_avx512.o: + $(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx512.c + +chunkset_avx512.lo: + $(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx512.c + chunkset_sse2.o: $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c diff --git a/arch/x86/avx2_tables.h b/arch/x86/avx2_tables.h new file mode 100644 index 00000000..50759993 --- /dev/null +++ b/arch/x86/avx2_tables.h @@ -0,0 +1,44 @@ +#ifndef _AVX2_TABLES_H +#define _AVX2_TABLES_H + +#include "../generic/chunk_permute_table.h" + +/* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can + * never be 0 - 2, we'll start with an offset, subtracting 3 from the input */ +static const lut_rem_pair perm_idx_lut[29] = { + { 0, 2}, /* 3 */ + { 0, 0}, /* don't care */ + { 1 * 32, 2}, /* 5 */ + { 2 * 32, 2}, /* 6 */ + { 3 * 32, 4}, /* 7 */ + { 0 * 32, 0}, /* don't care */ + { 4 * 32, 5}, /* 9 */ + { 5 * 32, 22}, /* 10 */ + { 6 * 32, 21}, /* 11 */ + { 7 * 32, 20}, /* 12 */ + { 8 * 32, 6}, /* 13 */ + { 9 * 32, 4}, /* 14 */ + {10 * 32, 2}, /* 15 */ + { 0 * 32, 0}, /* don't care */ + {11 * 32, 15}, /* 17 */ + {11 * 32 + 16, 14}, /* 18 */ + {11 * 32 + 16 * 2, 13}, /* 19 */ + {11 * 32 + 16 * 3, 12}, /* 20 */ + {11 * 32 + 16 * 4, 11}, /* 21 */ + {11 * 32 + 16 * 5, 10}, /* 22 */ + {11 * 32 + 16 * 6, 9}, /* 23 */ + {11 * 32 + 16 * 7, 8}, /* 24 */ + {11 * 32 + 16 * 8, 7}, /* 25 */ + {11 * 32 + 16 * 9, 6}, /* 26 */ + {11 * 32 + 16 * 10, 5}, /* 27 */ + {11 * 32 + 16 * 11, 4}, /* 28 */ + {11 * 32 + 16 * 12, 3}, /* 29 */ + {11 * 32 + 16 * 13, 2}, /* 30 */ + {11 * 32 + 16 * 14, 1} /* 31 */ +}; + +static const uint16_t half_rem_vals[13] = { + 1, 0, 1, 4, 2, 0, 7, 6, 5, 4, 3, 2, 1 +}; + +#endif diff --git a/arch/x86/chunkset_avx2.c b/arch/x86/chunkset_avx2.c index 8cc17103..8f29e5b2 100644 --- a/arch/x86/chunkset_avx2.c +++ b/arch/x86/chunkset_avx2.c @@ -4,8 +4,8 @@ #include "zbuild.h" #ifdef X86_AVX2 +#include "avx2_tables.h" #include -#include "../generic/chunk_permute_table.h" #include "x86_intrins.h" typedef __m256i chunk_t; @@ -19,44 +19,6 @@ typedef __m128i halfchunk_t; #define HAVE_CHUNK_MAG #define HAVE_HALF_CHUNK -/* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can - * never be 0 - 2, we'll start with an offset, subtracting 3 from the input */ -static const lut_rem_pair perm_idx_lut[29] = { - { 0, 2}, /* 3 */ - { 0, 0}, /* don't care */ - { 1 * 32, 2}, /* 5 */ - { 2 * 32, 2}, /* 6 */ - { 3 * 32, 4}, /* 7 */ - { 0 * 32, 0}, /* don't care */ - { 4 * 32, 5}, /* 9 */ - { 5 * 32, 22}, /* 10 */ - { 6 * 32, 21}, /* 11 */ - { 7 * 32, 20}, /* 12 */ - { 8 * 32, 6}, /* 13 */ - { 9 * 32, 4}, /* 14 */ - {10 * 32, 2}, /* 15 */ - { 0 * 32, 0}, /* don't care */ - {11 * 32, 15}, /* 17 */ - {11 * 32 + 16, 14}, /* 18 */ - {11 * 32 + 16 * 2, 13}, /* 19 */ - {11 * 32 + 16 * 3, 12}, /* 20 */ - {11 * 32 + 16 * 4, 11}, /* 21 */ - {11 * 32 + 16 * 5, 10}, /* 22 */ - {11 * 32 + 16 * 6, 9}, /* 23 */ - {11 * 32 + 16 * 7, 8}, /* 24 */ - {11 * 32 + 16 * 8, 7}, /* 25 */ - {11 * 32 + 16 * 9, 6}, /* 26 */ - {11 * 32 + 16 * 10, 5}, /* 27 */ - {11 * 32 + 16 * 11, 4}, /* 28 */ - {11 * 32 + 16 * 12, 3}, /* 29 */ - {11 * 32 + 16 * 13, 2}, /* 30 */ - {11 * 32 + 16 * 14, 1} /* 31 */ -}; - -static const uint16_t half_rem_vals[13] = { - 1, 0, 1, 4, 2, 0, 7, 6, 5, 4, 3, 2, 1 -}; - static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { int16_t tmp; memcpy(&tmp, from, sizeof(tmp)); diff --git a/arch/x86/chunkset_avx512.c b/arch/x86/chunkset_avx512.c new file mode 100644 index 00000000..551df029 --- /dev/null +++ b/arch/x86/chunkset_avx512.c @@ -0,0 +1,189 @@ +/* chunkset_avx512.c -- AVX512 inline functions to copy small data chunks. + * For conditions of distribution and use, see copyright notice in zlib.h + */ +#include "zbuild.h" + +#ifdef X86_AVX512 + +#include "avx2_tables.h" +#include +#include "x86_intrins.h" + +typedef __m256i chunk_t; +typedef __m128i halfchunk_t; +typedef __mmask32 mask_t; +typedef __mmask16 halfmask_t; + +#define HAVE_CHUNKMEMSET_2 +#define HAVE_CHUNKMEMSET_4 +#define HAVE_CHUNKMEMSET_8 +#define HAVE_CHUNKMEMSET_16 +#define HAVE_CHUNKMEMSET_1 +#define HAVE_CHUNK_MAG +#define HAVE_HALF_CHUNK +#define HAVE_MASKED_READWRITE +#define HAVE_CHUNKCOPY +#define HAVE_HALFCHUNKCOPY + +static inline halfmask_t gen_half_mask(unsigned len) { + return (halfmask_t)_bzhi_u32(0xFFFF, len); +} + +static inline mask_t gen_mask(unsigned len) { + return (mask_t)_bzhi_u32(0xFFFFFFFF, len); +} + +static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { + int16_t tmp; + memcpy(&tmp, from, sizeof(tmp)); + *chunk = _mm256_set1_epi16(tmp); +} + +static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { + int32_t tmp; + memcpy(&tmp, from, sizeof(tmp)); + *chunk = _mm256_set1_epi32(tmp); +} + +static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { + int64_t tmp; + memcpy(&tmp, from, sizeof(tmp)); + *chunk = _mm256_set1_epi64x(tmp); +} + +static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) { + *chunk = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)from)); +} + +static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { + *chunk = _mm256_loadu_si256((__m256i *)s); +} + +static inline void storechunk(uint8_t *out, chunk_t *chunk) { + _mm256_storeu_si256((__m256i *)out, *chunk); +} + +static inline void storechunk_mask(uint8_t *out, mask_t mask, chunk_t *chunk) { + _mm256_mask_storeu_epi8(out, mask, *chunk); +} + +static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) { + Assert(len > 0, "chunkcopy should never have a length 0"); + + unsigned rem = len % sizeof(chunk_t); + mask_t rem_mask = gen_mask(rem); + + /* Since this is only ever called if dist >= a chunk, we don't need a masked load */ + chunk_t chunk; + loadchunk(from, &chunk); + _mm256_mask_storeu_epi8(out, rem_mask, chunk); + out += rem; + from += rem; + len -= rem; + + while (len > 0) { + loadchunk(from, &chunk); + storechunk(out, &chunk); + out += sizeof(chunk_t); + from += sizeof(chunk_t); + len -= sizeof(chunk_t); + } + + return out; +} + +static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) { + lut_rem_pair lut_rem = perm_idx_lut[dist - 3]; + __m256i ret_vec; + *chunk_rem = lut_rem.remval; + + /* See the AVX2 implementation for more detailed comments. This is that + some masked + * loads to avoid an out of bounds read on the heap */ + + if (dist < 16) { + const __m256i permute_xform = + _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16); + __m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx)); + halfmask_t load_mask = gen_half_mask(dist); + __m128i ret_vec0 = _mm_maskz_loadu_epi8(load_mask, buf); + perm_vec = _mm256_add_epi8(perm_vec, permute_xform); + ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1); + ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec); + } else { + halfmask_t load_mask = gen_half_mask(dist - 16); + __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf); + __m128i ret_vec1 = _mm_maskz_loadu_epi8(load_mask, (__m128i*)(buf + 16)); + __m128i perm_vec1 = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx)); + halfmask_t xlane_mask = _mm_cmp_epi8_mask(perm_vec1, _mm_set1_epi8(15), _MM_CMPINT_LE); + __m128i latter_half = _mm_mask_shuffle_epi8(ret_vec1, xlane_mask, ret_vec0, perm_vec1); + ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), latter_half, 1); + } + + return ret_vec; +} + +static inline void loadhalfchunk(uint8_t const *s, halfchunk_t *chunk) { + *chunk = _mm_loadu_si128((__m128i *)s); +} + +static inline void storehalfchunk(uint8_t *out, halfchunk_t *chunk) { + _mm_storeu_si128((__m128i *)out, *chunk); +} + +static inline chunk_t halfchunk2whole(halfchunk_t *chunk) { + /* We zero extend mostly to appease some memory sanitizers. These bytes are ultimately + * unlikely to be actually written or read from */ + return _mm256_zextsi128_si256(*chunk); +} + +static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) { + lut_rem_pair lut_rem = perm_idx_lut[dist - 3]; + __m128i perm_vec, ret_vec; + halfmask_t load_mask = gen_half_mask(dist); + ret_vec = _mm_maskz_loadu_epi8(load_mask, buf); + *chunk_rem = half_rem_vals[dist - 3]; + + perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx)); + ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec); + + return ret_vec; +} + +static inline uint8_t* HALFCHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) { + Assert(len > 0, "chunkcopy should never have a length 0"); + + unsigned rem = len % sizeof(halfchunk_t); + halfmask_t rem_mask = gen_half_mask(rem); + + /* Since this is only ever called if dist >= a chunk, we don't need a masked load */ + halfchunk_t chunk; + loadhalfchunk(from, &chunk); + _mm_mask_storeu_epi8(out, rem_mask, chunk); + out += rem; + from += rem; + len -= rem; + + while (len > 0) { + loadhalfchunk(from, &chunk); + storehalfchunk(out, &chunk); + out += sizeof(halfchunk_t); + from += sizeof(halfchunk_t); + len -= sizeof(halfchunk_t); + } + + return out; +} + +#define CHUNKSIZE chunksize_avx512 +#define CHUNKUNROLL chunkunroll_avx512 +#define CHUNKMEMSET chunkmemset_avx512 +#define CHUNKMEMSET_SAFE chunkmemset_safe_avx512 + +#include "chunkset_tpl.h" + +#define INFLATE_FAST inflate_fast_avx512 + +#include "inffast_tpl.h" + +#endif diff --git a/arch/x86/x86_features.c b/arch/x86/x86_features.c index 58cb4df3..9491a007 100644 --- a/arch/x86/x86_features.c +++ b/arch/x86/x86_features.c @@ -97,6 +97,8 @@ void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) { features->has_avx2 = ebx & 0x20; } + features->has_bmi2 = ebx & 0x8; + // check AVX512 bits if the OS supports saving ZMM registers if (features->has_os_save_zmm) { features->has_avx512f = ebx & 0x00010000; @@ -108,7 +110,7 @@ void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) { features->has_avx512vl = ebx & 0x80000000; } features->has_avx512_common = features->has_avx512f && features->has_avx512dq && features->has_avx512bw \ - && features->has_avx512vl; + && features->has_avx512vl && features->has_bmi2; features->has_avx512vnni = ecx & 0x800; } } diff --git a/arch/x86/x86_features.h b/arch/x86/x86_features.h index 6daa5e38..3901ad75 100644 --- a/arch/x86/x86_features.h +++ b/arch/x86/x86_features.h @@ -14,6 +14,7 @@ struct x86_cpu_features { int has_avx512vl; int has_avx512_common; // Enabled when AVX512(F,DQ,BW,VL) are all enabled. int has_avx512vnni; + int has_bmi2; int has_sse2; int has_ssse3; int has_sse42; diff --git a/arch/x86/x86_functions.h b/arch/x86/x86_functions.h index 5f8fcf63..fc62daea 100644 --- a/arch/x86/x86_functions.h +++ b/arch/x86/x86_functions.h @@ -46,6 +46,9 @@ uint8_t* chunkmemset_safe_avx2(uint8_t *out, uint8_t *from, unsigned len, unsign #ifdef X86_AVX512 uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len); uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +uint32_t chunksize_avx512(void); +uint8_t* chunkmemset_safe_avx512(uint8_t *out, uint8_t *from, unsigned len, unsigned left); +void inflate_fast_avx512(PREFIX3(stream)* strm, uint32_t start); #endif #ifdef X86_AVX512VNNI uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len); @@ -146,6 +149,12 @@ uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len); # define native_adler32 adler32_avx512 # undef native_adler32_fold_copy # define native_adler32_fold_copy adler32_fold_copy_avx512 +# undef native_chunkmemset_safe +# define native_chunkmemset_safe chunkmemset_safe_avx512 +# undef native_chunksize +# define native_chunksize chunksize_avx512 +# undef native_inflate_fast +# define native_inflate_fast inflate_fast_avx512 // X86 - AVX512 (VNNI) # if defined(X86_AVX512VNNI) && defined(__AVX512VNNI__) # undef native_adler32 diff --git a/chunkset_tpl.h b/chunkset_tpl.h index fc9f755e..5af1fbe8 100644 --- a/chunkset_tpl.h +++ b/chunkset_tpl.h @@ -4,7 +4,6 @@ #include "zbuild.h" #include -#include /* Returns the chunk size */ Z_INTERNAL uint32_t CHUNKSIZE(void) { @@ -88,7 +87,7 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t } #endif -#ifdef HAVE_HALF_CHUNK +#if defined(HAVE_HALF_CHUNK) && !defined(HAVE_HALFCHUNKCOPY) static inline uint8_t* HALFCHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) { halfchunk_t chunk; int32_t align = ((len - 1) % sizeof(halfchunk_t)) + 1; @@ -126,6 +125,15 @@ static inline uint8_t* CHUNKMEMSET(uint8_t *out, uint8_t *from, unsigned len) { * always needed to be handled here or if we're just now seeing it because we are * dispatching to this function, more */ if (sdist < 0 && dist < len) { +#ifdef HAVE_MASKED_READWRITE + /* We can still handle this case if we can mitigate over writing _and_ we + * fit the entirety of the copy length with one load */ + if (len <= sizeof(chunk_t)) { + /* Tempting to add a goto to the block below but hopefully most compilers + * collapse these identical code segments as one label to jump to */ + return CHUNKCOPY(out, from, len); + } +#endif /* Here the memmove semantics match perfectly, as when this happens we are * effectively sliding down the contents of memory by dist bytes */ memmove(out, from, len); @@ -139,7 +147,7 @@ static inline uint8_t* CHUNKMEMSET(uint8_t *out, uint8_t *from, unsigned len) { return CHUNKCOPY(out, from, len); } - /* Only AVX2 as there's 128 bit vectors and 256 bit. We allow for shorter vector + /* Only AVX2+ as there's 128 bit vectors and 256 bit. We allow for shorter vector * lengths because they serve to allow more cases to fall into chunkcopy, as the * distance of the shorter length is still deemed a safe distance. We rewrite this * here rather than calling the ssse3 variant directly now because doing so required @@ -154,11 +162,10 @@ static inline uint8_t* CHUNKMEMSET(uint8_t *out, uint8_t *from, unsigned len) { if ((dist % 2) != 0 || dist == 6) { halfchunk_t halfchunk_load = GET_HALFCHUNK_MAG(from, &chunk_mod, (unsigned)dist); - adv_amount = sizeof(halfchunk_t) - chunk_mod; if (len == sizeof(halfchunk_t)) { storehalfchunk(out, &halfchunk_load); - len -= adv_amount; - out += adv_amount; + len -= sizeof(halfchunk_t); + out += sizeof(halfchunk_t); } chunk_load = halfchunk2whole(&halfchunk_load); @@ -212,7 +219,11 @@ static inline uint8_t* CHUNKMEMSET(uint8_t *out, uint8_t *from, unsigned len) { rem_bytes: #endif if (len) { +#ifndef HAVE_MASKED_READWRITE memcpy(out, &chunk_load, len); +#else + storechunk_mask(out, gen_mask(len), &chunk_load); +#endif out += len; } @@ -237,6 +248,8 @@ Z_INTERNAL uint8_t* CHUNKMEMSET_SAFE(uint8_t *out, uint8_t *from, unsigned len, --left; } #endif + +#ifndef HAVE_MASKED_READWRITE if (UNLIKELY(left < sizeof(chunk_t))) { while (len > 0) { *out++ = *from++; @@ -245,6 +258,7 @@ Z_INTERNAL uint8_t* CHUNKMEMSET_SAFE(uint8_t *out, uint8_t *from, unsigned len, return out; } +#endif if (len) out = CHUNKMEMSET(out, from, len); @@ -252,14 +266,15 @@ Z_INTERNAL uint8_t* CHUNKMEMSET_SAFE(uint8_t *out, uint8_t *from, unsigned len, return out; } -static inline uint8_t *CHUNKCOPY_SAFE(uint8_t *out, uint8_t *from, unsigned len, uint8_t *safe) +static inline uint8_t *CHUNKCOPY_SAFE(uint8_t *out, uint8_t *from, uint64_t len, uint8_t *safe) { if (out == from) return out + len; uint64_t safelen = (safe - out); - len = MIN(len, (unsigned)safelen); + len = MIN(len, safelen); +#ifndef HAVE_MASKED_READWRITE uint64_t from_dist = (uint64_t)llabs(safe - from); if (UNLIKELY(from_dist < sizeof(chunk_t) || safelen < sizeof(chunk_t))) { while (len--) { @@ -268,6 +283,7 @@ static inline uint8_t *CHUNKCOPY_SAFE(uint8_t *out, uint8_t *from, unsigned len, return out; } +#endif - return CHUNKMEMSET(out, from, len); + return CHUNKMEMSET(out, from, (unsigned)len); } diff --git a/cmake/detect-intrinsics.cmake b/cmake/detect-intrinsics.cmake index 1906f215..b8eabe8e 100644 --- a/cmake/detect-intrinsics.cmake +++ b/cmake/detect-intrinsics.cmake @@ -76,14 +76,14 @@ macro(check_avx512_intrinsics) if(NOT NATIVEFLAG) if(CMAKE_C_COMPILER_ID MATCHES "Intel") if(CMAKE_HOST_UNIX OR APPLE) - set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl") + set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi2") else() set(AVX512FLAG "/arch:AVX512") endif() elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") # For CPUs that can benefit from AVX512, it seems GCC generates suboptimal # instruction scheduling unless you specify a reasonable -mtune= target - set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl") + set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi2") if(NOT MSVC) check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE) if(HAVE_CASCADE_LAKE) @@ -114,12 +114,12 @@ macro(check_avx512vnni_intrinsics) if(NOT NATIVEFLAG) if(CMAKE_C_COMPILER_ID MATCHES "Intel") if(CMAKE_HOST_UNIX OR APPLE OR CMAKE_C_COMPILER_ID MATCHES "IntelLLVM") - set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni") + set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mbmi2") else() set(AVX512VNNIFLAG "/arch:AVX512") endif() elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") - set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni") + set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mbmi2") if(NOT MSVC) check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE) if(HAVE_CASCADE_LAKE) diff --git a/configure b/configure index 04d962e1..738e5f92 100755 --- a/configure +++ b/configure @@ -106,7 +106,7 @@ floatabi= forcesse2=0 # For CPUs that can benefit from AVX512, it seems GCC generates suboptimal # instruction scheduling unless you specify a reasonable -mtune= target -avx512flag="-mavx512f -mavx512dq -mavx512bw -mavx512vl" +avx512flag="-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi2" avx512vnniflag="${avx512flag} -mavx512vnni" avx2flag="-mavx2" sse2flag="-msse2" @@ -1589,8 +1589,8 @@ case "${ARCH}" in if test ${HAVE_AVX512_INTRIN} -eq 1; then CFLAGS="${CFLAGS} -DX86_AVX512" SFLAGS="${SFLAGS} -DX86_AVX512" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_avx512.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_avx512.lo" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_avx512.o chunkset_avx512.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_avx512.lo chunkset_avx512.lo" fi check_mtune_cascadelake_compiler_flag diff --git a/functable.c b/functable.c index 832a57e7..c8b11b5f 100644 --- a/functable.c +++ b/functable.c @@ -129,6 +129,9 @@ static void init_functable(void) { if (cf.x86.has_avx512_common) { ft.adler32 = &adler32_avx512; ft.adler32_fold_copy = &adler32_fold_copy_avx512; + ft.chunkmemset_safe = &chunkmemset_safe_avx512; + ft.chunksize = &chunksize_avx512; + ft.inflate_fast = &inflate_fast_avx512; } #endif #ifdef X86_AVX512VNNI diff --git a/inffast_tpl.h b/inffast_tpl.h index afa5e04e..2ec865db 100644 --- a/inffast_tpl.h +++ b/inffast_tpl.h @@ -254,14 +254,18 @@ void Z_INTERNAL INFLATE_FAST(PREFIX3(stream) *strm, uint32_t start) { out = chunkcopy_safe(out, out - dist, len, safe); } } else { - if (!extra_safe) - out = CHUNKCOPY_SAFE(out, from, len, safe); - else +#ifndef HAVE_MASKED_READWRITE + if (extra_safe) out = chunkcopy_safe(out, from, len, safe); + else +#endif + out = CHUNKCOPY_SAFE(out, from, len, safe); } +#ifndef HAVE_MASKED_READWRITE } else if (extra_safe) { /* Whole reference is in range of current output. */ out = chunkcopy_safe(out, out - dist, len, safe); +#endif } else { /* Whole reference is in range of current output. No range checks are necessary because we start with room for at least 258 bytes of output,