SSE4.1 optimized chorba

This is ~25-30% faster than the SSE2 variant on a core2 quad. The main reason
for this has to do with the fact that, while incurring far fewer shifts,
an entirely separate stack buffer has to be managed that is the size of
the L1 cache on most CPUs. This was one of the main reasons the 32k
specialized function was slower for the scalar counterpart, despite auto
vectorizing. The auto vectorized loop was setting up the stack buffer at
unaligned offsets, which is detrimental to performance pre-nehalem.
Additionally, we were losing a fair bit of time to the zero
initialization, which we are now doing more selectively.

There are a ton of loads and stores happening, and for sure we are bound
on the fill buffer + store forwarding. An SSE2 version of this code is
probably possible by simply replacing the shifts with unpacks with zero
and the palignr's with shufpd's. I'm just not sure it'll be all that worth
it, though. We are gating against SSE4.1 not because we are using specifically
a 4.1 instruction but because that marks when Wolfdale came out and palignr
became a lot faster.
This commit is contained in:
Adam Stylinski 2025-03-10 21:17:25 -04:00 committed by Hans Kristian Rosbach
parent 5a232688e1
commit 46fc33f39d
11 changed files with 451 additions and 9 deletions

View File

@ -131,7 +131,8 @@ elseif(BASEARCH_S360_FOUND)
elseif(BASEARCH_X86_FOUND)
option(WITH_SSE2 "Build with SSE2" ON)
cmake_dependent_option(WITH_SSSE3 "Build with SSSE3" ON "WITH_SSE2" OFF)
cmake_dependent_option(WITH_SSE42 "Build with SSE42" ON "WITH_SSSE3" OFF)
cmake_dependent_option(WITH_SSE41 "Build with SSE41" ON "WITH_SSSE3" OFF)
cmake_dependent_option(WITH_SSE42 "Build with SSE42" ON "WITH_SSE41" OFF)
cmake_dependent_option(WITH_PCLMULQDQ "Build with PCLMULQDQ" ON "WITH_SSE42" OFF)
cmake_dependent_option(WITH_AVX2 "Build with AVX2" ON "WITH_SSE42" OFF)
cmake_dependent_option(WITH_AVX512 "Build with AVX512" ON "WITH_AVX2" OFF)
@ -151,7 +152,7 @@ mark_as_advanced(FORCE
WITH_DFLTCC_INFLATE
WITH_CRC32_VX
WITH_AVX2 WITH_SSE2
WITH_SSSE3 WITH_SSE42
WITH_SSSE3 WITH_SSE41 WITH_SSE42
WITH_PCLMULQDQ
WITH_ALTIVEC
WITH_POWER8
@ -1035,9 +1036,20 @@ if(WITH_OPTIM)
set(WITH_SSSE3 OFF)
endif()
endif()
if(WITH_SSE41)
check_sse41_intrinsics()
if(HAVE_SSE41_INTRIN AND WITH_SSSE3)
add_definitions(-DX86_SSE41)
set(SSE41_SRCS ${ARCHDIR}/chorba_sse41.c)
list(APPEND ZLIB_ARCH_SRCS ${SSE41_SRCS})
set_property(SOURCE ${SSE41_SRCS} PROPERTY COMPILE_FLAGS "${SSE41FLAG} ${NOLTOFLAG}")
else()
set(WITH_SSE41 OFF)
endif()
endif()
if(WITH_SSE42)
check_sse42_intrinsics()
if(HAVE_SSE42_INTRIN AND WITH_SSSE3)
if(HAVE_SSE42_INTRIN AND WITH_SSE41)
add_definitions(-DX86_SSE42)
set(SSE42_SRCS ${ARCHDIR}/adler32_sse42.c)
add_feature_info(SSE42_CRC 1 "Support SSE4.2 optimized adler32 hash generation, using \"${SSE42FLAG}\"")
@ -1526,6 +1538,7 @@ elseif(BASEARCH_X86_FOUND)
add_feature_info(WITH_AVX512VNNI WITH_AVX512VNNI "Build with AVX512 VNNI")
add_feature_info(WITH_SSE2 WITH_SSE2 "Build with SSE2")
add_feature_info(WITH_SSSE3 WITH_SSSE3 "Build with SSSE3")
add_feature_info(WITH_SSE41 WITH_SSE41 "Build with SSE41")
add_feature_info(WITH_SSE42 WITH_SSE42 "Build with SSE42")
add_feature_info(WITH_PCLMULQDQ WITH_PCLMULQDQ "Build with PCLMULQDQ")
add_feature_info(WITH_VPCLMULQDQ WITH_VPCLMULQDQ "Build with VPCLMULQDQ")

View File

@ -13,6 +13,7 @@ AVX512VNNIFLAG=-mavx512vnni -mbmi2
AVX2FLAG=-mavx2 -mbmi2
SSE2FLAG=-msse2
SSSE3FLAG=-mssse3
SSE41FLAG=-msse4.1
SSE42FLAG=-msse4.2
PCLMULFLAG=-mpclmul
VPCLMULFLAG=-mvpclmulqdq
@ -35,6 +36,7 @@ all: \
chunkset_sse2.o chunkset_sse2.lo \
chunkset_ssse3.o chunkset_ssse3.lo \
chorba_sse2.o chorba_sse2.lo \
chorba_sse41.o chorba_sse41.lo \
compare256_avx2.o compare256_avx2.lo \
compare256_avx512.o compare256_avx512.lo \
compare256_sse2.o compare256_sse2.lo \
@ -79,6 +81,12 @@ chorba_sse2.o:
chorba_sse2.lo:
$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chorba_sse2.c
chorba_sse41.o:
$(CC) $(CFLAGS) $(SSE41FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chorba_sse41.c
chorba_sse41.lo:
$(CC) $(CFLAGS) $(SSE41FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chorba_sse41.c
compare256_avx2.o:
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c

342
arch/x86/chorba_sse41.c Normal file
View File

@ -0,0 +1,342 @@
#if !defined(WITHOUT_CHORBA) && defined(X86_SSE41)
#include "zbuild.h"
#include "crc32_braid_p.h"
#include "crc32_braid_tbl.h"
#include "crc32.h"
#include <emmintrin.h>
#include <smmintrin.h>
#include "arch/x86/x86_intrins.h"
#include "arch/generic/generic_functions.h"
#include <assert.h>
extern uint32_t crc32_braid_base(uint32_t c, const uint8_t *buf, size_t len);
extern uint32_t chorba_small_nondestructive_sse2(uint32_t c, uint64_t *aligned_buf, size_t aligned_len);
#define READ_NEXT(in, off, a, b) do { \
a = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t))); \
b = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t) + 2)); \
} while (0);
#define NEXT_ROUND(invec, a, b, c, d) do { \
a = _mm_xor_si128(_mm_slli_epi64(invec, 17), _mm_slli_epi64(invec, 55)); \
b = _mm_xor_si128(_mm_xor_si128(_mm_srli_epi64(invec, 47), _mm_srli_epi64(invec, 9)), _mm_slli_epi64(invec, 19)); \
c = _mm_xor_si128(_mm_srli_epi64(invec, 45), _mm_slli_epi64(invec, 44)); \
d = _mm_srli_epi64(invec, 20); \
} while (0);
#define REALIGN_CHORBA(in0, in1, in2, in3, out0, out1, out2, out3, out4, shift) do { \
out0 = _mm_slli_si128(in0, shift); \
out1 = _mm_alignr_epi8(in1, in0, shift); \
out2 = _mm_alignr_epi8(in2, in1, shift); \
out3 = _mm_alignr_epi8(in3, in2, shift); \
out4 = _mm_srli_si128(in3, shift); \
} while (0)
#define STORE4(out0, out1, out2, out3, out) do { \
_mm_store_si128(out++, out0); \
_mm_store_si128(out++, out1); \
_mm_store_si128(out++, out2); \
_mm_store_si128(out++, out3); \
} while (0)
#define READ4(out0, out1, out2, out3, in) do { \
out0 = _mm_load_si128(in++); \
out1 = _mm_load_si128(in++); \
out2 = _mm_load_si128(in++); \
out3 = _mm_load_si128(in++); \
} while (0)
/* This is intentionally shifted one down to compensate for the deferred store from
* the last iteration */
#define READ4_WITHXOR(out0, out1, out2, out3, xor0, xor1, xor2, xor3, in) do { \
out0 = _mm_xor_si128(in[1], xor0); \
out1 = _mm_xor_si128(in[2], xor1); \
out2 = _mm_xor_si128(in[3], xor2); \
out3 = _mm_xor_si128(in[4], xor3); \
} while (0)
static Z_FORCEINLINE uint32_t crc32_chorba_32768_nondestructive_sse41(uint32_t crc, const uint64_t* buf, size_t len) {
const uint64_t* input = buf;
ALIGNED_(16) uint64_t bitbuffer[32768 / sizeof(uint64_t)];
__m128i *bitbuffer_v = (__m128i*)bitbuffer;
const uint8_t* bitbufferbytes = (const uint8_t*) bitbuffer;
__m128i z = _mm_setzero_si128();
__m128i *bitbuf128 = &bitbuffer_v[64];
__m128i *bitbuf144 = &bitbuffer_v[72];
__m128i *bitbuf182 = &bitbuffer_v[91];
__m128i *bitbuf210 = &bitbuffer_v[105];
__m128i *bitbuf300 = &bitbuffer_v[150];
__m128i *bitbuf0 = bitbuf128;
__m128i *inptr = (__m128i*)input;
/* We only need to zero out the bytes between the 128'th value and the 144th
* that are actually read */
__m128i *z_cursor = bitbuf128;
for (size_t i = 0; i < 2; ++i) {
STORE4(z, z, z, z, z_cursor);
}
/* We only need to zero out the bytes between the 144'th value and the 182nd that
* are actually read */
z_cursor = bitbuf144 + 8;
for (size_t i = 0; i < 11; ++i) {
_mm_store_si128(z_cursor++, z);
}
/* We only need to zero out the bytes between the 182nd value and the 210th that
* are actually read. */
z_cursor = bitbuf182;
for (size_t i = 0; i < 4; ++i) {
STORE4(z, z, z, z, z_cursor);
}
/* We need to mix this in */
__m128i init_crc = _mm_cvtsi64x_si128(crc);
crc = 0;
size_t i = 0;
/* Previous iteration runs carried over */
__m128i buf144 = z;
__m128i buf182 = z;
__m128i buf210 = z;
for(; i + 300*8+64 < len && i < 22 * 8; i += 64) {
__m128i in12, in34, in56, in78,
in_1, in23, in45, in67, in8_;
READ4(in12, in34, in56, in78, inptr);
if (i == 0) {
in12 = _mm_xor_si128(in12, init_crc);
}
REALIGN_CHORBA(in12, in34, in56, in78,
in_1, in23, in45, in67, in8_, 8);
__m128i a = _mm_xor_si128(buf144, in_1);
STORE4(a, in23, in45, in67, bitbuf144);
buf144 = in8_;
__m128i e = _mm_xor_si128(buf182, in_1);
STORE4(e, in23, in45, in67, bitbuf182);
buf182 = in8_;
__m128i m = _mm_xor_si128(buf210, in_1);
STORE4(m, in23, in45, in67, bitbuf210);
buf210 = in8_;
STORE4(in12, in34, in56, in78, bitbuf300);
}
for(; i + 300*8+64 < len && i < 32 * 8; i += 64) {
__m128i in12, in34, in56, in78,
in_1, in23, in45, in67, in8_;
READ4(in12, in34, in56, in78, inptr);
REALIGN_CHORBA(in12, in34, in56, in78,
in_1, in23, in45, in67, in8_, 8);
__m128i a = _mm_xor_si128(buf144, in_1);
STORE4(a, in23, in45, in67, bitbuf144);
buf144 = in8_;
__m128i e, f, g, h;
e = _mm_xor_si128(buf182, in_1);
READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182);
STORE4(e, f, g, h, bitbuf182);
__m128i m = _mm_xor_si128(buf210, in_1);
STORE4(m, in23, in45, in67, bitbuf210);
buf210 = in8_;
STORE4(in12, in34, in56, in78, bitbuf300);
}
for(; i + 300*8+64 < len && i < 84 * 8; i += 64) {
__m128i in12, in34, in56, in78,
in_1, in23, in45, in67, in8_;
READ4(in12, in34, in56, in78, inptr);
REALIGN_CHORBA(in12, in34, in56, in78,
in_1, in23, in45, in67, in8_, 8);
__m128i a, b, c, d;
a = _mm_xor_si128(buf144, in_1);
READ4_WITHXOR(b, c, d, buf144, in23, in45, in67, in8_, bitbuf144);
STORE4(a, b, c, d, bitbuf144);
__m128i e, f, g, h;
e = _mm_xor_si128(buf182, in_1);
READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182);
STORE4(e, f, g, h, bitbuf182);
__m128i m = _mm_xor_si128(buf210, in_1);
STORE4(m, in23, in45, in67, bitbuf210);
buf210 = in8_;
STORE4(in12, in34, in56, in78, bitbuf300);
}
for(; i + 300*8+64 < len; i += 64) {
__m128i in12, in34, in56, in78,
in_1, in23, in45, in67, in8_;
if (i < 128 * 8) {
READ4(in12, in34, in56, in78, inptr);
} else {
in12 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
in34 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
in56 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
in78 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
}
// [0, 145, 183, 211]
/* Pre Penryn CPUs the unpack should be faster */
REALIGN_CHORBA(in12, in34, in56, in78,
in_1, in23, in45, in67, in8_, 8);
__m128i a, b, c, d;
a = _mm_xor_si128(buf144, in_1);
READ4_WITHXOR(b, c, d, buf144, in23, in45, in67, in8_, bitbuf144);
STORE4(a, b, c, d, bitbuf144);
__m128i e, f, g, h;
e = _mm_xor_si128(buf182, in_1);
READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182);
STORE4(e, f, g, h, bitbuf182);
__m128i n, o, p;
__m128i m = _mm_xor_si128(buf210, in_1);
/* Couldn't tell you why but despite knowing that this is always false,
* removing this branch with GCC makes things significantly slower. Some
* loop bodies must be being joined or something */
if (i < 84 * 8) {
n = in23;
o = in45;
p = in67;
buf210 = in8_;
} else {
READ4_WITHXOR(n, o, p, buf210, in23, in45, in67, in8_, bitbuf210);
}
STORE4(m, n, o, p, bitbuf210);
STORE4(in12, in34, in56, in78, bitbuf300);
}
/* Second half of stores bubbled out */
_mm_store_si128(bitbuf144, buf144);
_mm_store_si128(bitbuf182, buf182);
_mm_store_si128(bitbuf210, buf210);
/* We also have to zero out the tail */
size_t left_to_z = len - (300*8 + i);
__m128i *bitbuf_tail = (__m128i*)(bitbuffer + 300 + i/8);
while (left_to_z >= 64) {
STORE4(z, z, z, z, bitbuf_tail);
left_to_z -= 64;
}
while (left_to_z >= 16) {
_mm_store_si128(bitbuf_tail++, z);
left_to_z -= 16;
}
uint8_t *tail_bytes = (uint8_t*)bitbuf_tail;
while (left_to_z--) {
*tail_bytes++ = 0;
}
ALIGNED_(16) uint64_t final[9] = {0};
__m128i next12, next34, next56;
next12 = z;
next34 = z;
next56 = z;
for(; (i + 72 < len); i += 32) {
__m128i in1in2, in3in4;
__m128i in1in2_, in3in4_;
__m128i ab1, ab2, ab3, ab4;
__m128i cd1, cd2, cd3, cd4;
READ_NEXT(input, i, in1in2, in3in4);
READ_NEXT(bitbuffer, i, in1in2_, in3in4_);
in1in2 = _mm_xor_si128(_mm_xor_si128(in1in2, in1in2_), next12);
in3in4 = _mm_xor_si128(in3in4, in3in4_);
NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
__m128i a2_ = _mm_slli_si128(ab2, 8);
__m128i ab1_next34 = _mm_xor_si128(next34, ab1);
in3in4 = _mm_xor_si128(in3in4, ab1_next34);
in3in4 = _mm_xor_si128(a2_, in3in4);
NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
__m128i b2c2 = _mm_alignr_epi8(cd2, ab2, 8);
__m128i a4_ = _mm_slli_si128(ab4, 8);
a4_ = _mm_xor_si128(b2c2, a4_);
next12 = _mm_xor_si128(ab3, a4_);
next12 = _mm_xor_si128(next12, cd1);
__m128i d2_ = _mm_srli_si128(cd2, 8);
__m128i b4c4 = _mm_alignr_epi8(cd4, ab4, 8);
next12 = _mm_xor_si128(next12, next56);
next34 = _mm_xor_si128(cd3, _mm_xor_si128(b4c4, d2_));
next56 = _mm_srli_si128(cd4, 8);
}
memcpy(final, input+(i / sizeof(uint64_t)), len-i);
__m128i *final128 = (__m128i*)final;
_mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next12));
++final128;
_mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next34));
++final128;
_mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next56));
uint8_t* final_bytes = (uint8_t*) final;
for(size_t j = 0; j < (len-i); j++) {
crc = crc_table[(crc ^ final_bytes[j] ^ bitbufferbytes[(j+i)]) & 0xff] ^ (crc >> 8);
}
return crc;
}
Z_INTERNAL uint32_t crc32_chorba_sse41(uint32_t crc, const uint8_t *buf, size_t len) {
uint32_t c;
uint64_t* aligned_buf;
size_t aligned_len;
c = (~crc) & 0xffffffff;
uintptr_t algn_diff = ((uintptr_t)16 - ((uintptr_t)buf & 15)) & 15;
if (algn_diff < len) {
if (algn_diff) {
c = crc32_braid_internal(c, buf, algn_diff);
}
aligned_buf = (uint64_t*) (buf + algn_diff);
aligned_len = len - algn_diff;
if(aligned_len > CHORBA_LARGE_THRESHOLD) {
c = crc32_chorba_118960_nondestructive(c, (z_word_t*) aligned_buf, aligned_len);
} else if (aligned_len > CHORBA_MEDIUM_LOWER_THRESHOLD &&
aligned_len <= CHORBA_MEDIUM_UPPER_THRESHOLD) {
c = crc32_chorba_32768_nondestructive_sse41(c, aligned_buf, aligned_len);
} else if (aligned_len > CHORBA_SMALL_THRESHOLD_64BIT) {
c = chorba_small_nondestructive_sse2(c, aligned_buf, aligned_len);
} else {
c = crc32_braid_internal(c, (uint8_t*) aligned_buf, aligned_len);
}
}
else {
c = crc32_braid_internal(c, buf, len);
}
/* Return the CRC, post-conditioned. */
return c ^ 0xffffffff;
}
#endif

View File

@ -85,6 +85,7 @@ void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) {
features->has_sse2 = edx & 0x4000000;
features->has_ssse3 = ecx & 0x200;
features->has_sse41 = ecx & 0x80000;
features->has_sse42 = ecx & 0x100000;
features->has_pclmulqdq = ecx & 0x2;

View File

@ -17,6 +17,7 @@ struct x86_cpu_features {
int has_bmi2;
int has_sse2;
int has_ssse3;
int has_sse41;
int has_sse42;
int has_pclmulqdq;
int has_vpclmulqdq;

View File

@ -11,7 +11,7 @@
* Further context:
* https://developercommunity.visualstudio.com/t/Stack-corruption-with-v142-toolchain-whe/10853479 */
#if defined(_MSC_VER) && !defined(_M_AMD64) && _MSC_VER >= 1920 && _MSC_VER <= 1929
#define NO_CHORBA_SSE2
#define NO_CHORBA_SSE
#endif
#ifdef X86_SSE2
@ -36,6 +36,12 @@ uint8_t* chunkmemset_safe_ssse3(uint8_t *out, uint8_t *from, unsigned len, unsig
void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start);
#endif
#ifdef X86_SSE41
# if !defined(WITHOUT_CHORBA)
uint32_t crc32_chorba_sse41(uint32_t crc32, const uint8_t *buf, size_t len);
# endif
#endif
#ifdef X86_SSE42
uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
#endif
@ -104,7 +110,7 @@ uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
# define native_longest_match longest_match_sse2
# undef native_longest_match_slow
# define native_longest_match_slow longest_match_slow_sse2
# if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE2)
# if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE)
# undef native_crc32
# define native_crc32 crc32_chorba_sse2
# endif
@ -119,6 +125,10 @@ uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
# undef native_inflate_fast
# define native_inflate_fast inflate_fast_ssse3
# endif
# if !defined(WITHOUT_CHORBA) && defined(X86_SSE41) && defined(__SSE4_1__) && !defined(NO_CHORBA_SSE)
# undef native_crc32
# define native_crc32 crc32_chorba_sse41
# endif
// X86 - SSE4.2
# if defined(X86_SSE42) && defined(__SSE4_2__)
# undef native_adler32_fold_copy

View File

@ -565,6 +565,29 @@ macro(check_ssse3_intrinsics)
)
endmacro()
macro(check_sse41_intrinsics)
if(NOT NATIVEFLAG)
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
if(CMAKE_HOST_UNIX OR APPLE)
set(SSE41FLAG "-msse4.1")
else()
set(SSE41FLAG "/arch:SSE4.1")
endif()
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
set(SSE41FLAG "-msse4.1")
endif()
endif()
# Check whether compiler supports SSE4.1 intrinsics
set(CMAKE_REQUIRED_FLAGS "${SSE41FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
check_c_source_compiles(
"#include <smmintrin.h>
__m128i f(__m128i a, __m128i b) { return _mm_min_epi32(a, b); }
int main(void) { return 0; }"
HAVE_SSE41_INTRIN
)
set(CMAKE_REQUIRED_FLAGS)
endmacro()
macro(check_sse42_intrinsics)
if(NOT NATIVEFLAG)
if(CMAKE_C_COMPILER_ID MATCHES "Intel")

27
configure vendored
View File

@ -111,6 +111,7 @@ avx512vnniflag="${avx512flag} -mavx512vnni"
avx2flag="-mavx2 -mbmi2"
sse2flag="-msse2"
ssse3flag="-mssse3"
sse41flag="-msse4.1"
sse42flag="-msse4.2"
pclmulflag="-mpclmul"
vpclmulflag="-mvpclmulqdq -mavx512f"
@ -1590,6 +1591,22 @@ EOF
fi
}
check_sse41_intrinsics() {
# Check whether compiler supports SSE4.1 intrinsics
cat > $test.c << EOF
#include <smmintrin.h>
__m128i f(__m128i a, __m128i b) { return _mm_min_epi32(a, b); }
int main(void) { return 0; }
EOF
if try ${CC} ${CFLAGS} ${sse41flag} $test.c; then
echo "Checking for SSE4.1 intrinsics ... Yes." | tee -a configure.log
HAVE_SSE41_INTRIN=1
else
echo "Checking for SSE4.1 intrinsics ... No." | tee -a configure.log
HAVE_SSE41_INTRIN=0
fi
}
check_sse42_intrinsics() {
# Check whether compiler supports SSE4.2 intrinsics
cat > $test.c << EOF
@ -1717,6 +1734,15 @@ case "${ARCH}" in
ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_ssse3.lo chunkset_ssse3.lo"
fi
check_sse41_intrinsics
if test ${HAVE_SSE41_INTRIN} -eq 1; then
CFLAGS="${CFLAGS} -DX86_SSE41"
SFLAGS="${SFLAGS} -DX86_SSE41"
ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} chorba_sse41.o"
ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} chorba_sse41.lo"
fi
check_sse42_intrinsics
if test ${HAVE_SSE42_INTRIN} -eq 1; then
@ -2263,6 +2289,7 @@ sed < $SRCDIR/$ARCHDIR/Makefile.in "
/^AVX512VNNIFLAG *=/s#=.*#=$avx512vnniflag#
/^SSE2FLAG *=/s#=.*#=$sse2flag#
/^SSSE3FLAG *=/s#=.*#=$ssse3flag#
/^SSE41FLAG *=/s#=.*#=$sse41flag#
/^SSE42FLAG *=/s#=.*#=$sse42flag#
/^PCLMULFLAG *=/s#=.*#=$pclmulflag#
/^VPCLMULFLAG *=/s#=.*#=$vpclmulflag#

View File

@ -75,7 +75,7 @@ static void init_functable(void) {
{
ft.chunkmemset_safe = &chunkmemset_safe_sse2;
ft.chunksize = &chunksize_sse2;
#if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE2)
#if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE)
ft.crc32 = &crc32_chorba_sse2;
#endif
ft.inflate_fast = &inflate_fast_sse2;
@ -95,6 +95,16 @@ static void init_functable(void) {
ft.inflate_fast = &inflate_fast_ssse3;
}
#endif
// X86 - SSE4.1
#ifdef X86_SSE41
if (cf.x86.has_sse41) {
#if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE)
ft.crc32 = &crc32_chorba_sse41;
#endif
}
#endif
// X86 - SSE4.2
#ifdef X86_SSE42
if (cf.x86.has_sse42) {

View File

@ -69,8 +69,11 @@ BENCHMARK_CRC32(native, native_crc32, 1);
#else
#ifndef WITHOUT_CHORBA
# if defined(X86_SSE2) && !defined(NO_CHORBA_SSE2)
# if defined(X86_SSE2) && !defined(NO_CHORBA_SSE)
BENCHMARK_CRC32(chorba_sse2, crc32_chorba_sse2, test_cpu_features.x86.has_sse2);
# if defined(X86_SSE41) && !defined(NO_CHORBA_SSE)
BENCHMARK_CRC32(chorba_sse41, crc32_chorba_sse41, test_cpu_features.x86.has_sse41);
# endif
# endif
#endif

View File

@ -193,7 +193,8 @@ static const crc32_test tests[] = {
"h{bcmdC+a;t+Cf{6Y_dFq-{X4Yu&7uNfVDh?q&_u.UWJU],-GiH7ADzb7-V.Q%4=+v!$L9W+T=bP]$_:]Vyg}A.ygD.r;h-D]m%&"
"h{bcmdC+a;t+Cf{6Y_dFq-{X4Yu&7uNfVDh?q&_u.UWJU],-GiH7ADzb7-V.Q%4=+v!$L9W+T=bP]$_:]Vyg}A.ygD.r;h-D]m%&"
"h{bcmdC+a;t+Cf{6Y_dFq-{X4Yu&7uNfVDh?q&_u.UWJU],-GiH7ADzb7-V.Q%4=+v!$L9W+T=bP]$_:]Vyg}A.ygD.r;h-D]m%&", 600, 0x888AFA5B},
{0x0, buf32k, 32768, 0x217726B2}
{0x0, buf32k, 32768, 0x217726B2},
{0x0, buf32k, 16384, 0xE81722F0}
};
class crc32_variant : public ::testing::TestWithParam<crc32_test> {
@ -281,8 +282,11 @@ TEST_CRC32(pclmulqdq, crc32_pclmulqdq, test_cpu_features.x86.has_pclmulqdq)
#ifdef X86_VPCLMULQDQ_CRC
TEST_CRC32(vpclmulqdq, crc32_vpclmulqdq, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq))
#endif
#if !defined(WITHOUT_CHORBA) && defined(X86_SSE2) && !defined(NO_CHORBA_SSE2)
#if !defined(WITHOUT_CHORBA) && defined(X86_SSE2) && !defined(NO_CHORBA_SSE)
TEST_CRC32(chorba_sse2, crc32_chorba_sse2, test_cpu_features.x86.has_sse2)
#endif
#if !defined(WITHOUT_CHORBA) && defined(X86_SSE41) && !defined(NO_CHORBA_SSE)
TEST_CRC32(chorba_sse41, crc32_chorba_sse41, test_cpu_features.x86.has_sse41)
#endif
#endif