SSE4.1 optimized chorba

This is ~25-30% faster than the SSE2 variant on a core2 quad. The main reason for this has to do with the fact that, while incurring far fewer shifts, an entirely separate stack buffer has to be managed that is the size of the L1 cache on most CPUs. This was one of the main reasons the 32k specialized function was slower for the scalar counterpart, despite auto vectorizing. The auto vectorized loop was setting up the stack buffer at unaligned offsets, which is detrimental to performance pre-nehalem. Additionally, we were losing a fair bit of time to the zero initialization, which we are now doing more selectively. There are a ton of loads and stores happening, and for sure we are bound on the fill buffer + store forwarding. An SSE2 version of this code is probably possible by simply replacing the shifts with unpacks with zero and the palignr's with shufpd's. I'm just not sure it'll be all that worth it, though. We are gating against SSE4.1 not because we are using specifically a 4.1 instruction but because that marks when Wolfdale came out and palignr became a lot faster.
2025-06-18 19:45:37 -04:00 · 2025-03-10 21:17:25 -04:00 · 2025-03-10 21:17:25 -04:00 · 46fc33f39d
commit 46fc33f39d
parent 5a232688e1
11 changed files with 451 additions and 9 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -131,7 +131,8 @@ elseif(BASEARCH_S360_FOUND)
 elseif(BASEARCH_X86_FOUND)
    option(WITH_SSE2 "Build with SSE2" ON)
    cmake_dependent_option(WITH_SSSE3 "Build with SSSE3" ON "WITH_SSE2" OFF)
-    cmake_dependent_option(WITH_SSE42 "Build with SSE42" ON "WITH_SSSE3" OFF)
+    cmake_dependent_option(WITH_SSE41 "Build with SSE41" ON "WITH_SSSE3" OFF)
+    cmake_dependent_option(WITH_SSE42 "Build with SSE42" ON "WITH_SSE41" OFF)
    cmake_dependent_option(WITH_PCLMULQDQ "Build with PCLMULQDQ" ON "WITH_SSE42" OFF)
    cmake_dependent_option(WITH_AVX2 "Build with AVX2" ON "WITH_SSE42" OFF)
    cmake_dependent_option(WITH_AVX512 "Build with AVX512" ON "WITH_AVX2" OFF)
@ -151,7 +152,7 @@ mark_as_advanced(FORCE
    WITH_DFLTCC_INFLATE
    WITH_CRC32_VX
    WITH_AVX2 WITH_SSE2
-    WITH_SSSE3 WITH_SSE42
+    WITH_SSSE3 WITH_SSE41 WITH_SSE42
    WITH_PCLMULQDQ
    WITH_ALTIVEC
    WITH_POWER8
@ -1035,9 +1036,20 @@ if(WITH_OPTIM)
                set(WITH_SSSE3 OFF)
            endif()
        endif()
+        if(WITH_SSE41)
+            check_sse41_intrinsics()
+            if(HAVE_SSE41_INTRIN AND WITH_SSSE3)
+                add_definitions(-DX86_SSE41)
+                set(SSE41_SRCS ${ARCHDIR}/chorba_sse41.c)
+                list(APPEND ZLIB_ARCH_SRCS ${SSE41_SRCS})
+                set_property(SOURCE ${SSE41_SRCS} PROPERTY COMPILE_FLAGS "${SSE41FLAG} ${NOLTOFLAG}")
+            else()
+                set(WITH_SSE41 OFF)
+            endif()
+        endif()
        if(WITH_SSE42)
            check_sse42_intrinsics()
-            if(HAVE_SSE42_INTRIN AND WITH_SSSE3)
+            if(HAVE_SSE42_INTRIN AND WITH_SSE41)
                add_definitions(-DX86_SSE42)
                set(SSE42_SRCS ${ARCHDIR}/adler32_sse42.c)
                add_feature_info(SSE42_CRC 1 "Support SSE4.2 optimized adler32 hash generation, using \"${SSE42FLAG}\"")
@ -1526,6 +1538,7 @@ elseif(BASEARCH_X86_FOUND)
    add_feature_info(WITH_AVX512VNNI WITH_AVX512VNNI "Build with AVX512 VNNI")
    add_feature_info(WITH_SSE2 WITH_SSE2 "Build with SSE2")
    add_feature_info(WITH_SSSE3 WITH_SSSE3 "Build with SSSE3")
+    add_feature_info(WITH_SSE41 WITH_SSE41 "Build with SSE41")
    add_feature_info(WITH_SSE42 WITH_SSE42 "Build with SSE42")
    add_feature_info(WITH_PCLMULQDQ WITH_PCLMULQDQ "Build with PCLMULQDQ")
    add_feature_info(WITH_VPCLMULQDQ WITH_VPCLMULQDQ "Build with VPCLMULQDQ")
--- a/arch/x86/Makefile.in
+++ b/arch/x86/Makefile.in
@ -13,6 +13,7 @@ AVX512VNNIFLAG=-mavx512vnni -mbmi2
 AVX2FLAG=-mavx2 -mbmi2
 SSE2FLAG=-msse2
 SSSE3FLAG=-mssse3
+SSE41FLAG=-msse4.1
 SSE42FLAG=-msse4.2
 PCLMULFLAG=-mpclmul
 VPCLMULFLAG=-mvpclmulqdq
@ -35,6 +36,7 @@ all: \
 	chunkset_sse2.o chunkset_sse2.lo \
 	chunkset_ssse3.o chunkset_ssse3.lo \
 	chorba_sse2.o chorba_sse2.lo \
+	chorba_sse41.o chorba_sse41.lo \
 	compare256_avx2.o compare256_avx2.lo \
 	compare256_avx512.o compare256_avx512.lo \
 	compare256_sse2.o compare256_sse2.lo \
@ -79,6 +81,12 @@ chorba_sse2.o:
 chorba_sse2.lo:
 	$(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chorba_sse2.c

+chorba_sse41.o:
+	$(CC) $(CFLAGS) $(SSE41FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chorba_sse41.c
+
+chorba_sse41.lo:
+	$(CC) $(CFLAGS) $(SSE41FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chorba_sse41.c
+
 compare256_avx2.o:
 	$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c

--- a/arch/x86/chorba_sse41.c
+++ b/arch/x86/chorba_sse41.c
@ -0,0 +1,342 @@
+#if !defined(WITHOUT_CHORBA) && defined(X86_SSE41)
+
+#include "zbuild.h"
+#include "crc32_braid_p.h"
+#include "crc32_braid_tbl.h"
+#include "crc32.h"
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include "arch/x86/x86_intrins.h"
+#include "arch/generic/generic_functions.h"
+#include <assert.h>
+
+extern uint32_t crc32_braid_base(uint32_t c, const uint8_t *buf, size_t len);
+extern uint32_t chorba_small_nondestructive_sse2(uint32_t c, uint64_t *aligned_buf, size_t aligned_len);
+
+#define READ_NEXT(in, off, a, b) do { \
+        a = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t))); \
+        b = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t) + 2)); \
+        } while (0);
+
+#define NEXT_ROUND(invec, a, b, c, d) do { \
+        a = _mm_xor_si128(_mm_slli_epi64(invec, 17), _mm_slli_epi64(invec, 55)); \
+        b = _mm_xor_si128(_mm_xor_si128(_mm_srli_epi64(invec, 47), _mm_srli_epi64(invec, 9)), _mm_slli_epi64(invec, 19)); \
+        c = _mm_xor_si128(_mm_srli_epi64(invec, 45), _mm_slli_epi64(invec, 44)); \
+        d  = _mm_srli_epi64(invec, 20); \
+        } while (0);
+
+#define REALIGN_CHORBA(in0, in1, in2, in3, out0, out1, out2, out3, out4, shift) do { \
+        out0 = _mm_slli_si128(in0, shift); \
+        out1 = _mm_alignr_epi8(in1, in0, shift); \
+        out2 = _mm_alignr_epi8(in2, in1, shift); \
+        out3 = _mm_alignr_epi8(in3, in2, shift); \
+        out4 = _mm_srli_si128(in3, shift); \
+        } while (0)
+
+#define STORE4(out0, out1, out2, out3, out) do { \
+        _mm_store_si128(out++, out0); \
+        _mm_store_si128(out++, out1); \
+        _mm_store_si128(out++, out2); \
+        _mm_store_si128(out++, out3); \
+    } while (0)
+
+#define READ4(out0, out1, out2, out3, in) do { \
+    out0 = _mm_load_si128(in++); \
+    out1 = _mm_load_si128(in++); \
+    out2 = _mm_load_si128(in++); \
+    out3 = _mm_load_si128(in++); \
+    } while (0)
+
+/* This is intentionally shifted one down to compensate for the deferred store from
+ * the last iteration */
+#define READ4_WITHXOR(out0, out1, out2, out3, xor0, xor1, xor2, xor3, in) do { \
+    out0 = _mm_xor_si128(in[1], xor0); \
+    out1 = _mm_xor_si128(in[2], xor1); \
+    out2 = _mm_xor_si128(in[3], xor2); \
+    out3 = _mm_xor_si128(in[4], xor3); \
+    } while (0)
+
+static Z_FORCEINLINE uint32_t crc32_chorba_32768_nondestructive_sse41(uint32_t crc, const uint64_t* buf, size_t len) {
+    const uint64_t* input = buf;
+    ALIGNED_(16) uint64_t bitbuffer[32768 / sizeof(uint64_t)];
+    __m128i *bitbuffer_v = (__m128i*)bitbuffer;
+    const uint8_t* bitbufferbytes = (const uint8_t*) bitbuffer;
+    __m128i z = _mm_setzero_si128();
+
+    __m128i *bitbuf128 = &bitbuffer_v[64];
+    __m128i *bitbuf144 = &bitbuffer_v[72];
+    __m128i *bitbuf182 = &bitbuffer_v[91];
+    __m128i *bitbuf210 = &bitbuffer_v[105];
+    __m128i *bitbuf300 = &bitbuffer_v[150];
+    __m128i *bitbuf0 = bitbuf128;
+    __m128i *inptr = (__m128i*)input;
+
+    /* We only need to zero out the bytes between the 128'th value and the 144th
+     * that are actually read */
+    __m128i *z_cursor = bitbuf128;
+    for (size_t i = 0; i < 2; ++i) {
+        STORE4(z, z, z, z, z_cursor);
+    }
+
+    /* We only need to zero out the bytes between the 144'th value and the 182nd that
+     * are actually read */
+    z_cursor = bitbuf144 + 8;
+    for (size_t i = 0; i < 11; ++i) {
+        _mm_store_si128(z_cursor++, z);
+    }
+
+    /* We only need to zero out the bytes between the 182nd value and the 210th that
+     * are actually read. */
+    z_cursor = bitbuf182;
+    for (size_t i = 0; i < 4; ++i) {
+        STORE4(z, z, z, z, z_cursor);
+    }
+
+    /* We need to mix this in */
+    __m128i init_crc = _mm_cvtsi64x_si128(crc);
+    crc = 0;
+
+    size_t i = 0;
+
+    /* Previous iteration runs carried over */
+    __m128i buf144 = z;
+    __m128i buf182 = z;
+    __m128i buf210 = z;
+
+    for(; i + 300*8+64 < len && i < 22 * 8; i += 64) {
+        __m128i in12, in34, in56, in78,
+                in_1, in23, in45, in67, in8_;
+
+        READ4(in12, in34, in56, in78, inptr);
+
+        if (i == 0) {
+            in12 = _mm_xor_si128(in12, init_crc);
+        }
+
+        REALIGN_CHORBA(in12, in34, in56, in78,
+                       in_1, in23, in45, in67, in8_, 8);
+
+        __m128i a = _mm_xor_si128(buf144, in_1);
+
+        STORE4(a, in23, in45, in67, bitbuf144);
+        buf144 = in8_;
+
+        __m128i e = _mm_xor_si128(buf182, in_1);
+        STORE4(e, in23, in45, in67, bitbuf182);
+        buf182 = in8_;
+
+        __m128i m = _mm_xor_si128(buf210, in_1);
+        STORE4(m, in23, in45, in67, bitbuf210);
+        buf210 = in8_;
+
+        STORE4(in12, in34, in56, in78, bitbuf300);
+    }
+
+    for(; i + 300*8+64 < len && i < 32 * 8; i += 64) {
+        __m128i in12, in34, in56, in78,
+                in_1, in23, in45, in67, in8_;
+        READ4(in12, in34, in56, in78, inptr);
+
+        REALIGN_CHORBA(in12, in34, in56, in78,
+                       in_1, in23, in45, in67, in8_, 8);
+
+        __m128i a = _mm_xor_si128(buf144, in_1);
+
+        STORE4(a, in23, in45, in67, bitbuf144);
+        buf144 = in8_;
+
+        __m128i e, f, g, h;
+        e = _mm_xor_si128(buf182, in_1);
+        READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182);
+        STORE4(e, f, g, h, bitbuf182);
+
+        __m128i m = _mm_xor_si128(buf210, in_1);
+        STORE4(m, in23, in45, in67, bitbuf210);
+        buf210 = in8_;
+
+        STORE4(in12, in34, in56, in78, bitbuf300);
+    }
+
+    for(; i + 300*8+64 < len && i < 84 * 8; i += 64) {
+        __m128i in12, in34, in56, in78,
+                in_1, in23, in45, in67, in8_;
+        READ4(in12, in34, in56, in78, inptr);
+
+        REALIGN_CHORBA(in12, in34, in56, in78,
+                       in_1, in23, in45, in67, in8_, 8);
+
+        __m128i a, b, c, d;
+        a = _mm_xor_si128(buf144, in_1);
+        READ4_WITHXOR(b, c, d, buf144, in23, in45, in67, in8_, bitbuf144);
+        STORE4(a, b, c, d, bitbuf144);
+
+        __m128i e, f, g, h;
+        e = _mm_xor_si128(buf182, in_1);
+        READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182);
+        STORE4(e, f, g, h, bitbuf182);
+
+        __m128i m = _mm_xor_si128(buf210, in_1);
+        STORE4(m, in23, in45, in67, bitbuf210);
+        buf210 = in8_;
+
+        STORE4(in12, in34, in56, in78, bitbuf300);
+    }
+
+    for(; i + 300*8+64 < len; i += 64) {
+        __m128i in12, in34, in56, in78,
+                in_1, in23, in45, in67, in8_;
+
+        if (i < 128 * 8) {
+            READ4(in12, in34, in56, in78, inptr);
+        } else {
+            in12 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
+            in34 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
+            in56 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
+            in78 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
+        }
+
+        // [0, 145, 183, 211]
+
+        /* Pre Penryn CPUs the unpack should be faster */
+        REALIGN_CHORBA(in12, in34, in56, in78,
+                       in_1, in23, in45, in67, in8_, 8);
+
+        __m128i a, b, c, d;
+        a = _mm_xor_si128(buf144, in_1);
+        READ4_WITHXOR(b, c, d, buf144, in23, in45, in67, in8_, bitbuf144);
+        STORE4(a, b, c, d, bitbuf144);
+
+        __m128i e, f, g, h;
+        e = _mm_xor_si128(buf182, in_1);
+        READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182);
+        STORE4(e, f, g, h, bitbuf182);
+
+        __m128i n, o, p;
+        __m128i m = _mm_xor_si128(buf210, in_1);
+
+        /* Couldn't tell you why but despite knowing that this is always false,
+         * removing this branch with GCC makes things significantly slower. Some
+         * loop bodies must be being joined or something */
+        if (i < 84 * 8) {
+            n = in23;
+            o = in45;
+            p = in67;
+            buf210 = in8_;
+        } else {
+            READ4_WITHXOR(n, o, p, buf210, in23, in45, in67, in8_, bitbuf210);
+        }
+
+        STORE4(m, n, o, p, bitbuf210);
+        STORE4(in12, in34, in56, in78, bitbuf300);
+    }
+
+    /* Second half of stores bubbled out */
+    _mm_store_si128(bitbuf144, buf144);
+    _mm_store_si128(bitbuf182, buf182);
+    _mm_store_si128(bitbuf210, buf210);
+
+    /* We also have to zero out the tail */
+    size_t left_to_z = len - (300*8 + i);
+    __m128i *bitbuf_tail = (__m128i*)(bitbuffer + 300 + i/8);
+    while (left_to_z >= 64) {
+       STORE4(z, z, z, z, bitbuf_tail);
+       left_to_z -= 64;
+    }
+
+    while (left_to_z >= 16) {
+       _mm_store_si128(bitbuf_tail++, z);
+       left_to_z -= 16;
+    }
+
+    uint8_t *tail_bytes = (uint8_t*)bitbuf_tail;
+    while (left_to_z--) {
+       *tail_bytes++ = 0;
+    }
+
+    ALIGNED_(16) uint64_t final[9] = {0};
+    __m128i next12, next34, next56;
+    next12 = z;
+    next34 = z;
+    next56 = z;
+
+    for(; (i + 72 < len); i += 32) {
+        __m128i in1in2, in3in4;
+        __m128i in1in2_, in3in4_;
+        __m128i ab1, ab2, ab3, ab4;
+        __m128i cd1, cd2, cd3, cd4;
+
+        READ_NEXT(input, i, in1in2, in3in4);
+        READ_NEXT(bitbuffer, i, in1in2_, in3in4_);
+
+        in1in2 = _mm_xor_si128(_mm_xor_si128(in1in2, in1in2_), next12);
+        in3in4 = _mm_xor_si128(in3in4, in3in4_);
+
+        NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
+
+        __m128i a2_ = _mm_slli_si128(ab2, 8);
+        __m128i ab1_next34 = _mm_xor_si128(next34, ab1);
+        in3in4 = _mm_xor_si128(in3in4, ab1_next34);
+        in3in4 = _mm_xor_si128(a2_, in3in4);
+        NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
+
+        __m128i b2c2 = _mm_alignr_epi8(cd2, ab2, 8);
+        __m128i a4_ = _mm_slli_si128(ab4, 8);
+        a4_ = _mm_xor_si128(b2c2, a4_);
+        next12 = _mm_xor_si128(ab3, a4_);
+        next12 = _mm_xor_si128(next12, cd1);
+
+        __m128i d2_ = _mm_srli_si128(cd2, 8);
+        __m128i b4c4 = _mm_alignr_epi8(cd4, ab4, 8);
+        next12 = _mm_xor_si128(next12, next56);
+        next34 = _mm_xor_si128(cd3, _mm_xor_si128(b4c4, d2_));
+        next56 = _mm_srli_si128(cd4, 8);
+    }
+
+    memcpy(final, input+(i / sizeof(uint64_t)), len-i);
+    __m128i *final128 = (__m128i*)final;
+    _mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next12));
+    ++final128;
+    _mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next34));
+    ++final128;
+    _mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next56));
+
+    uint8_t* final_bytes = (uint8_t*) final;
+
+    for(size_t j = 0; j < (len-i); j++) {
+        crc = crc_table[(crc ^ final_bytes[j] ^ bitbufferbytes[(j+i)]) & 0xff] ^ (crc >> 8);
+    }
+    return crc;
+}
+
+Z_INTERNAL uint32_t crc32_chorba_sse41(uint32_t crc, const uint8_t *buf, size_t len) {
+    uint32_t c;
+    uint64_t* aligned_buf;
+    size_t aligned_len;
+
+    c = (~crc) & 0xffffffff;
+    uintptr_t algn_diff = ((uintptr_t)16 - ((uintptr_t)buf & 15)) & 15;
+    if (algn_diff < len) {
+        if (algn_diff) {
+            c = crc32_braid_internal(c, buf, algn_diff);
+        }
+        aligned_buf = (uint64_t*) (buf + algn_diff);
+        aligned_len = len - algn_diff;
+        if(aligned_len > CHORBA_LARGE_THRESHOLD) {
+            c = crc32_chorba_118960_nondestructive(c, (z_word_t*) aligned_buf, aligned_len);
+        } else if (aligned_len > CHORBA_MEDIUM_LOWER_THRESHOLD &&
+                   aligned_len <= CHORBA_MEDIUM_UPPER_THRESHOLD) {
+            c = crc32_chorba_32768_nondestructive_sse41(c, aligned_buf, aligned_len);
+        } else if (aligned_len > CHORBA_SMALL_THRESHOLD_64BIT) {
+            c = chorba_small_nondestructive_sse2(c, aligned_buf, aligned_len);
+        } else {
+            c = crc32_braid_internal(c, (uint8_t*) aligned_buf, aligned_len);
+        }
+    }
+    else {
+        c = crc32_braid_internal(c, buf, len);
+    }
+
+    /* Return the CRC, post-conditioned. */
+    return c ^ 0xffffffff;
+}
+#endif
--- a/arch/x86/x86_features.c
+++ b/arch/x86/x86_features.c
@ -85,6 +85,7 @@ void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) {

    features->has_sse2 = edx & 0x4000000;
    features->has_ssse3 = ecx & 0x200;
+    features->has_sse41 = ecx & 0x80000;
    features->has_sse42 = ecx & 0x100000;
    features->has_pclmulqdq = ecx & 0x2;

--- a/arch/x86/x86_features.h
+++ b/arch/x86/x86_features.h
@ -17,6 +17,7 @@ struct x86_cpu_features {
    int has_bmi2;
    int has_sse2;
    int has_ssse3;
+    int has_sse41;
    int has_sse42;
    int has_pclmulqdq;
    int has_vpclmulqdq;
--- a/arch/x86/x86_functions.h
+++ b/arch/x86/x86_functions.h
@ -11,7 +11,7 @@
 * Further context:
 * https://developercommunity.visualstudio.com/t/Stack-corruption-with-v142-toolchain-whe/10853479 */
 #if defined(_MSC_VER) && !defined(_M_AMD64) && _MSC_VER >= 1920 && _MSC_VER <= 1929
-#define NO_CHORBA_SSE2
+#define NO_CHORBA_SSE
 #endif

 #ifdef X86_SSE2
@ -36,6 +36,12 @@ uint8_t* chunkmemset_safe_ssse3(uint8_t *out, uint8_t *from, unsigned len, unsig
 void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start);
 #endif

+#ifdef X86_SSE41
+#   if !defined(WITHOUT_CHORBA)
+    uint32_t crc32_chorba_sse41(uint32_t crc32, const uint8_t *buf, size_t len);
+#   endif
+#endif
+
 #ifdef X86_SSE42
 uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
 #endif
@ -104,7 +110,7 @@ uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
 #      define native_longest_match longest_match_sse2
 #      undef native_longest_match_slow
 #      define native_longest_match_slow longest_match_slow_sse2
-#      if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE2)
+#      if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE)
 #          undef native_crc32
 #          define native_crc32 crc32_chorba_sse2
 #      endif
@ -119,6 +125,10 @@ uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
 #    undef native_inflate_fast
 #    define native_inflate_fast inflate_fast_ssse3
 #  endif
+#  if !defined(WITHOUT_CHORBA) && defined(X86_SSE41) && defined(__SSE4_1__) && !defined(NO_CHORBA_SSE)
+#   undef native_crc32
+#   define native_crc32 crc32_chorba_sse41
+#   endif
 // X86 - SSE4.2
 #  if defined(X86_SSE42) && defined(__SSE4_2__)
 #    undef native_adler32_fold_copy
--- a/cmake/detect-intrinsics.cmake
+++ b/cmake/detect-intrinsics.cmake
@ -565,6 +565,29 @@ macro(check_ssse3_intrinsics)
    )
 endmacro()

+macro(check_sse41_intrinsics)
+    if(NOT NATIVEFLAG)
+        if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+            if(CMAKE_HOST_UNIX OR APPLE)
+                set(SSE41FLAG "-msse4.1")
+            else()
+                set(SSE41FLAG "/arch:SSE4.1")
+            endif()
+        elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+            set(SSE41FLAG "-msse4.1")
+        endif()
+    endif()
+    # Check whether compiler supports SSE4.1 intrinsics
+    set(CMAKE_REQUIRED_FLAGS "${SSE41FLAG} ${NATIVEFLAG} ${ZNOLTOFLAG}")
+    check_c_source_compiles(
+        "#include <smmintrin.h>
+        __m128i f(__m128i a, __m128i b) { return _mm_min_epi32(a, b); }
+        int main(void) { return 0; }"
+        HAVE_SSE41_INTRIN
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
 macro(check_sse42_intrinsics)
    if(NOT NATIVEFLAG)
        if(CMAKE_C_COMPILER_ID MATCHES "Intel")
--- a/27
+++ b/27
@ -111,6 +111,7 @@ avx512vnniflag="${avx512flag} -mavx512vnni"
 avx2flag="-mavx2 -mbmi2"
 sse2flag="-msse2"
 ssse3flag="-mssse3"
+sse41flag="-msse4.1"
 sse42flag="-msse4.2"
 pclmulflag="-mpclmul"
 vpclmulflag="-mvpclmulqdq -mavx512f"
@ -1590,6 +1591,22 @@ EOF
    fi
 }

+check_sse41_intrinsics() {
+    # Check whether compiler supports SSE4.1 intrinsics
+    cat > $test.c << EOF
+#include <smmintrin.h>
+__m128i f(__m128i a, __m128i b) { return _mm_min_epi32(a, b); }
+int main(void) { return 0; }
+EOF
+    if try ${CC} ${CFLAGS} ${sse41flag} $test.c; then
+        echo "Checking for SSE4.1 intrinsics ... Yes." | tee -a configure.log
+        HAVE_SSE41_INTRIN=1
+    else
+        echo "Checking for SSE4.1 intrinsics ... No." | tee -a configure.log
+        HAVE_SSE41_INTRIN=0
+    fi
+}
+
 check_sse42_intrinsics() {
    # Check whether compiler supports SSE4.2 intrinsics
    cat > $test.c << EOF
@ -1717,6 +1734,15 @@ case "${ARCH}" in
                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_ssse3.lo chunkset_ssse3.lo"
            fi

+            check_sse41_intrinsics
+
+            if test ${HAVE_SSE41_INTRIN} -eq 1; then
+                CFLAGS="${CFLAGS} -DX86_SSE41"
+                SFLAGS="${SFLAGS} -DX86_SSE41"
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} chorba_sse41.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} chorba_sse41.lo"
+            fi
+
            check_sse42_intrinsics

            if test ${HAVE_SSE42_INTRIN} -eq 1; then
@ -2263,6 +2289,7 @@ sed < $SRCDIR/$ARCHDIR/Makefile.in "
 /^AVX512VNNIFLAG *=/s#=.*#=$avx512vnniflag#
 /^SSE2FLAG *=/s#=.*#=$sse2flag#
 /^SSSE3FLAG *=/s#=.*#=$ssse3flag#
+/^SSE41FLAG *=/s#=.*#=$sse41flag#
 /^SSE42FLAG *=/s#=.*#=$sse42flag#
 /^PCLMULFLAG *=/s#=.*#=$pclmulflag#
 /^VPCLMULFLAG *=/s#=.*#=$vpclmulflag#
--- a/functable.c
+++ b/functable.c
@ -75,7 +75,7 @@ static void init_functable(void) {
    {
        ft.chunkmemset_safe = &chunkmemset_safe_sse2;
        ft.chunksize = &chunksize_sse2;
-#if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE2)
+#if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE)
        ft.crc32 = &crc32_chorba_sse2;
 #endif
        ft.inflate_fast = &inflate_fast_sse2;
@ -95,6 +95,16 @@ static void init_functable(void) {
        ft.inflate_fast = &inflate_fast_ssse3;
    }
 #endif
+
+    // X86 - SSE4.1
+#ifdef X86_SSE41
+    if (cf.x86.has_sse41) {
+#if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE)
+        ft.crc32 = &crc32_chorba_sse41;
+#endif
+    }
+#endif
+
    // X86 - SSE4.2
 #ifdef X86_SSE42
    if (cf.x86.has_sse42) {
--- a/test/benchmarks/benchmark_crc32.cc
+++ b/test/benchmarks/benchmark_crc32.cc
@ -69,8 +69,11 @@ BENCHMARK_CRC32(native, native_crc32, 1);
 #else

 #ifndef WITHOUT_CHORBA
-#   if defined(X86_SSE2) && !defined(NO_CHORBA_SSE2)
+#   if defined(X86_SSE2) && !defined(NO_CHORBA_SSE)
    BENCHMARK_CRC32(chorba_sse2, crc32_chorba_sse2, test_cpu_features.x86.has_sse2);
+#       if defined(X86_SSE41) && !defined(NO_CHORBA_SSE)
+        BENCHMARK_CRC32(chorba_sse41, crc32_chorba_sse41, test_cpu_features.x86.has_sse41);
+#       endif
 #   endif
 #endif

--- a/test/test_crc32.cc
+++ b/test/test_crc32.cc
@ -193,7 +193,8 @@ static const crc32_test tests[] = {
    "h{bcmdC+a;t+Cf{6Y_dFq-{X4Yu&7uNfVDh?q&_u.UWJU],-GiH7ADzb7-V.Q%4=+v!$L9W+T=bP]$_:]Vyg}A.ygD.r;h-D]m%&"
    "h{bcmdC+a;t+Cf{6Y_dFq-{X4Yu&7uNfVDh?q&_u.UWJU],-GiH7ADzb7-V.Q%4=+v!$L9W+T=bP]$_:]Vyg}A.ygD.r;h-D]m%&"
    "h{bcmdC+a;t+Cf{6Y_dFq-{X4Yu&7uNfVDh?q&_u.UWJU],-GiH7ADzb7-V.Q%4=+v!$L9W+T=bP]$_:]Vyg}A.ygD.r;h-D]m%&", 600, 0x888AFA5B},
-  {0x0, buf32k, 32768, 0x217726B2}
+  {0x0, buf32k, 32768, 0x217726B2},
+  {0x0, buf32k, 16384, 0xE81722F0}
 };

 class crc32_variant : public ::testing::TestWithParam<crc32_test> {
@ -281,8 +282,11 @@ TEST_CRC32(pclmulqdq, crc32_pclmulqdq, test_cpu_features.x86.has_pclmulqdq)
 #ifdef X86_VPCLMULQDQ_CRC
 TEST_CRC32(vpclmulqdq, crc32_vpclmulqdq, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq))
 #endif
-#if !defined(WITHOUT_CHORBA) && defined(X86_SSE2) && !defined(NO_CHORBA_SSE2)
+#if !defined(WITHOUT_CHORBA) && defined(X86_SSE2) && !defined(NO_CHORBA_SSE)
 TEST_CRC32(chorba_sse2, crc32_chorba_sse2, test_cpu_features.x86.has_sse2)
 #endif
+#if !defined(WITHOUT_CHORBA) && defined(X86_SSE41) && !defined(NO_CHORBA_SSE)
+TEST_CRC32(chorba_sse41, crc32_chorba_sse41, test_cpu_features.x86.has_sse41)
+#endif

 #endif