Make chunkset_avx2 half chunk aware

This gives us appreciable gains on a number of fronts. The first being we're inlining a pretty hot function that was getting dispatched to regularly. Another is that we're able to do a safe lagged copy of a distance that is smaller, so CHUNKCOPY gets its teeth back here for smaller sizes, without having to do another dispatch to a function. We're also now doing two overlapping writes at once and letting the CPU do its store forwarding. This was an enhancement @dougallj had suggested a while back. Additionally, the "half chunk mag" here is fundamentally less complicated because it doesn't require sythensizing cross lane permutes with a blend operation, so we can optimistically do that first if the len is small enough that a full 32 byte chunk doesn't make any sense.
2025-06-18 11:35:35 -04:00 · 2024-09-12 17:47:30 -04:00 · 2024-09-12 17:47:30 -04:00 · e874b34e1a
commit e874b34e1a
parent b52e703417
5 changed files with 124 additions and 32 deletions
--- a/arch/arm/chunkset_neon.c
+++ b/arch/arm/chunkset_neon.c
@ -9,8 +9,6 @@

 typedef uint8x16_t chunk_t;

-#define CHUNK_SIZE 16
-
 #define HAVE_CHUNKMEMSET_2
 #define HAVE_CHUNKMEMSET_4
 #define HAVE_CHUNKMEMSET_8
--- a/arch/x86/chunkset_avx2.c
+++ b/arch/x86/chunkset_avx2.c
@ -6,16 +6,17 @@
 #ifdef X86_AVX2
 #include <immintrin.h>
 #include "../generic/chunk_permute_table.h"
+#include "x86_intrins.h"

 typedef __m256i chunk_t;
-
-#define CHUNK_SIZE 32
+typedef __m128i halfchunk_t;

 #define HAVE_CHUNKMEMSET_2
 #define HAVE_CHUNKMEMSET_4
 #define HAVE_CHUNKMEMSET_8
 #define HAVE_CHUNKMEMSET_16
 #define HAVE_CHUNK_MAG
+#define HAVE_HALF_CHUNK

 /* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can
 * never be 0 - 2, we'll start with an offset, subtracting 3 from the input */
@ -51,6 +52,10 @@ static const lut_rem_pair perm_idx_lut[29] = {
    {11 * 32 + 16 * 14, 1}  /* 31 */
 };

+static const uint16_t half_rem_vals[13] = {
+    1, 0, 1, 4, 2, 0, 7, 6, 5, 4, 3, 2, 1
+};
+
 static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
    int16_t tmp;
    memcpy(&tmp, from, sizeof(tmp));
@ -120,6 +125,51 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
    return ret_vec;
 }

+static inline void halfchunkmemset_2(uint8_t *from, halfchunk_t *chunk) {
+    int16_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm_set1_epi16(tmp);
+}
+
+static inline void halfchunkmemset_4(uint8_t *from, halfchunk_t *chunk) {
+    int32_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm_set1_epi32(tmp);
+}
+
+static inline void halfchunkmemset_8(uint8_t *from, halfchunk_t *chunk) {
+    int64_t tmp;
+    memcpy(&tmp, from, sizeof(tmp));
+    *chunk = _mm_set1_epi64x(tmp);
+}
+
+static inline void loadhalfchunk(uint8_t const *s, halfchunk_t *chunk) {
+    *chunk = _mm_loadu_si128((__m128i *)s);
+}
+
+static inline void storehalfchunk(uint8_t *out, halfchunk_t *chunk) {
+    _mm_storeu_si128((__m128i *)out, *chunk);
+}
+
+static inline chunk_t halfchunk2whole(halfchunk_t chunk) {
+    /* We zero extend mostly to appease some memory sanitizers. These bytes are ultimately
+     * unlikely to be actually written or read from */
+    return _mm256_zextsi128_si256(chunk);
+}
+
+static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+    lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+    __m128i perm_vec, ret_vec;
+    __msan_unpoison(buf + dist, 16 - dist);
+    ret_vec = _mm_loadu_si128((__m128i*)buf);
+    *chunk_rem = half_rem_vals[dist - 3];
+
+    perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+    ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);
+
+    return ret_vec;
+}
+
 #define CHUNKSIZE        chunksize_avx2
 #define CHUNKCOPY        chunkcopy_avx2
 #define CHUNKUNROLL      chunkunroll_avx2
--- a/arch/x86/chunkset_sse2.c
+++ b/arch/x86/chunkset_sse2.c
@ -9,8 +9,6 @@

 typedef __m128i chunk_t;

-#define CHUNK_SIZE 16
-
 #define HAVE_CHUNKMEMSET_2
 #define HAVE_CHUNKMEMSET_4
 #define HAVE_CHUNKMEMSET_8
--- a/arch/x86/chunkset_ssse3.c
+++ b/arch/x86/chunkset_ssse3.c
@ -10,8 +10,6 @@

 typedef __m128i chunk_t;

-#define CHUNK_SIZE 16
-
 #define HAVE_CHUNKMEMSET_2
 #define HAVE_CHUNKMEMSET_4
 #define HAVE_CHUNKMEMSET_8
--- a/chunkset_tpl.h
+++ b/chunkset_tpl.h
@ -5,10 +5,6 @@
 #include "zbuild.h"
 #include <stdlib.h>

-#if CHUNK_SIZE == 32 && defined(X86_SSSE3)
-extern uint8_t* chunkmemset_ssse3(uint8_t *out, unsigned dist, unsigned len);
-#endif
-
 /* Returns the chunk size */
 Z_INTERNAL uint32_t CHUNKSIZE(void) {
    return sizeof(chunk_t);
@ -91,20 +87,37 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
 }
 #endif

+#ifdef HAVE_HALF_CHUNK
+static inline uint8_t* HALFCHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
+    halfchunk_t chunk;
+    int32_t align = ((len - 1) % sizeof(halfchunk_t)) + 1;
+    loadhalfchunk(from, &chunk);
+    storehalfchunk(out, &chunk);
+    out += align;
+    from += align;
+    len -= align;
+    while (len > 0) {
+        loadhalfchunk(from, &chunk);
+        storehalfchunk(out, &chunk);
+        out += sizeof(halfchunk_t);
+        from += sizeof(halfchunk_t);
+        len -= sizeof(halfchunk_t);
+    }
+    return out;
+}
+#endif
+
 /* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST.
   Return OUT + LEN. */
-Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
+static inline uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
    /* Debug performance related issues when len < sizeof(uint64_t):
       Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
    Assert(dist > 0, "chunkmemset cannot have a distance 0");
-    /* Only AVX2 */
-#if CHUNK_SIZE == 32 && defined(X86_SSSE3)
-    if (len <= 16) {
-        return chunkmemset_ssse3(out, dist, len);
-    }
-#endif

    uint8_t *from = out - dist;
+    chunk_t chunk_load;
+    uint32_t chunk_mod = 0;
+    uint32_t adv_amount;

    if (dist == 1) {
        memset(out, *from, len);
@ -113,10 +126,45 @@ Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
        return CHUNKCOPY(out, out - dist, len);
    }

-    chunk_t chunk_load;
-    uint32_t chunk_mod = 0;
+    /* Only AVX2 as there's 128 bit vectors and 256 bit. We allow for shorter vector
+     * lengths because they serve to allow more cases to fall into chunkcopy, as the
+     * distance of the shorter length is still deemed a safe distance. We rewrite this
+     * here rather than calling the ssse3 variant directly now because doing so required
+     * dispatching to another function and broke inlining for this function entirely. We
+     * also can merge an assert and some remainder peeling behavior into the same code blocks,
+     * making the code a little smaller.  */
+#ifdef HAVE_HALF_CHUNK
+    if (len <= sizeof(halfchunk_t)) {
+        if (dist > sizeof(halfchunk_t)) {
+            return HALFCHUNKCOPY(out, out - dist, len);
+        }
+
+        halfchunk_t halfchunk_load;
+
+        if (dist == 2) {
+            halfchunkmemset_2(from, &halfchunk_load);
+        } else if (dist == 4) {
+            halfchunkmemset_4(from, &halfchunk_load);
+        } else if (dist == 8) {
+            halfchunkmemset_8(from, &halfchunk_load);
+        } else if (dist == 16) {
+            loadhalfchunk(from, &halfchunk_load);
+        } else {
+            halfchunk_load = GET_HALFCHUNK_MAG(from, &chunk_mod, dist);
+        }
+
+        adv_amount = sizeof(halfchunk_t) - chunk_mod;
+        while (len >= sizeof(halfchunk_t)) {
+            storehalfchunk(out, &halfchunk_load);
+            len -= adv_amount;
+            out += adv_amount;
+        }
+
+        chunk_load = halfchunk2whole(halfchunk_load);
+        goto rem_bytes;
+    }
+#endif

-    /* TODO: possibly build up a permutation table for this if not an even modulus */
 #ifdef HAVE_CHUNKMEMSET_2
    if (dist == 2) {
        chunkmemset_2(from, &chunk_load);
@ -143,28 +191,28 @@ Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
        chunk_load = GET_CHUNK_MAG(from, &chunk_mod, dist);
    }

-    /* If we're lucky enough and dist happens to be an even modulus of our vector length,
-     * we can do two stores per loop iteration, which for most ISAs, especially x86, is beneficial */
-    if (chunk_mod == 0) {
-        while (len >= (2 * sizeof(chunk_t))) {
-            storechunk(out, &chunk_load);
-            storechunk(out + sizeof(chunk_t), &chunk_load);
-            out += 2 * sizeof(chunk_t);
-            len -= 2 * sizeof(chunk_t);
-        }
+    adv_amount = sizeof(chunk_t) - chunk_mod;
+
+    while (len >= (2 * sizeof(chunk_t))) {
+        storechunk(out, &chunk_load);
+        storechunk(out + adv_amount, &chunk_load);
+        out += 2 * adv_amount;
+        len -= 2 * adv_amount;
    }

    /* If we don't have a "dist" length that divides evenly into a vector
     * register, we can write the whole vector register but we need only
     * advance by the amount of the whole string that fits in our chunk_t.
     * If we do divide evenly into the vector length, adv_amount = chunk_t size*/
-    uint32_t adv_amount = sizeof(chunk_t) - chunk_mod;
    while (len >= sizeof(chunk_t)) {
        storechunk(out, &chunk_load);
        len -= adv_amount;
        out += adv_amount;
    }

+#ifdef HAVE_HALF_CHUNK
+rem_bytes:
+#endif
    if (len) {
        memcpy(out, &chunk_load, len);
        out += len;