diff --git a/arch/arm/chunkset_neon.c b/arch/arm/chunkset_neon.c index 0016f7f2..da9d7f95 100644 --- a/arch/arm/chunkset_neon.c +++ b/arch/arm/chunkset_neon.c @@ -9,8 +9,6 @@ typedef uint8x16_t chunk_t; -#define CHUNK_SIZE 16 - #define HAVE_CHUNKMEMSET_2 #define HAVE_CHUNKMEMSET_4 #define HAVE_CHUNKMEMSET_8 diff --git a/arch/x86/chunkset_avx2.c b/arch/x86/chunkset_avx2.c index 26bd004c..86cbaaa8 100644 --- a/arch/x86/chunkset_avx2.c +++ b/arch/x86/chunkset_avx2.c @@ -6,16 +6,17 @@ #ifdef X86_AVX2 #include #include "../generic/chunk_permute_table.h" +#include "x86_intrins.h" typedef __m256i chunk_t; - -#define CHUNK_SIZE 32 +typedef __m128i halfchunk_t; #define HAVE_CHUNKMEMSET_2 #define HAVE_CHUNKMEMSET_4 #define HAVE_CHUNKMEMSET_8 #define HAVE_CHUNKMEMSET_16 #define HAVE_CHUNK_MAG +#define HAVE_HALF_CHUNK /* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can * never be 0 - 2, we'll start with an offset, subtracting 3 from the input */ @@ -51,6 +52,10 @@ static const lut_rem_pair perm_idx_lut[29] = { {11 * 32 + 16 * 14, 1} /* 31 */ }; +static const uint16_t half_rem_vals[13] = { + 1, 0, 1, 4, 2, 0, 7, 6, 5, 4, 3, 2, 1 +}; + static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { int16_t tmp; memcpy(&tmp, from, sizeof(tmp)); @@ -120,6 +125,51 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t return ret_vec; } +static inline void halfchunkmemset_2(uint8_t *from, halfchunk_t *chunk) { + int16_t tmp; + memcpy(&tmp, from, sizeof(tmp)); + *chunk = _mm_set1_epi16(tmp); +} + +static inline void halfchunkmemset_4(uint8_t *from, halfchunk_t *chunk) { + int32_t tmp; + memcpy(&tmp, from, sizeof(tmp)); + *chunk = _mm_set1_epi32(tmp); +} + +static inline void halfchunkmemset_8(uint8_t *from, halfchunk_t *chunk) { + int64_t tmp; + memcpy(&tmp, from, sizeof(tmp)); + *chunk = _mm_set1_epi64x(tmp); +} + +static inline void loadhalfchunk(uint8_t const *s, halfchunk_t *chunk) { + *chunk = _mm_loadu_si128((__m128i *)s); +} + +static inline void storehalfchunk(uint8_t *out, halfchunk_t *chunk) { + _mm_storeu_si128((__m128i *)out, *chunk); +} + +static inline chunk_t halfchunk2whole(halfchunk_t chunk) { + /* We zero extend mostly to appease some memory sanitizers. These bytes are ultimately + * unlikely to be actually written or read from */ + return _mm256_zextsi128_si256(chunk); +} + +static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) { + lut_rem_pair lut_rem = perm_idx_lut[dist - 3]; + __m128i perm_vec, ret_vec; + __msan_unpoison(buf + dist, 16 - dist); + ret_vec = _mm_loadu_si128((__m128i*)buf); + *chunk_rem = half_rem_vals[dist - 3]; + + perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx)); + ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec); + + return ret_vec; +} + #define CHUNKSIZE chunksize_avx2 #define CHUNKCOPY chunkcopy_avx2 #define CHUNKUNROLL chunkunroll_avx2 diff --git a/arch/x86/chunkset_sse2.c b/arch/x86/chunkset_sse2.c index c402c0ee..0b1593b5 100644 --- a/arch/x86/chunkset_sse2.c +++ b/arch/x86/chunkset_sse2.c @@ -9,8 +9,6 @@ typedef __m128i chunk_t; -#define CHUNK_SIZE 16 - #define HAVE_CHUNKMEMSET_2 #define HAVE_CHUNKMEMSET_4 #define HAVE_CHUNKMEMSET_8 diff --git a/arch/x86/chunkset_ssse3.c b/arch/x86/chunkset_ssse3.c index 722ecd3d..deedb6ce 100644 --- a/arch/x86/chunkset_ssse3.c +++ b/arch/x86/chunkset_ssse3.c @@ -10,8 +10,6 @@ typedef __m128i chunk_t; -#define CHUNK_SIZE 16 - #define HAVE_CHUNKMEMSET_2 #define HAVE_CHUNKMEMSET_4 #define HAVE_CHUNKMEMSET_8 diff --git a/chunkset_tpl.h b/chunkset_tpl.h index 64f2bbec..9330e804 100644 --- a/chunkset_tpl.h +++ b/chunkset_tpl.h @@ -5,10 +5,6 @@ #include "zbuild.h" #include -#if CHUNK_SIZE == 32 && defined(X86_SSSE3) -extern uint8_t* chunkmemset_ssse3(uint8_t *out, unsigned dist, unsigned len); -#endif - /* Returns the chunk size */ Z_INTERNAL uint32_t CHUNKSIZE(void) { return sizeof(chunk_t); @@ -91,20 +87,37 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t } #endif +#ifdef HAVE_HALF_CHUNK +static inline uint8_t* HALFCHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) { + halfchunk_t chunk; + int32_t align = ((len - 1) % sizeof(halfchunk_t)) + 1; + loadhalfchunk(from, &chunk); + storehalfchunk(out, &chunk); + out += align; + from += align; + len -= align; + while (len > 0) { + loadhalfchunk(from, &chunk); + storehalfchunk(out, &chunk); + out += sizeof(halfchunk_t); + from += sizeof(halfchunk_t); + len -= sizeof(halfchunk_t); + } + return out; +} +#endif + /* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST. Return OUT + LEN. */ -Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) { +static inline uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) { /* Debug performance related issues when len < sizeof(uint64_t): Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */ Assert(dist > 0, "chunkmemset cannot have a distance 0"); - /* Only AVX2 */ -#if CHUNK_SIZE == 32 && defined(X86_SSSE3) - if (len <= 16) { - return chunkmemset_ssse3(out, dist, len); - } -#endif uint8_t *from = out - dist; + chunk_t chunk_load; + uint32_t chunk_mod = 0; + uint32_t adv_amount; if (dist == 1) { memset(out, *from, len); @@ -113,10 +126,45 @@ Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) { return CHUNKCOPY(out, out - dist, len); } - chunk_t chunk_load; - uint32_t chunk_mod = 0; + /* Only AVX2 as there's 128 bit vectors and 256 bit. We allow for shorter vector + * lengths because they serve to allow more cases to fall into chunkcopy, as the + * distance of the shorter length is still deemed a safe distance. We rewrite this + * here rather than calling the ssse3 variant directly now because doing so required + * dispatching to another function and broke inlining for this function entirely. We + * also can merge an assert and some remainder peeling behavior into the same code blocks, + * making the code a little smaller. */ +#ifdef HAVE_HALF_CHUNK + if (len <= sizeof(halfchunk_t)) { + if (dist > sizeof(halfchunk_t)) { + return HALFCHUNKCOPY(out, out - dist, len); + } + + halfchunk_t halfchunk_load; + + if (dist == 2) { + halfchunkmemset_2(from, &halfchunk_load); + } else if (dist == 4) { + halfchunkmemset_4(from, &halfchunk_load); + } else if (dist == 8) { + halfchunkmemset_8(from, &halfchunk_load); + } else if (dist == 16) { + loadhalfchunk(from, &halfchunk_load); + } else { + halfchunk_load = GET_HALFCHUNK_MAG(from, &chunk_mod, dist); + } + + adv_amount = sizeof(halfchunk_t) - chunk_mod; + while (len >= sizeof(halfchunk_t)) { + storehalfchunk(out, &halfchunk_load); + len -= adv_amount; + out += adv_amount; + } + + chunk_load = halfchunk2whole(halfchunk_load); + goto rem_bytes; + } +#endif - /* TODO: possibly build up a permutation table for this if not an even modulus */ #ifdef HAVE_CHUNKMEMSET_2 if (dist == 2) { chunkmemset_2(from, &chunk_load); @@ -143,28 +191,28 @@ Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) { chunk_load = GET_CHUNK_MAG(from, &chunk_mod, dist); } - /* If we're lucky enough and dist happens to be an even modulus of our vector length, - * we can do two stores per loop iteration, which for most ISAs, especially x86, is beneficial */ - if (chunk_mod == 0) { - while (len >= (2 * sizeof(chunk_t))) { - storechunk(out, &chunk_load); - storechunk(out + sizeof(chunk_t), &chunk_load); - out += 2 * sizeof(chunk_t); - len -= 2 * sizeof(chunk_t); - } + adv_amount = sizeof(chunk_t) - chunk_mod; + + while (len >= (2 * sizeof(chunk_t))) { + storechunk(out, &chunk_load); + storechunk(out + adv_amount, &chunk_load); + out += 2 * adv_amount; + len -= 2 * adv_amount; } /* If we don't have a "dist" length that divides evenly into a vector * register, we can write the whole vector register but we need only * advance by the amount of the whole string that fits in our chunk_t. * If we do divide evenly into the vector length, adv_amount = chunk_t size*/ - uint32_t adv_amount = sizeof(chunk_t) - chunk_mod; while (len >= sizeof(chunk_t)) { storechunk(out, &chunk_load); len -= adv_amount; out += adv_amount; } +#ifdef HAVE_HALF_CHUNK +rem_bytes: +#endif if (len) { memcpy(out, &chunk_load, len); out += len;