Make chunkset_avx2 half chunk aware

This gives us appreciable gains on a number of fronts.  The first being
we're inlining a pretty hot function that was getting dispatched to
regularly. Another is that we're able to do a safe lagged copy of a
distance that is smaller, so CHUNKCOPY gets its teeth back here for
smaller sizes, without having to do another dispatch to a function.

We're also now doing two overlapping writes at once and letting the CPU
do its store forwarding. This was an enhancement @dougallj had suggested
a while back.

Additionally, the "half chunk mag" here is fundamentally less
complicated because it doesn't require sythensizing cross lane permutes
with a blend operation, so we can optimistically do that first if the
len is small enough that a full 32 byte chunk doesn't make any sense.
This commit is contained in:
Adam Stylinski 2024-09-12 17:47:30 -04:00 committed by Hans Kristian Rosbach
parent b52e703417
commit e874b34e1a
5 changed files with 124 additions and 32 deletions

View File

@ -9,8 +9,6 @@
typedef uint8x16_t chunk_t;
#define CHUNK_SIZE 16
#define HAVE_CHUNKMEMSET_2
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8

View File

@ -6,16 +6,17 @@
#ifdef X86_AVX2
#include <immintrin.h>
#include "../generic/chunk_permute_table.h"
#include "x86_intrins.h"
typedef __m256i chunk_t;
#define CHUNK_SIZE 32
typedef __m128i halfchunk_t;
#define HAVE_CHUNKMEMSET_2
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
#define HAVE_CHUNKMEMSET_16
#define HAVE_CHUNK_MAG
#define HAVE_HALF_CHUNK
/* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can
* never be 0 - 2, we'll start with an offset, subtracting 3 from the input */
@ -51,6 +52,10 @@ static const lut_rem_pair perm_idx_lut[29] = {
{11 * 32 + 16 * 14, 1} /* 31 */
};
static const uint16_t half_rem_vals[13] = {
1, 0, 1, 4, 2, 0, 7, 6, 5, 4, 3, 2, 1
};
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
int16_t tmp;
memcpy(&tmp, from, sizeof(tmp));
@ -120,6 +125,51 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
return ret_vec;
}
static inline void halfchunkmemset_2(uint8_t *from, halfchunk_t *chunk) {
int16_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm_set1_epi16(tmp);
}
static inline void halfchunkmemset_4(uint8_t *from, halfchunk_t *chunk) {
int32_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm_set1_epi32(tmp);
}
static inline void halfchunkmemset_8(uint8_t *from, halfchunk_t *chunk) {
int64_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm_set1_epi64x(tmp);
}
static inline void loadhalfchunk(uint8_t const *s, halfchunk_t *chunk) {
*chunk = _mm_loadu_si128((__m128i *)s);
}
static inline void storehalfchunk(uint8_t *out, halfchunk_t *chunk) {
_mm_storeu_si128((__m128i *)out, *chunk);
}
static inline chunk_t halfchunk2whole(halfchunk_t chunk) {
/* We zero extend mostly to appease some memory sanitizers. These bytes are ultimately
* unlikely to be actually written or read from */
return _mm256_zextsi128_si256(chunk);
}
static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
__m128i perm_vec, ret_vec;
__msan_unpoison(buf + dist, 16 - dist);
ret_vec = _mm_loadu_si128((__m128i*)buf);
*chunk_rem = half_rem_vals[dist - 3];
perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);
return ret_vec;
}
#define CHUNKSIZE chunksize_avx2
#define CHUNKCOPY chunkcopy_avx2
#define CHUNKUNROLL chunkunroll_avx2

View File

@ -9,8 +9,6 @@
typedef __m128i chunk_t;
#define CHUNK_SIZE 16
#define HAVE_CHUNKMEMSET_2
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8

View File

@ -10,8 +10,6 @@
typedef __m128i chunk_t;
#define CHUNK_SIZE 16
#define HAVE_CHUNKMEMSET_2
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8

View File

@ -5,10 +5,6 @@
#include "zbuild.h"
#include <stdlib.h>
#if CHUNK_SIZE == 32 && defined(X86_SSSE3)
extern uint8_t* chunkmemset_ssse3(uint8_t *out, unsigned dist, unsigned len);
#endif
/* Returns the chunk size */
Z_INTERNAL uint32_t CHUNKSIZE(void) {
return sizeof(chunk_t);
@ -91,20 +87,37 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
}
#endif
#ifdef HAVE_HALF_CHUNK
static inline uint8_t* HALFCHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
halfchunk_t chunk;
int32_t align = ((len - 1) % sizeof(halfchunk_t)) + 1;
loadhalfchunk(from, &chunk);
storehalfchunk(out, &chunk);
out += align;
from += align;
len -= align;
while (len > 0) {
loadhalfchunk(from, &chunk);
storehalfchunk(out, &chunk);
out += sizeof(halfchunk_t);
from += sizeof(halfchunk_t);
len -= sizeof(halfchunk_t);
}
return out;
}
#endif
/* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST.
Return OUT + LEN. */
Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
static inline uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
/* Debug performance related issues when len < sizeof(uint64_t):
Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
Assert(dist > 0, "chunkmemset cannot have a distance 0");
/* Only AVX2 */
#if CHUNK_SIZE == 32 && defined(X86_SSSE3)
if (len <= 16) {
return chunkmemset_ssse3(out, dist, len);
}
#endif
uint8_t *from = out - dist;
chunk_t chunk_load;
uint32_t chunk_mod = 0;
uint32_t adv_amount;
if (dist == 1) {
memset(out, *from, len);
@ -113,10 +126,45 @@ Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
return CHUNKCOPY(out, out - dist, len);
}
chunk_t chunk_load;
uint32_t chunk_mod = 0;
/* Only AVX2 as there's 128 bit vectors and 256 bit. We allow for shorter vector
* lengths because they serve to allow more cases to fall into chunkcopy, as the
* distance of the shorter length is still deemed a safe distance. We rewrite this
* here rather than calling the ssse3 variant directly now because doing so required
* dispatching to another function and broke inlining for this function entirely. We
* also can merge an assert and some remainder peeling behavior into the same code blocks,
* making the code a little smaller. */
#ifdef HAVE_HALF_CHUNK
if (len <= sizeof(halfchunk_t)) {
if (dist > sizeof(halfchunk_t)) {
return HALFCHUNKCOPY(out, out - dist, len);
}
halfchunk_t halfchunk_load;
if (dist == 2) {
halfchunkmemset_2(from, &halfchunk_load);
} else if (dist == 4) {
halfchunkmemset_4(from, &halfchunk_load);
} else if (dist == 8) {
halfchunkmemset_8(from, &halfchunk_load);
} else if (dist == 16) {
loadhalfchunk(from, &halfchunk_load);
} else {
halfchunk_load = GET_HALFCHUNK_MAG(from, &chunk_mod, dist);
}
adv_amount = sizeof(halfchunk_t) - chunk_mod;
while (len >= sizeof(halfchunk_t)) {
storehalfchunk(out, &halfchunk_load);
len -= adv_amount;
out += adv_amount;
}
chunk_load = halfchunk2whole(halfchunk_load);
goto rem_bytes;
}
#endif
/* TODO: possibly build up a permutation table for this if not an even modulus */
#ifdef HAVE_CHUNKMEMSET_2
if (dist == 2) {
chunkmemset_2(from, &chunk_load);
@ -143,28 +191,28 @@ Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
chunk_load = GET_CHUNK_MAG(from, &chunk_mod, dist);
}
/* If we're lucky enough and dist happens to be an even modulus of our vector length,
* we can do two stores per loop iteration, which for most ISAs, especially x86, is beneficial */
if (chunk_mod == 0) {
while (len >= (2 * sizeof(chunk_t))) {
storechunk(out, &chunk_load);
storechunk(out + sizeof(chunk_t), &chunk_load);
out += 2 * sizeof(chunk_t);
len -= 2 * sizeof(chunk_t);
}
adv_amount = sizeof(chunk_t) - chunk_mod;
while (len >= (2 * sizeof(chunk_t))) {
storechunk(out, &chunk_load);
storechunk(out + adv_amount, &chunk_load);
out += 2 * adv_amount;
len -= 2 * adv_amount;
}
/* If we don't have a "dist" length that divides evenly into a vector
* register, we can write the whole vector register but we need only
* advance by the amount of the whole string that fits in our chunk_t.
* If we do divide evenly into the vector length, adv_amount = chunk_t size*/
uint32_t adv_amount = sizeof(chunk_t) - chunk_mod;
while (len >= sizeof(chunk_t)) {
storechunk(out, &chunk_load);
len -= adv_amount;
out += adv_amount;
}
#ifdef HAVE_HALF_CHUNK
rem_bytes:
#endif
if (len) {
memcpy(out, &chunk_load, len);
out += len;