mirror of
https://github.com/GerbilSoft/zlib-ng.git
synced 2025-06-18 11:35:35 -04:00
Make chunkset_avx2 half chunk aware
This gives us appreciable gains on a number of fronts. The first being we're inlining a pretty hot function that was getting dispatched to regularly. Another is that we're able to do a safe lagged copy of a distance that is smaller, so CHUNKCOPY gets its teeth back here for smaller sizes, without having to do another dispatch to a function. We're also now doing two overlapping writes at once and letting the CPU do its store forwarding. This was an enhancement @dougallj had suggested a while back. Additionally, the "half chunk mag" here is fundamentally less complicated because it doesn't require sythensizing cross lane permutes with a blend operation, so we can optimistically do that first if the len is small enough that a full 32 byte chunk doesn't make any sense.
This commit is contained in:
parent
b52e703417
commit
e874b34e1a
@ -9,8 +9,6 @@
|
||||
|
||||
typedef uint8x16_t chunk_t;
|
||||
|
||||
#define CHUNK_SIZE 16
|
||||
|
||||
#define HAVE_CHUNKMEMSET_2
|
||||
#define HAVE_CHUNKMEMSET_4
|
||||
#define HAVE_CHUNKMEMSET_8
|
||||
|
@ -6,16 +6,17 @@
|
||||
#ifdef X86_AVX2
|
||||
#include <immintrin.h>
|
||||
#include "../generic/chunk_permute_table.h"
|
||||
#include "x86_intrins.h"
|
||||
|
||||
typedef __m256i chunk_t;
|
||||
|
||||
#define CHUNK_SIZE 32
|
||||
typedef __m128i halfchunk_t;
|
||||
|
||||
#define HAVE_CHUNKMEMSET_2
|
||||
#define HAVE_CHUNKMEMSET_4
|
||||
#define HAVE_CHUNKMEMSET_8
|
||||
#define HAVE_CHUNKMEMSET_16
|
||||
#define HAVE_CHUNK_MAG
|
||||
#define HAVE_HALF_CHUNK
|
||||
|
||||
/* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can
|
||||
* never be 0 - 2, we'll start with an offset, subtracting 3 from the input */
|
||||
@ -51,6 +52,10 @@ static const lut_rem_pair perm_idx_lut[29] = {
|
||||
{11 * 32 + 16 * 14, 1} /* 31 */
|
||||
};
|
||||
|
||||
static const uint16_t half_rem_vals[13] = {
|
||||
1, 0, 1, 4, 2, 0, 7, 6, 5, 4, 3, 2, 1
|
||||
};
|
||||
|
||||
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
|
||||
int16_t tmp;
|
||||
memcpy(&tmp, from, sizeof(tmp));
|
||||
@ -120,6 +125,51 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
|
||||
return ret_vec;
|
||||
}
|
||||
|
||||
static inline void halfchunkmemset_2(uint8_t *from, halfchunk_t *chunk) {
|
||||
int16_t tmp;
|
||||
memcpy(&tmp, from, sizeof(tmp));
|
||||
*chunk = _mm_set1_epi16(tmp);
|
||||
}
|
||||
|
||||
static inline void halfchunkmemset_4(uint8_t *from, halfchunk_t *chunk) {
|
||||
int32_t tmp;
|
||||
memcpy(&tmp, from, sizeof(tmp));
|
||||
*chunk = _mm_set1_epi32(tmp);
|
||||
}
|
||||
|
||||
static inline void halfchunkmemset_8(uint8_t *from, halfchunk_t *chunk) {
|
||||
int64_t tmp;
|
||||
memcpy(&tmp, from, sizeof(tmp));
|
||||
*chunk = _mm_set1_epi64x(tmp);
|
||||
}
|
||||
|
||||
static inline void loadhalfchunk(uint8_t const *s, halfchunk_t *chunk) {
|
||||
*chunk = _mm_loadu_si128((__m128i *)s);
|
||||
}
|
||||
|
||||
static inline void storehalfchunk(uint8_t *out, halfchunk_t *chunk) {
|
||||
_mm_storeu_si128((__m128i *)out, *chunk);
|
||||
}
|
||||
|
||||
static inline chunk_t halfchunk2whole(halfchunk_t chunk) {
|
||||
/* We zero extend mostly to appease some memory sanitizers. These bytes are ultimately
|
||||
* unlikely to be actually written or read from */
|
||||
return _mm256_zextsi128_si256(chunk);
|
||||
}
|
||||
|
||||
static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
|
||||
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
|
||||
__m128i perm_vec, ret_vec;
|
||||
__msan_unpoison(buf + dist, 16 - dist);
|
||||
ret_vec = _mm_loadu_si128((__m128i*)buf);
|
||||
*chunk_rem = half_rem_vals[dist - 3];
|
||||
|
||||
perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
|
||||
ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);
|
||||
|
||||
return ret_vec;
|
||||
}
|
||||
|
||||
#define CHUNKSIZE chunksize_avx2
|
||||
#define CHUNKCOPY chunkcopy_avx2
|
||||
#define CHUNKUNROLL chunkunroll_avx2
|
||||
|
@ -9,8 +9,6 @@
|
||||
|
||||
typedef __m128i chunk_t;
|
||||
|
||||
#define CHUNK_SIZE 16
|
||||
|
||||
#define HAVE_CHUNKMEMSET_2
|
||||
#define HAVE_CHUNKMEMSET_4
|
||||
#define HAVE_CHUNKMEMSET_8
|
||||
|
@ -10,8 +10,6 @@
|
||||
|
||||
typedef __m128i chunk_t;
|
||||
|
||||
#define CHUNK_SIZE 16
|
||||
|
||||
#define HAVE_CHUNKMEMSET_2
|
||||
#define HAVE_CHUNKMEMSET_4
|
||||
#define HAVE_CHUNKMEMSET_8
|
||||
|
@ -5,10 +5,6 @@
|
||||
#include "zbuild.h"
|
||||
#include <stdlib.h>
|
||||
|
||||
#if CHUNK_SIZE == 32 && defined(X86_SSSE3)
|
||||
extern uint8_t* chunkmemset_ssse3(uint8_t *out, unsigned dist, unsigned len);
|
||||
#endif
|
||||
|
||||
/* Returns the chunk size */
|
||||
Z_INTERNAL uint32_t CHUNKSIZE(void) {
|
||||
return sizeof(chunk_t);
|
||||
@ -91,20 +87,37 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_HALF_CHUNK
|
||||
static inline uint8_t* HALFCHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
|
||||
halfchunk_t chunk;
|
||||
int32_t align = ((len - 1) % sizeof(halfchunk_t)) + 1;
|
||||
loadhalfchunk(from, &chunk);
|
||||
storehalfchunk(out, &chunk);
|
||||
out += align;
|
||||
from += align;
|
||||
len -= align;
|
||||
while (len > 0) {
|
||||
loadhalfchunk(from, &chunk);
|
||||
storehalfchunk(out, &chunk);
|
||||
out += sizeof(halfchunk_t);
|
||||
from += sizeof(halfchunk_t);
|
||||
len -= sizeof(halfchunk_t);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST.
|
||||
Return OUT + LEN. */
|
||||
Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
|
||||
static inline uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
|
||||
/* Debug performance related issues when len < sizeof(uint64_t):
|
||||
Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
|
||||
Assert(dist > 0, "chunkmemset cannot have a distance 0");
|
||||
/* Only AVX2 */
|
||||
#if CHUNK_SIZE == 32 && defined(X86_SSSE3)
|
||||
if (len <= 16) {
|
||||
return chunkmemset_ssse3(out, dist, len);
|
||||
}
|
||||
#endif
|
||||
|
||||
uint8_t *from = out - dist;
|
||||
chunk_t chunk_load;
|
||||
uint32_t chunk_mod = 0;
|
||||
uint32_t adv_amount;
|
||||
|
||||
if (dist == 1) {
|
||||
memset(out, *from, len);
|
||||
@ -113,10 +126,45 @@ Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
|
||||
return CHUNKCOPY(out, out - dist, len);
|
||||
}
|
||||
|
||||
chunk_t chunk_load;
|
||||
uint32_t chunk_mod = 0;
|
||||
/* Only AVX2 as there's 128 bit vectors and 256 bit. We allow for shorter vector
|
||||
* lengths because they serve to allow more cases to fall into chunkcopy, as the
|
||||
* distance of the shorter length is still deemed a safe distance. We rewrite this
|
||||
* here rather than calling the ssse3 variant directly now because doing so required
|
||||
* dispatching to another function and broke inlining for this function entirely. We
|
||||
* also can merge an assert and some remainder peeling behavior into the same code blocks,
|
||||
* making the code a little smaller. */
|
||||
#ifdef HAVE_HALF_CHUNK
|
||||
if (len <= sizeof(halfchunk_t)) {
|
||||
if (dist > sizeof(halfchunk_t)) {
|
||||
return HALFCHUNKCOPY(out, out - dist, len);
|
||||
}
|
||||
|
||||
halfchunk_t halfchunk_load;
|
||||
|
||||
if (dist == 2) {
|
||||
halfchunkmemset_2(from, &halfchunk_load);
|
||||
} else if (dist == 4) {
|
||||
halfchunkmemset_4(from, &halfchunk_load);
|
||||
} else if (dist == 8) {
|
||||
halfchunkmemset_8(from, &halfchunk_load);
|
||||
} else if (dist == 16) {
|
||||
loadhalfchunk(from, &halfchunk_load);
|
||||
} else {
|
||||
halfchunk_load = GET_HALFCHUNK_MAG(from, &chunk_mod, dist);
|
||||
}
|
||||
|
||||
adv_amount = sizeof(halfchunk_t) - chunk_mod;
|
||||
while (len >= sizeof(halfchunk_t)) {
|
||||
storehalfchunk(out, &halfchunk_load);
|
||||
len -= adv_amount;
|
||||
out += adv_amount;
|
||||
}
|
||||
|
||||
chunk_load = halfchunk2whole(halfchunk_load);
|
||||
goto rem_bytes;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* TODO: possibly build up a permutation table for this if not an even modulus */
|
||||
#ifdef HAVE_CHUNKMEMSET_2
|
||||
if (dist == 2) {
|
||||
chunkmemset_2(from, &chunk_load);
|
||||
@ -143,28 +191,28 @@ Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
|
||||
chunk_load = GET_CHUNK_MAG(from, &chunk_mod, dist);
|
||||
}
|
||||
|
||||
/* If we're lucky enough and dist happens to be an even modulus of our vector length,
|
||||
* we can do two stores per loop iteration, which for most ISAs, especially x86, is beneficial */
|
||||
if (chunk_mod == 0) {
|
||||
while (len >= (2 * sizeof(chunk_t))) {
|
||||
storechunk(out, &chunk_load);
|
||||
storechunk(out + sizeof(chunk_t), &chunk_load);
|
||||
out += 2 * sizeof(chunk_t);
|
||||
len -= 2 * sizeof(chunk_t);
|
||||
}
|
||||
adv_amount = sizeof(chunk_t) - chunk_mod;
|
||||
|
||||
while (len >= (2 * sizeof(chunk_t))) {
|
||||
storechunk(out, &chunk_load);
|
||||
storechunk(out + adv_amount, &chunk_load);
|
||||
out += 2 * adv_amount;
|
||||
len -= 2 * adv_amount;
|
||||
}
|
||||
|
||||
/* If we don't have a "dist" length that divides evenly into a vector
|
||||
* register, we can write the whole vector register but we need only
|
||||
* advance by the amount of the whole string that fits in our chunk_t.
|
||||
* If we do divide evenly into the vector length, adv_amount = chunk_t size*/
|
||||
uint32_t adv_amount = sizeof(chunk_t) - chunk_mod;
|
||||
while (len >= sizeof(chunk_t)) {
|
||||
storechunk(out, &chunk_load);
|
||||
len -= adv_amount;
|
||||
out += adv_amount;
|
||||
}
|
||||
|
||||
#ifdef HAVE_HALF_CHUNK
|
||||
rem_bytes:
|
||||
#endif
|
||||
if (len) {
|
||||
memcpy(out, &chunk_load, len);
|
||||
out += len;
|
||||
|
Loading…
Reference in New Issue
Block a user