Try to simply the inflate loop by collapsing most cases to chunksets

This commit is contained in:
Adam Stylinski 2024-09-23 18:26:04 -04:00 committed by Hans Kristian Rosbach
parent e874b34e1a
commit 94aacd8bd6
12 changed files with 92 additions and 85 deletions

View File

@ -8,7 +8,7 @@
#ifdef ARM_NEON #ifdef ARM_NEON
uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len); uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t chunksize_neon(void); uint32_t chunksize_neon(void);
uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left); uint8_t* chunkmemset_safe_neon(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
# ifdef HAVE_BUILTIN_CTZLL # ifdef HAVE_BUILTIN_CTZLL
uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1); uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1);

View File

@ -22,7 +22,7 @@ typedef uint32_t (*crc32_func)(uint32_t crc32, const uint8_t *buf, size_t len);
uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len); uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t chunksize_c(void); uint32_t chunksize_c(void);
uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left); uint8_t* chunkmemset_safe_c(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
void inflate_fast_c(PREFIX3(stream) *strm, uint32_t start); void inflate_fast_c(PREFIX3(stream) *strm, uint32_t start);
uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len); uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len);

View File

@ -15,7 +15,7 @@ void slide_hash_vmx(deflate_state *s);
#ifdef POWER8_VSX #ifdef POWER8_VSX
uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len); uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t chunksize_power8(void); uint32_t chunksize_power8(void);
uint8_t* chunkmemset_safe_power8(uint8_t *out, unsigned dist, unsigned len, unsigned left); uint8_t* chunkmemset_safe_power8(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
uint32_t crc32_power8(uint32_t crc, const uint8_t *buf, size_t len); uint32_t crc32_power8(uint32_t crc, const uint8_t *buf, size_t len);
void slide_hash_power8(deflate_state *s); void slide_hash_power8(deflate_state *s);
void inflate_fast_power8(PREFIX3(stream) *strm, uint32_t start); void inflate_fast_power8(PREFIX3(stream) *strm, uint32_t start);

View File

@ -13,7 +13,7 @@
uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len); uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t adler32_fold_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); uint32_t adler32_fold_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
uint32_t chunksize_rvv(void); uint32_t chunksize_rvv(void);
uint8_t* chunkmemset_safe_rvv(uint8_t *out, unsigned dist, unsigned len, unsigned left); uint8_t* chunkmemset_safe_rvv(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1); uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1);
uint32_t longest_match_rvv(deflate_state *const s, Pos cur_match); uint32_t longest_match_rvv(deflate_state *const s, Pos cur_match);

View File

@ -15,6 +15,7 @@ typedef __m128i halfchunk_t;
#define HAVE_CHUNKMEMSET_4 #define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8 #define HAVE_CHUNKMEMSET_8
#define HAVE_CHUNKMEMSET_16 #define HAVE_CHUNKMEMSET_16
#define HAVE_CHUNKMEMSET_1
#define HAVE_CHUNK_MAG #define HAVE_CHUNK_MAG
#define HAVE_HALF_CHUNK #define HAVE_HALF_CHUNK
@ -125,24 +126,6 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
return ret_vec; return ret_vec;
} }
static inline void halfchunkmemset_2(uint8_t *from, halfchunk_t *chunk) {
int16_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm_set1_epi16(tmp);
}
static inline void halfchunkmemset_4(uint8_t *from, halfchunk_t *chunk) {
int32_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm_set1_epi32(tmp);
}
static inline void halfchunkmemset_8(uint8_t *from, halfchunk_t *chunk) {
int64_t tmp;
memcpy(&tmp, from, sizeof(tmp));
*chunk = _mm_set1_epi64x(tmp);
}
static inline void loadhalfchunk(uint8_t const *s, halfchunk_t *chunk) { static inline void loadhalfchunk(uint8_t const *s, halfchunk_t *chunk) {
*chunk = _mm_loadu_si128((__m128i *)s); *chunk = _mm_loadu_si128((__m128i *)s);
} }
@ -151,10 +134,10 @@ static inline void storehalfchunk(uint8_t *out, halfchunk_t *chunk) {
_mm_storeu_si128((__m128i *)out, *chunk); _mm_storeu_si128((__m128i *)out, *chunk);
} }
static inline chunk_t halfchunk2whole(halfchunk_t chunk) { static inline chunk_t halfchunk2whole(halfchunk_t *chunk) {
/* We zero extend mostly to appease some memory sanitizers. These bytes are ultimately /* We zero extend mostly to appease some memory sanitizers. These bytes are ultimately
* unlikely to be actually written or read from */ * unlikely to be actually written or read from */
return _mm256_zextsi128_si256(chunk); return _mm256_zextsi128_si256(*chunk);
} }
static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) { static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {

View File

@ -8,7 +8,7 @@
#ifdef X86_SSE2 #ifdef X86_SSE2
uint32_t chunksize_sse2(void); uint32_t chunksize_sse2(void);
uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left); uint8_t* chunkmemset_safe_sse2(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
# ifdef HAVE_BUILTIN_CTZ # ifdef HAVE_BUILTIN_CTZ
uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1); uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1);
@ -21,7 +21,7 @@ uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsign
#ifdef X86_SSSE3 #ifdef X86_SSSE3
uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len); uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len);
uint8_t* chunkmemset_safe_ssse3(uint8_t *out, unsigned dist, unsigned len, unsigned left); uint8_t* chunkmemset_safe_ssse3(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start); void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start);
#endif #endif
@ -33,7 +33,7 @@ uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *sr
uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len); uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
uint32_t chunksize_avx2(void); uint32_t chunksize_avx2(void);
uint8_t* chunkmemset_safe_avx2(uint8_t *out, unsigned dist, unsigned len, unsigned left); uint8_t* chunkmemset_safe_avx2(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
# ifdef HAVE_BUILTIN_CTZ # ifdef HAVE_BUILTIN_CTZ
uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1); uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1);

View File

@ -4,6 +4,7 @@
#include "zbuild.h" #include "zbuild.h"
#include <stdlib.h> #include <stdlib.h>
#include <stdio.h>
/* Returns the chunk size */ /* Returns the chunk size */
Z_INTERNAL uint32_t CHUNKSIZE(void) { Z_INTERNAL uint32_t CHUNKSIZE(void) {
@ -69,18 +70,18 @@ static inline uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len)
static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) { static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
/* This code takes string of length dist from "from" and repeats /* This code takes string of length dist from "from" and repeats
* it for as many times as can fit in a chunk_t (vector register) */ * it for as many times as can fit in a chunk_t (vector register) */
uint32_t cpy_dist; uint64_t cpy_dist;
uint32_t bytes_remaining = sizeof(chunk_t); uint64_t bytes_remaining = sizeof(chunk_t);
chunk_t chunk_load; chunk_t chunk_load;
uint8_t *cur_chunk = (uint8_t *)&chunk_load; uint8_t *cur_chunk = (uint8_t *)&chunk_load;
while (bytes_remaining) { while (bytes_remaining) {
cpy_dist = MIN(dist, bytes_remaining); cpy_dist = MIN(dist, bytes_remaining);
memcpy(cur_chunk, buf, cpy_dist); memcpy(cur_chunk, buf, (size_t)cpy_dist);
bytes_remaining -= cpy_dist; bytes_remaining -= cpy_dist;
cur_chunk += cpy_dist; cur_chunk += cpy_dist;
/* This allows us to bypass an expensive integer division since we're effectively /* This allows us to bypass an expensive integer division since we're effectively
* counting in this loop, anyway */ * counting in this loop, anyway */
*chunk_rem = cpy_dist; *chunk_rem = (uint32_t)cpy_dist;
} }
return chunk_load; return chunk_load;
@ -109,21 +110,33 @@ static inline uint8_t* HALFCHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned
/* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST. /* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST.
Return OUT + LEN. */ Return OUT + LEN. */
static inline uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) { static inline uint8_t* CHUNKMEMSET(uint8_t *out, uint8_t *from, unsigned len) {
/* Debug performance related issues when len < sizeof(uint64_t): /* Debug performance related issues when len < sizeof(uint64_t):
Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */ Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
Assert(dist > 0, "chunkmemset cannot have a distance 0"); Assert(from != out, "chunkmemset cannot have a distance 0");
uint8_t *from = out - dist;
chunk_t chunk_load; chunk_t chunk_load;
uint32_t chunk_mod = 0; uint32_t chunk_mod = 0;
uint32_t adv_amount; uint32_t adv_amount;
int64_t sdist = out - from;
uint64_t dist = llabs(sdist);
/* We are supporting the case for when we are reading bytes from ahead in the buffer.
* We now have to handle this, though it wasn't _quite_ clear if this rare circumstance
* always needed to be handled here or if we're just now seeing it because we are
* dispatching to this function, more */
if (sdist < 0 && dist < len) {
/* Here the memmove semantics match perfectly, as when this happens we are
* effectively sliding down the contents of memory by dist bytes */
memmove(out, from, len);
return out + len;
}
if (dist == 1) { if (dist == 1) {
memset(out, *from, len); memset(out, *from, len);
return out + len; return out + len;
} else if (dist > sizeof(chunk_t)) { } else if (dist >= sizeof(chunk_t)) {
return CHUNKCOPY(out, out - dist, len); return CHUNKCOPY(out, from, len);
} }
/* Only AVX2 as there's 128 bit vectors and 256 bit. We allow for shorter vector /* Only AVX2 as there's 128 bit vectors and 256 bit. We allow for shorter vector
@ -135,33 +148,22 @@ static inline uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
* making the code a little smaller. */ * making the code a little smaller. */
#ifdef HAVE_HALF_CHUNK #ifdef HAVE_HALF_CHUNK
if (len <= sizeof(halfchunk_t)) { if (len <= sizeof(halfchunk_t)) {
if (dist > sizeof(halfchunk_t)) { if (dist >= sizeof(halfchunk_t))
return HALFCHUNKCOPY(out, out - dist, len); return HALFCHUNKCOPY(out, from, len);
if ((dist % 2) != 0 || dist == 6) {
halfchunk_t halfchunk_load = GET_HALFCHUNK_MAG(from, &chunk_mod, (unsigned)dist);
adv_amount = sizeof(halfchunk_t) - chunk_mod;
if (len == sizeof(halfchunk_t)) {
storehalfchunk(out, &halfchunk_load);
len -= adv_amount;
out += adv_amount;
}
chunk_load = halfchunk2whole(&halfchunk_load);
goto rem_bytes;
} }
halfchunk_t halfchunk_load;
if (dist == 2) {
halfchunkmemset_2(from, &halfchunk_load);
} else if (dist == 4) {
halfchunkmemset_4(from, &halfchunk_load);
} else if (dist == 8) {
halfchunkmemset_8(from, &halfchunk_load);
} else if (dist == 16) {
loadhalfchunk(from, &halfchunk_load);
} else {
halfchunk_load = GET_HALFCHUNK_MAG(from, &chunk_mod, dist);
}
adv_amount = sizeof(halfchunk_t) - chunk_mod;
while (len >= sizeof(halfchunk_t)) {
storehalfchunk(out, &halfchunk_load);
len -= adv_amount;
out += adv_amount;
}
chunk_load = halfchunk2whole(halfchunk_load);
goto rem_bytes;
} }
#endif #endif
@ -185,11 +187,7 @@ static inline uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
chunkmemset_16(from, &chunk_load); chunkmemset_16(from, &chunk_load);
} else } else
#endif #endif
if (dist == sizeof(chunk_t)) { chunk_load = GET_CHUNK_MAG(from, &chunk_mod, (unsigned)dist);
loadchunk(from, &chunk_load);
} else {
chunk_load = GET_CHUNK_MAG(from, &chunk_mod, dist);
}
adv_amount = sizeof(chunk_t) - chunk_mod; adv_amount = sizeof(chunk_t) - chunk_mod;
@ -221,7 +219,7 @@ rem_bytes:
return out; return out;
} }
Z_INTERNAL uint8_t* CHUNKMEMSET_SAFE(uint8_t *out, unsigned dist, unsigned len, unsigned left) { Z_INTERNAL uint8_t* CHUNKMEMSET_SAFE(uint8_t *out, uint8_t *from, unsigned len, unsigned left) {
#if !defined(UNALIGNED64_OK) #if !defined(UNALIGNED64_OK)
# if !defined(UNALIGNED_OK) # if !defined(UNALIGNED_OK)
static const uint32_t align_mask = 7; static const uint32_t align_mask = 7;
@ -231,7 +229,7 @@ Z_INTERNAL uint8_t* CHUNKMEMSET_SAFE(uint8_t *out, unsigned dist, unsigned len,
#endif #endif
len = MIN(len, left); len = MIN(len, left);
uint8_t *from = out - dist;
#if !defined(UNALIGNED64_OK) #if !defined(UNALIGNED64_OK)
while (((uintptr_t)out & align_mask) && (len > 0)) { while (((uintptr_t)out & align_mask) && (len > 0)) {
*out++ = *from++; *out++ = *from++;
@ -239,15 +237,37 @@ Z_INTERNAL uint8_t* CHUNKMEMSET_SAFE(uint8_t *out, unsigned dist, unsigned len,
--left; --left;
} }
#endif #endif
if (left < (unsigned)(3 * sizeof(chunk_t))) { if (UNLIKELY(left < sizeof(chunk_t))) {
while (len > 0) { while (len > 0) {
*out++ = *from++; *out++ = *from++;
--len; --len;
} }
return out; return out;
} }
if (len) if (len)
return CHUNKMEMSET(out, dist, len); out = CHUNKMEMSET(out, from, len);
return out; return out;
} }
static inline uint8_t *CHUNKCOPY_SAFE(uint8_t *out, uint8_t *from, unsigned len, uint8_t *safe)
{
if (out == from)
return out + len;
uint64_t safelen = (safe - out);
len = MIN(len, (unsigned)safelen);
uint64_t from_dist = (uint64_t)llabs(safe - from);
if (UNLIKELY(from_dist < sizeof(chunk_t) || safelen < sizeof(chunk_t))) {
while (len--) {
*out++ = *from++;
}
return out;
}
return CHUNKMEMSET(out, from, len);
}

View File

@ -273,9 +273,9 @@ static uint32_t adler32_fold_copy_stub(uint32_t adler, uint8_t* dst, const uint8
return functable.adler32_fold_copy(adler, dst, src, len); return functable.adler32_fold_copy(adler, dst, src, len);
} }
static uint8_t* chunkmemset_safe_stub(uint8_t* out, unsigned dist, unsigned len, unsigned left) { static uint8_t* chunkmemset_safe_stub(uint8_t* out, uint8_t *from, unsigned len, unsigned left) {
init_functable(); init_functable();
return functable.chunkmemset_safe(out, dist, len, left); return functable.chunkmemset_safe(out, from, len, left);
} }
static uint32_t chunksize_stub(void) { static uint32_t chunksize_stub(void) {

View File

@ -27,7 +27,7 @@ struct functable_s {
void (* force_init) (void); void (* force_init) (void);
uint32_t (* adler32) (uint32_t adler, const uint8_t *buf, size_t len); uint32_t (* adler32) (uint32_t adler, const uint8_t *buf, size_t len);
uint32_t (* adler32_fold_copy) (uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); uint32_t (* adler32_fold_copy) (uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
uint8_t* (* chunkmemset_safe) (uint8_t *out, unsigned dist, unsigned len, unsigned left); uint8_t* (* chunkmemset_safe) (uint8_t *out, uint8_t *from, unsigned len, unsigned left);
uint32_t (* chunksize) (void); uint32_t (* chunksize) (void);
uint32_t (* compare256) (const uint8_t *src0, const uint8_t *src1); uint32_t (* compare256) (const uint8_t *src0, const uint8_t *src1);
uint32_t (* crc32) (uint32_t crc, const uint8_t *buf, size_t len); uint32_t (* crc32) (uint32_t crc, const uint8_t *buf, size_t len);

View File

@ -235,7 +235,7 @@ void Z_INTERNAL INFLATE_FAST(PREFIX3(stream) *strm, uint32_t start) {
from += wsize - op; from += wsize - op;
if (op < len) { /* some from end of window */ if (op < len) { /* some from end of window */
len -= op; len -= op;
out = chunkcopy_safe(out, from, op, safe); out = CHUNKCOPY_SAFE(out, from, op, safe);
from = window; /* more from start of window */ from = window; /* more from start of window */
op = wnext; op = wnext;
/* This (rare) case can create a situation where /* This (rare) case can create a situation where
@ -245,19 +245,23 @@ void Z_INTERNAL INFLATE_FAST(PREFIX3(stream) *strm, uint32_t start) {
} }
if (op < len) { /* still need some from output */ if (op < len) { /* still need some from output */
len -= op; len -= op;
out = chunkcopy_safe(out, from, op, safe); if (!extra_safe) {
if (!extra_safe) out = CHUNKCOPY_SAFE(out, from, op, safe);
out = CHUNKUNROLL(out, &dist, &len); out = CHUNKUNROLL(out, &dist, &len);
out = chunkcopy_safe(out, out - dist, len, safe); out = CHUNKCOPY_SAFE(out, out - dist, len, safe);
} else {
out = chunkcopy_safe(out, from, op, safe);
out = chunkcopy_safe(out, out - dist, len, safe);
}
} else { } else {
out = chunkcopy_safe(out, from, len, safe); if (!extra_safe)
out = CHUNKCOPY_SAFE(out, from, len, safe);
else
out = chunkcopy_safe(out, from, len, safe);
} }
} else if (extra_safe) { } else if (extra_safe) {
/* Whole reference is in range of current output. */ /* Whole reference is in range of current output. */
if (dist >= len || dist >= state->chunksize)
out = chunkcopy_safe(out, out - dist, len, safe); out = chunkcopy_safe(out, out - dist, len, safe);
else
out = CHUNKMEMSET_SAFE(out, dist, len, (unsigned)((safe - out)));
} else { } else {
/* Whole reference is in range of current output. No range checks are /* Whole reference is in range of current output. No range checks are
necessary because we start with room for at least 258 bytes of output, necessary because we start with room for at least 258 bytes of output,
@ -267,7 +271,7 @@ void Z_INTERNAL INFLATE_FAST(PREFIX3(stream) *strm, uint32_t start) {
if (dist >= len || dist >= state->chunksize) if (dist >= len || dist >= state->chunksize)
out = CHUNKCOPY(out, out - dist, len); out = CHUNKCOPY(out, out - dist, len);
else else
out = CHUNKMEMSET(out, dist, len); out = CHUNKMEMSET(out, out - dist, len);
} }
} else if ((op & 64) == 0) { /* 2nd level distance code */ } else if ((op & 64) == 0) { /* 2nd level distance code */
here = dcode + here->val + BITS(op); here = dcode + here->val + BITS(op);

View File

@ -1090,7 +1090,7 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) {
} else { } else {
copy = MIN(state->length, left); copy = MIN(state->length, left);
put = FUNCTABLE_CALL(chunkmemset_safe)(put, state->offset, copy, left); put = FUNCTABLE_CALL(chunkmemset_safe)(put, put - state->offset, copy, left);
} }
left -= copy; left -= copy;
state->length -= copy; state->length -= copy;

View File

@ -150,7 +150,7 @@ static inline uint64_t load_64_bits(const unsigned char *in, unsigned bits) {
/* Behave like chunkcopy, but avoid writing beyond of legal output. */ /* Behave like chunkcopy, but avoid writing beyond of legal output. */
static inline uint8_t* chunkcopy_safe(uint8_t *out, uint8_t *from, uint64_t len, uint8_t *safe) { static inline uint8_t* chunkcopy_safe(uint8_t *out, uint8_t *from, uint64_t len, uint8_t *safe) {
uint64_t safelen = (safe - out) + 1; uint64_t safelen = safe - out;
len = MIN(len, safelen); len = MIN(len, safelen);
int32_t olap_src = from >= out && from < out + len; int32_t olap_src = from >= out && from < out + len;
int32_t olap_dst = out >= from && out < from + len; int32_t olap_dst = out >= from && out < from + len;