zlib-ng/fallback_builtins.h
Adam Stylinski 43dbfd6709 Improved adler32 NEON performance by 30-47%
We unlocked some ILP by allowing for independent sums in the loop and
reducing these sums outside of the loop. Additionally, the multiplication
by 32 (now 64) is moved outside of this loop. Similar to the chromium
implementation, this code does straight 8 bit -> 16 bit additions and defers
the fused multiply accumulate outside of the loop.  However, by unrolling by
another factor of 2, the code is measurably faster. The code does fused multiply
accmulates back to as many scratch registers we have room for in order to maximize
ILP for the 16 integer FMAs that need to occur.  The compiler seems to order them
such that the destination register is the same register as the previous instruction,
so perhaps it's not actually able to overlap or maybe the -A73's pipeline is reordering
these instructions, anyway.

On the Odroid-N2, the Cortex-A73 cores are ~30-44% faster on the adler32 benchmark,
and the Cortex-A53 cores are anywhere from 34-47% faster.
2022-02-24 16:00:51 +01:00

123 lines
3.8 KiB
C

#ifndef FALLBACK_BUILTINS_H
#define FALLBACK_BUILTINS_H
#if defined(_MSC_VER) && !defined(__clang__)
#if defined(_M_IX86) || defined(_M_AMD64) || defined(_M_IA64) || defined(_M_ARM) || defined(_M_ARM64)
#include <intrin.h>
#ifdef X86_FEATURES
# include "arch/x86/x86_features.h"
#endif
/* This is not a general purpose replacement for __builtin_ctz. The function expects that value is != 0
* Because of that assumption trailing_zero is not initialized and the return value of _BitScanForward is not checked
*/
static __forceinline unsigned long __builtin_ctz(uint32_t value) {
#ifdef X86_FEATURES
if (x86_cpu_has_tzcnt)
return _tzcnt_u32(value);
#endif
unsigned long trailing_zero;
_BitScanForward(&trailing_zero, value);
return trailing_zero;
}
#define HAVE_BUILTIN_CTZ
#ifdef _M_AMD64
/* This is not a general purpose replacement for __builtin_ctzll. The function expects that value is != 0
* Because of that assumption trailing_zero is not initialized and the return value of _BitScanForward64 is not checked
*/
static __forceinline unsigned long long __builtin_ctzll(uint64_t value) {
#ifdef X86_FEATURES
if (x86_cpu_has_tzcnt)
return _tzcnt_u64(value);
#endif
unsigned long trailing_zero;
_BitScanForward64(&trailing_zero, value);
return trailing_zero;
}
#define HAVE_BUILTIN_CTZLL
#endif // Microsoft AMD64
#endif // Microsoft AMD64/IA64/x86/ARM/ARM64 test
#endif // _MSC_VER & !clang
/* Unfortunately GCC didn't support these things until version 10 */
#ifdef __AVX2__
#include <immintrin.h>
#if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10)
static inline __m256i _mm256_zextsi128_si256(__m128i a) {
__m128i r;
__asm__ volatile ("vmovdqa %1,%0" : "=x" (r) : "x" (a));
return _mm256_castsi128_si256(r);
}
#ifdef __AVX512F__
static inline __m512i _mm512_zextsi128_si512(__m128i a) {
__m128i r;
__asm__ volatile ("vmovdqa %1,%0" : "=x" (r) : "x" (a));
return _mm512_castsi128_si512(r);
}
#endif // __AVX512F__
#endif // gcc version 10 test
#endif // __AVX2__
#if defined(ARM_NEON_ADLER32) && !defined(__aarch64__)
/* Compatibility shim for the _high family of functions */
#define vmull_high_u8(a, b) vmull_u8(vget_high_u8(a), vget_high_u8(b))
#define vmlal_high_u8(a, b, c) vmlal_u8(a, vget_high_u8(b), vget_high_u8(c))
#define vmlal_high_u16(a, b, c) vmlal_u16(a, vget_high_u16(b), vget_high_u16(c))
#define vaddw_high_u8(a, b) vaddw_u8(a, vget_high_u8(b))
#endif
#ifdef ARM_NEON_SLIDEHASH
#define vqsubq_u16_x4_x1(out, a, b) do { \
out.val[0] = vqsubq_u16(a.val[0], b); \
out.val[1] = vqsubq_u16(a.val[1], b); \
out.val[2] = vqsubq_u16(a.val[2], b); \
out.val[3] = vqsubq_u16(a.val[3], b); \
} while (0)
/* Have to check for hard float ABI on GCC/clang, but not
* on MSVC (we don't compile for the soft float ABI on windows)
*/
#if !defined(ARM_NEON_HASLD4) && (defined(__ARM_FP) || defined(_MSC_VER))
#ifdef _M_ARM64
# include <arm64_neon.h>
#else
# include <arm_neon.h>
#endif
static inline uint16x8x4_t vld1q_u16_x4(uint16_t *a) {
uint16x8x4_t ret = (uint16x8x4_t) {{
vld1q_u16(a),
vld1q_u16(a+8),
vld1q_u16(a+16),
vld1q_u16(a+24)}};
return ret;
}
static inline uint8x16x4_t vld1q_u8_x4(uint8_t *a) {
uint8x16x4_t ret = (uint8x16x4_t) {{
vld1q_u8(a),
vld1q_u8(a+16),
vld1q_u8(a+32),
vld1q_u8(a+48)}};
return ret;
}
static inline void vst1q_u16_x4(uint16_t *p, uint16x8x4_t a) {
vst1q_u16(p, a.val[0]);
vst1q_u16(p + 8, a.val[1]);
vst1q_u16(p + 16, a.val[2]);
vst1q_u16(p + 24, a.val[3]);
}
#endif // HASLD4 check and hard float
#endif // ARM_NEON_SLIDEHASH
#endif // include guard FALLBACK_BUILTINS_H