mirror of
https://github.com/GerbilSoft/zlib-ng.git
synced 2025-06-18 19:45:37 -04:00

This is ~25-30% faster than the SSE2 variant on a core2 quad. The main reason for this has to do with the fact that, while incurring far fewer shifts, an entirely separate stack buffer has to be managed that is the size of the L1 cache on most CPUs. This was one of the main reasons the 32k specialized function was slower for the scalar counterpart, despite auto vectorizing. The auto vectorized loop was setting up the stack buffer at unaligned offsets, which is detrimental to performance pre-nehalem. Additionally, we were losing a fair bit of time to the zero initialization, which we are now doing more selectively. There are a ton of loads and stores happening, and for sure we are bound on the fill buffer + store forwarding. An SSE2 version of this code is probably possible by simply replacing the shifts with unpacks with zero and the palignr's with shufpd's. I'm just not sure it'll be all that worth it, though. We are gating against SSE4.1 not because we are using specifically a 4.1 instruction but because that marks when Wolfdale came out and palignr became a lot faster.
129 lines
4.0 KiB
C
129 lines
4.0 KiB
C
/* x86_features.c - x86 feature check
|
|
*
|
|
* Copyright (C) 2013 Intel Corporation. All rights reserved.
|
|
* Author:
|
|
* Jim Kukunas
|
|
*
|
|
* For conditions of distribution and use, see copyright notice in zlib.h
|
|
*/
|
|
|
|
#include "zbuild.h"
|
|
#include "x86_features.h"
|
|
|
|
|
|
#if defined(HAVE_CPUID_MS)
|
|
# include <intrin.h>
|
|
#elif defined(HAVE_CPUID_GNU)
|
|
// Newer versions of GCC and clang come with cpuid.h
|
|
# include <cpuid.h>
|
|
# ifdef X86_HAVE_XSAVE_INTRIN
|
|
# if __GNUC__ == 8
|
|
# include <xsaveintrin.h>
|
|
# else
|
|
# include <immintrin.h>
|
|
# endif
|
|
# endif
|
|
#endif
|
|
|
|
#include <string.h>
|
|
|
|
static inline void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
|
|
#if defined(HAVE_CPUID_MS)
|
|
unsigned int registers[4];
|
|
__cpuid((int *)registers, info);
|
|
|
|
*eax = registers[0];
|
|
*ebx = registers[1];
|
|
*ecx = registers[2];
|
|
*edx = registers[3];
|
|
#elif defined(HAVE_CPUID_GNU)
|
|
*eax = *ebx = *ecx = *edx = 0;
|
|
__cpuid(info, *eax, *ebx, *ecx, *edx);
|
|
#else
|
|
/* When using this fallback, the faster SSE/AVX code is disabled */
|
|
*eax = *ebx = *ecx = *edx = 0;
|
|
#endif
|
|
}
|
|
|
|
static inline void cpuidex(int info, int subinfo, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
|
|
#if defined(HAVE_CPUID_MS)
|
|
unsigned int registers[4];
|
|
__cpuidex((int *)registers, info, subinfo);
|
|
|
|
*eax = registers[0];
|
|
*ebx = registers[1];
|
|
*ecx = registers[2];
|
|
*edx = registers[3];
|
|
#elif defined(HAVE_CPUID_GNU)
|
|
*eax = *ebx = *ecx = *edx = 0;
|
|
__cpuid_count(info, subinfo, *eax, *ebx, *ecx, *edx);
|
|
#else
|
|
/* When using this fallback, the faster SSE/AVX code is disabled */
|
|
*eax = *ebx = *ecx = *edx = 0;
|
|
#endif
|
|
}
|
|
|
|
static inline uint64_t xgetbv(unsigned int xcr) {
|
|
#if defined(_MSC_VER) || defined(X86_HAVE_XSAVE_INTRIN)
|
|
return _xgetbv(xcr);
|
|
#elif defined(__GNUC__)
|
|
uint32_t eax, edx;
|
|
__asm__ ( ".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(xcr));
|
|
return (uint64_t)(edx) << 32 | eax;
|
|
#else
|
|
/* When using this fallback, some of the faster code is disabled */
|
|
return 0;
|
|
#endif
|
|
}
|
|
|
|
void Z_INTERNAL x86_check_features(struct x86_cpu_features *features) {
|
|
unsigned eax, ebx, ecx, edx;
|
|
unsigned maxbasic;
|
|
|
|
cpuid(0, &maxbasic, &ebx, &ecx, &edx);
|
|
cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx);
|
|
|
|
features->has_sse2 = edx & 0x4000000;
|
|
features->has_ssse3 = ecx & 0x200;
|
|
features->has_sse41 = ecx & 0x80000;
|
|
features->has_sse42 = ecx & 0x100000;
|
|
features->has_pclmulqdq = ecx & 0x2;
|
|
|
|
if (ecx & 0x08000000) {
|
|
uint64_t xfeature = xgetbv(0);
|
|
|
|
features->has_os_save_ymm = ((xfeature & 0x06) == 0x06);
|
|
features->has_os_save_zmm = ((xfeature & 0xe6) == 0xe6);
|
|
}
|
|
|
|
if (maxbasic >= 7) {
|
|
cpuidex(7, 0, &eax, &ebx, &ecx, &edx);
|
|
|
|
// check BMI1 bit
|
|
// Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf
|
|
features->has_vpclmulqdq = ecx & 0x400;
|
|
|
|
// check AVX2 bit if the OS supports saving YMM registers
|
|
if (features->has_os_save_ymm) {
|
|
features->has_avx2 = ebx & 0x20;
|
|
}
|
|
|
|
features->has_bmi2 = ebx & 0x8;
|
|
|
|
// check AVX512 bits if the OS supports saving ZMM registers
|
|
if (features->has_os_save_zmm) {
|
|
features->has_avx512f = ebx & 0x00010000;
|
|
if (features->has_avx512f) {
|
|
// According to the Intel Software Developer's Manual, AVX512F must be enabled too in order to enable
|
|
// AVX512(DQ,BW,VL).
|
|
features->has_avx512dq = ebx & 0x00020000;
|
|
features->has_avx512bw = ebx & 0x40000000;
|
|
features->has_avx512vl = ebx & 0x80000000;
|
|
}
|
|
features->has_avx512_common = features->has_avx512f && features->has_avx512dq && features->has_avx512bw \
|
|
&& features->has_avx512vl && features->has_bmi2;
|
|
features->has_avx512vnni = ecx & 0x800;
|
|
}
|
|
}
|
|
}
|