Use ternary logic to xor 3 operands for "fold16"

This strategy is borrowed from ISA-L in this commit:
c2bec3ea65

We can also use it in the "fold final" routine but we'd have to take
some extra care to only use it on AVX512 capable systems.
This commit is contained in:
Adam Stylinski 2023-04-29 11:33:05 -04:00 committed by Hans Kristian Rosbach
parent f346148df0
commit 9087c75f8d

View File

@ -40,8 +40,7 @@ static size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc3, 3);
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0);
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_t0, 0x96);
#ifdef COPY
_mm512_storeu_si512((__m512i *)dst, zmm_t0);
@ -70,15 +69,10 @@ static size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
zmm_crc2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x10);
zmm_crc3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x10);
zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
zmm_crc1 = _mm512_xor_si512(z1, zmm_crc1);
zmm_crc2 = _mm512_xor_si512(z2, zmm_crc2);
zmm_crc3 = _mm512_xor_si512(z3, zmm_crc3);
zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0);
zmm_crc1 = _mm512_xor_si512(zmm_crc1, zmm_t1);
zmm_crc2 = _mm512_xor_si512(zmm_crc2, zmm_t2);
zmm_crc3 = _mm512_xor_si512(zmm_crc3, zmm_t3);
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_t0, 0x96);
zmm_crc1 = _mm512_ternarylogic_epi32(zmm_crc1, z1, zmm_t1, 0x96);
zmm_crc2 = _mm512_ternarylogic_epi32(zmm_crc2, z2, zmm_t2, 0x96);
zmm_crc3 = _mm512_ternarylogic_epi32(zmm_crc3, z3, zmm_t3, 0x96);
#ifdef COPY
_mm512_storeu_si512((__m512i *)dst, zmm_t0);
@ -93,18 +87,15 @@ static size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
// zmm_crc[0,1,2,3] -> zmm_crc0
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc1);
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc1, 0x96);
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc2);
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc2, 0x96);
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc3);
zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc3, 0x96);
// zmm_crc0 -> xmm_crc[0, 1, 2, 3]
*xmm_crc0 = _mm512_extracti32x4_epi32(zmm_crc0, 0);