zlib-ng/arch/x86/avx2_tables.h
Adam Stylinski 0ed5ac8289 Make an AVX512 inflate fast with low cost masked writes
This takes advantage of the fact that on AVX512 architectures, masked
moves are incredibly cheap. There are many places where we have to
fallback to the safe C implementation of chunkcopy_safe because of the
assumed overwriting that occurs. We're to sidestep most of the branching
needed here by simply controlling the bounds of our writes with a mask.
2024-11-20 22:14:44 +01:00

45 lines
1.5 KiB
C

#ifndef _AVX2_TABLES_H
#define _AVX2_TABLES_H
#include "../generic/chunk_permute_table.h"
/* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can
* never be 0 - 2, we'll start with an offset, subtracting 3 from the input */
static const lut_rem_pair perm_idx_lut[29] = {
{ 0, 2}, /* 3 */
{ 0, 0}, /* don't care */
{ 1 * 32, 2}, /* 5 */
{ 2 * 32, 2}, /* 6 */
{ 3 * 32, 4}, /* 7 */
{ 0 * 32, 0}, /* don't care */
{ 4 * 32, 5}, /* 9 */
{ 5 * 32, 22}, /* 10 */
{ 6 * 32, 21}, /* 11 */
{ 7 * 32, 20}, /* 12 */
{ 8 * 32, 6}, /* 13 */
{ 9 * 32, 4}, /* 14 */
{10 * 32, 2}, /* 15 */
{ 0 * 32, 0}, /* don't care */
{11 * 32, 15}, /* 17 */
{11 * 32 + 16, 14}, /* 18 */
{11 * 32 + 16 * 2, 13}, /* 19 */
{11 * 32 + 16 * 3, 12}, /* 20 */
{11 * 32 + 16 * 4, 11}, /* 21 */
{11 * 32 + 16 * 5, 10}, /* 22 */
{11 * 32 + 16 * 6, 9}, /* 23 */
{11 * 32 + 16 * 7, 8}, /* 24 */
{11 * 32 + 16 * 8, 7}, /* 25 */
{11 * 32 + 16 * 9, 6}, /* 26 */
{11 * 32 + 16 * 10, 5}, /* 27 */
{11 * 32 + 16 * 11, 4}, /* 28 */
{11 * 32 + 16 * 12, 3}, /* 29 */
{11 * 32 + 16 * 13, 2}, /* 30 */
{11 * 32 + 16 * 14, 1} /* 31 */
};
static const uint16_t half_rem_vals[13] = {
1, 0, 1, 4, 2, 0, 7, 6, 5, 4, 3, 2, 1
};
#endif