GimliDS/arm9/source/lzav.h

/**
 * @file lzav.h
 *
 * @version 4.17
 *
 * @brief Self-contained inclusion file for the "LZAV" in-memory data
 * compression and decompression algorithms.
 *
 * The source code is written in ISO C99, with full C++ compliance enabled
 * conditionally and automatically, if compiled with a C++ compiler.
 *
 * Description is available at https://github.com/avaneev/lzav
 *
 * E-mail: aleksey.vaneev@gmail.com or info@voxengo.com
 *
 * LICENSE:
 *
 * Copyright (c) 2023-2025 Aleksey Vaneev
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

#ifndef LZAV_INCLUDED
#define LZAV_INCLUDED

#define LZAV_API_VER 0x107 ///< API version, unrelated to source code version.
#define LZAV_VER_STR "4.17" ///< LZAV source code version string.

/**
 * @def LZAV_FMT_MIN
 * @brief Minimal stream format id supported by the decompressor. A value of 2
 * can be defined externally, to reduce decompressor's code size.
 */

#if !defined( LZAV_FMT_MIN )
	#define LZAV_FMT_MIN 1
#endif // !defined( LZAV_FMT_MIN )

/**
 * @def LZAV_NS_CUSTOM
 * @brief If this macro is defined externally, all symbols will be placed
 * into the namespace specified by the macro, and won't be exported to the
 * global namespace. WARNING: if the defined value of the macro is empty, the
 * symbols will be placed into the global namespace anyway.
 */

/**
 * @def LZAV_NOEX
 * @brief Macro that defines the "noexcept" function specifier for C++
 * environment.
 */

/**
 * @def LZAV_NULL
 * @brief Macro that defines "nullptr" value, for C++ guidelines conformance.
 */

/**
 * @def LZAV_NS
 * @brief Macro that defines an actual implementation namespace in C++
 * environment, with export of relevant symbols to the global namespace
 * (if @ref LZAV_NS_CUSTOM is undefined).
 */

#if defined( __cplusplus )

	#include <cstring>
	#include <cstdlib>

	#if __cplusplus >= 201103L

		#include <cstdint>

		#define LZAV_NOEX noexcept
		#define LZAV_NULL nullptr

	#else // __cplusplus >= 201103L

		#include <stdint.h>

		#define LZAV_NOEX throw()
		#define LZAV_NULL NULL

	#endif // __cplusplus >= 201103L

	#if defined( LZAV_NS_CUSTOM )
		#define LZAV_NS LZAV_NS_CUSTOM
	#else // defined( LZAV_NS_CUSTOM )
		#define LZAV_NS lzav
	#endif // defined( LZAV_NS_CUSTOM )

#else // defined( __cplusplus )

	#include <string.h>
	#include <stdlib.h>
	#include <stdint.h>

	#define LZAV_NOEX
	#define LZAV_NULL 0

#endif // defined( __cplusplus )

#if SIZE_MAX < 0xFFFFFFFFU

	#error LZAV: the platform or the compiler has incompatible size_t type.

#endif // size_t check

/**
 * @def LZAV_X86
 * @brief Macro is defined if `x86` or `x86_64` platform was detected.
 */

#if defined( i386 ) || defined( __i386 ) || defined( __i386__ ) || \
	defined( _X86_ ) || defined( __x86_64 ) || defined( __x86_64__ ) || \
	defined( __amd64 ) || defined( __amd64__ ) || defined( _M_IX86 ) || \
	( defined( _M_AMD64 ) && !defined( _M_ARM64EC ))

	#define LZAV_X86

#endif // x86 platform check

/**
 * @def LZAV_LITTLE_ENDIAN
 * @brief Endianness definition macro, can be used as a logical constant.
 */

#if defined( __LITTLE_ENDIAN__ ) || defined( __LITTLE_ENDIAN ) || \
	defined( _LITTLE_ENDIAN ) || ( defined( __BYTE_ORDER__ ) && \
		__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ) || \
	defined( LZAV_X86 ) || defined( _WIN32 ) || defined( _M_ARM ) || \
	defined( _M_ARM64EC )

	#define LZAV_LITTLE_ENDIAN 1

#elif defined( __BIG_ENDIAN__ ) || defined( __BIG_ENDIAN ) || \
	defined( _BIG_ENDIAN ) || ( defined( __BYTE_ORDER__ ) && \
		__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ ) || \
	defined( __SYSC_ZARCH__ ) || defined( __zarch__ ) || \
	defined( __s390x__ ) || defined( __sparc ) || defined( __sparc__ )

	#define LZAV_LITTLE_ENDIAN 0

#else // defined( __BIG_ENDIAN__ )

	#warning LZAV: cannot determine endianness, assuming little-endian.

	#define LZAV_LITTLE_ENDIAN 1

#endif // defined( __BIG_ENDIAN__ )

/**
 * @def LZAV_PTR32
 * @brief Macro denotes that pointers are likely 32-bit (pointer overflow
 * checks are required).
 */

#if SIZE_MAX <= 0xFFFFFFFFU && \
	( !defined( UINTPTR_MAX ) || UINTPTR_MAX <= 0xFFFFFFFFU )

	#define LZAV_PTR32

#endif // 32-bit pointers check

/**
 * @def LZAV_ARCH64
 * @brief Macro that denotes availability of 64-bit instructions.
 */

#if defined( __LP64__ ) || defined( _LP64 ) || !defined( LZAV_PTR32 ) || \
	defined( __x86_64__ ) || defined( __aarch64__ ) || \
	defined( _M_X64 ) || defined( _M_ARM64 )

	#define LZAV_ARCH64

#endif // 64-bit availability check

/**
 * @def LZAV_GCC_BUILTINS
 * @brief Macro that denotes availability of GCC-style built-in functions.
 */

#if defined( __GNUC__ ) || defined( __clang__ ) || \
	defined( __IBMC__ ) || defined( __IBMCPP__ ) || \
	defined( __COMPCERT__ ) || ( defined( __INTEL_COMPILER ) && \
		__INTEL_COMPILER >= 1300 && !defined( _MSC_VER ))

	#define LZAV_GCC_BUILTINS

#endif // GCC built-ins check

/**
 * @def LZAV_IEC32( x )
 * @brief In-place endianness-correction macro, for singular 32-bit variables.
 * @param x Value to correct in-place.
 */

#if LZAV_LITTLE_ENDIAN

	#define LZAV_IEC32( x ) (void) 0

#else // LZAV_LITTLE_ENDIAN

	#if defined( LZAV_GCC_BUILTINS )

		#define LZAV_IEC32( x ) x = __builtin_bswap32( x )

	#elif defined( _MSC_VER )

		#define LZAV_IEC32( x ) x = _byteswap_ulong( x )

	#else // defined( _MSC_VER )

		#define LZAV_IEC32( x ) x = (uint32_t) ( \
			x >> 24 | \
			( x & 0x00FF0000 ) >> 8 | \
			( x & 0x0000FF00 ) << 8 | \
			x << 24 )

	#endif // defined( _MSC_VER )

#endif // LZAV_LITTLE_ENDIAN

/**
 * @def LZAV_LIKELY( x )
 * @brief Likelihood macro that is used for manually-guided
 * micro-optimization.
 * @param x Expression that is likely to be evaluated to 1.
 */

/**
 * @def LZAV_UNLIKELY( x )
 * @brief Unlikelihood macro that is used for manually-guided
 * micro-optimization.
 * @param x Expression that is unlikely to be evaluated to 1.
 */

#if defined( LZAV_GCC_BUILTINS ) && \
	!( defined( __aarch64__ ) && defined( __APPLE__ ))

	#define LZAV_LIKELY( x ) __builtin_expect( x, 1 )
	#define LZAV_UNLIKELY( x ) __builtin_expect( x, 0 )

#else // Likelihood macros

	#define LZAV_LIKELY( x ) ( x )
	#define LZAV_UNLIKELY( x ) ( x )

#endif // Likelihood macros

#if defined( _MSC_VER ) && !defined( LZAV_GCC_BUILTINS )
	#include <intrin.h> // For _BitScanForward.
#endif // defined( _MSC_VER ) && !defined( LZAV_GCC_BUILTINS )

#if defined( LZAV_NS )

namespace LZAV_NS {

using std :: memcpy;
using std :: memset;
using std :: malloc;
using std :: free;
using std :: size_t;

#if __cplusplus >= 201103L

	using std :: intptr_t;
	using std :: uint16_t;
	using std :: uint32_t;
	using uint8_t = unsigned char; ///< For C++ type aliasing compliance.

	#if defined( LZAV_ARCH64 )
		using std :: uint64_t;
	#endif // defined( LZAV_ARCH64 )

#endif // __cplusplus >= 201103L

namespace enum_wrapper {

#endif // defined( LZAV_NS )

/**
 * @brief Decompression error codes.
 */

enum LZAV_ERROR
{
	LZAV_E_PARAMS = -1, ///< Incorrect function parameters.
	LZAV_E_SRCOOB = -2, ///< Source buffer OOB.
	LZAV_E_DSTOOB = -3, ///< Destination buffer OOB.
	LZAV_E_REFOOB = -4, ///< Back-reference OOB.
	LZAV_E_DSTLEN = -5, ///< Decompressed length mismatch.
	LZAV_E_UNKFMT = -6, ///< Unknown stream format.
	LZAV_E_PTROVR = -7 ///< Pointer overflow.
};

#if defined( LZAV_NS )

} // namespace enum_wrapper

using namespace enum_wrapper;

#endif // defined( LZAV_NS )

#define LZAV_HASH_C1 0x243F6A88 ///< Hash function constant 1.
#define LZAV_HASH_C2 0x85A308D3 ///< Hash function constant 2.

/**
 * Enumeration used to define compression algorithm's parameters.
 */

enum LZAV_PARAM
{
	LZAV_WIN_LEN = ( 1 << 23 ), ///< LZ77 window length, in bytes.
	LZAV_REF_LEN = ( 15 + 255 + 254 ), ///< Max ref length, minus `mref`.
	LZAV_LIT_FIN = 6, ///< The number of literals required at finish.
	LZAV_OFS_MIN = 8, ///< The minimal reference offset to use.
	LZAV_OFS_TH1 = (( 1 << 10 ) - 1 ), ///< Reference offset threshold 1.
	LZAV_OFS_TH2 = (( 1 << 18 ) - 1 ), ///< Reference offset threshold 2.
	LZAV_FMT_CUR = 2 ///< Stream format identifier used by the compressor.
};

/**
 * @brief Data match length finding function.
 *
 * Function finds the number of continuously-matching leading bytes between
 * two buffers. This function is well-optimized for a wide variety of
 * compilers and platforms.
 *
 * @param p1 Pointer to buffer 1.
 * @param p2 Pointer to buffer 2.
 * @param ml Maximal number of bytes to match.
 * @return The number of matching leading bytes.
 */

static inline size_t lzav_match_len( const uint8_t* p1, const uint8_t* p2,
	const size_t ml ) LZAV_NOEX
{
	const uint8_t* const p1s = p1;
	const uint8_t* const p1e = p1 + ml;

#if defined( LZAV_ARCH64 )

	while( LZAV_LIKELY( p1 + 7 < p1e ))
	{
		uint64_t v1, v2, vd;
		memcpy( &v1, p1, 8 );
		memcpy( &v2, p2, 8 );
		vd = v1 ^ v2;

		if( vd != 0 )
		{
		#if defined( LZAV_GCC_BUILTINS )

			#if LZAV_LITTLE_ENDIAN
				return( (size_t) ( p1 - p1s + ( __builtin_ctzll( vd ) >> 3 )));
			#else // LZAV_LITTLE_ENDIAN
				return( (size_t) ( p1 - p1s + ( __builtin_clzll( vd ) >> 3 )));
			#endif // LZAV_LITTLE_ENDIAN

		#else // defined( LZAV_GCC_BUILTINS )

			#if defined( _MSC_VER )
				unsigned long i;
				_BitScanForward64( &i, (unsigned __int64) vd );
				return( (size_t) ( p1 - p1s + ( i >> 3 )));
			#else // defined( _MSC_VER )
				#if !LZAV_LITTLE_ENDIAN
					const uint64_t sw = vd >> 32 | vd << 32;
					const uint64_t sw2 =
						( sw & (uint64_t) 0xFFFF0000FFFF0000 ) >> 16 |
						( sw & (uint64_t) 0x0000FFFF0000FFFF ) << 16;
					vd = ( sw2 & (uint64_t) 0xFF00FF00FF00FF00 ) >> 8 |
						( sw2 & (uint64_t) 0x00FF00FF00FF00FF ) << 8;
				#endif // !LZAV_LITTLE_ENDIAN

				const uint64_t m = (uint64_t) 0x0101010101010101;

				return( (size_t) ( p1 - p1s +
					(((( vd ^ ( vd - 1 )) & ( m - 1 )) * m ) >> 56 )));
			#endif // defined( _MSC_VER )

		#endif // defined( LZAV_GCC_BUILTINS )
		}

		p1 += 8;
		p2 += 8;
	}

	// At most 7 bytes left.

	if( LZAV_LIKELY( p1 + 3 < p1e ))
	{

#else // defined( LZAV_ARCH64 )

	while( LZAV_LIKELY( p1 + 3 < p1e ))
	{

#endif // defined( LZAV_ARCH64 )

		uint32_t v1, v2, vd;
		memcpy( &v1, p1, 4 );
		memcpy( &v2, p2, 4 );
		vd = v1 ^ v2;

		if( vd != 0 )
		{
		#if defined( LZAV_GCC_BUILTINS )

			#if LZAV_LITTLE_ENDIAN
				return( (size_t) ( p1 - p1s + ( __builtin_ctz( vd ) >> 3 )));
			#else // LZAV_LITTLE_ENDIAN
				return( (size_t) ( p1 - p1s + ( __builtin_clz( vd ) >> 3 )));
			#endif // LZAV_LITTLE_ENDIAN

		#else // defined( LZAV_GCC_BUILTINS )

			#if defined( _MSC_VER )
				unsigned long i;
				_BitScanForward( &i, (unsigned long) vd );
				return( (size_t) ( p1 - p1s + ( i >> 3 )));
			#else // defined( _MSC_VER )
				LZAV_IEC32( vd );
				const uint32_t m = 0x01010101;

				return( (size_t) ( p1 - p1s +
					(((( vd ^ ( vd - 1 )) & ( m - 1 )) * m ) >> 24 )));
			#endif // defined( _MSC_VER )

		#endif // defined( LZAV_GCC_BUILTINS )
		}

		p1 += 4;
		p2 += 4;
	}

	// At most 3 bytes left.

	if( p1 < p1e )
	{
		if( *p1 != p2[ 0 ])
		{
			return( (size_t) ( p1 - p1s ));
		}

		if( ++p1 < p1e )
		{
			if( *p1 != p2[ 1 ])
			{
				return( (size_t) ( p1 - p1s ));
			}

			if( ++p1 < p1e )
			{
				if( *p1 != p2[ 2 ])
				{
					return( (size_t) ( p1 - p1s ));
				}
			}
		}
	}

	return( ml );
}

/**
 * @brief Data match length finding function, reverse direction.
 *
 * @param p1 Origin pointer to buffer 1.
 * @param p2 Origin pointer to buffer 2.
 * @param ml Maximal number of bytes to back-match.
 * @return The number of matching prior bytes, not including origin position.
 */

static inline size_t lzav_match_len_r( const uint8_t* p1, const uint8_t* p2,
	const size_t ml ) LZAV_NOEX
{
	if( LZAV_UNLIKELY( ml == 0 ))
	{
		return( 0 );
	}

	if( p1[ -1 ] != p2[ -1 ])
	{
		return( 0 );
	}

	if( LZAV_UNLIKELY( ml != 1 ))
	{
		const uint8_t* const p1s = p1;
		const uint8_t* p1e = p1 - ml + 1;
		p1--;
		p2--;

		while( LZAV_UNLIKELY( p1 > p1e ))
		{
			uint16_t v1, v2;
			memcpy( &v1, p1 - 2, 2 );
			memcpy( &v2, p2 - 2, 2 );

			const uint32_t vd = (uint32_t) ( v1 ^ v2 );

			if( vd != 0 )
			{
			#if LZAV_LITTLE_ENDIAN
				return( (size_t) ( p1s - p1 + (( vd & 0xFF00 ) == 0 )));
			#else // LZAV_LITTLE_ENDIAN
				return( (size_t) ( p1s - p1 + (( vd & 0x00FF ) == 0 )));
			#endif // LZAV_LITTLE_ENDIAN
			}

			p1 -= 2;
			p2 -= 2;
		}

		p1e--;

		if( p1 > p1e && p1[ -1 ] != p2[ -1 ])
		{
			return( (size_t) ( p1s - p1 ));
		}
	}

	return( ml );
}

/**
 * @brief Internal LZAV block header writing function (stream format 2).
 *
 * Internal function writes a block to the output buffer. This function can be
 * used in custom compression algorithms.
 *
 * Stream format 2.
 *
 * "Raw" compressed stream consists of any quantity of unnumerated "blocks".
 * A block starts with a header byte, followed by several optional bytes.
 * Bits 4-5 of the header specify block's type.
 *
 * CC00LLLL: literal block (1-6 bytes). `LLLL` is literal length.
 *
 * OO01RRRR: 10-bit offset block (2-4 bytes). `RRRR` is reference length.
 *
 * OO10RRRR: 18-bit offset block (3-5 bytes).
 *
 * OO11RRRR: 23-bit offset block (4-6 bytes).
 *
 * If `LLLL` or `RRRR` equals 0, a value of 16 is assumed, and an additional
 * length byte follows. If in a literal block this additional byte's highest
 * bit is 1, one more length byte follows that defines higher bits of length
 * (up to 4 bytes). In a reference block, additional 1-2 length bytes follow
 * the offset bytes. `CC` is a reference offset carry value (additional 2
 * lowest bits of offset of the next reference block). Block type 3 includes 3
 * carry bits (highest bits of 4th byte).
 *
 * The overall compressed data is prefixed with a byte whose lower 4 bits
 * contain minimal reference length (mref), and the highest 4 bits contain
 * stream format identifier. Compressed data always finishes with
 * @ref LZAV_LIT_FIN literals. The lzav_write_fin_2() function should be used
 * to finalize compression.
 *
 * Except the last block, a literal block is always followed by a reference
 * block.
 *
 * @param op Output buffer pointer.
 * @param lc Literal length, in bytes.
 * @param rc Reference length, in bytes, not lesser than mref.
 * @param d Reference offset, in bytes. Should be lesser than
 * @ref LZAV_WIN_LEN, and not lesser than `rc` since fast copy on
 * decompression cannot provide consistency of copying of data that is not in
 * the output yet.
 * @param ipa Literals anchor pointer.
 * @param cbpp Pointer to the pointer to the latest offset carry block header.
 * Cannot be 0, but the contained pointer can be 0 (initial value).
 * @param cshp Pointer to offset carry shift.
 * @param mref Minimal reference length, in bytes, used by the compression
 * algorithm.
 * @return Incremented output buffer pointer.
 */

static inline uint8_t* lzav_write_blk_2( uint8_t* op, size_t lc, size_t rc,
	size_t d, const uint8_t* ipa, uint8_t** const cbpp, int* const cshp,
	const size_t mref ) LZAV_NOEX
{
	// Perform offset carry to a previous block (`csh` may be zero).

	const int csh = *cshp;
	rc = rc + 1 - mref;
	**cbpp |= (uint8_t) (( d << 8 ) >> csh );
	d >>= csh;

	if( LZAV_UNLIKELY( lc != 0 ))
	{
		// Write a literal block.

		size_t cv; // Offset carry value in literal block.
		cv = ( d & 3 ) << 6;
		d >>= 2;

		if( LZAV_LIKELY( lc < 9 ))
		{
			*op = (uint8_t) ( cv | lc );
			op++;

			memcpy( op, ipa, 8 );
			op += lc;
		}
		else
		if( LZAV_LIKELY( lc < 16 ))
		{
			*op = (uint8_t) ( cv | lc );
			op++;

			memcpy( op, ipa, 16 );
			op += lc;
		}
		else
		if( LZAV_LIKELY( lc < 16 + 128 ))
		{
		#if LZAV_LITTLE_ENDIAN
			uint16_t ov = (uint16_t) (( lc - 16 ) << 8 | cv );
		#else // LZAV_LITTLE_ENDIAN
			uint16_t ov = (uint16_t) ( cv << 8 | ( lc - 16 ));
		#endif // LZAV_LITTLE_ENDIAN

			memcpy( op, &ov, 2 );
			op += 2;

			memcpy( op, ipa, 16 );
			memcpy( op + 16, ipa + 16, 16 );

			if( lc < 33 )
			{
				op += lc;
			}
			else
			{
				ipa += 32;
				op += 32;
				lc -= 32;

				do
				{
					*op = *ipa;
					ipa++;
					op++;
				} while( --lc != 0 );
			}
		}
		else
		{
			*op = (uint8_t) cv;
			op++;

			size_t lcw = lc - 16;

			while( lcw > 127 )
			{
				*op = (uint8_t) ( 0x80 | lcw );
				lcw >>= 7;
				op++;
			}

			*op = (uint8_t) lcw;
			op++;

			memcpy( op, ipa, lc );
			op += lc;
		}
	}

	// Write a reference block.

	static const int ocsh[ 4 ] = { 0, 0, 0, 3 };
	const size_t bt = (size_t) 1 + ( d > LZAV_OFS_TH1 ) + ( d > LZAV_OFS_TH2 );

	if( LZAV_LIKELY( rc < 16 ))
	{
		uint32_t ov = (uint32_t) ( d << 6 | bt << 4 | rc );
		LZAV_IEC32( ov );
		memcpy( op, &ov, 4 );

		op += bt;
		*cshp = ocsh[ bt ];
		*cbpp = op;

		return( op + 1 );
	}

	uint32_t ov = (uint32_t) ( d << 6 | bt << 4 );
	LZAV_IEC32( ov );
	memcpy( op, &ov, 4 );

	op += bt;
	*cshp = ocsh[ bt ];
	*cbpp = op;

	if( LZAV_LIKELY( rc < 16 + 255 ))
	{
		op[ 1 ] = (uint8_t) ( rc - 16 );
		return( op + 2 );
	}

	op[ 1 ] = (uint8_t) 255;
	op[ 2 ] = (uint8_t) ( rc - 16 - 255 );
	return( op + 3 );
}

/**
 * @brief Internal LZAV finishing function (stream format 2).
 *
 * Internal function writes finishing literal block(s) to the output buffer.
 * This function can be used in custom compression algorithms.
 *
 * Stream format 2.
 *
 * @param op Output buffer pointer.
 * @param lc Literal length, in bytes. Not less than @ref LZAV_LIT_FIN.
 * @param ipa Literals anchor pointer.
 * @return Incremented output buffer pointer.
 */

static inline uint8_t* lzav_write_fin_2( uint8_t* op, size_t lc,
	const uint8_t* ipa ) LZAV_NOEX
{
	if( lc < 16 )
	{
		*op = (uint8_t) lc;
		op++;
	}
	else
	{
		*op = 0;
		op++;

		size_t lcw = lc - 16;

		while( lcw > 127 )
		{
			*op = (uint8_t) ( 0x80 | lcw );
			lcw >>= 7;
			op++;
		}

		*op = (uint8_t) lcw;
		op++;
	}

	memcpy( op, ipa, lc );
	return( op + lc );
}

/**
 * @brief Function returns buffer size required for LZAV compression.
 *
 * @param srcl The length of the source data to be compressed.
 * @return The required allocation size for destination compression buffer.
 * Always a positive value.
 */

static inline int lzav_compress_bound( const int srcl ) LZAV_NOEX
{
	if( srcl <= 0 )
	{
		return( 16 );
	}

	const int k = 16 + 127 + 1;
	const int l2 = srcl / ( k + 6 );

	return(( srcl - l2 * 6 + k - 1 ) / k * 2 - l2 + srcl + 16 );
}

/**
 * @brief Function returns buffer size required for the higher-ratio LZAV
 * compression.
 *
 * @param srcl The length of the source data to be compressed.
 * @return The required allocation size for destination compression buffer.
 * Always a positive value.
 */

static inline int lzav_compress_bound_hi( const int srcl ) LZAV_NOEX
{
	if( srcl <= 0 )
	{
		return( 16 );
	}

	const int l2 = srcl / ( 16 + 5 );

	return(( srcl - l2 * 5 + 15 ) / 16 * 2 - l2 + srcl + 16 );
}

/**
 * @brief Hash-table initialization function.
 *
 * Function initializes the hash-table by replicating the contents of the
 * specified tuple value.
 *
 * @param[out] ht Hash-table pointer.
 * @param htsize Hash-table size. The size should be a power of 2 value, not
 * lesser than 64 bytes.
 * @param[in] initv Pointer to initialized 8-byte tuple.
 */

static inline void lzav_ht_init( uint8_t* const ht, const size_t htsize,
	const uint32_t* const initv ) LZAV_NOEX
{
	memcpy( ht, initv, 8 );
	memcpy( ht + 8, initv, 8 );
	memcpy( ht + 16, ht, 16 );
	memcpy( ht + 32, ht, 16 );
	memcpy( ht + 48, ht, 16 );

	uint8_t* const hte = ht + htsize;
	uint8_t* htc = ht + 64;

	while( LZAV_LIKELY( htc != hte ))
	{
		memcpy( htc, ht, 16 );
		memcpy( htc + 16, ht, 16 );
		memcpy( htc + 32, ht, 16 );
		memcpy( htc + 48, ht, 16 );
		htc += 64;
	}
}

/**
 * @brief LZAV compression function, with external buffer option.
 *
 * Function performs in-memory data compression using the LZAV compression
 * algorithm and stream format. The function produces a "raw" compressed data,
 * without a header containing data length nor identifier nor checksum.
 *
 * Note that compression algorithm and its output on the same source data may
 * differ between LZAV versions, and may differ between big- and little-endian
 * systems. However, the decompression of a compressed data produced by any
 * prior compressor version will remain possible.
 *
 * @param[in] src Source (uncompressed) data pointer, can be 0 if `srcl`
 * equals 0. Address alignment is unimportant.
 * @param[out] dst Destination (compressed data) buffer pointer. The allocated
 * size should be at least lzav_compress_bound() bytes large. Address
 * alignment is unimportant. Should be different to `src`.
 * @param srcl Source data length, in bytes, can be 0: in this case the
 * compressed length is assumed to be 0 as well.
 * @param dstl Destination buffer's capacity, in bytes.
 * @param ext_buf External buffer to use for hash-table, set to null for the
 * function to manage memory itself (via standard `malloc`). Supplying a
 * pre-allocated buffer is useful if compression is performed during
 * application's operation often: this reduces memory allocation overhead and
 * fragmentation. Note that the access to the supplied buffer is not
 * implicitly thread-safe. Buffer's address must be aligned to 32 bits.
 * @param ext_bufl The capacity of the `ext_buf`, in bytes, should be a
 * power-of-2 value. Set to 0 if `ext_buf` is null. The capacity should not be
 * lesser than `4*srcl`, and for default compression ratio should not be
 * greater than 1 MiB. Same `ext_bufl` value can be used for any smaller
 * source data. Using smaller `ext_bufl` values reduces the compression ratio
 * and, at the same time, increases compression speed. This aspect can be
 * utilized on memory-constrained and low-performance processors.
 * @return The length of compressed data, in bytes. Returns 0 if `srcl` is
 * lesser or equal to 0, or if `dstl` is too small, or if buffer pointers are
 * invalid, or if not enough memory.
 */

static inline int lzav_compress( const void* const src, void* const dst,
	const int srcl, const int dstl, void* const ext_buf,
	const int ext_bufl ) LZAV_NOEX
{
	if(( srcl <= 0 ) | ( src == LZAV_NULL ) | ( dst == LZAV_NULL ) |
		( src == dst ) | ( dstl < lzav_compress_bound( srcl )))
	{
		return( 0 );
	}

	const size_t mref = 6; // Minimal reference length, in bytes.
	const size_t mlen = LZAV_REF_LEN + mref;

	uint8_t* op = (uint8_t*) dst; // Destination (compressed data) pointer.
	*op = (uint8_t) ( LZAV_FMT_CUR << 4 | mref ); // Write prefix byte.
	op++;

	if( srcl < 16 )
	{
		// Handle a very short source data.

		*op = (uint8_t) srcl;
		op++;

		memcpy( op, src, (size_t) srcl );

		if( srcl > LZAV_LIT_FIN - 1 )
		{
			return( 2 + srcl );
		}

		memset( op + srcl, 0, (size_t) ( LZAV_LIT_FIN - srcl ));
		return( 2 + LZAV_LIT_FIN );
	}

	uint32_t stack_buf[ 2048 ]; // On-stack hash-table.
	void* alloc_buf = LZAV_NULL; // Hash-table allocated on heap.
	uint8_t* ht = (uint8_t*) stack_buf; // The actual hash-table pointer.

	size_t htsize; // Hash-table's size in bytes (power-of-2).
	htsize = ( 1 << 7 ) * sizeof( uint32_t ) * 4;

	if( ext_buf == LZAV_NULL )
	{
		while( htsize != ( 1 << 20 ) && ( htsize >> 2 ) < (size_t) srcl )
		{
			htsize <<= 1;
		}

		if( htsize > sizeof( stack_buf ))
		{
			alloc_buf = malloc( htsize );

			if( alloc_buf == LZAV_NULL )
			{
				return( 0 );
			}

			ht = (uint8_t*) alloc_buf;
		}
	}
	else
	{
		size_t htsizem;

		if( ext_bufl > (int) sizeof( stack_buf ))
		{
			htsizem = (size_t) ext_bufl;
			ht = (uint8_t*) ext_buf;
		}
		else
		{
			htsizem = sizeof( stack_buf );
		}

		while(( htsize >> 2 ) < (size_t) srcl )
		{
			const size_t htsize2 = htsize << 1;

			if( htsize2 > htsizem )
			{
				break;
			}

			htsize = htsize2;
		}
	}

	// Initialize the hash-table. Each hash-table item consists of 2 tuples
	// (4 initial match bytes; 32-bit source data offset). Set source data
	// offset to avoid OOB in back-match.

	uint32_t initv[ 2 ] = { 0, 16 };

	if( LZAV_LIKELY( srcl > 19 ))
	{
		memcpy( initv, (const uint8_t*) src + 16, 4 );
	}

	lzav_ht_init( ht, htsize, initv );

	const uint32_t hmask = (uint32_t) (( htsize - 1 ) ^ 15 ); // Hash mask.
	const uint8_t* ip = (const uint8_t*) src; // Source data pointer.
	const uint8_t* const ipe = ip + srcl - LZAV_LIT_FIN; // End pointer.
	const uint8_t* const ipet = ipe - 15 + LZAV_LIT_FIN; // Hashing threshold,
		// avoids I/O OOB.
	const uint8_t* ipa = ip; // Literals anchor pointer.

	uint8_t* cbp = op; // Pointer to the latest offset carry block header.
	int csh = 0; // Offset carry shift.

	intptr_t mavg = 100 << 21; // Running average of hash match rate (*2^15).
		// Two-factor average: success (0-64) by average reference length.
	uint32_t rndb = 0; // PRNG bit derived from the non-matching offset.

	ip += 16; // Skip source bytes, to avoid OOB in back-match.

	while( LZAV_LIKELY( ip < ipet ))
	{
		// Hash source data (endianness is minimally important for compression
		// efficiency).

		uint32_t iw1;
		uint16_t iw2, ww2;
		memcpy( &iw1, ip, 4 );

		uint32_t Seed1 = LZAV_HASH_C1;
		uint32_t hval = LZAV_HASH_C2;
		memcpy( &iw2, ip + 4, 2 );

		Seed1 ^= iw1;
		hval ^= iw2;
		hval *= Seed1;
		hval >>= 12;

		// Hash-table access.

		uint32_t* const hp = (uint32_t*) ( ht + ( hval & hmask ));
		const uint32_t ipo = (uint32_t) ( ip - (const uint8_t*) src );
		const uint32_t hw1 = hp[ 0 ]; // Tuple 1's match word.
		const uint8_t* wp; // At window pointer.
		size_t d, ml, rc, lc;

		// Find source data in hash-table tuples.

		if( LZAV_LIKELY( iw1 != hw1 ))
		{
			if( LZAV_LIKELY( iw1 != hp[ 2 ]))
			{
				goto _no_match;
			}

			wp = (const uint8_t*) src + hp[ 3 ];
			memcpy( &ww2, wp + 4, 2 );

			if( LZAV_UNLIKELY( iw2 != ww2 ))
			{
				goto _no_match;
			}
		}
		else
		{
			wp = (const uint8_t*) src + hp[ 1 ];
			memcpy( &ww2, wp + 4, 2 );

			if( LZAV_UNLIKELY( iw2 != ww2 ))
			{
				if( LZAV_LIKELY( iw1 != hp[ 2 ]))
				{
					goto _no_match;
				}

				wp = (const uint8_t*) src + hp[ 3 ];
				memcpy( &ww2, wp + 4, 2 );

				if( LZAV_UNLIKELY( iw2 != ww2 ))
				{
					goto _no_match;
				}
			}
		}

		// Source data and hash-table entry matched.

		d = (size_t) ( ip - wp ); // Reference offset (distance).
		ml = (size_t) ( ipe - ip ); // Max reference match length. Make sure
			// `LZAV_LIT_FIN` literals remain on finish.

		if( LZAV_UNLIKELY( d - LZAV_OFS_MIN >
			LZAV_WIN_LEN - LZAV_OFS_MIN - 1 ))
		{
			// Small offsets may be inefficient (wrap over 0 for efficiency).

			goto _d_oob;
		}

		// Disallow reference copy overlap by using `d` as max match length.

		ml = ( ml > d ? d : ml );
		ml = ( ml > mlen ? mlen : ml );

		if( LZAV_LIKELY( d > 273 ))
		{
			// Update a matching entry only if it is not an adjacent
			// replication. Otherwise, source data consisting of same-byte
			// runs won't compress well.

			if( LZAV_LIKELY( iw1 == hw1 )) // Replace tuple, or insert.
			{
				hp[ 1 ] = ipo;
			}
			else
			{
				hp[ 2 ] = hw1;
				hp[ 3 ] = hp[ 1 ];
				hp[ 0 ] = iw1;
				hp[ 1 ] = ipo;
			}
		}

		rc = mref + lzav_match_len( ip + mref, wp + mref, ml - mref );

		lc = (size_t) ( ip - ipa );

		if( LZAV_UNLIKELY( lc != 0 ))
		{
			// Try to consume literals by finding a match at a back-position.

			ml -= rc;
			size_t bmc = ( lc > 16 ? 16 : lc );

			if( LZAV_LIKELY( ml > bmc ))
			{
				ml = bmc;
			}

			bmc = lzav_match_len_r( ip, wp, ml );

			if( LZAV_UNLIKELY( bmc != 0 ))
			{
				rc += bmc;
				ip -= bmc;
				lc -= bmc;
			}
		}

		op = lzav_write_blk_2( op, lc, rc, d, ipa, &cbp, &csh, mref );
		ip += rc;
		ipa = ip;
		mavg += ( (intptr_t) ( rc << 21 ) - mavg ) >> 10;
		continue;

	_d_oob:
		ip++;

		if( LZAV_LIKELY( d < LZAV_WIN_LEN ))
		{
			continue;
		}

		hp[ 1 + ( iw1 != hw1 ) * 2 ] = ipo;
		continue;

	_no_match:
		hp[ 2 ] = iw1;

		mavg -= mavg >> 11;

		hp[ 3 ] = ipo;

		if( mavg < ( 200 << 14 ) && ip != ipa ) // Speed-up threshold.
		{
			// Compression speed-up technique that keeps the number of hash
			// evaluations around 45% of compressed data length. In some cases
			// reduces the number of blocks by several percent.

			ip += 1 + rndb; // Use PRNG bit to dither match positions.
			rndb = ipo & 1; // Delay to decorrelate from current match.

			if( LZAV_UNLIKELY( mavg < ( 130 << 14 )))
			{
				ip++;

				if( LZAV_UNLIKELY( mavg < ( 100 << 14 )))
				{
					ip += (intptr_t) 100 - ( mavg >> 14 ); // Gradually faster.
				}
			}
		}

		ip++;
	}

	if( alloc_buf != LZAV_NULL )
	{
		free( alloc_buf );
	}

	return( (int) ( lzav_write_fin_2( op, (size_t) ( ipe - ipa +
		LZAV_LIT_FIN ), ipa ) - (uint8_t*) dst ));
}

/**
 * @brief Default LZAV compression function.
 *
 * Function performs in-memory data compression using the LZAV compression
 * algorithm, with the default settings.
 *
 * See the lzav_compress() function for a more detailed description.
 *
 * @param[in] src Source (uncompressed) data pointer.
 * @param[out] dst Destination (compressed data) buffer pointer. The allocated
 * size should be at least lzav_compress_bound() bytes large.
 * @param srcl Source data length, in bytes.
 * @param dstl Destination buffer's capacity, in bytes.
 * @return The length of compressed data, in bytes. Returns 0 if `srcl` is
 * lesser or equal to 0, or if `dstl` is too small, or if not enough memory.
 */

static inline int lzav_compress_default( const void* const src,
	void* const dst, const int srcl, const int dstl ) LZAV_NOEX
{
	return( lzav_compress( src, dst, srcl, dstl, LZAV_NULL, 0 ));
}

/**
 * @brief Higher-ratio LZAV compression function (much slower).
 *
 * Function performs in-memory data compression using the higher-ratio LZAV
 * compression algorithm.
 *
 * @param[in] src Source (uncompressed) data pointer.
 * @param[out] dst Destination (compressed data) buffer pointer. The allocated
 * size should be at least lzav_compress_bound_hi() bytes large.
 * @param srcl Source data length, in bytes.
 * @param dstl Destination buffer's capacity, in bytes.
 * @return The length of compressed data, in bytes. Returns 0 if `srcl` is
 * lesser or equal to 0, or if `dstl` is too small, or if buffer pointers are
 * invalid, or if not enough memory.
 */

static inline int lzav_compress_hi( const void* const src, void* const dst,
	const int srcl, const int dstl ) LZAV_NOEX
{
	if(( srcl <= 0 ) | ( src == LZAV_NULL ) | ( dst == LZAV_NULL ) |
		( src == dst ) | ( dstl < lzav_compress_bound_hi( srcl )))
	{
		return( 0 );
	}

	const size_t mref = 5; // Minimal reference length, in bytes.
	const size_t mlen = LZAV_REF_LEN + mref;

	uint8_t* op = (uint8_t*) dst; // Destination (compressed data) pointer.
	*op = (uint8_t) ( LZAV_FMT_CUR << 4 | mref ); // Write prefix byte.
	op++;

	if( srcl < 16 )
	{
		// Handle a very short source data.

		*op = (uint8_t) srcl;
		op++;

		memcpy( op, src, (size_t) srcl );

		if( srcl > LZAV_LIT_FIN - 1 )
		{
			return( 2 + srcl );
		}

		memset( op + srcl, 0, (size_t) ( LZAV_LIT_FIN - srcl ));
		return( 2 + LZAV_LIT_FIN );
	}

	size_t htsize; // Hash-table's size in bytes (power-of-2).
	htsize = ( 1 << 7 ) * sizeof( uint32_t ) * 2 * 8;

	while( htsize != ( 1 << 23 ) && ( htsize >> 2 ) < (size_t) srcl )
	{
		htsize <<= 1;
	}

	uint8_t* const ht = (uint8_t*) malloc( htsize ); // The hash-table pointer.

	if( ht == LZAV_NULL )
	{
		return( 0 );
	}

	// Initialize the hash-table. Each hash-table item consists of 8 tuples
	// (4 initial match bytes; 32-bit source data offset). The last value of
	// the last tuple is used as head tuple offset (an even value).

	uint32_t initv[ 2 ] = { 0, 0 };
	memcpy( initv, src, 4 );

	lzav_ht_init( ht, htsize, initv );

	const uint32_t hmask = (uint32_t) (( htsize - 1 ) ^ 63 ); // Hash mask.
	const uint8_t* ip = (const uint8_t*) src; // Source data pointer.
	const uint8_t* const ipe = ip + srcl - LZAV_LIT_FIN; // End pointer.
	const uint8_t* const ipet = ipe - 15 + LZAV_LIT_FIN; // Hashing threshold,
		// avoids I/O OOB.
	const uint8_t* ipa = ip; // Literals anchor pointer.

	uint8_t* cbp = op; // Pointer to the latest offset carry block header.
	int csh = 0; // Offset carry shift.

	size_t prc = 0; // Length of a previously found match.
	size_t pd = 0; // Distance of a previously found match.
	const uint8_t* pip = ip; // Source pointer of a previously found match.

	while( LZAV_LIKELY( ip < ipet ))
	{
		// Hash source data (endianness is minimally important for compression
		// efficiency).

		uint32_t iw1;
		memcpy( &iw1, ip, 4 );

		uint32_t Seed1 = LZAV_HASH_C1;
		uint32_t hval = LZAV_HASH_C2;

		Seed1 ^= iw1;
		hval ^= ip[ 4 ];
		hval *= Seed1;
		hval >>= 8;

		// Hash-table access.

		uint32_t* const hp = (uint32_t*) ( ht + ( hval & hmask ));
		const uint32_t ipo = (uint32_t) ( ip - (const uint8_t*) src );
		size_t ti0 = hp[ 15 ]; // Head tuple offset.

		// Find source data in hash-table tuples, in up to 7 previous
		// positions.

		const uint8_t* wp = ip; // Best found window pointer.
		const size_t mle = (size_t) ( ipe - ip ); // Match length bound.
		size_t rc = 0; // Best found match length-4, 0 - not found.
		size_t d; // Reference offset (distance).
		size_t ti = ti0;
		int i;

		if( LZAV_LIKELY( mlen < mle ))
		{
			// Optimized match-finding.

			for( i = 0; i < 7; i++ )
			{
				const uint32_t ww1 = hp[ ti ];
				const uint8_t* const wp0 = (const uint8_t*) src + hp[ ti + 1 ];
				d = (size_t) ( ip - wp0 );
				ti = ( ti == 12 ? 0 : ti + 2 );

				if( iw1 == ww1 )
				{
					d = ( d < 4 ? 4 : d );

					const size_t rc0 = lzav_match_len( ip + 4, wp0 + 4,
						( d > mlen ? mlen : d ) - 4 );

					if( rc0 > rc )
					{
						wp = wp0;
						rc = rc0;
					}
				}
			}
		}
		else
		{
			for( i = 0; i < 7; i++ )
			{
				const uint32_t ww1 = hp[ ti ];
				const uint8_t* const wp0 = (const uint8_t*) src + hp[ ti + 1 ];
				d = (size_t) ( ip - wp0 );
				ti = ( ti == 12 ? 0 : ti + 2 );

				if( iw1 == ww1 )
				{
					// Disallow reference copy overlap by using `d` as max
					// match length. Fix `d` if it is lesser than 4 (this is
					// safe as max `ip` is lesser than `ipe` by `mref` bytes).

					d = ( d < 4 ? 4 : d );

					// Make sure `LZAV_LIT_FIN` literals remain on finish.

					size_t ml = ( mle > d ? d : mle );
					ml = ( ml > mlen ? mlen : ml );

					const size_t rc0 = lzav_match_len( ip + 4, wp0 + 4,
						ml - 4 );

					if( rc0 > rc )
					{
						wp = wp0;
						rc = rc0;
					}
				}
			}
		}

		rc += 4;
		d = (size_t) ( ip - wp );

		if( LZAV_LIKELY( d != rc ))
		{
			// Update hash-table entry, if there was no match, or if the match
			// is not an adjacent replication.

			ti0 = ( ti0 == 0 ? 12 : ti0 - 2 );
			hp[ ti0 ] = iw1;
			hp[ ti0 + 1 ] = ipo;
			hp[ 15 ] = (uint32_t) ti0;
		}

		if(( rc < mref + ( d > LZAV_OFS_TH2 )) |
			( d - LZAV_OFS_MIN > LZAV_WIN_LEN - LZAV_OFS_MIN - 1 ))
		{
			ip++;
			continue;
		}

		// Source data and hash-table entry match of suitable length.

		const uint8_t* const ip0 = ip;
		size_t lc = (size_t) ( ip - ipa );

		if( LZAV_UNLIKELY( lc != 0 ))
		{
			// Try to consume literals by finding a match at back-position.

			size_t ml = ( mle > d ? d : mle );
			ml = ( ml > mlen ? mlen : ml );
			ml -= rc;

			const size_t wpo = (size_t) ( wp - (const uint8_t*) src );

			if( LZAV_LIKELY( ml > lc ))
			{
				ml = lc;
			}

			if( LZAV_UNLIKELY( ml > wpo ))
			{
				ml = wpo;
			}

			const size_t bmc = lzav_match_len_r( ip, wp, ml );

			if( LZAV_UNLIKELY( bmc != 0 ))
			{
				rc += bmc;
				ip -= bmc;
				lc -= bmc;
			}
		}

		if( prc == 0 )
		{
			// Save match for a later comparison.

			prc = rc;
			pd = d;
			pip = ip;
			ip = ip0 + 1;
			continue;
		}

		// Block size overhead estimation, and comparison with a previously
		// found match.

		const size_t plc = (size_t) ( pip - ipa );
		const int lb = ( lc != 0 );
		const int sh0 = 10 + csh;
		const int sh = sh0 + lb * 2;
		const size_t ov = lc + (size_t) lb + ( lc > 15 ) + 2 +
			( d >= ( (size_t) 1 << sh )) +
			( d >= ( (size_t) 1 << ( sh + 8 )));

		const int plb = ( plc != 0 );
		const int psh = sh0 + plb * 2;
		const size_t pov = plc + (size_t) plb + ( plc > 15 ) + 2 +
			( pd >= ( (size_t) 1 << psh )) +
			( pd >= ( (size_t) 1 << ( psh + 8 )));

		if( LZAV_LIKELY( prc * ov > rc * pov ))
		{
			const uint8_t* const nipa = pip + prc;

			if( LZAV_UNLIKELY( nipa <= ip ))
			{
				// A winning previous match does not overlap a current match.

				op = lzav_write_blk_2( op, plc, prc, pd, ipa, &cbp, &csh,
					mref );

				ipa = nipa;
				prc = rc;
				pd = d;
				pip = ip;
				ip = ip0 + 1;
				continue;
			}

			rc = prc;
			d = pd;
			ip = pip;
			lc = plc;
		}

		op = lzav_write_blk_2( op, lc, rc, d, ipa, &cbp, &csh, mref );
		ip += rc;
		ipa = ip;
		prc = 0;
	}

	if( prc != 0 )
	{
		op = lzav_write_blk_2( op, (size_t) ( pip - ipa ), prc, pd, ipa, &cbp,
			&csh, mref );

		ipa = pip + prc;
	}

	free( ht );

	return( (int) ( lzav_write_fin_2( op, (size_t) ( ipe - ipa +
		LZAV_LIT_FIN ), ipa ) - (uint8_t*) dst ));
}

/**
 * @def LZAV_LOAD32( a )
 * @brief Defines `bv` and loads 32-bit unsigned value from memory, with
 * endianness-correction.
 *
 * @param a Memory address.
 */

/**
 * @def LZAV_SET_IPD_CV( x, v, sh )
 * @brief Defines `ipd` as pointer to back-reference, checks bounds,
 * updates carry bit variables.
 *
 * @param x Reference offset.
 * @param v Next `cv` value.
 * @param sh Next `csh` value.
 */

/**
 * @def LZAV_SET_IPD( x )
 * @brief Defines `ipd` as pointer to back-reference, checks bounds,
 * resets carry bit variables.
 *
 * @param x Reference offset.
 */

/**
 * @brief Internal LZAV decompression function (stream format 2).
 *
 * Function decompresses "raw" data previously compressed into the LZAV stream
 * format 2.
 *
 * This function should not be called directly since it does not check the
 * format identifier.
 *
 * @param[in] src Source (compressed) data pointer.
 * @param[out] dst Destination (decompressed data) buffer pointer.
 * @param srcl Source data length, in bytes.
 * @param dstl Expected destination data length, in bytes.
 * @param[out] pwl Pointer to variable that receives the number of bytes
 * written to the destination buffer (until error or end of buffer).
 * @return The length of decompressed data, in bytes, or any negative value if
 * some error happened.
 */

static inline int lzav_decompress_2( const void* const src, void* const dst,
	const int srcl, const int dstl, int* const pwl ) LZAV_NOEX
{
	const uint8_t* ip = (const uint8_t*) src; // Compressed data pointer.
	const uint8_t* const ipe = ip + srcl; // Compressed data boundary pointer.
	const uint8_t* const ipet = ipe - 6; // Block header read threshold.
	uint8_t* op = (uint8_t*) dst; // Destination (decompressed data) pointer.
	uint8_t* const ope = op + dstl; // Destination boundary pointer.
	uint8_t* const opet = ope - 63; // Threshold for fast copy to destination.
	*pwl = dstl;
	const size_t mref1 = (size_t) ( *ip & 15 ) - 1; // Minimal ref length - 1.
	size_t bh; // Current block header, updated in each branch.
	size_t cv = 0; // Reference offset carry value.
	int csh = 0; // Reference offset carry shift.

	#define LZAV_LOAD32( a ) \
		uint32_t bv; \
		memcpy( &bv, a, 4 ); \
		LZAV_IEC32( bv )

	#define LZAV_SET_IPD_CV( x, v, sh ) \
		const size_t d = ( x ) << csh | cv; \
		csh = ( sh ); \
		const size_t md = (size_t) ( op - (uint8_t*) dst ); \
		cv = ( v ); \
		ipd = op - d; \
		if( LZAV_UNLIKELY( d > md )) \
			goto _err_refoob

	#define LZAV_SET_IPD( x ) \
		LZAV_SET_IPD_CV( x, 0, 0 )

	ip++; // Advance beyond prefix byte.

	if( LZAV_UNLIKELY( ip >= ipet ))
	{
		goto _err_srcoob;
	}

	bh = *ip;

	while( LZAV_LIKELY( ip < ipet ))
	{
		const uint8_t* ipd; // Source data pointer.
		size_t cc; // Byte copy count.
		size_t bt; // Block type.

		if( LZAV_UNLIKELY(( bh & 0x30 ) == 0 )) // Block type 0.
		{
			size_t ncv = bh >> 6; // Additional offset carry bits.
			ip++;
			cc = bh & 15;

			if( LZAV_LIKELY( cc != 0 )) // True, if no additional length byte.
			{
				ipd = ip;
				ncv <<= csh;
				ip += cc;

				if( LZAV_LIKELY(( op < opet ) & ( ipd < ipe - 22 ))) // 15+6+1
				{
					cv |= ncv;
					bh = *ip;
					csh += 2;
					memcpy( op, ipd, 16 );
					op += cc;

					goto _refblk; // Reference block follows, if not EOS.
				}
			}
			else
			{
				bh = *ip;
				ncv <<= csh;
				cc = bh & 0x7F;
				ip++;

				if( LZAV_UNLIKELY(( bh & 0x80 ) != 0 ))
				{
					int sh = 7;

					do
					{
						bh = *ip;
						ip++;
						cc |= ( bh & 0x7F ) << sh;

						if( sh == 28 ) // No more than 4 additional bytes.
						{
							break;
						}

						sh += 7;

					} while(( bh & 0x80 ) != 0 );

					cc &= 0x7FFFFFFF; // For malformed data.
				}

				cc += 16;
				ipd = ip;
				ip += cc;

				uint8_t* const opcc = op + cc;

				#if defined( LZAV_PTR32 )
				if( LZAV_UNLIKELY(( ip < ipd ) | ( opcc < op )))
				{
					goto _err_ptrovr;
				}
				#endif // defined( LZAV_PTR32 )

				if( LZAV_LIKELY(( opcc < opet ) & ( ip < ipe - 70 ))) // 63+6+1
				{
					do
					{
						memcpy( op, ipd, 16 );
						memcpy( op + 16, ipd + 16, 16 );
						memcpy( op + 32, ipd + 32, 16 );
						memcpy( op + 48, ipd + 48, 16 );
						op += 64;
						ipd += 64;
					} while( op < opcc );

					cv |= ncv;
					bh = *ip;
					csh += 2;
					op = opcc;

					goto _refblk; // Reference block follows, if not EOS.
				}
			}

			uint8_t* const opcc = op + cc;

			if( LZAV_UNLIKELY( opcc > ope ))
			{
				if( LZAV_UNLIKELY( ip > ipe ))
				{
					goto _err_srcoob_lit;
				}

				goto _err_dstoob_lit;
			}

			if( LZAV_LIKELY( ip < ipe ))
			{
				cv |= ncv;
				bh = *ip;
				csh += 2;
				memcpy( op, ipd, cc );
				op = opcc;
				continue;
			}

			if( LZAV_UNLIKELY( ip != ipe ))
			{
				goto _err_srcoob_lit;
			}

			memcpy( op, ipd, cc );
			op = opcc;
			break;

		_err_srcoob_lit:
			cc = (size_t) ( ipe - ipd );

			if( cc < (size_t) ( ope - op ))
			{
				memcpy( op, ipd, cc );
				*pwl = (int) ( op + cc - (uint8_t*) dst );
			}
			else
			{
				memcpy( op, ipd, (size_t) ( ope - op ));
			}

			return( LZAV_E_SRCOOB );

		_err_dstoob_lit:
			memcpy( op, ipd, (size_t) ( ope - op ));
			return( LZAV_E_DSTOOB );
		}

	_refblk:
		bt = ( bh >> 4 ) & 3;
		ip++;
		const int bt8 = (int) ( bt << 3 );

	#if defined( LZAV_X86 )

		static const uint32_t om[ 4 ] = { 0, 0xFF, 0xFFFF, 0xFFFFFF };
		static const int ocsh[ 4 ] = { 0, 0, 0, 3 };

		LZAV_LOAD32( ip );
		ip += bt;
		const uint32_t o = bv & om[ bt ];
		bv >>= bt8;

		const int wcsh = ocsh[ bt ];

		LZAV_SET_IPD_CV( bh >> 6 | ( o & 0x1FFFFF ) << 2, o >> 21, wcsh );

	#else // defined( LZAV_X86 )

		// Memory accesses on RISC are less efficient here.

		LZAV_LOAD32( ip );
		const uint32_t om = ( (uint32_t) 1 << bt8 ) - 1;
		ip += bt;
		const size_t o = bv & om;
		bv >>= bt8;

		LZAV_SET_IPD_CV( bh >> 6 | ( o & 0x1FFFFF ) << 2, o >> 21,
			( bt == 3 ? 3 : 0 ));

	#endif // defined( LZAV_X86 )

		cc = bh & 15;

		if( LZAV_LIKELY( cc != 0 )) // True, if no additional length byte.
		{
			cc += mref1;
			bh = bv & 0xFF;

			if( LZAV_LIKELY( op < opet ))
			{
				if( LZAV_LIKELY( d > 15 ))
				{
					memcpy( op, ipd, 16 );
					memcpy( op + 16, ipd + 16, 4 );
					op += cc;
					continue;
				}

				if( LZAV_LIKELY( d > 7 ))
				{
					memcpy( op, ipd, 8 );
					memcpy( op + 8, ipd + 8, 8 );
					op += cc;
					continue;
				}

				if( d > 3 )
				{
					memcpy( op, ipd, 4 );
					memcpy( op + 4, ipd + 4, 4 );
					op += cc;
					continue;
				}

				goto _err_refoob;
			}

			if( LZAV_UNLIKELY( cc > d ))
			{
				goto _err_refoob;
			}

			uint8_t* const opcc = op + cc;

			if( LZAV_UNLIKELY( opcc > ope ))
			{
				goto _err_dstoob_ref;
			}

			memcpy( op, ipd, cc );
			op = opcc;
			continue;
		}
		else
		{
			bh = bv & 0xFF;
			ip++;
			cc = 16 + mref1 + bh;

			if( LZAV_UNLIKELY( bh == 255 ))
			{
				cc += *ip;
				ip++;
			}

			uint8_t* const opcc = op + cc;
			bh = *ip;

			if( LZAV_LIKELY(( opcc < opet ) & ( d > 15 )))
			{
				do
				{
					memcpy( op, ipd, 16 );
					memcpy( op + 16, ipd + 16, 16 );
					memcpy( op + 32, ipd + 32, 16 );
					memcpy( op + 48, ipd + 48, 16 );
					op += 64;
					ipd += 64;
				} while( op < opcc );

				op = opcc;
				continue;
			}

			if( LZAV_UNLIKELY( cc > d ))
			{
				goto _err_refoob;
			}

			if( LZAV_UNLIKELY( opcc > ope ))
			{
				goto _err_dstoob_ref;
			}

			memcpy( op, ipd, cc );
			op = opcc;
			continue;
		}

	_err_dstoob_ref:
		memcpy( op, ipd, (size_t) ( ope - op ));
		return( LZAV_E_DSTOOB );
	}

	if( LZAV_UNLIKELY( op != ope ))
	{
		goto _err_dstlen;
	}

	return( (int) ( op - (uint8_t*) dst ));

_err_srcoob:
	*pwl = (int) ( op - (uint8_t*) dst );
	return( LZAV_E_SRCOOB );

_err_refoob:
	*pwl = (int) ( op - (uint8_t*) dst );
	return( LZAV_E_REFOOB );

_err_dstlen:
	*pwl = (int) ( op - (uint8_t*) dst );
	return( LZAV_E_DSTLEN );

#if defined( LZAV_PTR32 )
_err_ptrovr:
	*pwl = (int) ( op - (uint8_t*) dst );
	return( LZAV_E_PTROVR );
#endif // defined( LZAV_PTR32 )
}

#if LZAV_FMT_MIN < 2

/**
 * @def LZAV_LOAD16( a )
 * @brief Defines `bv` and loads 16-bit unsigned value from memory, with
 * endianness-correction.
 *
 * @param a Memory address.
 */

/**
 * @def LZAV_MEMMOVE( d, s, c )
 * @brief Stack-based `memmove` function which gets optimized into SIMD
 * instructions.
 *
 * @param d Destination address.
 * @param s Source address.
 * @param c Byte copy count (must be a constant).
 */

/**
 * @brief Internal LZAV decompression function (stream format 1).
 *
 * Function decompresses "raw" data previously compressed into the LZAV stream
 * format 1.
 *
 * This function should not be called directly since it does not check the
 * format identifier.
 *
 * @param[in] src Source (compressed) data pointer.
 * @param[out] dst Destination (decompressed data) buffer pointer.
 * @param srcl Source data length, in bytes.
 * @param dstl Expected destination data length, in bytes.
 * @return The length of decompressed data, in bytes, or any negative value if
 * some error happened.
 */

static inline int lzav_decompress_1( const void* const src, void* const dst,
	const int srcl, const int dstl ) LZAV_NOEX
{
	const uint8_t* ip = (const uint8_t*) src; // Compressed data pointer.
	const uint8_t* const ipe = ip + srcl; // Compressed data boundary pointer.
	const uint8_t* const ipet = ipe - 5; // Block header read threshold.
	uint8_t* op = (uint8_t*) dst; // Destination (decompressed data) pointer.
	uint8_t* const ope = op + dstl; // Destination boundary pointer.
	uint8_t* const opet = ope - 63; // Threshold for fast copy to destination.
	const size_t mref1 = (size_t) ( *ip & 15 ) - 1; // Minimal ref length - 1.
	size_t bh = 0; // Current block header, updated in each branch.
	size_t cv = 0; // Reference offset carry value.
	int csh = 0; // Reference offset carry shift.

	#if LZAV_LITTLE_ENDIAN
		#define LZAV_LOAD16( a ) \
			uint16_t bv; \
			memcpy( &bv, a, 2 )
	#else // LZAV_LITTLE_ENDIAN
		#define LZAV_LOAD16( a ) \
			uint16_t bv = (uint16_t) ( *( a ) | *( a + 1 ) << 8 )
	#endif // LZAV_LITTLE_ENDIAN

	#define LZAV_MEMMOVE( d, s, c ) \
		{ uint8_t tmp[ c ]; memcpy( tmp, s, c ); memcpy( d, tmp, c ); } (void) 0

	ip++; // Advance beyond prefix byte.

	if( LZAV_UNLIKELY( ip >= ipet ))
	{
		goto _err_srcoob;
	}

	bh = *ip;

	while( LZAV_LIKELY( ip < ipet ))
	{
		const uint8_t* ipd; // Source data pointer.
		size_t cc; // Byte copy count.

		if( LZAV_UNLIKELY(( bh & 0x30 ) == 0 )) // Block type 0.
		{
			cv = bh >> 6;
			csh = 2;
			ip++;
			cc = bh & 15;

			if( LZAV_LIKELY( cc != 0 )) // True, if no additional length byte.
			{
				ipd = ip;
				ip += cc;

				if( LZAV_LIKELY(( op < opet ) & ( ipd < ipe - 15 - 6 )))
				{
					bh = *ip;
					memcpy( op, ipd, 16 );
					op += cc;
					goto _refblk; // Reference block follows, if not EOS.
				}
			}
			else
			{
				LZAV_LOAD16( ip );

				const size_t l2 = (size_t) ( bv & 0xFF );
				cc = 16;
				ip++;
				const int lb = ( l2 == 255 );
				cc += l2 + (( bv >> 8 ) & ( 0x100 - lb ));
				ip += lb;

				ipd = ip;
				ip += cc;

				if( LZAV_LIKELY(( op < opet ) & ( ipd < ipe - 63 - 1 )))
				{
					memcpy( op, ipd, 16 );
					memcpy( op + 16, ipd + 16, 16 );
					memcpy( op + 32, ipd + 32, 16 );
					memcpy( op + 48, ipd + 48, 16 );

					if( LZAV_LIKELY( cc < 65 ))
					{
						bh = *ip;
						op += cc;
						continue;
					}

					ipd += 64;
					op += 64;
					cc -= 64;
				}
			}

			if( LZAV_LIKELY( ip < ipe ))
			{
				bh = *ip;
			}
			else
			if( LZAV_UNLIKELY( ip != ipe ))
			{
				goto _err_srcoob;
			}

			if( LZAV_UNLIKELY( op + cc > ope ))
			{
				goto _err_dstoob;
			}

			// This and other alike copy-blocks are transformed into fast SIMD
			// instructions, by a modern compiler. Direct use of `memcpy` is
			// slower due to shortness of data remaining to copy, on average.

			while( cc != 0 )
			{
				*op = *ipd;
				ipd++;
				op++;
				cc--;
			}

			continue;
		}

	_refblk:
		cc = bh & 15;

		if( LZAV_UNLIKELY(( bh & 32 ) == 0 )) // True, if block type 1.
		{
			LZAV_SET_IPD( bh >> 6 | (size_t) ip[ 1 ] << 2 );
			ip += 2;
			bh = *ip;
		}
		else // Block type 2 or 3.
		{
			if( LZAV_LIKELY(( bh & 16 ) == 0 )) // True, if block type 2.
			{
				LZAV_LOAD16( ip + 1 );
				LZAV_SET_IPD( bh >> 6 | (size_t) bv << 2 );
				ip += 3;
				bh = *ip;
			}
			else // Block type 3.
			{
				LZAV_LOAD32( ip + 1 );
				LZAV_SET_IPD_CV( bv & 0xFFFFFF, bh >> 6, 2 );
				ip += 4;
				bh = bv >> 24;
			}
		}

		if( LZAV_LIKELY( cc != 0 )) // True, if no additional length byte.
		{
			cc += mref1;

			if( LZAV_LIKELY( op < opet ))
			{
				LZAV_MEMMOVE( op, ipd, 16 );
				LZAV_MEMMOVE( op + 16, ipd + 16, 4 );

				op += cc;
				continue;
			}
		}
		else
		{
			cc = 16 + mref1 + bh;
			ip++;
			bh = *ip;

			if( LZAV_LIKELY( op < opet ))
			{
				LZAV_MEMMOVE( op, ipd, 16 );
				LZAV_MEMMOVE( op + 16, ipd + 16, 16 );
				LZAV_MEMMOVE( op + 32, ipd + 32, 16 );
				LZAV_MEMMOVE( op + 48, ipd + 48, 16 );

				if( LZAV_LIKELY( cc < 65 ))
				{
					op += cc;
					continue;
				}

				ipd += 64;
				op += 64;
				cc -= 64;
			}
		}

		if( LZAV_UNLIKELY( op + cc > ope ))
		{
			goto _err_dstoob;
		}

		while( cc != 0 )
		{
			*op = *ipd;
			ipd++;
			op++;
			cc--;
		}
	}

	if( LZAV_UNLIKELY( op != ope ))
	{
		goto _err_dstlen;
	}

	return( (int) ( op - (uint8_t*) dst ));

_err_srcoob:
	return( LZAV_E_SRCOOB );

_err_dstoob:
	return( LZAV_E_DSTOOB );

_err_refoob:
	return( LZAV_E_REFOOB );

_err_dstlen:
	return( LZAV_E_DSTLEN );
}

#undef LZAV_LOAD16
#undef LZAV_MEMMOVE

#endif // LZAV_FMT_MIN < 2

#undef LZAV_LOAD32
#undef LZAV_SET_IPD_CV
#undef LZAV_SET_IPD

/**
 * @brief LZAV decompression function (partial).
 *
 * Function decompresses "raw" data previously compressed into the LZAV stream
 * format, for partial or recovery decompression. For example, this function
 * can be used to decompress only an initial segment of a larger data block.
 *
 * @param[in] src Source (compressed) data pointer, can be 0 if `srcl` is 0.
 * Address alignment is unimportant.
 * @param[out] dst Destination (decompressed data) buffer pointer. Address
 * alignment is unimportant. Should be different to `src`.
 * @param srcl Source data length, in bytes, can be 0.
 * @param dstl Destination buffer length, in bytes, can be 0.
 * @return The length of decompressed data, in bytes. Always a non-negative
 * value (error codes are not returned).
 */

static inline int lzav_decompress_partial( const void* const src,
	void* const dst, const int srcl, const int dstl ) LZAV_NOEX
{
	if( srcl <= 0 || src == LZAV_NULL || dst == LZAV_NULL || src == dst ||
		dstl <= 0 )
	{
		return( 0 );
	}

	const int fmt = *(const uint8_t*) src >> 4;
	int dl = 0;

	if( fmt == 2 )
	{
		lzav_decompress_2( src, dst, srcl, dstl, &dl );
	}

	return( dl );
}

/**
 * @brief LZAV decompression function.
 *
 * Function decompresses "raw" data previously compressed into the LZAV stream
 * format.
 *
 * Note that while the function does perform checks to avoid OOB memory
 * accesses, and checks for decompressed data length equality, this is not a
 * strict guarantee of a valid decompression. In cases when the compressed
 * data is stored in a long-term storage without embedded data integrity
 * mechanisms (e.g., a database without RAID 1 guarantee, a binary container
 * without a digital signature nor CRC), then a checksum (hash) of the
 * original uncompressed data should be stored, and then evaluated against
 * that of the decompressed data. Also, a separate checksum (hash) of
 * application-defined header, which contains uncompressed and compressed data
 * lengths, should be checked before decompression. A high-performance
 * "komihash" hash function can be used to obtain a hash value of the data.
 *
 * @param[in] src Source (compressed) data pointer, can be 0 if `srcl` is 0.
 * Address alignment is unimportant.
 * @param[out] dst Destination (decompressed data) buffer pointer. Address
 * alignment is unimportant. Should be different to `src`.
 * @param srcl Source data length, in bytes, can be 0.
 * @param dstl Expected destination data length, in bytes, can be 0. Should
 * not be confused with the actual size of the destination buffer (which may
 * be larger).
 * @return The length of decompressed data, in bytes, or any negative value if
 * some error happened. Always returns a negative value if the resulting
 * decompressed data length differs from `dstl`. This means that error result
 * handling requires just a check for a negative return value (see the
 * LZAV_ERROR enum for possible values).
 */

static inline int lzav_decompress( const void* const src, void* const dst,
	const int srcl, const int dstl ) LZAV_NOEX
{
	if( srcl < 0 )
	{
		return( LZAV_E_PARAMS );
	}

	if( srcl == 0 )
	{
		return( dstl == 0 ? 0 : LZAV_E_PARAMS );
	}

	if( src == LZAV_NULL || dst == LZAV_NULL || src == dst || dstl <= 0 )
	{
		return( LZAV_E_PARAMS );
	}

	const int fmt = *(const uint8_t*) src >> 4;

	if( fmt == 2 )
	{
		int tmp;
		return( lzav_decompress_2( src, dst, srcl, dstl, &tmp ));
	}

#if LZAV_FMT_MIN < 2
	if( fmt == 1 )
	{
		return( lzav_decompress_1( src, dst, srcl, dstl ));
	}
#endif // LZAV_FMT_MIN < 2

	return( LZAV_E_UNKFMT );
}

#if defined( LZAV_NS )

} // namespace LZAV_NS

#if !defined( LZAV_NS_CUSTOM )

namespace {

using namespace LZAV_NS :: enum_wrapper;
using LZAV_NS :: lzav_compress_bound;
using LZAV_NS :: lzav_compress_bound_hi;
using LZAV_NS :: lzav_compress;
using LZAV_NS :: lzav_compress_default;
using LZAV_NS :: lzav_compress_hi;
using LZAV_NS :: lzav_decompress_partial;
using LZAV_NS :: lzav_decompress;

} // namespace

#endif // !defined( LZAV_NS_CUSTOM )

#endif // defined( LZAV_NS )

// Defines for Doxygen.

#if !defined( LZAV_NS_CUSTOM )
	#define LZAV_NS_CUSTOM
#endif // !defined( LZAV_NS_CUSTOM )

#undef LZAV_NS_CUSTOM
#undef LZAV_NOEX
#undef LZAV_NULL
#undef LZAV_X86
#undef LZAV_GCC_BUILTINS
#undef LZAV_IEC32
#undef LZAV_LIKELY
#undef LZAV_UNLIKELY
#undef LZAV_HASH_C1
#undef LZAV_HASH_C2

#endif // LZAV_INCLUDED