[librptexture] rp_image_ops_neon.cpp: Added a NEON-optimized version of rp_image::swizzle().

Note that NEON's vtbl instructions doesn't support the bit 7 "zero it" feature that x86's pshufb supports, so we have to use an AND mask in addition to an OR mask. Tested using luminance_alpha_reference_u.ktx2, which has a swizzle value 'rrrg'. (Also used this for testing the previous commit, which switched some mask stuff from 8-bit operations to 32-bit.)
2025-06-18 11:35:38 -04:00 · 2025-04-17 01:27:12 -04:00 · 2025-04-17 01:27:12 -04:00 · 06a86667b3
commit 06a86667b3
parent 1ed88d90e1
4 changed files with 317 additions and 3 deletions
--- a/NEWS.md
+++ b/NEWS.md
@ -89,6 +89,7 @@
  * Added NEON-optimized functions:
    * Array byteswap (16-bit and 32-bit)
    * Linear image decoding (32-bit color)
+    * Image swizzling (for e.g. KTX2)
    * Tested on Android. (arm64)
    * Compile-tested on MSVC. (arm32, arm64)
  * Switched XML parsers from TinyXML2 to PugiXML.
--- a/src/librptexture/CMakeLists.txt
+++ b/src/librptexture/CMakeLists.txt
@ -200,7 +200,10 @@ IF(CPU_i386 OR CPU_amd64)
 		)
 ELSEIF(CPU_arm OR CPU_arm64)
 	IF(HAVE_ARM_NEON_H)
-		SET(${PROJECT_NAME}_CPU_SRCS decoder/ImageDecoder_Linear_neon.cpp)
+		SET(${PROJECT_NAME}_CPU_SRCS
+			img/rp_image_ops_neon.cpp
+			decoder/ImageDecoder_Linear_neon.cpp
+			)
 	ENDIF(HAVE_ARM_NEON_H)
 ENDIF()
 UNSET(arch)
--- a/src/librptexture/img/rp_image.hpp
+++ b/src/librptexture/img/rp_image.hpp
@ -8,6 +8,8 @@

 #pragma once

+#include "librptexture/config.librptexture.h"
+
 #include "common.h"
 #include "dll-macros.h"

@ -25,10 +27,20 @@
 #  define RP_IMAGE_HAS_SSE2 1
 #  define RP_IMAGE_HAS_SSSE3 1
 #  define RP_IMAGE_HAS_SSE41 1
+#elif defined(HAVE_ARM_NEON_H)
+#  if defined(RP_CPU_ARM) || defined(RP_CPU_ARM64)
+#    include "librpcpuid/cpuflags_arm.h"
+#    define RP_IMAGE_HAS_NEON 1
+#  endif
 #endif
 #ifdef RP_CPU_AMD64
 #  define RP_IMAGE_ALWAYS_HAS_SSE2 1
 #endif
+#ifdef HAVE_ARM_NEON_H
+#  ifdef RP_CPU_ARM64
+#    define RP_IMAGE_ALWAYS_HAS_NEON 1
+#  endif
+#endif

 #include "../argb32_t.hpp"

@ -512,6 +524,17 @@ class rp_image
 		int swizzle_ssse3(const char *swz_spec);
 #endif /* RP_IMAGE_HAS_SSSE3 */

+#ifdef RP_IMAGE_HAS_NEON
+		/**
+		 * Swizzle the image channels.
+		 * NEON-optimized version.
+		 *
+		 * @param swz_spec Swizzle specification: [rgba01]{4} [matches KTX2]
+		 * @return 0 on success; negative POSIX error code on error.
+		 */
+		int swizzle_neon(const char *swz_spec);
+#endif /* RP_IMAGE_HAS_NEON */
+
 		/**
 		 * Swizzle the image channels.
 		 *
@ -597,14 +620,23 @@ inline int rp_image::apply_chroma_key(uint32_t key)
 */
 inline int rp_image::swizzle(const char *swz_spec)
 {
-#if defined(RP_IMAGE_HAS_SSSE3)
+#ifdef RP_IMAGE_ALWAYS_HAS_NEON
+	return swizzle_neon(swz_spec);
+#else
+#  if defined(RP_IMAGE_HAS_SSSE3)
 	if (RP_CPU_x86_HasSSSE3()) {
 		return swizzle_ssse3(swz_spec);
 	} else
-#endif /* RP_IMAGE_HAS_SSSE3 */
+#  endif /* RP_IMAGE_HAS_SSSE3 */
+#  ifdef RP_IMAGE_HAS_NEON
+	if (RP_CPU_arm_HasNEON()) {
+		return swizzle_neon(swz_spec);
+	} else
+#  endif /* IMAGEDECODER_HAS_NEON */
 	{
 		return swizzle_cpp(swz_spec);
 	}
+#endif
 }

 }
--- a/src/librptexture/img/rp_image_ops_neon.cpp
+++ b/src/librptexture/img/rp_image_ops_neon.cpp
@ -0,0 +1,278 @@
+/***************************************************************************
+ * ROM Properties Page shell extension. (librptexture)                     *
+ * rp_image_ops.cpp: Image class. (operations)                             *
+ * NEON-optimized version.                                                 *
+ *                                                                         *
+ * Copyright (c) 2016-2025 by David Korth.                                 *
+ * SPDX-License-Identifier: GPL-2.0-or-later                               *
+ ***************************************************************************/
+
+#include "stdafx.h"
+#include "rp_image.hpp"
+#include "rp_image_p.hpp"
+#include "rp_image_backend.hpp"
+
+// C++ STL classes
+using std::array;
+
+// ARM NEON intrinsics
+#include <arm_neon.h>
+
+// NOTE: vtbl only supports 64-bit vectors on ARMv7 (32-bit).
+// TODO: Consolidate this with ImageDecoder_Linear_neon.cpp?
+#if defined(RP_CPU_ARM64)
+//static constexpr size_t VEC_LEN_U8 = 16;
+static constexpr size_t VEC_LEN_U32 = 4;
+typedef uint8x16_t uint8xVTBL_t;
+typedef uint32x4_t uint32xVTBL_t;
+#define vld1VTBL_u8  vld1q_u8
+#define vld1VTBL_u32 vld1q_u32
+#define vst1VTBL_u8  vst1q_u8
+#define vst1VTBL_u32 vst1q_u32
+#elif defined(RP_CPU_ARM)
+//static constexpr size_t VEC_LEN_U8 = 8;
+static constexpr size_t VEC_LEN_U32 = 2;
+typedef uint8x8_t uint8xVTBL_t;
+typedef uint32x2_t uint32xVTBL_t;
+#define vld1VTBL_u8  vld1_u8
+#define vld1VTBL_u32 vld1_u32
+#define vst1VTBL_u8  vst1_u8
+#define vst1VTBL_u32 vst1_u32
+#else
+#  error Unsupported CPU?
+#endif
+
+// Workaround for RP_D() expecting the no-underscore, UpperCamelCase naming convention.
+#define rp_imagePrivate rp_image_private
+
+namespace LibRpTexture {
+
+/** Image operations **/
+
+/**
+ * Swizzle the image channels.
+ * SSSE3-optimized version.
+ *
+ * @param swz_spec Swizzle specification: [rgba01]{4} [matches KTX2]
+ * @return 0 on success; negative POSIX error code on error.
+ */
+int rp_image::swizzle_neon(const char *swz_spec)
+{
+	RP_D(rp_image);
+	rp_image_backend *const backend = d->backend.get();
+	assert(backend->format == rp_image::Format::ARGB32);
+	if (backend->format != rp_image::Format::ARGB32) {
+		// ARGB32 is required.
+		// TODO: Automatically convert the image?
+		return -EINVAL;
+	}
+
+	// TODO: Verify swz_spec.
+	typedef union _u8_32 {
+		uint8_t u8[4];
+		uint32_t u32;
+	} u8_32;
+	u8_32 swz_ch;
+	memcpy(&swz_ch, swz_spec, sizeof(swz_ch));
+	if (swz_ch.u32 == 'rgba') {
+		// 'rgba' == NULL swizzle. Don't bother doing anything.
+		return 0;
+	}
+
+	// NOTE: Texture uses ARGB format, but swizzle uses rgba.
+	// Rotate swz_ch to convert it to argb.
+	// The entire thing needs to be byteswapped to match the internal order, too.
+	// TODO: Verify on big-endian.
+	swz_ch.u32 = (swz_ch.u32 >> 24) | (swz_ch.u32 << 8);
+	swz_ch.u32 = be32_to_cpu(swz_ch.u32);
+
+	// Determine the pshufb mask.
+	// This can be used for [rgba0].
+	// For 1, we'll need a separate "por" mask.
+	// NOTE: NEON doesn't support bit 7 for "zero the byte", so we'll need to
+	// use a separate "pand" mask as well.
+	u8_32 pshufb_mask_vals;
+	u8_32 pand_mask_vals;
+	u8_32 por_mask_vals;
+#define SET_MASK_VALS(n, shuf, pand, por) do { \
+		pshufb_mask_vals.u8[n] = (shuf); \
+		pand_mask_vals.u8[n] = (pand); \
+		por_mask_vals.u8[n] = (por); \
+	} while (0)
+#define SWIZZLE_MASK_VAL(n) do { \
+		switch (swz_ch.u8[n]) { \
+					/*             n   shuf  pand   por */ \
+			case 'b':	SET_MASK_VALS((n),    0, 0xFF, 0x00);	break; \
+			case 'g':	SET_MASK_VALS((n),    1, 0xFF, 0x00);	break; \
+			case 'r':	SET_MASK_VALS((n),    2, 0xFF, 0x00);	break; \
+			case 'a':	SET_MASK_VALS((n),    3, 0xFF, 0x00);	break; \
+			case '0':	SET_MASK_VALS((n), 0xFF, 0x00, 0x00);	break; \
+			case '1':	SET_MASK_VALS((n), 0xFF, 0x00, 0xFF);	break; \
+			default: \
+				assert(!"Invalid swizzle value."); \
+				SET_MASK_VALS((n), 0xFF, 0x00, 0x00); \
+				break; \
+		} \
+	} while (0)
+
+	SWIZZLE_MASK_VAL(0);
+	SWIZZLE_MASK_VAL(1);
+	SWIZZLE_MASK_VAL(2);
+	SWIZZLE_MASK_VAL(3);
+
+	const array<uint32_t, VEC_LEN_U32> pshufb_mask_u32 = {{
+		pshufb_mask_vals.u32,
+		pshufb_mask_vals.u32 + 0x04040404,
+#ifdef RP_CPU_ARM64
+		pshufb_mask_vals.u32 + 0x08080808,
+		pshufb_mask_vals.u32 + 0x0C0C0C0C
+#endif /* RP_CPU_ARM64 */
+	}};
+	const array<uint32_t, VEC_LEN_U32> pand_mask_u32 = {{
+		pand_mask_vals.u32,
+		pand_mask_vals.u32,
+#ifdef RP_CPU_ARM64
+		pand_mask_vals.u32,
+		pand_mask_vals.u32
+#endif /* RP_CPU_ARM64 */
+	}};
+	const array<uint32_t, VEC_LEN_U32> por_mask_u32 = {{
+		por_mask_vals.u32,
+		por_mask_vals.u32,
+#ifdef RP_CPU_ARM64
+		por_mask_vals.u32,
+		por_mask_vals.u32
+#endif /* RP_CPU_ARM64 */
+	}};
+
+	uint32xVTBL_t shuf_mask = vld1VTBL_u32(pshufb_mask_u32.data());
+	uint32xVTBL_t and_mask = vld1VTBL_u32(pand_mask_u32.data());
+	uint32xVTBL_t or_mask = vld1VTBL_u32(por_mask_u32.data());
+
+	// Channel indexes
+	static constexpr unsigned int SWZ_CH_B = 0U;
+	static constexpr unsigned int SWZ_CH_G = 1U;
+	static constexpr unsigned int SWZ_CH_R = 2U;
+	static constexpr unsigned int SWZ_CH_A = 3U;
+
+	uint32_t *bits = static_cast<uint32_t*>(backend->data());
+	const unsigned int stride_diff = (backend->stride - this->row_bytes()) / sizeof(uint32_t);
+	const int width = backend->width;
+	for (int y = backend->height; y > 0; y--) {
+		// Process 16 pixels at a time using NEON.
+		int x;
+		for (x = width; x > 15; x -= 16, bits += 16) {
+#if defined(RP_CPU_ARM64)
+			uint32x4x4_t sa = vld4q_u32(bits);
+
+			sa.val[0] = vqtbl1q_u8(sa.val[0], shuf_mask);
+			sa.val[1] = vqtbl1q_u8(sa.val[1], shuf_mask);
+			sa.val[2] = vqtbl1q_u8(sa.val[2], shuf_mask);
+			sa.val[3] = vqtbl1q_u8(sa.val[3], shuf_mask);
+
+			sa.val[0] = vandq_u32(sa.val[0], and_mask);
+			sa.val[1] = vandq_u32(sa.val[1], and_mask);
+			sa.val[2] = vandq_u32(sa.val[2], and_mask);
+			sa.val[3] = vandq_u32(sa.val[3], and_mask);
+
+			sa.val[0] = vorrq_u32(sa.val[0], or_mask);
+			sa.val[1] = vorrq_u32(sa.val[1], or_mask);
+			sa.val[2] = vorrq_u32(sa.val[2], or_mask);
+			sa.val[3] = vorrq_u32(sa.val[3], or_mask);
+
+			vst4q_u32(bits, sa);
+#elif defined(RP_CPU_ARM)
+			uint32x2x4_t sa = vld4_u32(&bits[0]);
+			uint32x2x4_t sb = vld4_u32(&bits[8]);
+
+			sa.val[0] = vtbl1_u8(sa.val[0], shuf_mask);
+			sa.val[1] = vtbl1_u8(sa.val[1], shuf_mask);
+			sa.val[2] = vtbl1_u8(sa.val[2], shuf_mask);
+			sa.val[3] = vtbl1_u8(sa.val[3], shuf_mask);
+			sb.val[0] = vtbl1_u8(sb.val[0], shuf_mask);
+			sb.val[1] = vtbl1_u8(sb.val[1], shuf_mask);
+			sb.val[2] = vtbl1_u8(sb.val[2], shuf_mask);
+			sb.val[3] = vtbl1_u8(sb.val[3], shuf_mask);
+
+			sa.val[0] = vand_u8(sa.val[0], and_mask);
+			sa.val[1] = vand_u8(sa.val[1], and_mask);
+			sa.val[2] = vand_u8(sa.val[2], and_mask);
+			sa.val[3] = vand_u8(sa.val[3], and_mask);
+			sb.val[0] = vand_u8(sb.val[0], and_mask);
+			sb.val[1] = vand_u8(sb.val[1], and_mask);
+			sb.val[2] = vand_u8(sb.val[2], and_mask);
+			sb.val[3] = vand_u8(sb.val[3], and_mask);
+
+			sa.val[0] = vorr_u8(sa.val[0], or_mask);
+			sa.val[1] = vorr_u8(sa.val[1], or_mask);
+			sa.val[2] = vorr_u8(sa.val[2], or_mask);
+			sa.val[3] = vorr_u8(sa.val[3], or_mask);
+			sb.val[0] = vorr_u8(sb.val[0], or_mask);
+			sb.val[1] = vorr_u8(sb.val[1], or_mask);
+			sb.val[2] = vorr_u8(sb.val[2], or_mask);
+			sb.val[3] = vorr_u8(sb.val[3], or_mask);
+
+			vst4_u32(&bits[0], sa);
+			vst4_u32(&bits[8], sb);
+#endif
+		}
+
+		// Process remaining pixels using regular swizzling
+		for (; x > 0; x--, bits++) {
+			u8_32 cur, swz;
+			cur.u32 = *bits;
+
+		// TODO: Verify on big-endian.
+#define SWIZZLE_CHANNEL(n) do { \
+				switch (swz_ch.u8[n]) { \
+					case 'b':	swz.u8[n] = cur.u8[SWZ_CH_B];	break; \
+					case 'g':	swz.u8[n] = cur.u8[SWZ_CH_G];	break; \
+					case 'r':	swz.u8[n] = cur.u8[SWZ_CH_R];	break; \
+					case 'a':	swz.u8[n] = cur.u8[SWZ_CH_A];	break; \
+					case '0':	swz.u8[n] = 0;			break; \
+					case '1':	swz.u8[n] = 255;		break; \
+					default: \
+						assert(!"Invalid swizzle value."); \
+						swz.u8[n] = 0; \
+						break; \
+				} \
+			} while (0)
+
+			SWIZZLE_CHANNEL(0);
+			SWIZZLE_CHANNEL(1);
+			SWIZZLE_CHANNEL(2);
+			SWIZZLE_CHANNEL(3);
+
+			*bits = swz.u32;
+		}
+
+		// Next row.
+		bits += stride_diff;
+	}
+
+	// Swizzle the sBIT value, if set.
+	if (d->has_sBIT) {
+		// TODO: If gray is set, move its values to rgb?
+		const rp_image::sBIT_t sBIT_old = d->sBIT;
+
+#define SWIZZLE_sBIT(n, ch) do { \
+				switch (swz_ch.u8[n]) { \
+					case 'b':	d->sBIT.ch = sBIT_old.blue;	break; \
+					case 'g':	d->sBIT.ch = sBIT_old.green;	break; \
+					case 'r':	d->sBIT.ch = sBIT_old.red;	break; \
+					case 'a':	d->sBIT.ch = sBIT_old.alpha;	break; \
+					case '0': case '1': \
+							d->sBIT.ch = 1;			break; \
+				} \
+			} while (0)
+
+			SWIZZLE_sBIT(SWZ_CH_B, blue);
+			SWIZZLE_sBIT(SWZ_CH_G, green);
+			SWIZZLE_sBIT(SWZ_CH_R, red);
+			SWIZZLE_sBIT(SWZ_CH_A, alpha);
+	}
+
+	return 0;
+}
+
+}