From 3ee12a6c26edb6df61a304e7d6f2ccba75e9b199 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sun, 15 Mar 2026 13:36:40 +0100 Subject: [PATCH] Speedup NEON match_empty, match_empty_or_deleted, and match_full using SWAR Convert the uint8x8_t directly to a u64 and use scalar bit operations instead of NEON comparisons. This allows LLVM to eliminate the vector load entirely and correctly handles big-endian AArch64. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/control/group/neon.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/control/group/neon.rs b/src/control/group/neon.rs index c64b89169..e2383edf6 100644 --- a/src/control/group/neon.rs +++ b/src/control/group/neon.rs @@ -76,7 +76,10 @@ impl Group { /// `EMPTY`. #[inline] pub(crate) fn match_empty(self) -> BitMask { - self.match_tag(Tag::EMPTY) + unsafe { + let ctrl = neon::vget_lane_u64(neon::vreinterpret_u64_u8(self.0), 0); + BitMask(ctrl & (ctrl << 1) & BITMASK_ITER_MASK) + } } /// Returns a `BitMask` indicating all tags in the group which are @@ -84,8 +87,8 @@ impl Group { #[inline] pub(crate) fn match_empty_or_deleted(self) -> BitMask { unsafe { - let cmp = neon::vcltz_s8(neon::vreinterpret_s8_u8(self.0)); - BitMask(neon::vget_lane_u64(neon::vreinterpret_u64_u8(cmp), 0)) + let ctrl = neon::vget_lane_u64(neon::vreinterpret_u64_u8(self.0), 0); + BitMask(ctrl & BITMASK_ITER_MASK) } } @@ -93,8 +96,8 @@ impl Group { #[inline] pub(crate) fn match_full(self) -> BitMask { unsafe { - let cmp = neon::vcgez_s8(neon::vreinterpret_s8_u8(self.0)); - BitMask(neon::vget_lane_u64(neon::vreinterpret_u64_u8(cmp), 0)) + let ctrl = neon::vget_lane_u64(neon::vreinterpret_u64_u8(self.0), 0); + BitMask(!ctrl & BITMASK_ITER_MASK) } }