From 57a4f099934dd5d841ad811f1f69a9e67863690b Mon Sep 17 00:00:00 2001
From: Kim Yang <kimy@nvidia.com>
Date: Mon, 27 Apr 2026 00:55:12 +0800
Subject: [PATCH 1/2] ByteLevel: skip per-byte transform for
 printable-ASCII-only tokens
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The byte → char map produced by `bytes_char()` is the identity on
`0x21..=0x7E` (printable ASCII excluding space): each such byte maps to
the char with the same code point, which encodes back to that exact
byte in UTF-8. So when an entire post-regex token already lives in that
range, the per-byte transform produces an output byte-identical to the
input and rebuilds `alignments` to the same tuples it already had. We
can return early and skip the `Vec<(char, isize)>` build, the
`HashMap` lookups, and the `transform()` rebuild.

The gate is conservative: any byte outside `0x21..=0x7E` (space, tab,
newline, DEL, any Latin-1, any UTF-8 lead/continuation byte) falls
through to the original code path with zero changes. Tokens like
`" word"` (regex `' ?\p{L}+'` output), `"\n"`, or any non-ASCII text
therefore retain their existing GPT-2 byte mapping (' ' → 'Ġ' etc.).

The `iter().all(...)` predicate auto-vectorizes on stable Rust, so the
detection is O(n) over bytes with SIMD-class throughput on x86_64 / aarch64.

Two new tests in `byte_level::tests`:

  - `printable_ascii_fast_path_matches_slow_path` — tokens of various
    sizes (including > 32 bytes to cross auto-vectorized chunks) round-trip
    byte-identical with the same offsets the slow path produces.
  - `fast_path_does_not_swallow_non_printable_bytes` — `" hi"` is still
    mapped to `"Ġhi"` (slow path).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tokenizers/src/pre_tokenizers/byte_level.rs | 56 +++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/tokenizers/src/pre_tokenizers/byte_level.rs b/tokenizers/src/pre_tokenizers/byte_level.rs
index 8bc0f30af..2fd9a4123 100644
--- a/tokenizers/src/pre_tokenizers/byte_level.rs
+++ b/tokenizers/src/pre_tokenizers/byte_level.rs
@@ -131,6 +131,16 @@ impl PreTokenizer for ByteLevel {
         })?;
         pretokenized.normalize(|normalized| {
             let s = normalized.get();
+            // Fast path: bytes in `0x21..=0x7E` (printable ASCII excluding space)
+            // map to the char with the same code point in `BYTES_CHAR`, i.e.
+            // `BYTES_CHAR[b] == b as char`. So for any token whose bytes all sit
+            // in that range, the per-byte transform produces an output that is
+            // byte-identical to the input and leaves `alignments` unchanged. We
+            // can therefore return without rebuilding anything. The `iter().all`
+            // predicate is trivially auto-vectorized by the compiler on stable.
+            if !s.is_empty() && s.as_bytes().iter().all(|&b| (0x21..=0x7E).contains(&b)) {
+                return Ok(());
+            }
             let mut transformations: Vec<(char, isize)> = Vec::with_capacity(s.len());
             for (i, cur_char) in s.char_indices() {
                 let size = cur_char.len_utf8();
@@ -568,6 +578,52 @@ mod tests {
         );
     }
 
+    #[test]
+    fn printable_ascii_fast_path_matches_slow_path() {
+        // Tokens whose bytes are all in `0x21..=0x7E` exercise the fast path.
+        // Their normalized form must be byte-identical to the input and their
+        // offsets must still tile the input contiguously.
+        let inputs = [
+            "Hello",
+            "!",
+            "?world!",
+            "abc123XYZ_+-=*/<>",
+            "a", // 1-byte boundary
+            "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ", // > 32 bytes to cross any auto-vectorized chunk boundary
+        ];
+        let bytelevel = ByteLevel::default().add_prefix_space(false).use_regex(false);
+        for s in inputs {
+            let mut pretok = PreTokenizedString::from(s);
+            bytelevel.pre_tokenize(&mut pretok).unwrap();
+            let splits: Vec<_> = pretok
+                .get_splits(OffsetReferential::Original, OffsetType::Byte)
+                .into_iter()
+                .map(|(t, o, _)| (t.to_string(), o))
+                .collect();
+            assert_eq!(
+                splits,
+                vec![(s.to_string(), (0, s.len()))],
+                "fast path mangled token: {s:?}"
+            );
+        }
+    }
+
+    #[test]
+    fn fast_path_does_not_swallow_non_printable_bytes() {
+        // Tokens containing a byte outside `0x21..=0x7E` (here a leading space)
+        // must still hit the slow path and get the GPT-2 byte→char mapping
+        // (' ' -> 'Ġ', i.e. U+0120).
+        let bytelevel = ByteLevel::default().add_prefix_space(false).use_regex(false);
+        let mut pretok = PreTokenizedString::from(" hi");
+        bytelevel.pre_tokenize(&mut pretok).unwrap();
+        let normalized: Vec<_> = pretok
+            .get_splits(OffsetReferential::Normalized, OffsetType::Byte)
+            .into_iter()
+            .map(|(t, _, _)| t.to_string())
+            .collect();
+        assert_eq!(normalized, vec!["Ġhi".to_string()]);
+    }
+
     #[test]
     fn deserialization() {
         // Before use_regex

From 7f71546d0a38da9f54f8ca8938364980c10c66cf Mon Sep 17 00:00:00 2001
From: Kim Yang <kimy@nvidia.com>
Date: Mon, 27 Apr 2026 14:16:28 +0800
Subject: [PATCH 2/2] fmt: cargo fmt on byte_level tests

---
 tokenizers/src/pre_tokenizers/byte_level.rs | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tokenizers/src/pre_tokenizers/byte_level.rs b/tokenizers/src/pre_tokenizers/byte_level.rs
index 2fd9a4123..4ed0be3c3 100644
--- a/tokenizers/src/pre_tokenizers/byte_level.rs
+++ b/tokenizers/src/pre_tokenizers/byte_level.rs
@@ -588,10 +588,12 @@ mod tests {
             "!",
             "?world!",
             "abc123XYZ_+-=*/<>",
-            "a", // 1-byte boundary
+            "a",                                                    // 1-byte boundary
             "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ", // > 32 bytes to cross any auto-vectorized chunk boundary
         ];
-        let bytelevel = ByteLevel::default().add_prefix_space(false).use_regex(false);
+        let bytelevel = ByteLevel::default()
+            .add_prefix_space(false)
+            .use_regex(false);
         for s in inputs {
             let mut pretok = PreTokenizedString::from(s);
             bytelevel.pre_tokenize(&mut pretok).unwrap();
@@ -613,7 +615,9 @@ mod tests {
         // Tokens containing a byte outside `0x21..=0x7E` (here a leading space)
         // must still hit the slow path and get the GPT-2 byte→char mapping
         // (' ' -> 'Ġ', i.e. U+0120).
-        let bytelevel = ByteLevel::default().add_prefix_space(false).use_regex(false);
+        let bytelevel = ByteLevel::default()
+            .add_prefix_space(false)
+            .use_regex(false);
         let mut pretok = PreTokenizedString::from(" hi");
         bytelevel.pre_tokenize(&mut pretok).unwrap();
         let normalized: Vec<_> = pretok