From 57a4f099934dd5d841ad811f1f69a9e67863690b Mon Sep 17 00:00:00 2001 From: Kim Yang Date: Mon, 27 Apr 2026 00:55:12 +0800 Subject: [PATCH 1/2] ByteLevel: skip per-byte transform for printable-ASCII-only tokens MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The byte → char map produced by `bytes_char()` is the identity on `0x21..=0x7E` (printable ASCII excluding space): each such byte maps to the char with the same code point, which encodes back to that exact byte in UTF-8. So when an entire post-regex token already lives in that range, the per-byte transform produces an output byte-identical to the input and rebuilds `alignments` to the same tuples it already had. We can return early and skip the `Vec<(char, isize)>` build, the `HashMap` lookups, and the `transform()` rebuild. The gate is conservative: any byte outside `0x21..=0x7E` (space, tab, newline, DEL, any Latin-1, any UTF-8 lead/continuation byte) falls through to the original code path with zero changes. Tokens like `" word"` (regex `' ?\p{L}+'` output), `"\n"`, or any non-ASCII text therefore retain their existing GPT-2 byte mapping (' ' → 'Ġ' etc.). The `iter().all(...)` predicate auto-vectorizes on stable Rust, so the detection is O(n) over bytes with SIMD-class throughput on x86_64 / aarch64. Two new tests in `byte_level::tests`: - `printable_ascii_fast_path_matches_slow_path` — tokens of various sizes (including > 32 bytes to cross auto-vectorized chunks) round-trip byte-identical with the same offsets the slow path produces. - `fast_path_does_not_swallow_non_printable_bytes` — `" hi"` is still mapped to `"Ġhi"` (slow path). Co-Authored-By: Claude Opus 4.7 (1M context) --- tokenizers/src/pre_tokenizers/byte_level.rs | 56 +++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/tokenizers/src/pre_tokenizers/byte_level.rs b/tokenizers/src/pre_tokenizers/byte_level.rs index 8bc0f30af..2fd9a4123 100644 --- a/tokenizers/src/pre_tokenizers/byte_level.rs +++ b/tokenizers/src/pre_tokenizers/byte_level.rs @@ -131,6 +131,16 @@ impl PreTokenizer for ByteLevel { })?; pretokenized.normalize(|normalized| { let s = normalized.get(); + // Fast path: bytes in `0x21..=0x7E` (printable ASCII excluding space) + // map to the char with the same code point in `BYTES_CHAR`, i.e. + // `BYTES_CHAR[b] == b as char`. So for any token whose bytes all sit + // in that range, the per-byte transform produces an output that is + // byte-identical to the input and leaves `alignments` unchanged. We + // can therefore return without rebuilding anything. The `iter().all` + // predicate is trivially auto-vectorized by the compiler on stable. + if !s.is_empty() && s.as_bytes().iter().all(|&b| (0x21..=0x7E).contains(&b)) { + return Ok(()); + } let mut transformations: Vec<(char, isize)> = Vec::with_capacity(s.len()); for (i, cur_char) in s.char_indices() { let size = cur_char.len_utf8(); @@ -568,6 +578,52 @@ mod tests { ); } + #[test] + fn printable_ascii_fast_path_matches_slow_path() { + // Tokens whose bytes are all in `0x21..=0x7E` exercise the fast path. + // Their normalized form must be byte-identical to the input and their + // offsets must still tile the input contiguously. + let inputs = [ + "Hello", + "!", + "?world!", + "abc123XYZ_+-=*/<>", + "a", // 1-byte boundary + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ", // > 32 bytes to cross any auto-vectorized chunk boundary + ]; + let bytelevel = ByteLevel::default().add_prefix_space(false).use_regex(false); + for s in inputs { + let mut pretok = PreTokenizedString::from(s); + bytelevel.pre_tokenize(&mut pretok).unwrap(); + let splits: Vec<_> = pretok + .get_splits(OffsetReferential::Original, OffsetType::Byte) + .into_iter() + .map(|(t, o, _)| (t.to_string(), o)) + .collect(); + assert_eq!( + splits, + vec![(s.to_string(), (0, s.len()))], + "fast path mangled token: {s:?}" + ); + } + } + + #[test] + fn fast_path_does_not_swallow_non_printable_bytes() { + // Tokens containing a byte outside `0x21..=0x7E` (here a leading space) + // must still hit the slow path and get the GPT-2 byte→char mapping + // (' ' -> 'Ġ', i.e. U+0120). + let bytelevel = ByteLevel::default().add_prefix_space(false).use_regex(false); + let mut pretok = PreTokenizedString::from(" hi"); + bytelevel.pre_tokenize(&mut pretok).unwrap(); + let normalized: Vec<_> = pretok + .get_splits(OffsetReferential::Normalized, OffsetType::Byte) + .into_iter() + .map(|(t, _, _)| t.to_string()) + .collect(); + assert_eq!(normalized, vec!["Ġhi".to_string()]); + } + #[test] fn deserialization() { // Before use_regex From 7f71546d0a38da9f54f8ca8938364980c10c66cf Mon Sep 17 00:00:00 2001 From: Kim Yang Date: Mon, 27 Apr 2026 14:16:28 +0800 Subject: [PATCH 2/2] fmt: cargo fmt on byte_level tests --- tokenizers/src/pre_tokenizers/byte_level.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tokenizers/src/pre_tokenizers/byte_level.rs b/tokenizers/src/pre_tokenizers/byte_level.rs index 2fd9a4123..4ed0be3c3 100644 --- a/tokenizers/src/pre_tokenizers/byte_level.rs +++ b/tokenizers/src/pre_tokenizers/byte_level.rs @@ -588,10 +588,12 @@ mod tests { "!", "?world!", "abc123XYZ_+-=*/<>", - "a", // 1-byte boundary + "a", // 1-byte boundary "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ", // > 32 bytes to cross any auto-vectorized chunk boundary ]; - let bytelevel = ByteLevel::default().add_prefix_space(false).use_regex(false); + let bytelevel = ByteLevel::default() + .add_prefix_space(false) + .use_regex(false); for s in inputs { let mut pretok = PreTokenizedString::from(s); bytelevel.pre_tokenize(&mut pretok).unwrap(); @@ -613,7 +615,9 @@ mod tests { // Tokens containing a byte outside `0x21..=0x7E` (here a leading space) // must still hit the slow path and get the GPT-2 byte→char mapping // (' ' -> 'Ġ', i.e. U+0120). - let bytelevel = ByteLevel::default().add_prefix_space(false).use_regex(false); + let bytelevel = ByteLevel::default() + .add_prefix_space(false) + .use_regex(false); let mut pretok = PreTokenizedString::from(" hi"); bytelevel.pre_tokenize(&mut pretok).unwrap(); let normalized: Vec<_> = pretok