From cf9ef7285d986624063ebfc891f0f191efda0869 Mon Sep 17 00:00:00 2001
From: Luc Georges <luc.sydney.georges@gmail.com>
Date: Thu, 26 Mar 2026 16:26:46 +0100
Subject: [PATCH 1/5] feat: add new faster whitespace split pretok

---
 tokenizers/Cargo.toml                         |   4 +
 .../benches/whitespace_pretok_benchmark.rs    |  96 +++++++++
 tokenizers/src/pre_tokenizers/whitespace.rs   | 198 ++++++++++++++++++
 3 files changed, 298 insertions(+)
 create mode 100644 tokenizers/benches/whitespace_pretok_benchmark.rs

diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml
index c45fd6748d..d62a15369f 100644
--- a/tokenizers/Cargo.toml
+++ b/tokenizers/Cargo.toml
@@ -62,6 +62,10 @@ harness = false
 name = "truncation_benchmark"
 harness = false
 
+[[bench]]
+name = "whitespace_pretok_benchmark"
+harness = false
+
 [dependencies]
 rand = "0.9"
 onig = { version = "6.5.1", default-features = false, optional = true }
diff --git a/tokenizers/benches/whitespace_pretok_benchmark.rs b/tokenizers/benches/whitespace_pretok_benchmark.rs
new file mode 100644
index 0000000000..1695c97218
--- /dev/null
+++ b/tokenizers/benches/whitespace_pretok_benchmark.rs
@@ -0,0 +1,96 @@
+#[macro_use]
+extern crate criterion;
+
+use criterion::{BenchmarkId, Criterion, Throughput};
+use std::hint::black_box;
+use tokenizers::pre_tokenizers::whitespace::{ManualWhitespaceSplit, Whitespace};
+use tokenizers::{PreTokenizedString, PreTokenizer};
+
+fn bench_pretokenizer(c: &mut Criterion) {
+    let data = std::fs::read_to_string("data/big.txt").unwrap();
+    let lines: Vec<&str> = data.lines().collect();
+
+    let mut group = c.benchmark_group("whitespace-pretok");
+    group.throughput(Throughput::Bytes(data.len() as u64));
+
+    // Full corpus as a single string
+    group.bench_function("regex/full-corpus", |b| {
+        let pretok = Whitespace {};
+        b.iter(|| {
+            let mut pre = PreTokenizedString::from(black_box(data.as_str()));
+            pretok.pre_tokenize(&mut pre).unwrap();
+            pre
+        })
+    });
+
+    group.bench_function("manual/full-corpus", |b| {
+        let pretok = ManualWhitespaceSplit {};
+        b.iter(|| {
+            let mut pre = PreTokenizedString::from(black_box(data.as_str()));
+            pretok.pre_tokenize(&mut pre).unwrap();
+            pre
+        })
+    });
+
+    // Line-by-line (many short strings — tests per-call overhead)
+    group.bench_function("regex/line-by-line", |b| {
+        let pretok = Whitespace {};
+        b.iter(|| {
+            for line in &lines {
+                let mut pre = PreTokenizedString::from(black_box(*line));
+                pretok.pre_tokenize(&mut pre).unwrap();
+                black_box(&pre);
+            }
+        })
+    });
+
+    group.bench_function("manual/line-by-line", |b| {
+        let pretok = ManualWhitespaceSplit {};
+        b.iter(|| {
+            for line in &lines {
+                let mut pre = PreTokenizedString::from(black_box(*line));
+                pretok.pre_tokenize(&mut pre).unwrap();
+                black_box(&pre);
+            }
+        })
+    });
+
+    group.finish();
+
+    // --- Scaling with input size ---
+
+    let mut group = c.benchmark_group("whitespace-pretok-scaling");
+
+    for size in [100, 1_000, 10_000, 100_000] {
+        let input: String = data.chars().take(size).collect();
+        group.throughput(Throughput::Bytes(input.len() as u64));
+
+        group.bench_with_input(BenchmarkId::new("regex", size), &input, |b, input| {
+            let pretok = Whitespace {};
+            b.iter(|| {
+                let mut pre = PreTokenizedString::from(black_box(input.as_str()));
+                pretok.pre_tokenize(&mut pre).unwrap();
+                pre
+            })
+        });
+
+        group.bench_with_input(BenchmarkId::new("manual", size), &input, |b, input| {
+            let pretok = ManualWhitespaceSplit {};
+            b.iter(|| {
+                let mut pre = PreTokenizedString::from(black_box(input.as_str()));
+                pretok.pre_tokenize(&mut pre).unwrap();
+                pre
+            })
+        });
+    }
+
+    group.finish();
+}
+
+criterion_group! {
+    name = whitespace_pretok;
+    config = Criterion::default().sample_size(50);
+    targets = bench_pretokenizer
+}
+
+criterion_main!(whitespace_pretok);
diff --git a/tokenizers/src/pre_tokenizers/whitespace.rs b/tokenizers/src/pre_tokenizers/whitespace.rs
index 20cfb65193..b28e60ae37 100644
--- a/tokenizers/src/pre_tokenizers/whitespace.rs
+++ b/tokenizers/src/pre_tokenizers/whitespace.rs
@@ -2,6 +2,7 @@ use std::sync::LazyLock;
 
 use regex::Regex;
 
+use crate::pattern::Pattern;
 use crate::tokenizer::{
     pattern::Invert, PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior,
 };
@@ -40,6 +41,87 @@ impl PreTokenizer for WhitespaceSplit {
     }
 }
 
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+#[macro_rules_attribute(impl_serde_type!)]
+pub struct ManualWhitespaceSplit;
+
+impl PreTokenizer for ManualWhitespaceSplit {
+    fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
+        pretokenized.split(|_, normalized| {
+            normalized.split(WhiteSpacePattern, SplitDelimiterBehavior::Removed)
+        })
+    }
+}
+
+#[derive(Clone, Copy, Eq, PartialEq)]
+enum CharType {
+    Whitespace,
+    Word,
+    Symbol,
+}
+
+struct WhiteSpacePattern;
+
+impl Pattern for WhiteSpacePattern {
+    fn find_matches(&self, inside: &str) -> Result<Vec<(crate::Offsets, bool)>> {
+        if inside.is_empty() {
+            return Ok(vec![((0, 0), false)]);
+        }
+
+        let mut matches = Vec::new();
+        let mut span_start = 0;
+        let mut prev_type: Option<CharType> = None;
+
+        for (i, ch) in inside.char_indices() {
+            let ct = classify(ch);
+
+            if let Some(pt) = prev_type {
+                if pt != ct {
+                    // Emit the previous span:
+                    // - whitespace spans are non-matches (false)
+                    // - word/symbol spans are matches (true)
+                    matches.push(((span_start, i), pt == CharType::Whitespace));
+                    span_start = i;
+                }
+            }
+            prev_type = Some(ct);
+        }
+
+        // Emit the final span
+        if let Some(pt) = prev_type {
+            matches.push(((span_start, inside.len()), pt == CharType::Whitespace));
+        }
+
+        Ok(matches)
+    }
+}
+
+fn classify(ch: char) -> CharType {
+    if ch.is_whitespace() {
+        CharType::Whitespace
+    } else if is_word_char(ch) {
+        CharType::Word
+    } else {
+        CharType::Symbol
+    }
+}
+
+/// Matches the same characters as the `\w` regex class (Unicode-aware).
+/// This is: Alphabetic + Nd (decimal digit) + Pc (connector punctuation) +
+/// M (marks) + Join_Control — NOT Nl/No (which Rust's is_alphanumeric includes).
+fn is_word_char(ch: char) -> bool {
+    use unicode_categories::UnicodeCategories;
+
+    ch.is_alphabetic()               // Unicode Alphabetic property (L* + some others)
+        || ch.is_number_decimal_digit()  // Nd only (not Nl/No like superscripts, fractions)
+        || ch.is_punctuation_connector() // Pc: underscore, undertie, fullwidth low line, etc.
+        || ch.is_mark_nonspacing()       // Mn: combining diacriticals, nukta, etc.
+        || ch.is_mark_spacing_combining() // Mc: spacing combining marks (vowel signs)
+        || ch.is_mark_enclosing()        // Me: enclosing marks
+        || ch == '\u{200c}'              // Zero-Width Non-Joiner
+        || ch == '\u{200d}'              // Zero-Width Joiner
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -102,4 +184,120 @@ mod tests {
             );
         }
     }
+
+    #[test]
+    fn assert_equivalent() {
+        let test_cases = vec![
+            "Hello world!",
+            "How are you doing?",
+            "This is a test with numbers 123 and symbols @#$%",
+            "Multiple    spaces",
+            "Tabs\tand\nnewlines",
+            "Unicode: café résumé naïve",
+            "Mixed: Hello123!@# world",
+            "Edge cases: a.b,c;d:e",
+            "Empty string:",
+            "Only spaces:   ",
+            "Only symbols: !@#$%",
+            "Only words: hello world",
+            "Numbers: 123 456 789",
+            "Underscores: hello_world test_case",
+            "Special chars: αβγ δέζ ηθι",
+        ];
+
+        for test_case in test_cases {
+            let mut original = PreTokenizedString::from(test_case);
+            let mut manual = PreTokenizedString::from(test_case);
+
+            let original_pretok = Whitespace {};
+            let manual_pretok = ManualWhitespaceSplit {};
+
+            original_pretok.pre_tokenize(&mut original).unwrap();
+            manual_pretok.pre_tokenize(&mut manual).unwrap();
+
+            let original_splits = original
+                .get_splits(OffsetReferential::Original, OffsetType::Byte)
+                .into_iter()
+                .map(|(s, o, _)| (s, o))
+                .collect::<Vec<_>>();
+
+            let manual_splits = manual
+                .get_splits(OffsetReferential::Original, OffsetType::Byte)
+                .into_iter()
+                .map(|(s, o, _)| (s, o))
+                .collect::<Vec<_>>();
+
+            assert_eq!(
+                original_splits, manual_splits,
+                "Mismatch for test case: '{}'",
+                test_case
+            );
+        }
+    }
+
+    #[test]
+    fn manual_edge_cases() {
+        let pretok = ManualWhitespaceSplit {};
+
+        // Test various edge cases
+        let edge_cases = vec![
+            ("", vec![]),
+            (" ", vec![]),
+            ("  ", vec![]),
+            ("a", vec![("a", (0, 1))]),
+            ("!", vec![("!", (0, 1))]),
+            ("a!", vec![("a", (0, 1)), ("!", (1, 2))]),
+            ("!a", vec![("!", (0, 1)), ("a", (1, 2))]),
+            ("a b", vec![("a", (0, 1)), ("b", (2, 3))]),
+            ("a  b", vec![("a", (0, 1)), ("b", (3, 4))]),
+            ("a\tb", vec![("a", (0, 1)), ("b", (2, 3))]),
+            ("a\nb", vec![("a", (0, 1)), ("b", (2, 3))]),
+            ("a\r\nb", vec![("a", (0, 1)), ("b", (3, 4))]),
+        ];
+
+        for (input, expected) in edge_cases {
+            let mut pretokenized = PreTokenizedString::from(input);
+            pretok.pre_tokenize(&mut pretokenized).unwrap();
+            let result = pretokenized
+                .get_splits(OffsetReferential::Original, OffsetType::Byte)
+                .into_iter()
+                .map(|(s, o, _)| (s, o))
+                .collect::<Vec<_>>();
+            assert_eq!(result, expected, "Failed for input: '{}'", input);
+        }
+    }
+
+    #[test]
+    fn assert_equivalent_xnli() {
+        let data = std::fs::read_to_string("data/xnli.txt").unwrap();
+        let original_pretok = Whitespace {};
+        let manual_pretok = ManualWhitespaceSplit {};
+
+        for (i, line) in data.lines().enumerate() {
+            let mut original = PreTokenizedString::from(line);
+            let mut manual = PreTokenizedString::from(line);
+
+            original_pretok.pre_tokenize(&mut original).unwrap();
+            manual_pretok.pre_tokenize(&mut manual).unwrap();
+
+            let original_splits = original
+                .get_splits(OffsetReferential::Original, OffsetType::Byte)
+                .into_iter()
+                .map(|(s, o, _)| (s, o))
+                .collect::<Vec<_>>();
+            let manual_splits = manual
+                .get_splits(OffsetReferential::Original, OffsetType::Byte)
+                .into_iter()
+                .map(|(s, o, _)| (s, o))
+                .collect::<Vec<_>>();
+
+            assert_eq!(
+                original_splits,
+                manual_splits,
+                "Mismatch on line {}: '{}'",
+                i,
+                &line.chars().take(80).collect::<String>(),
+            );
+        }
+    }
 }

From 087f1be6f36d6e0066a6fc97ae794c4b8bb6974b Mon Sep 17 00:00:00 2001
From: Luc Georges <luc.sydney.georges@gmail.com>
Date: Thu, 26 Mar 2026 16:48:19 +0100
Subject: [PATCH 2/5] refactor: fmt

---
 tokenizers/src/pre_tokenizers/whitespace.rs | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tokenizers/src/pre_tokenizers/whitespace.rs b/tokenizers/src/pre_tokenizers/whitespace.rs
index b28e60ae37..2a02fea52f 100644
--- a/tokenizers/src/pre_tokenizers/whitespace.rs
+++ b/tokenizers/src/pre_tokenizers/whitespace.rs
@@ -112,14 +112,14 @@ fn classify(ch: char) -> CharType {
 fn is_word_char(ch: char) -> bool {
     use unicode_categories::UnicodeCategories;
 
-    ch.is_alphabetic()               // Unicode Alphabetic property (L* + some others)
-        || ch.is_number_decimal_digit()  // Nd only (not Nl/No like superscripts, fractions)
+    ch.is_alphabetic() // Unicode Alphabetic property (L* + some others)
+        || ch.is_number_decimal_digit() // Nd only (not Nl/No like superscripts, fractions)
         || ch.is_punctuation_connector() // Pc: underscore, undertie, fullwidth low line, etc.
-        || ch.is_mark_nonspacing()       // Mn: combining diacriticals, nukta, etc.
+        || ch.is_mark_nonspacing() // Mn: combining diacriticals, nukta, etc.
         || ch.is_mark_spacing_combining() // Mc: spacing combining marks (vowel signs)
-        || ch.is_mark_enclosing()        // Me: enclosing marks
-        || ch == '\u{200c}'              // Zero-Width Non-Joiner
-        || ch == '\u{200d}'              // Zero-Width Joiner
+        || ch.is_mark_enclosing() // Me: enclosing marks
+        || ch == '\u{200c}' // Zero-Width Non-Joiner
+        || ch == '\u{200d}' // Zero-Width Joiner
 }
 
 #[cfg(test)]

From 37b20361eb3f9364f5fbbccf3b17876f8bdea946 Mon Sep 17 00:00:00 2001
From: Luc Georges <luc.sydney.georges@gmail.com>
Date: Thu, 2 Apr 2026 16:12:35 +0200
Subject: [PATCH 3/5] refactor: use `is_mark` instead of the three `is_mark_*`
 fns

---
 tokenizers/src/pre_tokenizers/whitespace.rs | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/tokenizers/src/pre_tokenizers/whitespace.rs b/tokenizers/src/pre_tokenizers/whitespace.rs
index 2a02fea52f..88a319acb0 100644
--- a/tokenizers/src/pre_tokenizers/whitespace.rs
+++ b/tokenizers/src/pre_tokenizers/whitespace.rs
@@ -112,12 +112,10 @@ fn classify(ch: char) -> CharType {
 fn is_word_char(ch: char) -> bool {
     use unicode_categories::UnicodeCategories;
 
-    ch.is_alphabetic() // Unicode Alphabetic property (L* + some others)
-        || ch.is_number_decimal_digit() // Nd only (not Nl/No like superscripts, fractions)
-        || ch.is_punctuation_connector() // Pc: underscore, undertie, fullwidth low line, etc.
-        || ch.is_mark_nonspacing() // Mn: combining diacriticals, nukta, etc.
-        || ch.is_mark_spacing_combining() // Mc: spacing combining marks (vowel signs)
-        || ch.is_mark_enclosing() // Me: enclosing marks
+    ch.is_alphabetic()
+        || ch.is_number_decimal_digit()
+        || ch.is_punctuation_connector()
+        || ch.is_mark()
         || ch == '\u{200c}' // Zero-Width Non-Joiner
         || ch == '\u{200d}' // Zero-Width Joiner
 }

From 60165a121639603693a35f14998872a7fc3d704a Mon Sep 17 00:00:00 2001
From: Luc Georges <luc.sydney.georges@gmail.com>
Date: Sun, 19 Apr 2026 15:20:48 +0200
Subject: [PATCH 4/5] feat: add `xnli.txt` download in the Makefile

---
 tokenizers/Makefile | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tokenizers/Makefile b/tokenizers/Makefile
index e7bd98aada..beab1ed317 100644
--- a/tokenizers/Makefile
+++ b/tokenizers/Makefile
@@ -6,7 +6,7 @@ dir_guard=@mkdir -p $(@D)
 
 SHARED_RESOURCES = $(DATA_DIR)/gpt2-vocab.json $(DATA_DIR)/gpt2-merges.txt $(DATA_DIR)/bert-base-uncased-vocab.txt $(DATA_DIR)/big.txt $(DATA_DIR)/small.txt $(DATA_DIR)/albert-base-v1-tokenizer.json  $(DATA_DIR)/llama-3-tokenizer.json
 BENCHMARK_RESOURCES = $(SHARED_RESOURCES)
-TESTS_RESOURCES = $(SHARED_RESOURCES) $(DATA_DIR)/unigram.json $(DATA_DIR)/unigram_wagahaiwa_nekodearu.txt $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json $(DATA_DIR)/bert-wiki.json
+TESTS_RESOURCES = $(SHARED_RESOURCES) $(DATA_DIR)/unigram.json $(DATA_DIR)/unigram_wagahaiwa_nekodearu.txt $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json $(DATA_DIR)/bert-wiki.json $(DATA_DIR)/xnli.txt
 
 .PHONY : build
 build :
@@ -94,3 +94,7 @@ $(DATA_DIR)/bert-wiki.json :
 $(DATA_DIR)/llama-3-tokenizer.json :
 	$(dir_guard)
 	wget $(HF_TEST_DATA)/llama-3-tokenizer.json -O $@
+
+$(DATA_DIR)/xnli.txt :
+	$(dir_guard)
+	wget $(HF_TEST_DATA)/xnli.txt -O $@

From fb78475e603271ebfb5b2e3cc46146a1dd387673 Mon Sep 17 00:00:00 2001
From: Luc Georges <luc.sydney.georges@gmail.com>
Date: Mon, 27 Apr 2026 10:21:15 +0200
Subject: [PATCH 5/5] feat: skip xnli equivalence test if file not present

---
 .github/workflows/rust.yml                  | 9 ++++++++-
 tokenizers/src/pre_tokenizers/whitespace.rs | 5 ++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index a3eff25e80..f785aa2655 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -75,7 +75,14 @@ jobs:
         working-directory: ./tokenizers
         run: make test
 
-      # Skip integration tests for now on Windows
+      - name: Download xnli test dataset on Windows for whitespace equivalence test
+        if: matrix.os == 'windows-latest'
+        shell: bash
+        working-directory: ./tokenizers
+        run: |
+          mkdir -p data
+          curl -L https://huggingface.co/datasets/hf-internal-testing/tokenizers-test-data/resolve/main/xnli.txt -o data/xnli.txt
+
       - name: Run lib Tests on Windows
         if: matrix.os == 'windows-latest'
         uses: actions-rs/cargo@844f36862e911db73fe0815f00a4a2602c279505  # v1
diff --git a/tokenizers/src/pre_tokenizers/whitespace.rs b/tokenizers/src/pre_tokenizers/whitespace.rs
index 88a319acb0..47077fc886 100644
--- a/tokenizers/src/pre_tokenizers/whitespace.rs
+++ b/tokenizers/src/pre_tokenizers/whitespace.rs
@@ -267,7 +267,10 @@ mod tests {
 
     #[test]
     fn assert_equivalent_xnli() {
-        let data = std::fs::read_to_string("data/xnli.txt").unwrap();
+        let Ok(data) = std::fs::read_to_string("data/xnli.txt") else {
+            eprintln!("Could not read data/xnli.txt, skipping test");
+            return;
+        };
         let original_pretok = Whitespace {};
         let manual_pretok = ManualWhitespaceSplit {};