From cf9ef7285d986624063ebfc891f0f191efda0869 Mon Sep 17 00:00:00 2001 From: Luc Georges Date: Thu, 26 Mar 2026 16:26:46 +0100 Subject: [PATCH 1/5] feat: add new faster whitespace split pretok --- tokenizers/Cargo.toml | 4 + .../benches/whitespace_pretok_benchmark.rs | 96 +++++++++ tokenizers/src/pre_tokenizers/whitespace.rs | 198 ++++++++++++++++++ 3 files changed, 298 insertions(+) create mode 100644 tokenizers/benches/whitespace_pretok_benchmark.rs diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml index c45fd6748d..d62a15369f 100644 --- a/tokenizers/Cargo.toml +++ b/tokenizers/Cargo.toml @@ -62,6 +62,10 @@ harness = false name = "truncation_benchmark" harness = false +[[bench]] +name = "whitespace_pretok_benchmark" +harness = false + [dependencies] rand = "0.9" onig = { version = "6.5.1", default-features = false, optional = true } diff --git a/tokenizers/benches/whitespace_pretok_benchmark.rs b/tokenizers/benches/whitespace_pretok_benchmark.rs new file mode 100644 index 0000000000..1695c97218 --- /dev/null +++ b/tokenizers/benches/whitespace_pretok_benchmark.rs @@ -0,0 +1,96 @@ +#[macro_use] +extern crate criterion; + +use criterion::{BenchmarkId, Criterion, Throughput}; +use std::hint::black_box; +use tokenizers::pre_tokenizers::whitespace::{ManualWhitespaceSplit, Whitespace}; +use tokenizers::{PreTokenizedString, PreTokenizer}; + +fn bench_pretokenizer(c: &mut Criterion) { + let data = std::fs::read_to_string("data/big.txt").unwrap(); + let lines: Vec<&str> = data.lines().collect(); + + let mut group = c.benchmark_group("whitespace-pretok"); + group.throughput(Throughput::Bytes(data.len() as u64)); + + // Full corpus as a single string + group.bench_function("regex/full-corpus", |b| { + let pretok = Whitespace {}; + b.iter(|| { + let mut pre = PreTokenizedString::from(black_box(data.as_str())); + pretok.pre_tokenize(&mut pre).unwrap(); + pre + }) + }); + + group.bench_function("manual/full-corpus", |b| { + let pretok = ManualWhitespaceSplit {}; + b.iter(|| { + let mut pre = PreTokenizedString::from(black_box(data.as_str())); + pretok.pre_tokenize(&mut pre).unwrap(); + pre + }) + }); + + // Line-by-line (many short strings — tests per-call overhead) + group.bench_function("regex/line-by-line", |b| { + let pretok = Whitespace {}; + b.iter(|| { + for line in &lines { + let mut pre = PreTokenizedString::from(black_box(*line)); + pretok.pre_tokenize(&mut pre).unwrap(); + black_box(&pre); + } + }) + }); + + group.bench_function("manual/line-by-line", |b| { + let pretok = ManualWhitespaceSplit {}; + b.iter(|| { + for line in &lines { + let mut pre = PreTokenizedString::from(black_box(*line)); + pretok.pre_tokenize(&mut pre).unwrap(); + black_box(&pre); + } + }) + }); + + group.finish(); + + // --- Scaling with input size --- + + let mut group = c.benchmark_group("whitespace-pretok-scaling"); + + for size in [100, 1_000, 10_000, 100_000] { + let input: String = data.chars().take(size).collect(); + group.throughput(Throughput::Bytes(input.len() as u64)); + + group.bench_with_input(BenchmarkId::new("regex", size), &input, |b, input| { + let pretok = Whitespace {}; + b.iter(|| { + let mut pre = PreTokenizedString::from(black_box(input.as_str())); + pretok.pre_tokenize(&mut pre).unwrap(); + pre + }) + }); + + group.bench_with_input(BenchmarkId::new("manual", size), &input, |b, input| { + let pretok = ManualWhitespaceSplit {}; + b.iter(|| { + let mut pre = PreTokenizedString::from(black_box(input.as_str())); + pretok.pre_tokenize(&mut pre).unwrap(); + pre + }) + }); + } + + group.finish(); +} + +criterion_group! { + name = whitespace_pretok; + config = Criterion::default().sample_size(50); + targets = bench_pretokenizer +} + +criterion_main!(whitespace_pretok); diff --git a/tokenizers/src/pre_tokenizers/whitespace.rs b/tokenizers/src/pre_tokenizers/whitespace.rs index 20cfb65193..b28e60ae37 100644 --- a/tokenizers/src/pre_tokenizers/whitespace.rs +++ b/tokenizers/src/pre_tokenizers/whitespace.rs @@ -2,6 +2,7 @@ use std::sync::LazyLock; use regex::Regex; +use crate::pattern::Pattern; use crate::tokenizer::{ pattern::Invert, PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior, }; @@ -40,6 +41,87 @@ impl PreTokenizer for WhitespaceSplit { } } +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +#[macro_rules_attribute(impl_serde_type!)] +pub struct ManualWhitespaceSplit; + +impl PreTokenizer for ManualWhitespaceSplit { + fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> { + pretokenized.split(|_, normalized| { + normalized.split(WhiteSpacePattern, SplitDelimiterBehavior::Removed) + }) + } +} + +#[derive(Clone, Copy, Eq, PartialEq)] +enum CharType { + Whitespace, + Word, + Symbol, +} + +struct WhiteSpacePattern; + +impl Pattern for WhiteSpacePattern { + fn find_matches(&self, inside: &str) -> Result> { + if inside.is_empty() { + return Ok(vec![((0, 0), false)]); + } + + let mut matches = Vec::new(); + let mut span_start = 0; + let mut prev_type: Option = None; + + for (i, ch) in inside.char_indices() { + let ct = classify(ch); + + if let Some(pt) = prev_type { + if pt != ct { + // Emit the previous span: + // - whitespace spans are non-matches (false) + // - word/symbol spans are matches (true) + matches.push(((span_start, i), pt == CharType::Whitespace)); + span_start = i; + } + } + prev_type = Some(ct); + } + + // Emit the final span + if let Some(pt) = prev_type { + matches.push(((span_start, inside.len()), pt == CharType::Whitespace)); + } + + Ok(matches) + } +} + +fn classify(ch: char) -> CharType { + if ch.is_whitespace() { + CharType::Whitespace + } else if is_word_char(ch) { + CharType::Word + } else { + CharType::Symbol + } +} + +/// Matches the same characters as the `\w` regex class (Unicode-aware). +/// This is: Alphabetic + Nd (decimal digit) + Pc (connector punctuation) + +/// M (marks) + Join_Control — NOT Nl/No (which Rust's is_alphanumeric includes). +fn is_word_char(ch: char) -> bool { + use unicode_categories::UnicodeCategories; + + ch.is_alphabetic() // Unicode Alphabetic property (L* + some others) + || ch.is_number_decimal_digit() // Nd only (not Nl/No like superscripts, fractions) + || ch.is_punctuation_connector() // Pc: underscore, undertie, fullwidth low line, etc. + || ch.is_mark_nonspacing() // Mn: combining diacriticals, nukta, etc. + || ch.is_mark_spacing_combining() // Mc: spacing combining marks (vowel signs) + || ch.is_mark_enclosing() // Me: enclosing marks + || ch == '\u{200c}' // Zero-Width Non-Joiner + || ch == '\u{200d}' // Zero-Width Joiner +} + #[cfg(test)] mod tests { use super::*; @@ -102,4 +184,120 @@ mod tests { ); } } + + #[test] + fn assert_equivalent() { + let test_cases = vec![ + "Hello world!", + "How are you doing?", + "This is a test with numbers 123 and symbols @#$%", + "Multiple spaces", + "Tabs\tand\nnewlines", + "Unicode: café résumé naïve", + "Mixed: Hello123!@# world", + "Edge cases: a.b,c;d:e", + "Empty string:", + "Only spaces: ", + "Only symbols: !@#$%", + "Only words: hello world", + "Numbers: 123 456 789", + "Underscores: hello_world test_case", + "Special chars: αβγ δέζ ηθι", + ]; + + for test_case in test_cases { + let mut original = PreTokenizedString::from(test_case); + let mut manual = PreTokenizedString::from(test_case); + + let original_pretok = Whitespace {}; + let manual_pretok = ManualWhitespaceSplit {}; + + original_pretok.pre_tokenize(&mut original).unwrap(); + manual_pretok.pre_tokenize(&mut manual).unwrap(); + + let original_splits = original + .get_splits(OffsetReferential::Original, OffsetType::Byte) + .into_iter() + .map(|(s, o, _)| (s, o)) + .collect::>(); + + let manual_splits = manual + .get_splits(OffsetReferential::Original, OffsetType::Byte) + .into_iter() + .map(|(s, o, _)| (s, o)) + .collect::>(); + + assert_eq!( + original_splits, manual_splits, + "Mismatch for test case: '{}'", + test_case + ); + } + } + + #[test] + fn manual_edge_cases() { + let pretok = ManualWhitespaceSplit {}; + + // Test various edge cases + let edge_cases = vec![ + ("", vec![]), + (" ", vec![]), + (" ", vec![]), + ("a", vec![("a", (0, 1))]), + ("!", vec![("!", (0, 1))]), + ("a!", vec![("a", (0, 1)), ("!", (1, 2))]), + ("!a", vec![("!", (0, 1)), ("a", (1, 2))]), + ("a b", vec![("a", (0, 1)), ("b", (2, 3))]), + ("a b", vec![("a", (0, 1)), ("b", (3, 4))]), + ("a\tb", vec![("a", (0, 1)), ("b", (2, 3))]), + ("a\nb", vec![("a", (0, 1)), ("b", (2, 3))]), + ("a\r\nb", vec![("a", (0, 1)), ("b", (3, 4))]), + ]; + + for (input, expected) in edge_cases { + let mut pretokenized = PreTokenizedString::from(input); + pretok.pre_tokenize(&mut pretokenized).unwrap(); + let result = pretokenized + .get_splits(OffsetReferential::Original, OffsetType::Byte) + .into_iter() + .map(|(s, o, _)| (s, o)) + .collect::>(); + assert_eq!(result, expected, "Failed for input: '{}'", input); + } + } + + #[test] + fn assert_equivalent_xnli() { + let data = std::fs::read_to_string("data/xnli.txt").unwrap(); + let original_pretok = Whitespace {}; + let manual_pretok = ManualWhitespaceSplit {}; + + for (i, line) in data.lines().enumerate() { + let mut original = PreTokenizedString::from(line); + let mut manual = PreTokenizedString::from(line); + + original_pretok.pre_tokenize(&mut original).unwrap(); + manual_pretok.pre_tokenize(&mut manual).unwrap(); + + let original_splits = original + .get_splits(OffsetReferential::Original, OffsetType::Byte) + .into_iter() + .map(|(s, o, _)| (s, o)) + .collect::>(); + let manual_splits = manual + .get_splits(OffsetReferential::Original, OffsetType::Byte) + .into_iter() + .map(|(s, o, _)| (s, o)) + .collect::>(); + + assert_eq!( + original_splits, + manual_splits, + "Mismatch on line {}: '{}'", + i, + &line.chars().take(80).collect::(), + ); + } + } } From 087f1be6f36d6e0066a6fc97ae794c4b8bb6974b Mon Sep 17 00:00:00 2001 From: Luc Georges Date: Thu, 26 Mar 2026 16:48:19 +0100 Subject: [PATCH 2/5] refactor: fmt --- tokenizers/src/pre_tokenizers/whitespace.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tokenizers/src/pre_tokenizers/whitespace.rs b/tokenizers/src/pre_tokenizers/whitespace.rs index b28e60ae37..2a02fea52f 100644 --- a/tokenizers/src/pre_tokenizers/whitespace.rs +++ b/tokenizers/src/pre_tokenizers/whitespace.rs @@ -112,14 +112,14 @@ fn classify(ch: char) -> CharType { fn is_word_char(ch: char) -> bool { use unicode_categories::UnicodeCategories; - ch.is_alphabetic() // Unicode Alphabetic property (L* + some others) - || ch.is_number_decimal_digit() // Nd only (not Nl/No like superscripts, fractions) + ch.is_alphabetic() // Unicode Alphabetic property (L* + some others) + || ch.is_number_decimal_digit() // Nd only (not Nl/No like superscripts, fractions) || ch.is_punctuation_connector() // Pc: underscore, undertie, fullwidth low line, etc. - || ch.is_mark_nonspacing() // Mn: combining diacriticals, nukta, etc. + || ch.is_mark_nonspacing() // Mn: combining diacriticals, nukta, etc. || ch.is_mark_spacing_combining() // Mc: spacing combining marks (vowel signs) - || ch.is_mark_enclosing() // Me: enclosing marks - || ch == '\u{200c}' // Zero-Width Non-Joiner - || ch == '\u{200d}' // Zero-Width Joiner + || ch.is_mark_enclosing() // Me: enclosing marks + || ch == '\u{200c}' // Zero-Width Non-Joiner + || ch == '\u{200d}' // Zero-Width Joiner } #[cfg(test)] From 37b20361eb3f9364f5fbbccf3b17876f8bdea946 Mon Sep 17 00:00:00 2001 From: Luc Georges Date: Thu, 2 Apr 2026 16:12:35 +0200 Subject: [PATCH 3/5] refactor: use `is_mark` instead of the three `is_mark_*` fns --- tokenizers/src/pre_tokenizers/whitespace.rs | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tokenizers/src/pre_tokenizers/whitespace.rs b/tokenizers/src/pre_tokenizers/whitespace.rs index 2a02fea52f..88a319acb0 100644 --- a/tokenizers/src/pre_tokenizers/whitespace.rs +++ b/tokenizers/src/pre_tokenizers/whitespace.rs @@ -112,12 +112,10 @@ fn classify(ch: char) -> CharType { fn is_word_char(ch: char) -> bool { use unicode_categories::UnicodeCategories; - ch.is_alphabetic() // Unicode Alphabetic property (L* + some others) - || ch.is_number_decimal_digit() // Nd only (not Nl/No like superscripts, fractions) - || ch.is_punctuation_connector() // Pc: underscore, undertie, fullwidth low line, etc. - || ch.is_mark_nonspacing() // Mn: combining diacriticals, nukta, etc. - || ch.is_mark_spacing_combining() // Mc: spacing combining marks (vowel signs) - || ch.is_mark_enclosing() // Me: enclosing marks + ch.is_alphabetic() + || ch.is_number_decimal_digit() + || ch.is_punctuation_connector() + || ch.is_mark() || ch == '\u{200c}' // Zero-Width Non-Joiner || ch == '\u{200d}' // Zero-Width Joiner } From 60165a121639603693a35f14998872a7fc3d704a Mon Sep 17 00:00:00 2001 From: Luc Georges Date: Sun, 19 Apr 2026 15:20:48 +0200 Subject: [PATCH 4/5] feat: add `xnli.txt` download in the Makefile --- tokenizers/Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tokenizers/Makefile b/tokenizers/Makefile index e7bd98aada..beab1ed317 100644 --- a/tokenizers/Makefile +++ b/tokenizers/Makefile @@ -6,7 +6,7 @@ dir_guard=@mkdir -p $(@D) SHARED_RESOURCES = $(DATA_DIR)/gpt2-vocab.json $(DATA_DIR)/gpt2-merges.txt $(DATA_DIR)/bert-base-uncased-vocab.txt $(DATA_DIR)/big.txt $(DATA_DIR)/small.txt $(DATA_DIR)/albert-base-v1-tokenizer.json $(DATA_DIR)/llama-3-tokenizer.json BENCHMARK_RESOURCES = $(SHARED_RESOURCES) -TESTS_RESOURCES = $(SHARED_RESOURCES) $(DATA_DIR)/unigram.json $(DATA_DIR)/unigram_wagahaiwa_nekodearu.txt $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json $(DATA_DIR)/bert-wiki.json +TESTS_RESOURCES = $(SHARED_RESOURCES) $(DATA_DIR)/unigram.json $(DATA_DIR)/unigram_wagahaiwa_nekodearu.txt $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json $(DATA_DIR)/bert-wiki.json $(DATA_DIR)/xnli.txt .PHONY : build build : @@ -94,3 +94,7 @@ $(DATA_DIR)/bert-wiki.json : $(DATA_DIR)/llama-3-tokenizer.json : $(dir_guard) wget $(HF_TEST_DATA)/llama-3-tokenizer.json -O $@ + +$(DATA_DIR)/xnli.txt : + $(dir_guard) + wget $(HF_TEST_DATA)/xnli.txt -O $@ From fb78475e603271ebfb5b2e3cc46146a1dd387673 Mon Sep 17 00:00:00 2001 From: Luc Georges Date: Mon, 27 Apr 2026 10:21:15 +0200 Subject: [PATCH 5/5] feat: skip xnli equivalence test if file not present --- .github/workflows/rust.yml | 9 ++++++++- tokenizers/src/pre_tokenizers/whitespace.rs | 5 ++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index a3eff25e80..f785aa2655 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -75,7 +75,14 @@ jobs: working-directory: ./tokenizers run: make test - # Skip integration tests for now on Windows + - name: Download xnli test dataset on Windows for whitespace equivalence test + if: matrix.os == 'windows-latest' + shell: bash + working-directory: ./tokenizers + run: | + mkdir -p data + curl -L https://huggingface.co/datasets/hf-internal-testing/tokenizers-test-data/resolve/main/xnli.txt -o data/xnli.txt + - name: Run lib Tests on Windows if: matrix.os == 'windows-latest' uses: actions-rs/cargo@844f36862e911db73fe0815f00a4a2602c279505 # v1 diff --git a/tokenizers/src/pre_tokenizers/whitespace.rs b/tokenizers/src/pre_tokenizers/whitespace.rs index 88a319acb0..47077fc886 100644 --- a/tokenizers/src/pre_tokenizers/whitespace.rs +++ b/tokenizers/src/pre_tokenizers/whitespace.rs @@ -267,7 +267,10 @@ mod tests { #[test] fn assert_equivalent_xnli() { - let data = std::fs::read_to_string("data/xnli.txt").unwrap(); + let Ok(data) = std::fs::read_to_string("data/xnli.txt") else { + eprintln!("Could not read data/xnli.txt, skipping test"); + return; + }; let original_pretok = Whitespace {}; let manual_pretok = ManualWhitespaceSplit {};