diff --git a/.cargo/config.toml b/.cargo/config.toml new file mode 100644 index 0000000000..b8f19c6179 --- /dev/null +++ b/.cargo/config.toml @@ -0,0 +1,4 @@ +[env] +# Only takes effect when the optional `pcre2` feature is enabled and `pcre2-sys` +# is built. Keeps runtime free of dynamic libpcre2 dependency. +PCRE2_SYS_STATIC = { value = "1", force = false } diff --git a/bindings/python/src/utils/normalization.rs b/bindings/python/src/utils/normalization.rs index ed4d9b7593..17e05d3a68 100644 --- a/bindings/python/src/utils/normalization.rs +++ b/bindings/python/src/utils/normalization.rs @@ -28,7 +28,7 @@ impl Pattern for PyPattern { s.find_matches(inside) } } - PyPattern::Regex(r) => Python::attach(|py| (&r.borrow(py).inner).find_matches(inside)), + PyPattern::Regex(r) => Python::attach(|py| r.borrow(py).inner.find_matches(inside)), } } } diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml index 40b273ac4a..7a599fbe2f 100644 --- a/tokenizers/Cargo.toml +++ b/tokenizers/Cargo.toml @@ -75,11 +75,13 @@ getrandom = { version = "0.3" } esaxx-rs = { version = "0.1.10", default-features = false, features=[]} monostate = "0.1.12" ahash = { version = "0.8.11", features = ["serde"] } +pcre2 = { version = "0.2", optional = true } dary_heap = { version = "0.3.6", features = ["serde"] } compact_str = { version = "0.9", features = ["serde"] } [features] default = ["progressbar", "onig", "esaxx_fast"] +pcre2 = ["dep:pcre2", "fancy-regex"] esaxx_fast = ["esaxx-rs/cpp"] progressbar = ["indicatif"] http = ["hf-hub"] diff --git a/tokenizers/benches/llama3_benchmark.rs b/tokenizers/benches/llama3_benchmark.rs index 8bd45396a3..fcfce6175e 100644 --- a/tokenizers/benches/llama3_benchmark.rs +++ b/tokenizers/benches/llama3_benchmark.rs @@ -6,6 +6,7 @@ mod common; use common::{iter_bench_encode, iter_bench_encode_batch, iter_bench_train}; use criterion::{Criterion, Throughput}; use std::hint::black_box; +use std::sync::Arc; use tokenizers::{ models::{bpe::BpeTrainerBuilder, TrainerWrapper}, EncodeInput, Tokenizer, @@ -43,6 +44,40 @@ pub fn llama3(c: &mut Criterion) { group.bench_function("llama3-batch", |b| { b.iter_custom(|iters| iter_bench_encode_batch(iters, &tokenizer, &batches)) }); + // Concurrent long-context: N threads each encode a different large input (80k chars) + let all_lines: Vec<&str> = data.lines().collect(); + let lines_per_thread = 1000; + let tokenizer_arc = Arc::new(tokenizer.clone()); + for num_threads in [1, 2, 4, 8] { + let inputs: Vec = (0..num_threads) + .map(|i| { + let start = i * lines_per_thread; + all_lines[start..start + lines_per_thread].join("\n") + }) + .collect(); + let total_bytes: usize = inputs.iter().map(|s| s.len()).sum(); + let tok = tokenizer_arc.clone(); + group.throughput(Throughput::Bytes(total_bytes as u64)); + group.bench_function(format!("llama3-concurrent-long-{num_threads}t"), move |b| { + b.iter(|| { + std::thread::scope(|s| { + let handles: Vec<_> = inputs + .iter() + .map(|input| { + let tok = &tok; + s.spawn(move || { + black_box(tok.encode(black_box(input.as_str()), false).unwrap()) + }) + }) + .collect(); + for h in handles { + h.join().unwrap(); + } + }); + }) + }); + } + let mut trainer: TrainerWrapper = BpeTrainerBuilder::default() .show_progress(false) .build() diff --git a/tokenizers/src/normalizers/replace.rs b/tokenizers/src/normalizers/replace.rs index 5657574830..8def55068f 100644 --- a/tokenizers/src/normalizers/replace.rs +++ b/tokenizers/src/normalizers/replace.rs @@ -1,4 +1,3 @@ -use crate::tokenizer::pattern::Pattern; use crate::tokenizer::Decoder; use crate::tokenizer::{NormalizedString, Normalizer, Result}; use crate::utils::SysRegex; diff --git a/tokenizers/src/tokenizer/pattern.rs b/tokenizers/src/tokenizer/pattern.rs index a2a2f16841..7e15682646 100644 --- a/tokenizers/src/tokenizer/pattern.rs +++ b/tokenizers/src/tokenizer/pattern.rs @@ -62,23 +62,7 @@ impl Pattern for &Regex { impl Pattern for &SysRegex { fn find_matches(&self, inside: &str) -> Result> { - if inside.is_empty() { - return Ok(vec![((0, 0), false)]); - } - - let mut prev = 0; - let mut splits = Vec::with_capacity(inside.len()); - for (start, end) in self.find_iter(inside) { - if prev != start { - splits.push(((prev, start), false)); - } - splits.push(((start, end), true)); - prev = end; - } - if prev != inside.len() { - splits.push(((prev, inside.len()), false)) - } - Ok(splits) + SysRegex::find_matches(self, inside) } } diff --git a/tokenizers/src/utils/fancy.rs b/tokenizers/src/utils/fancy.rs index bbcf653115..73f9ea2d44 100644 --- a/tokenizers/src/utils/fancy.rs +++ b/tokenizers/src/utils/fancy.rs @@ -18,6 +18,30 @@ impl SysRegex { regex: Regex::new(regex_str)?, }) } + + pub fn find_matches( + &self, + inside: &str, + ) -> Result, Box> { + if inside.is_empty() { + return Ok(vec![((0, 0), false)]); + } + + let mut prev = 0; + let mut splits = Vec::with_capacity(inside.len()); + for matched in self.regex.find_iter(inside) { + let matched = matched?; + if prev != matched.start() { + splits.push(((prev, matched.start()), false)); + } + splits.push(((matched.start(), matched.end()), true)); + prev = matched.end(); + } + if prev != inside.len() { + splits.push(((prev, inside.len()), false)); + } + Ok(splits) + } } pub struct Matches<'r, 't>(fancy_regex::Matches<'r, 't>); diff --git a/tokenizers/src/utils/mod.rs b/tokenizers/src/utils/mod.rs index 636bee660d..3dc4284177 100644 --- a/tokenizers/src/utils/mod.rs +++ b/tokenizers/src/utils/mod.rs @@ -2,17 +2,24 @@ pub(crate) mod cache; #[cfg(feature = "http")] pub(crate) mod from_pretrained; -#[cfg(all(feature = "fancy-regex", not(feature = "onig")))] -mod fancy; -#[cfg(all(feature = "fancy-regex", not(feature = "onig")))] -pub use fancy::SysRegex; -#[cfg(feature = "onig")] +// Regex backend priority: pcre2 (JIT) > onig > fancy-regex +#[cfg(feature = "pcre2")] +mod pcre2_backend; +#[cfg(feature = "pcre2")] +pub use pcre2_backend::SysRegex; + +#[cfg(all(feature = "onig", not(feature = "pcre2")))] mod onig; -#[cfg(feature = "onig")] +#[cfg(all(feature = "onig", not(feature = "pcre2")))] pub use crate::utils::onig::SysRegex; -#[cfg(not(any(feature = "onig", feature = "fancy-regex")))] -compile_error!("One of the `onig`, or `fancy-regex` features must be enabled"); +#[cfg(all(feature = "fancy-regex", not(feature = "onig"), not(feature = "pcre2")))] +mod fancy; +#[cfg(all(feature = "fancy-regex", not(feature = "onig"), not(feature = "pcre2")))] +pub use fancy::SysRegex; + +#[cfg(not(any(feature = "onig", feature = "fancy-regex", feature = "pcre2")))] +compile_error!("One of the `pcre2`, `onig`, or `fancy-regex` features must be enabled"); pub mod iter; pub mod padding; diff --git a/tokenizers/src/utils/onig.rs b/tokenizers/src/utils/onig.rs index 27a10f007d..f51591a038 100644 --- a/tokenizers/src/utils/onig.rs +++ b/tokenizers/src/utils/onig.rs @@ -20,6 +20,26 @@ impl SysRegex { regex: Regex::new(regex_str)?, }) } + + pub fn find_matches(&self, inside: &str) -> Result> { + if inside.is_empty() { + return Ok(vec![((0, 0), false)]); + } + + let mut prev = 0; + let mut splits = Vec::with_capacity(inside.len()); + for (start, end) in self.regex.find_iter(inside) { + if prev != start { + splits.push(((prev, start), false)); + } + splits.push(((start, end), true)); + prev = end; + } + if prev != inside.len() { + splits.push(((prev, inside.len()), false)); + } + Ok(splits) + } } impl Pattern for &Regex { diff --git a/tokenizers/src/utils/pcre2_backend.rs b/tokenizers/src/utils/pcre2_backend.rs new file mode 100644 index 0000000000..c2711cd45b --- /dev/null +++ b/tokenizers/src/utils/pcre2_backend.rs @@ -0,0 +1,507 @@ +use crate::tokenizer::pattern::Pattern; +use crate::utils::parallelism::{current_num_threads, get_parallelism, MaybeParallelIterator}; +use crate::Offsets; +use std::cell::Cell; +use std::error::Error; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::OnceLock; + +/// Minimum chunk size (bytes) for parallel regex matching. +/// Parallel matching only triggers for inputs at least twice this size. +const MIN_CHUNK_SIZE: usize = 4 * 1024; + +/// Overlap added to each chunk so matches that start near a boundary are still +/// seen in full by the owning chunk. +const CHUNK_OVERLAP: usize = 1024; + +/// Cap the pool size — more copies waste memory without benefit since +/// concurrent encode calls on the same tokenizer are typically limited +/// by the number of physical cores doing real work. +const MAX_POOL_SIZE: usize = 32; + +/// Number of pre-compiled regex copies. +fn pool_size() -> usize { + static CACHED: OnceLock = OnceLock::new(); + *CACHED.get_or_init(|| { + std::thread::available_parallelism() + .map(|n| n.get().min(MAX_POOL_SIZE)) + .unwrap_or(1) + }) +} + +/// Global counter for assigning thread-local pool indices. +static THREAD_COUNTER: AtomicUsize = AtomicUsize::new(0); + +thread_local! { + /// Each thread gets a stable index into the regex pool. + static THREAD_INDEX: Cell = Cell::new( + THREAD_COUNTER.fetch_add(1, Ordering::Relaxed) + ); +} + +#[inline] +fn thread_index(pool_len: usize) -> usize { + THREAD_INDEX.with(|c| c.get()) % pool_len +} + +/// A single PCRE2 regex instance. Each instance maintains its own DFA cache, +/// so sharing across threads causes cache thrashing. +struct Pcre2Regex { + inner: pcre2::bytes::Regex, +} + +impl Pcre2Regex { + fn compile(pattern: &str) -> Result> { + let inner = pcre2::bytes::RegexBuilder::new() + .utf(true) + .ucp(true) + .jit_if_available(true) + .build(pattern)?; + Ok(Self { inner }) + } + + fn find_at(&self, text: &[u8], offset: usize) -> Option<(usize, usize)> { + match self.inner.find_at(text, offset) { + Ok(Some(m)) => Some((m.start(), m.end())), + _ => None, + } + } +} + +// Safety: pcre2::bytes::Regex is Send+Sync. Each Pcre2Regex instance has its +// own match context, so concurrent find_at calls on *different* instances are safe. +unsafe impl Send for Pcre2Regex {} +unsafe impl Sync for Pcre2Regex {} + +/// PCRE2-backed regex with JIT compilation and per-thread copies. +/// +/// Pre-compiles a pool of independent PCRE2 regex instances at construction time +/// (capped at 32). Each thread picks its own copy via a stable thread-local index, +/// avoiding DFA cache contention under concurrent use. +/// +/// Falls back to `fancy_regex` at runtime if PCRE2 compilation fails for a +/// particular regex pattern. +pub struct SysRegex { + /// Per-thread PCRE2 instances. None if PCRE2 compilation failed. + pcre2_pool: Option>, + /// Fallback regex, always available. + fallback: fancy_regex::Regex, +} + +impl std::fmt::Debug for SysRegex { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if self.pcre2_pool.is_some() { + write!(f, "SysRegex(pcre2-jit)") + } else { + write!(f, "SysRegex(fancy-regex-fallback)") + } + } +} + +impl SysRegex { + pub fn new(regex_str: &str) -> Result> { + // Always compile fancy-regex as fallback + let fallback = fancy_regex::Regex::new(regex_str)?; + + // Try PCRE2 — compile N independent copies for per-thread use + let n = pool_size(); + let pcre2_pool = (0..n) + .map(|_| Pcre2Regex::compile(regex_str)) + .collect::, _>>() + .ok(); + + Ok(Self { + pcre2_pool, + fallback, + }) + } + + pub fn find_iter<'r, 't>(&'r self, inside: &'t str) -> Matches<'r, 't> { + if let Some(pool) = &self.pcre2_pool { + let idx = thread_index(pool.len()); + Matches::Pcre2(Pcre2Matches { + regex: &pool[idx], + text: inside.as_bytes(), + offset: 0, + }) + } else { + Matches::Fancy(FancyMatches(self.fallback.find_iter(inside))) + } + } + + pub fn find_matches( + &self, + inside: &str, + ) -> Result, Box> { + if inside.is_empty() { + return Ok(vec![((0, 0), false)]); + } + + let matches = match &self.pcre2_pool { + Some(pool) if should_parallelize(inside.len(), pool.len()) => { + find_matches_pcre2_parallel(inside, pool)? + } + Some(pool) => find_matches_pcre2(inside, 0, &pool[0])?, + None => find_matches_fancy(inside, &self.fallback), + }; + + Ok(matches_to_splits(&matches, inside.len())) + } +} + +// --------------------------------------------------------------------------- +// Match iterators +// --------------------------------------------------------------------------- + +pub enum Matches<'r, 't> { + Pcre2(Pcre2Matches<'r, 't>), + Fancy(FancyMatches<'r, 't>), +} + +impl Iterator for Matches<'_, '_> { + type Item = (usize, usize); + + #[inline] + fn next(&mut self) -> Option { + match self { + Matches::Pcre2(m) => m.next(), + Matches::Fancy(m) => m.next(), + } + } +} + +pub struct Pcre2Matches<'r, 't> { + regex: &'r Pcre2Regex, + text: &'t [u8], + offset: usize, +} + +impl Iterator for Pcre2Matches<'_, '_> { + type Item = (usize, usize); + + fn next(&mut self) -> Option { + if self.offset > self.text.len() { + return None; + } + let (start, end) = self.regex.find_at(self.text, self.offset)?; + // Advance past this match (handle zero-length matches) + if end == self.offset { + self.offset += 1; + // Skip to next valid UTF-8 boundary + while self.offset < self.text.len() && (self.text[self.offset] & 0xC0) == 0x80 { + self.offset += 1; + } + } else { + self.offset = end; + } + Some((start, end)) + } +} + +pub struct FancyMatches<'r, 't>(fancy_regex::Matches<'r, 't>); + +impl Iterator for FancyMatches<'_, '_> { + type Item = (usize, usize); + + fn next(&mut self) -> Option { + match self.0.next() { + Some(Ok(m)) => Some((m.start(), m.end())), + _ => None, + } + } +} + +impl Pattern for &pcre2::bytes::Regex { + fn find_matches( + &self, + inside: &str, + ) -> Result, Box> { + if inside.is_empty() { + return Ok(vec![((0, 0), false)]); + } + + let mut prev = 0; + let mut splits = Vec::with_capacity(inside.len()); + let text = inside.as_bytes(); + let mut offset = 0; + while offset <= text.len() { + match self.find_at(text, offset) { + Ok(Some(m)) => { + let start = m.start(); + let end = m.end(); + if prev != start { + splits.push(((prev, start), false)); + } + splits.push(((start, end), true)); + prev = end; + offset = advance_after_match(text, offset, end); + } + Ok(None) => break, + Err(err) => return Err(Box::new(err)), + } + } + if prev != inside.len() { + splits.push(((prev, inside.len()), false)); + } + Ok(splits) + } +} + +#[inline] +fn should_parallelize(input_len: usize, pool_len: usize) -> bool { + get_parallelism() && pool_len >= 2 && input_len >= MIN_CHUNK_SIZE * 2 +} + +fn find_matches_fancy(inside: &str, regex: &fancy_regex::Regex) -> Vec<(usize, usize)> { + let mut matches = Vec::new(); + for matched in regex.find_iter(inside) { + match matched { + Ok(m) if m.start() != m.end() => matches.push((m.start(), m.end())), + Ok(_) => {} + Err(_) => break, + } + } + matches +} + +fn find_matches_pcre2( + text: &str, + base: usize, + regex: &Pcre2Regex, +) -> Result, Box> { + let bytes = text.as_bytes(); + let mut matches = Vec::new(); + let mut pos = 0; + + while pos <= bytes.len() { + match regex.inner.find_at(bytes, pos) { + Ok(Some(m)) => { + if m.start() != m.end() { + matches.push((base + m.start(), base + m.end())); + } + pos = advance_after_match(bytes, pos, m.end()); + } + Ok(None) => break, + Err(err) => return Err(Box::new(err)), + } + } + + Ok(matches) +} + +fn find_matches_pcre2_parallel( + text: &str, + pool: &[Pcre2Regex], +) -> Result, Box> { + let n_chunks = current_num_threads() + .min(text.len() / MIN_CHUNK_SIZE) + .min(pool.len()) + .max(2); + let nominal = text.len() / n_chunks; + + let mut auth = vec![0usize]; + for i in 1..n_chunks { + let boundary = snap_char_ceil(text, i * nominal); + if boundary > *auth.last().unwrap() && boundary < text.len() { + auth.push(boundary); + } + } + auth.push(text.len()); + + let actual = auth.len() - 1; + if actual < 2 { + return find_matches_pcre2(text, 0, &pool[0]); + } + + let chunk_results = (0..actual) + .into_maybe_par_iter_cond(actual >= 2) + .map(|i| { + let auth_start = auth[i]; + let auth_end = auth[i + 1]; + let chunk_end = snap_char_ceil(text, (auth_end + CHUNK_OVERLAP).min(text.len())); + let chunk = &text[auth_start..chunk_end]; + let matches = find_matches_pcre2(chunk, auth_start, &pool[i])?; + Ok::<_, Box>( + matches + .into_iter() + .filter(|&(start, _)| start < auth_end) + .collect::>(), + ) + }) + .collect::, _>>()?; + + merge_chunk_matches(chunk_results, |pos| { + find_next_match_from(text, pos, &pool[0]) + }) +} + +fn find_next_match_from( + text: &str, + pos: usize, + regex: &Pcre2Regex, +) -> Result, Box> { + let bytes = text.as_bytes(); + let mut offset = pos; + loop { + if offset >= bytes.len() { + return Ok(None); + } + match regex.inner.find_at(bytes, offset) { + Ok(Some(m)) if m.start() != m.end() => return Ok(Some((m.start(), m.end()))), + Ok(Some(m)) => { + offset = advance_after_match(bytes, offset, m.end()); + } + Ok(None) => return Ok(None), + Err(err) => return Err(Box::new(err)), + } + } +} + +fn merge_chunk_matches( + chunks: Vec>, + mut find_from: impl FnMut( + usize, + ) + -> Result, Box>, +) -> Result, Box> { + let total = chunks.iter().map(Vec::len).sum(); + let mut flat = Vec::with_capacity(total); + for chunk in chunks { + flat.extend(chunk); + } + + if flat.is_empty() { + return Ok(flat); + } + + let mut result = Vec::with_capacity(flat.len()); + let mut prev_end = 0; + let mut idx = 0; + + while idx < flat.len() { + if flat[idx].0 >= prev_end { + result.push(flat[idx]); + prev_end = flat[idx].1; + idx += 1; + continue; + } + + let mut max_ghost_end = 0usize; + while idx < flat.len() && flat[idx].0 < prev_end { + max_ghost_end = max_ghost_end.max(flat[idx].1); + idx += 1; + } + + if max_ghost_end > prev_end { + if let Some(&(trunc_start, _)) = result.last() { + result.pop(); + if let Some((start, end)) = find_from(trunc_start)? { + result.push((start, end)); + prev_end = end; + } else { + prev_end = result.last().map_or(0, |&(_, end)| end); + } + } + + while idx < flat.len() && flat[idx].0 < prev_end { + idx += 1; + } + } + + if idx < flat.len() && flat[idx].0 > prev_end { + let remaining = &flat[idx..]; + let mut pos = prev_end; + + loop { + match find_from(pos)? { + Some((start, end)) => { + let limit = remaining.len().min(64); + if let Some(offset) = + remaining[..limit].iter().position(|&m| m == (start, end)) + { + idx += offset; + break; + } + result.push((start, end)); + prev_end = end; + pos = end; + } + None => { + idx = flat.len(); + break; + } + } + } + } + } + + Ok(result) +} + +fn matches_to_splits(matches: &[(usize, usize)], input_len: usize) -> Vec<(Offsets, bool)> { + let mut prev = 0; + let mut splits = Vec::with_capacity(matches.len() * 2 + 1); + for &(start, end) in matches { + if prev != start { + splits.push(((prev, start), false)); + } + splits.push(((start, end), true)); + prev = end; + } + if prev != input_len { + splits.push(((prev, input_len), false)); + } + splits +} + +#[inline] +fn advance_after_match(bytes: &[u8], current: usize, end: usize) -> usize { + if end != current { + return end; + } + + let mut next = current + 1; + while next < bytes.len() && (bytes[next] & 0xC0) == 0x80 { + next += 1; + } + next +} + +#[inline] +fn snap_char_ceil(text: &str, mut pos: usize) -> usize { + while pos < text.len() && !text.is_char_boundary(pos) { + pos += 1; + } + pos +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn long_input_parallel_matches_sequential() { + let regex = SysRegex::new( + r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+", + ) + .unwrap(); + + let input = format!( + "{} {} {}{}", + "a".repeat(MIN_CHUNK_SIZE + 321), + "123".repeat(MIN_CHUNK_SIZE / 3), + "!".repeat(MIN_CHUNK_SIZE + 137), + " tail" + ); + + let sequential = match ®ex.pcre2_pool { + Some(pool) => matches_to_splits( + &find_matches_pcre2(&input, 0, &pool[0]).unwrap(), + input.len(), + ), + None => panic!("PCRE2 compilation failed, cannot run parallel match test"), + }; + + assert_eq!(regex.find_matches(&input).unwrap(), sequential); + } +} diff --git a/tokenizers/tests/stream.rs b/tokenizers/tests/stream.rs index c4cfee3ddf..58082f74f3 100644 --- a/tokenizers/tests/stream.rs +++ b/tokenizers/tests/stream.rs @@ -1,9 +1,25 @@ use tokenizers::{ normalizers, + parallelism::{get_parallelism, set_parallelism}, pre_tokenizers::split::{Split, SplitPattern}, AddedToken, NormalizerWrapper, PreTokenizerWrapper, SplitDelimiterBehavior, Tokenizer, }; +#[cfg(feature = "pcre2")] +use std::sync::{LazyLock, Mutex}; + +#[cfg(feature = "pcre2")] +static PARALLELISM_LOCK: LazyLock> = LazyLock::new(|| Mutex::new(())); + +#[cfg(feature = "pcre2")] +fn with_parallelism(parallel: bool, f: impl FnOnce() -> T) -> T { + let previous = get_parallelism(); + set_parallelism(parallel); + let result = f(); + set_parallelism(previous); + result +} + #[test] fn test_decoding_with_added_bpe() { let mut tokenizer = Tokenizer::from_file("data/llama-3-tokenizer.json").unwrap(); @@ -76,3 +92,44 @@ fn test_decode_stream_step_no_panic() { assert_eq!(decode_stream.step(102457).unwrap(), None); assert_eq!(decode_stream.step(113).unwrap(), Some("빵".to_string())); } + +#[cfg(feature = "pcre2")] +#[test] +fn test_long_context_encode_matches_sequential() { + let _guard = PARALLELISM_LOCK.lock().unwrap(); + let tokenizer = Tokenizer::from_file("data/llama-3-tokenizer.json").unwrap(); + let chunk = "Hello, y'all! How are you 😁 ? 12345 -- Καλημέρα.\n"; + let input = chunk.repeat(600); + assert!(input.len() > 16_384); + + let sequential = with_parallelism(false, || tokenizer.encode(input.as_str(), false).unwrap()); + let parallel = with_parallelism(true, || tokenizer.encode(input.as_str(), false).unwrap()); + + assert_eq!(sequential.get_ids(), parallel.get_ids()); + assert_eq!(sequential.get_tokens(), parallel.get_tokens()); +} + +#[cfg(feature = "pcre2")] +#[test] +fn test_long_context_char_offsets_match_sequential() { + let _guard = PARALLELISM_LOCK.lock().unwrap(); + let tokenizer = Tokenizer::from_file("data/llama-3-tokenizer.json").unwrap(); + let chunk = "Hello, y'all! How are you 😁 ? 12345 -- Καλημέρα.\n"; + let input = chunk.repeat(600); + assert!(input.len() > 16_384); + + let sequential = with_parallelism(false, || { + tokenizer + .encode_char_offsets(input.as_str(), false) + .unwrap() + }); + let parallel = with_parallelism(true, || { + tokenizer + .encode_char_offsets(input.as_str(), false) + .unwrap() + }); + + assert_eq!(sequential.get_ids(), parallel.get_ids()); + assert_eq!(sequential.get_tokens(), parallel.get_tokens()); + assert_eq!(sequential.get_offsets(), parallel.get_offsets()); +}