Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,22 @@
# Changelog

## [Unreleased]

### Added

- **`count_tokens()`**: new convenience method on `Tokenizer` for counting tokens without the ergonomic overhead of `len(encode(...))`
- **`get_model_info()`**: promoted to public API — returns vocab size, family, description, regex pattern, and special token metadata for any built-in model

### Fixed

- **Version string**: `__version__` corrected from `1.0.0` to `1.1.0` (was missed in the v1.1.0 release)
- **Special tokens for qwen35**: `from_pretrained("qwen35")` now correctly applies special tokens (`<|endoftext|>`, `<|im_start|>`, `<|im_end|>`). Models whose special token IDs overlap with byte or merge IDs (deepseek-v4, llama4, minicpm5) now emit a clear warning explaining why special tokens cannot be applied
- **Streaming decode performance**: byte-remap models (cl100k_base, o200k_base, p50k_base, r50k_base) now use a cached O(1) vocab lookup instead of a full batch decode per token, making streaming decode ~100× faster for GPT-family models

### Changed

- **`encode_ordinary` docs**: improved docstring to clearly explain the difference from `encode()` and the behaviour with special tokens

## [1.1.0] — 2026-06-13

### Added
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ class Tokenizer:
def __init__(self, merges, *, bytes_maps=None, pat_str=None, special_tokens=None)
def encode(self, text: str) -> list[int]
def encode_ordinary(self, text: str) -> list[int]
def count_tokens(self, text: str) -> int
def decode(self, ids: list[int]) -> str
def stream_decode(self, callback: Callable[[str], None]) -> Callable[[int], None]
def stream_decode_reset(self) -> None
Expand Down Expand Up @@ -169,6 +170,7 @@ class Trainer(bpe.Trainer):

```python
def list_models() -> list[str]
def get_model_info(name: str) -> dict # returns vocab_size, family, description, pat_str, special_tokens, has_byte_remap
```

### File I/O
Expand Down
2 changes: 2 additions & 0 deletions README_zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ class Tokenizer:
def __init__(self, merges, *, bytes_maps=None, pat_str=None, special_tokens=None)
def encode(self, text: str) -> list[int]
def encode_ordinary(self, text: str) -> list[int]
def count_tokens(self, text: str) -> int
def decode(self, ids: list[int]) -> str
def stream_decode(self, callback: Callable[[str], None]) -> Callable[[int], None]
def stream_decode_reset(self) -> None
Expand Down Expand Up @@ -169,6 +170,7 @@ class Trainer(bpe.Trainer):

```python
def list_models() -> list[str]
def get_model_info(name: str) -> dict # 返回 vocab_size、family、description、pat_str、special_tokens、has_byte_remap
```

### 文件 I/O
Expand Down
12 changes: 12 additions & 0 deletions tinybpe/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
special token handling, byte remapping, and streaming decode.
- :func:`list_models` — list built-in models available via
:meth:`Tokenizer.from_pretrained`.
- :func:`get_model_info` — get detailed metadata for a built-in model
(vocab size, description, family, regex pattern, special tokens).
- :class:`Trainer` — train BPE models from text corpora.
- :func:`load_model` / :func:`save_model` — ``.tbm`` model file I/O.
- :func:`load_vocab` / :func:`save_vocab` — ``.vocab`` vocabulary file I/O.
Expand Down Expand Up @@ -41,12 +43,21 @@
>>> import tinybpe
>>> tinybpe.list_models()
['cl100k_base', 'deepseek-v4', 'llama4', 'minicpm5', 'o200k_base', 'p50k_base', 'qwen35', 'r50k_base']

Get model metadata::

>>> info = tinybpe.get_model_info("cl100k_base")
>>> info["vocab_size"]
100277
>>> info["family"]
'GPT-4'
"""

__all__ = [
"Tokenizer",
"Trainer",
"__version__",
"get_model_info",
"list_models",
"load_model",
"load_vocab",
Expand All @@ -58,6 +69,7 @@
from tinybpe._model_io import load_vocab as load_vocab
from tinybpe._model_io import save_model as save_model
from tinybpe._model_io import save_vocab as save_vocab
from tinybpe._registry import get_model_info as get_model_info
from tinybpe._registry import list_models as list_models
from tinybpe._version import __version__ as __version__
from tinybpe.tokenizer import Tokenizer as Tokenizer
Expand Down
38 changes: 32 additions & 6 deletions tinybpe/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,16 +72,42 @@ def _load_registry() -> tuple[dict[str, ModelInfo], dict[str, str | None]]:
pat_ref: str = entry.get("pattern", "none")
pat_str = pattern_map.get(pat_ref)

# Special tokens are only valid for byte-remap models (tiktoken)
# where token IDs are preserved. For ID-remapped models, the
# remapped IDs differ from the originals.
raw_special: dict[str, int] | None = entry.get("special_tokens")
special_tokens: dict[str, int] | None
if entry.get("has_byte_remap", False) and raw_special:
has_remap = entry.get("has_byte_remap", False)

if has_remap and raw_special:
# Byte-remap models (tiktoken): special token IDs are the
# original model IDs and are always safe to apply — they
# sit outside the 0-255 byte range and the merge range.
special_tokens = raw_special
elif raw_special:
# ID-remapped: store for reference but don't apply
special_tokens = None
# Non-remap models: special token IDs must not overlap with
# byte values (0-255) or merge-derived IDs (256 .. vocab_size-1).
# If any special token ID falls inside the vocab range,
# decoding would be ambiguous — the C tokenizer cannot tell
# whether that ID means a vocab token or a special token.
max_vocab_id = entry["vocab_size"] - 1
conflicting = [
(tok, tid) for tok, tid in raw_special.items() if tid <= max_vocab_id
]
if conflicting:
import warnings

conflicting_repr = ", ".join(
f"{tok!r}→{tid}" for tok, tid in conflicting
)
warnings.warn(
f"Model {entry['name']!r}: special tokens overlap with byte or "
f"merge IDs ({conflicting_repr}). "
f"Special tokens will not be applied for this model. "
f"To fix, re-convert the model so special token IDs start "
f"at or above {max_vocab_id + 1}.",
stacklevel=2,
)
special_tokens = None
else:
special_tokens = raw_special
else:
special_tokens = None

Expand Down
2 changes: 1 addition & 1 deletion tinybpe/_version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""TinyBPE version."""

__version__ = "1.0.0"
__version__ = "1.1.0"
73 changes: 65 additions & 8 deletions tinybpe/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ class Tokenizer:
>>> tok = Tokenizer(merges, pat_str=r"\\w+|\\s+")
>>> ids = tok.encode("hello world")
>>> text = tok.decode(ids)
>>> tok.count_tokens("hello world")
2
"""

def __init__(
Expand Down Expand Up @@ -145,16 +147,41 @@ def __init__(
# ---- streaming decode state ----
self._stream_cache: bytes = b""

# ---- cached inverse-remapped vocab for fast streaming decode ----
# When bytes_maps is set, _decode_remap needs O(1) single-token
# lookup. self._enc.vocab rebuilds the dict on every access,
# so we build it once here and cache the inverse-remapped result.
self._vocab_cache: dict[int, bytes] | None = None
if bytes_maps is not None:
self._vocab_cache = {
k: self._inv_map(v) for k, v in self._enc.vocab.items() # type: ignore[union-attr]
}

# ------------------------------------------------------------------
# Encoding
# ------------------------------------------------------------------

def encode_ordinary(self, text: str) -> list[int]:
"""Encode text without pre-splitting on special tokens.
"""Encode text without regex-splitting on special token patterns.

Unlike :meth:`encode`, this method does **not** scan the input for
special token strings (e.g. ``"<|endoftext|>"``) before encoding.
Instead, the entire text is split into chunks via the pre-tokenizer
regex pattern and each chunk is BPE-encoded directly.

.. note::

The underlying C tokenizer is still configured with the same
vocabulary and special tokens. If a pre-tokenizer chunk happens
to consist entirely of bytes that match a special token, that
chunk will still be encoded as the special token ID. In
practice this is rare — special token strings like
``"<|endoftext|>"`` span multiple pre-tokenizer chunks and are
only matched as a whole by :meth:`encode`.

Unlike :meth:`encode`, this method does not use the special token
regex pattern to split text before encoding. Note that special
tokens may still be produced if the BPE merges produce them.
This method is named after the ``encode_ordinary`` convention in
tiktoken and is useful when you want consistent encoding behaviour
regardless of whether the tokenizer has special tokens configured.

Parameters
----------
Expand All @@ -165,6 +192,11 @@ def encode_ordinary(self, text: str) -> list[int]:
-------
list[int]
Token ID sequence.

See Also
--------
encode : Encode with special-token-aware regex splitting.
count_tokens : Count tokens without building the full ID list.
"""
chunks = re.findall(self._compiled_pattern, text)
chunk_bytes = [ch.encode("utf-8") for ch in chunks]
Expand Down Expand Up @@ -202,6 +234,24 @@ def encode(self, text: str) -> list[int]:
ids.extend(self.encode_ordinary(part))
return ids

def count_tokens(self, text: str) -> int:
"""Return the number of tokens ``text`` would produce when encoded.

This is a convenience method equivalent to ``len(self.encode(text))``
but communicates intent more clearly.

Parameters
----------
text : str
The input text to measure.

Returns
-------
int
Number of BPE tokens (including any special tokens).
"""
return len(self.encode(text))

# ------------------------------------------------------------------
# Decoding
# ------------------------------------------------------------------
Expand Down Expand Up @@ -262,10 +312,17 @@ def _decode(token_id: int) -> None:
self._stream_cache = b""

def _decode_remap(token_id: int) -> None:
assert self._inv_map is not None
text_bytes = self._enc.decode([token_id])
text_bytes = self._inv_map(text_bytes)
text_bytes = self._stream_cache + text_bytes
assert self._vocab_cache is not None
# Fast path: O(1) lookup in cached inverse-remapped vocab.
# Falls back to batch decode only for special token IDs
# (which sit outside the normal vocab range).
token_bytes = self._vocab_cache.get(token_id)
if token_bytes is None:
# Special token (or unknown ID) — use batch decode
assert self._inv_map is not None
token_bytes = self._enc.decode([token_id])
token_bytes = self._inv_map(token_bytes)
text_bytes = self._stream_cache + token_bytes
try:
text = text_bytes.decode("utf-8")
self._stream_cache = b""
Expand Down
Loading