diff --git a/tinybpe/_registry.py b/tinybpe/_registry.py index eb77e07..bf13d9b 100644 --- a/tinybpe/_registry.py +++ b/tinybpe/_registry.py @@ -88,15 +88,11 @@ def _load_registry() -> tuple[dict[str, ModelInfo], dict[str, str | None]]: # decoding would be ambiguous — the C tokenizer cannot tell # whether that ID means a vocab token or a special token. max_vocab_id = entry["vocab_size"] - 1 - conflicting = [ - (tok, tid) for tok, tid in raw_special.items() if tid <= max_vocab_id - ] + conflicting = [(tok, tid) for tok, tid in raw_special.items() if tid <= max_vocab_id] if conflicting: import warnings - conflicting_repr = ", ".join( - f"{tok!r}→{tid}" for tok, tid in conflicting - ) + conflicting_repr = ", ".join(f"{tok!r}→{tid}" for tok, tid in conflicting) warnings.warn( f"Model {entry['name']!r}: special tokens overlap with byte or " f"merge IDs ({conflicting_repr}). " diff --git a/tinybpe/tokenizer.py b/tinybpe/tokenizer.py index f83a10f..ce03adb 100644 --- a/tinybpe/tokenizer.py +++ b/tinybpe/tokenizer.py @@ -154,7 +154,8 @@ def __init__( self._vocab_cache: dict[int, bytes] | None = None if bytes_maps is not None: self._vocab_cache = { - k: self._inv_map(v) for k, v in self._enc.vocab.items() # type: ignore[union-attr] + k: self._inv_map(v) + for k, v in self._enc.vocab.items() # type: ignore[union-attr] } # ------------------------------------------------------------------