From 96736c7af6c20ca38886305af78424eae067365a Mon Sep 17 00:00:00 2001 From: Patrice Billaut Date: Sun, 25 May 2025 10:56:56 +0200 Subject: [PATCH] feat: tokenizer with special tokens --- llm/tokenizer/__init__.py | 6 ++ llm/tokenizer/naive.py | 8 +++ llm/{tokenizer.py => tokenizer/simple_v1.py} | 16 +---- llm/tokenizer/simple_v2.py | 30 +++++++++ tests/test_tokenizer.py | 57 ----------------- tests/test_tokenizer_naive.py | 10 +++ tests/test_tokenizer_simple_v1.py | 56 ++++++++++++++++ tests/test_tokenizer_simple_v2.py | 67 ++++++++++++++++++++ 8 files changed, 179 insertions(+), 71 deletions(-) create mode 100644 llm/tokenizer/__init__.py create mode 100644 llm/tokenizer/naive.py rename llm/{tokenizer.py => tokenizer/simple_v1.py} (70%) create mode 100644 llm/tokenizer/simple_v2.py delete mode 100644 tests/test_tokenizer.py create mode 100644 tests/test_tokenizer_naive.py create mode 100644 tests/test_tokenizer_simple_v1.py create mode 100644 tests/test_tokenizer_simple_v2.py diff --git a/llm/tokenizer/__init__.py b/llm/tokenizer/__init__.py new file mode 100644 index 0000000..c0734d5 --- /dev/null +++ b/llm/tokenizer/__init__.py @@ -0,0 +1,6 @@ +import re +from typing import TypeAlias + +Vocab: TypeAlias = dict[str, int] + +DECODE_PATTERN = re.compile(r"""\s+([,.?!"'()])""") diff --git a/llm/tokenizer/naive.py b/llm/tokenizer/naive.py new file mode 100644 index 0000000..b64573c --- /dev/null +++ b/llm/tokenizer/naive.py @@ -0,0 +1,8 @@ +import re + +NAIVE_TOKENS = re.compile(r"""([_,.;:?!"'()]|--|\s)""") + + +def naive_tokenization(text: str) -> list[str]: + tokens = NAIVE_TOKENS.split(text) + return [text.strip() for text in tokens if text.strip()] diff --git a/llm/tokenizer.py b/llm/tokenizer/simple_v1.py similarity index 70% rename from llm/tokenizer.py rename to llm/tokenizer/simple_v1.py index fa1c927..82d21c0 100644 --- a/llm/tokenizer.py +++ b/llm/tokenizer/simple_v1.py @@ -1,9 +1,5 @@ -import re -from typing import TypeAlias - -Vocab: TypeAlias = dict[str, int] - -NAIVE_TOKENS = re.compile(r"""([_,.;:?!"'()]|--|\s)""") +from llm.tokenizer import DECODE_PATTERN, Vocab +from llm.tokenizer.naive import naive_tokenization def generate_vocab(tokens: list[str]) -> Vocab: @@ -11,14 +7,6 @@ def generate_vocab(tokens: list[str]) -> Vocab: return {token: token_id for token_id, token in enumerate(all_words)} -def naive_tokenization(text: str) -> list[str]: - tokens = NAIVE_TOKENS.split(text) - return [text.strip() for text in tokens if text.strip()] - - -DECODE_PATTERN = re.compile(r"""\s+([,.?!"'()])""") - - class SimpleTokenizerV1: def __init__(self, vocab: Vocab): self.str_to_int = vocab diff --git a/llm/tokenizer/simple_v2.py b/llm/tokenizer/simple_v2.py new file mode 100644 index 0000000..a3c3d60 --- /dev/null +++ b/llm/tokenizer/simple_v2.py @@ -0,0 +1,30 @@ +from collections import defaultdict + +from llm.tokenizer import DECODE_PATTERN, Vocab +from llm.tokenizer.naive import naive_tokenization + + +def generate_vocab(tokens: list[str]) -> Vocab: + all_words = sorted(set(tokens)) + all_words.extend([TOKEN_UNKNOWN, TOKEN_END_OF_TEXT]) + return {token: token_id for token_id, token in enumerate(all_words)} + + +class SimpleTokenizerV2: + def __init__(self, vocab: Vocab): + self.str_to_int = defaultdict(lambda: vocab[TOKEN_UNKNOWN], vocab) + self.int_to_str = {i: s for s, i in vocab.items()} + + def encode(self, text: str) -> list[int]: + pre_processed = naive_tokenization(text) + token_ids = [self.str_to_int[s] for s in pre_processed] + return token_ids + + def decode(self, token_ids: list[int]) -> str: + text = " ".join([self.int_to_str[i] for i in token_ids]) + text = DECODE_PATTERN.sub(r"\1", text) + return text + + +TOKEN_UNKNOWN = "<|unk|>" +TOKEN_END_OF_TEXT = "<|endoftext|>" diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py deleted file mode 100644 index 2d764a8..0000000 --- a/tests/test_tokenizer.py +++ /dev/null @@ -1,57 +0,0 @@ -from pathlib import Path - -import pytest - -from llm.tokenizer import SimpleTokenizerV1, Vocab, generate_vocab, naive_tokenization - - -def test_naive_tokenizer_the_verdict(resource: Path): - path = resource / "the-verdict.txt" - raw_text = path.read_text() - tokens = naive_tokenization(raw_text) - assert len(tokens) == 4690 - - -def test_generate_vocabulary(): - text = "The quick brown fox jumps over the lazy dog" - vocab_tokens = naive_tokenization(text) - vocab = generate_vocab(vocab_tokens) - - assert vocab == { - "The": 0, - "brown": 1, - "dog": 2, - "fox": 3, - "jumps": 4, - "lazy": 5, - "over": 6, - "quick": 7, - "the": 8, - } - - -class TestSimpleTokenizerV1: - TEXT = "The quick brown fox jumps over the lazy dog" - - @pytest.fixture() - def vocab(self) -> Vocab: - vocab_tokens = naive_tokenization(self.TEXT) - return generate_vocab(vocab_tokens) - - def test_encode(self, vocab: Vocab): - tokenizer = SimpleTokenizerV1(vocab) - encoded = tokenizer.encode(self.TEXT) - - assert encoded == [0, 7, 1, 3, 4, 6, 8, 5, 2] - - def test_encode_fails_for_token_not_in_vocab(self, vocab: Vocab): - tokenizer = SimpleTokenizerV1(vocab) - unknown_token = "elephant" - with pytest.raises(KeyError, match=unknown_token): - tokenizer.encode(f"The quick brown {unknown_token} jumps over the lazy dog") - - def test_decode(self, vocab: Vocab): - tokenizer = SimpleTokenizerV1(vocab) - decoded = tokenizer.decode([0, 7, 1, 3, 4, 6, 8, 5, 2]) - - assert decoded == self.TEXT diff --git a/tests/test_tokenizer_naive.py b/tests/test_tokenizer_naive.py new file mode 100644 index 0000000..8549874 --- /dev/null +++ b/tests/test_tokenizer_naive.py @@ -0,0 +1,10 @@ +from pathlib import Path + +from llm.tokenizer.naive import naive_tokenization + + +def test_naive_tokenization(resource: Path): + path = resource / "the-verdict.txt" + raw_text = path.read_text() + tokens = naive_tokenization(raw_text) + assert len(tokens) == 4690 diff --git a/tests/test_tokenizer_simple_v1.py b/tests/test_tokenizer_simple_v1.py new file mode 100644 index 0000000..21b3c2a --- /dev/null +++ b/tests/test_tokenizer_simple_v1.py @@ -0,0 +1,56 @@ +import pytest + +from llm.tokenizer import ( + Vocab, +) +from llm.tokenizer.naive import naive_tokenization +from llm.tokenizer.simple_v1 import SimpleTokenizerV1, generate_vocab + + +@pytest.fixture(scope="module") +def text() -> str: + return "The quick brown fox jumps over the lazy dog" + + +@pytest.fixture() +def vocab(text: str) -> Vocab: + vocab_tokens = naive_tokenization(text) + return generate_vocab(vocab_tokens) + + +def test_generate_vocabulary(text: str): + vocab_tokens = naive_tokenization(text) + vocab = generate_vocab(vocab_tokens) + + assert vocab == { + "The": 0, + "brown": 1, + "dog": 2, + "fox": 3, + "jumps": 4, + "lazy": 5, + "over": 6, + "quick": 7, + "the": 8, + } + + +def test_encode(vocab: Vocab, text: str): + tokenizer = SimpleTokenizerV1(vocab) + encoded = tokenizer.encode(text) + + assert encoded == [0, 7, 1, 3, 4, 6, 8, 5, 2] + + +def test_encode_fails_for_token_not_in_vocab(vocab: Vocab): + tokenizer = SimpleTokenizerV1(vocab) + unknown_token = "elephant" + with pytest.raises(KeyError, match=unknown_token): + tokenizer.encode(f"The quick brown {unknown_token} jumps over the lazy dog") + + +def test_decode(vocab: Vocab, text: str): + tokenizer = SimpleTokenizerV1(vocab) + decoded = tokenizer.decode([0, 7, 1, 3, 4, 6, 8, 5, 2]) + + assert decoded == text diff --git a/tests/test_tokenizer_simple_v2.py b/tests/test_tokenizer_simple_v2.py new file mode 100644 index 0000000..3bb98ae --- /dev/null +++ b/tests/test_tokenizer_simple_v2.py @@ -0,0 +1,67 @@ +import pytest + +from llm.tokenizer import ( + Vocab, +) +from llm.tokenizer.naive import naive_tokenization +from llm.tokenizer.simple_v2 import ( + SimpleTokenizerV2, + TOKEN_END_OF_TEXT, + TOKEN_UNKNOWN, + generate_vocab, +) + + +@pytest.fixture(scope="module") +def text() -> str: + return "The quick brown fox jumps over the lazy dog" + + +@pytest.fixture() +def vocab(text: str) -> Vocab: + vocab_tokens = naive_tokenization(text) + return generate_vocab(vocab_tokens) + + +def test_generate_vocabulary(): + text = "The quick brown fox jumps over the lazy dog" + vocab_tokens = naive_tokenization(text) + vocab = generate_vocab(vocab_tokens) + + assert vocab == { + "The": 0, + "brown": 1, + "dog": 2, + "fox": 3, + "jumps": 4, + "lazy": 5, + "over": 6, + "quick": 7, + "the": 8, + TOKEN_UNKNOWN: 9, + TOKEN_END_OF_TEXT: 10, + } + + +def test_encode(vocab: Vocab, text: str): + tokenizer = SimpleTokenizerV2(vocab) + encoded = tokenizer.encode(text) + + assert encoded == [0, 7, 1, 3, 4, 6, 8, 5, 2] + + +def test_encode_substitutes_unknown_token(vocab: Vocab): + tokenizer = SimpleTokenizerV2(vocab) + unknown_token = "elephant" + encoded = tokenizer.encode( + f"The quick brown {unknown_token} jumps over the lazy dog" + ) + + assert encoded == [0, 7, 1, 9, 4, 6, 8, 5, 2] + + +def test_decode(vocab: Vocab): + tokenizer = SimpleTokenizerV2(vocab) + decoded = tokenizer.decode([0, 7, 1, 9, 4, 6, 8, 5, 2]) + + assert decoded == f"The quick brown {TOKEN_UNKNOWN} jumps over the lazy dog"