pbillaut · pbillaut · May 25, 2025 · May 25, 2025
diff --git a/llm/tokenizer/__init__.py b/llm/tokenizer/__init__.py
@@ -0,0 +1,6 @@
+import re
+from typing import TypeAlias
+
+Vocab: TypeAlias = dict[str, int]
+
+DECODE_PATTERN = re.compile(r"""\s+([,.?!"'()])""")
diff --git a/llm/tokenizer/naive.py b/llm/tokenizer/naive.py
@@ -0,0 +1,8 @@
+import re
+
+NAIVE_TOKENS = re.compile(r"""([_,.;:?!"'()]|--|\s)""")
+
+
+def naive_tokenization(text: str) -> list[str]:
+    tokens = NAIVE_TOKENS.split(text)
+    return [text.strip() for text in tokens if text.strip()]
diff --git a/llm/tokenizer.py → llm/tokenizer/simple_v1.py b/llm/tokenizer.py → llm/tokenizer/simple_v1.py
@@ -1,24 +1,12 @@
-import re
-from typing import TypeAlias
-
-Vocab: TypeAlias = dict[str, int]
-
-NAIVE_TOKENS = re.compile(r"""([_,.;:?!"'()]|--|\s)""")
+from llm.tokenizer import DECODE_PATTERN, Vocab
+from llm.tokenizer.naive import naive_tokenization
 
 
 def generate_vocab(tokens: list[str]) -> Vocab:
     all_words = sorted(set(tokens))
     return {token: token_id for token_id, token in enumerate(all_words)}
 
 
-def naive_tokenization(text: str) -> list[str]:
-    tokens = NAIVE_TOKENS.split(text)
-    return [text.strip() for text in tokens if text.strip()]
-
-
-DECODE_PATTERN = re.compile(r"""\s+([,.?!"'()])""")
-
-
 class SimpleTokenizerV1:
     def __init__(self, vocab: Vocab):
         self.str_to_int = vocab

diff --git a/llm/tokenizer/simple_v2.py b/llm/tokenizer/simple_v2.py
@@ -0,0 +1,30 @@
+from collections import defaultdict
+
+from llm.tokenizer import DECODE_PATTERN, Vocab
+from llm.tokenizer.naive import naive_tokenization
+
+
+def generate_vocab(tokens: list[str]) -> Vocab:
+    all_words = sorted(set(tokens))
+    all_words.extend([TOKEN_UNKNOWN, TOKEN_END_OF_TEXT])
+    return {token: token_id for token_id, token in enumerate(all_words)}
+
+
+class SimpleTokenizerV2:
+    def __init__(self, vocab: Vocab):
+        self.str_to_int = defaultdict(lambda: vocab[TOKEN_UNKNOWN], vocab)
+        self.int_to_str = {i: s for s, i in vocab.items()}
+
+    def encode(self, text: str) -> list[int]:
+        pre_processed = naive_tokenization(text)
+        token_ids = [self.str_to_int[s] for s in pre_processed]
+        return token_ids
+
+    def decode(self, token_ids: list[int]) -> str:
+        text = " ".join([self.int_to_str[i] for i in token_ids])
+        text = DECODE_PATTERN.sub(r"\1", text)
+        return text
+
+
+TOKEN_UNKNOWN = "<|unk|>"
+TOKEN_END_OF_TEXT = "<|endoftext|>"
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
diff --git a/tests/test_tokenizer_naive.py b/tests/test_tokenizer_naive.py
@@ -0,0 +1,10 @@
+from pathlib import Path
+
+from llm.tokenizer.naive import naive_tokenization
+
+
+def test_naive_tokenization(resource: Path):
+    path = resource / "the-verdict.txt"
+    raw_text = path.read_text()
+    tokens = naive_tokenization(raw_text)
+    assert len(tokens) == 4690
diff --git a/tests/test_tokenizer_simple_v1.py b/tests/test_tokenizer_simple_v1.py
@@ -0,0 +1,56 @@
+import pytest
+
+from llm.tokenizer import (
+    Vocab,
+)
+from llm.tokenizer.naive import naive_tokenization
+from llm.tokenizer.simple_v1 import SimpleTokenizerV1, generate_vocab
+
+
+@pytest.fixture(scope="module")
+def text() -> str:
+    return "The quick brown fox jumps over the lazy dog"
+
+
+@pytest.fixture()
+def vocab(text: str) -> Vocab:
+    vocab_tokens = naive_tokenization(text)
+    return generate_vocab(vocab_tokens)
+
+
+def test_generate_vocabulary(text: str):
+    vocab_tokens = naive_tokenization(text)
+    vocab = generate_vocab(vocab_tokens)
+
+    assert vocab == {
+        "The": 0,
+        "brown": 1,
+        "dog": 2,
+        "fox": 3,
+        "jumps": 4,
+        "lazy": 5,
+        "over": 6,
+        "quick": 7,
+        "the": 8,
+    }
+
+
+def test_encode(vocab: Vocab, text: str):
+    tokenizer = SimpleTokenizerV1(vocab)
+    encoded = tokenizer.encode(text)
+
+    assert encoded == [0, 7, 1, 3, 4, 6, 8, 5, 2]
+
+
+def test_encode_fails_for_token_not_in_vocab(vocab: Vocab):
+    tokenizer = SimpleTokenizerV1(vocab)
+    unknown_token = "elephant"
+    with pytest.raises(KeyError, match=unknown_token):
+        tokenizer.encode(f"The quick brown {unknown_token} jumps over the lazy dog")
+
+
+def test_decode(vocab: Vocab, text: str):
+    tokenizer = SimpleTokenizerV1(vocab)
+    decoded = tokenizer.decode([0, 7, 1, 3, 4, 6, 8, 5, 2])
+
+    assert decoded == text
diff --git a/tests/test_tokenizer_simple_v2.py b/tests/test_tokenizer_simple_v2.py
@@ -0,0 +1,67 @@
+import pytest
+
+from llm.tokenizer import (
+    Vocab,
+)
+from llm.tokenizer.naive import naive_tokenization
+from llm.tokenizer.simple_v2 import (
+    SimpleTokenizerV2,
+    TOKEN_END_OF_TEXT,
+    TOKEN_UNKNOWN,
+    generate_vocab,
+)
+
+
+@pytest.fixture(scope="module")
+def text() -> str:
+    return "The quick brown fox jumps over the lazy dog"
+
+
+@pytest.fixture()
+def vocab(text: str) -> Vocab:
+    vocab_tokens = naive_tokenization(text)
+    return generate_vocab(vocab_tokens)
+
+
+def test_generate_vocabulary():
+    text = "The quick brown fox jumps over the lazy dog"
+    vocab_tokens = naive_tokenization(text)
+    vocab = generate_vocab(vocab_tokens)
+
+    assert vocab == {
+        "The": 0,
+        "brown": 1,
+        "dog": 2,
+        "fox": 3,
+        "jumps": 4,
+        "lazy": 5,
+        "over": 6,
+        "quick": 7,
+        "the": 8,
+        TOKEN_UNKNOWN: 9,
+        TOKEN_END_OF_TEXT: 10,
+    }
+
+
+def test_encode(vocab: Vocab, text: str):
+    tokenizer = SimpleTokenizerV2(vocab)
+    encoded = tokenizer.encode(text)
+
+    assert encoded == [0, 7, 1, 3, 4, 6, 8, 5, 2]
+
+
+def test_encode_substitutes_unknown_token(vocab: Vocab):
+    tokenizer = SimpleTokenizerV2(vocab)
+    unknown_token = "elephant"
+    encoded = tokenizer.encode(
+        f"The quick brown {unknown_token} jumps over the lazy dog"
+    )
+
+    assert encoded == [0, 7, 1, 9, 4, 6, 8, 5, 2]
+
+
+def test_decode(vocab: Vocab):
+    tokenizer = SimpleTokenizerV2(vocab)
+    decoded = tokenizer.decode([0, 7, 1, 9, 4, 6, 8, 5, 2])
+
+    assert decoded == f"The quick brown {TOKEN_UNKNOWN} jumps over the lazy dog"