From 96736c7af6c20ca38886305af78424eae067365a Mon Sep 17 00:00:00 2001
From: Patrice Billaut <billaut@proton.me>
Date: Sun, 25 May 2025 10:56:56 +0200
Subject: [PATCH] feat: tokenizer with special tokens

---
 llm/tokenizer/__init__.py                    |  6 ++
 llm/tokenizer/naive.py                       |  8 +++
 llm/{tokenizer.py => tokenizer/simple_v1.py} | 16 +----
 llm/tokenizer/simple_v2.py                   | 30 +++++++++
 tests/test_tokenizer.py                      | 57 -----------------
 tests/test_tokenizer_naive.py                | 10 +++
 tests/test_tokenizer_simple_v1.py            | 56 ++++++++++++++++
 tests/test_tokenizer_simple_v2.py            | 67 ++++++++++++++++++++
 8 files changed, 179 insertions(+), 71 deletions(-)
 create mode 100644 llm/tokenizer/__init__.py
 create mode 100644 llm/tokenizer/naive.py
 rename llm/{tokenizer.py => tokenizer/simple_v1.py} (70%)
 create mode 100644 llm/tokenizer/simple_v2.py
 delete mode 100644 tests/test_tokenizer.py
 create mode 100644 tests/test_tokenizer_naive.py
 create mode 100644 tests/test_tokenizer_simple_v1.py
 create mode 100644 tests/test_tokenizer_simple_v2.py

diff --git a/llm/tokenizer/__init__.py b/llm/tokenizer/__init__.py
new file mode 100644
index 0000000..c0734d5
--- /dev/null
+++ b/llm/tokenizer/__init__.py
@@ -0,0 +1,6 @@
+import re
+from typing import TypeAlias
+
+Vocab: TypeAlias = dict[str, int]
+
+DECODE_PATTERN = re.compile(r"""\s+([,.?!"'()])""")
diff --git a/llm/tokenizer/naive.py b/llm/tokenizer/naive.py
new file mode 100644
index 0000000..b64573c
--- /dev/null
+++ b/llm/tokenizer/naive.py
@@ -0,0 +1,8 @@
+import re
+
+NAIVE_TOKENS = re.compile(r"""([_,.;:?!"'()]|--|\s)""")
+
+
+def naive_tokenization(text: str) -> list[str]:
+    tokens = NAIVE_TOKENS.split(text)
+    return [text.strip() for text in tokens if text.strip()]
diff --git a/llm/tokenizer.py b/llm/tokenizer/simple_v1.py
similarity index 70%
rename from llm/tokenizer.py
rename to llm/tokenizer/simple_v1.py
index fa1c927..82d21c0 100644
--- a/llm/tokenizer.py
+++ b/llm/tokenizer/simple_v1.py
@@ -1,9 +1,5 @@
-import re
-from typing import TypeAlias
-
-Vocab: TypeAlias = dict[str, int]
-
-NAIVE_TOKENS = re.compile(r"""([_,.;:?!"'()]|--|\s)""")
+from llm.tokenizer import DECODE_PATTERN, Vocab
+from llm.tokenizer.naive import naive_tokenization
 
 
 def generate_vocab(tokens: list[str]) -> Vocab:
@@ -11,14 +7,6 @@ def generate_vocab(tokens: list[str]) -> Vocab:
     return {token: token_id for token_id, token in enumerate(all_words)}
 
 
-def naive_tokenization(text: str) -> list[str]:
-    tokens = NAIVE_TOKENS.split(text)
-    return [text.strip() for text in tokens if text.strip()]
-
-
-DECODE_PATTERN = re.compile(r"""\s+([,.?!"'()])""")
-
-
 class SimpleTokenizerV1:
     def __init__(self, vocab: Vocab):
         self.str_to_int = vocab
diff --git a/llm/tokenizer/simple_v2.py b/llm/tokenizer/simple_v2.py
new file mode 100644
index 0000000..a3c3d60
--- /dev/null
+++ b/llm/tokenizer/simple_v2.py
@@ -0,0 +1,30 @@
+from collections import defaultdict
+
+from llm.tokenizer import DECODE_PATTERN, Vocab
+from llm.tokenizer.naive import naive_tokenization
+
+
+def generate_vocab(tokens: list[str]) -> Vocab:
+    all_words = sorted(set(tokens))
+    all_words.extend([TOKEN_UNKNOWN, TOKEN_END_OF_TEXT])
+    return {token: token_id for token_id, token in enumerate(all_words)}
+
+
+class SimpleTokenizerV2:
+    def __init__(self, vocab: Vocab):
+        self.str_to_int = defaultdict(lambda: vocab[TOKEN_UNKNOWN], vocab)
+        self.int_to_str = {i: s for s, i in vocab.items()}
+
+    def encode(self, text: str) -> list[int]:
+        pre_processed = naive_tokenization(text)
+        token_ids = [self.str_to_int[s] for s in pre_processed]
+        return token_ids
+
+    def decode(self, token_ids: list[int]) -> str:
+        text = " ".join([self.int_to_str[i] for i in token_ids])
+        text = DECODE_PATTERN.sub(r"\1", text)
+        return text
+
+
+TOKEN_UNKNOWN = "<|unk|>"
+TOKEN_END_OF_TEXT = "<|endoftext|>"
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
deleted file mode 100644
index 2d764a8..0000000
--- a/tests/test_tokenizer.py
+++ /dev/null
@@ -1,57 +0,0 @@
-from pathlib import Path
-
-import pytest
-
-from llm.tokenizer import SimpleTokenizerV1, Vocab, generate_vocab, naive_tokenization
-
-
-def test_naive_tokenizer_the_verdict(resource: Path):
-    path = resource / "the-verdict.txt"
-    raw_text = path.read_text()
-    tokens = naive_tokenization(raw_text)
-    assert len(tokens) == 4690
-
-
-def test_generate_vocabulary():
-    text = "The quick brown fox jumps over the lazy dog"
-    vocab_tokens = naive_tokenization(text)
-    vocab = generate_vocab(vocab_tokens)
-
-    assert vocab == {
-        "The": 0,
-        "brown": 1,
-        "dog": 2,
-        "fox": 3,
-        "jumps": 4,
-        "lazy": 5,
-        "over": 6,
-        "quick": 7,
-        "the": 8,
-    }
-
-
-class TestSimpleTokenizerV1:
-    TEXT = "The quick brown fox jumps over the lazy dog"
-
-    @pytest.fixture()
-    def vocab(self) -> Vocab:
-        vocab_tokens = naive_tokenization(self.TEXT)
-        return generate_vocab(vocab_tokens)
-
-    def test_encode(self, vocab: Vocab):
-        tokenizer = SimpleTokenizerV1(vocab)
-        encoded = tokenizer.encode(self.TEXT)
-
-        assert encoded == [0, 7, 1, 3, 4, 6, 8, 5, 2]
-
-    def test_encode_fails_for_token_not_in_vocab(self, vocab: Vocab):
-        tokenizer = SimpleTokenizerV1(vocab)
-        unknown_token = "elephant"
-        with pytest.raises(KeyError, match=unknown_token):
-            tokenizer.encode(f"The quick brown {unknown_token} jumps over the lazy dog")
-
-    def test_decode(self, vocab: Vocab):
-        tokenizer = SimpleTokenizerV1(vocab)
-        decoded = tokenizer.decode([0, 7, 1, 3, 4, 6, 8, 5, 2])
-
-        assert decoded == self.TEXT
diff --git a/tests/test_tokenizer_naive.py b/tests/test_tokenizer_naive.py
new file mode 100644
index 0000000..8549874
--- /dev/null
+++ b/tests/test_tokenizer_naive.py
@@ -0,0 +1,10 @@
+from pathlib import Path
+
+from llm.tokenizer.naive import naive_tokenization
+
+
+def test_naive_tokenization(resource: Path):
+    path = resource / "the-verdict.txt"
+    raw_text = path.read_text()
+    tokens = naive_tokenization(raw_text)
+    assert len(tokens) == 4690
diff --git a/tests/test_tokenizer_simple_v1.py b/tests/test_tokenizer_simple_v1.py
new file mode 100644
index 0000000..21b3c2a
--- /dev/null
+++ b/tests/test_tokenizer_simple_v1.py
@@ -0,0 +1,56 @@
+import pytest
+
+from llm.tokenizer import (
+    Vocab,
+)
+from llm.tokenizer.naive import naive_tokenization
+from llm.tokenizer.simple_v1 import SimpleTokenizerV1, generate_vocab
+
+
+@pytest.fixture(scope="module")
+def text() -> str:
+    return "The quick brown fox jumps over the lazy dog"
+
+
+@pytest.fixture()
+def vocab(text: str) -> Vocab:
+    vocab_tokens = naive_tokenization(text)
+    return generate_vocab(vocab_tokens)
+
+
+def test_generate_vocabulary(text: str):
+    vocab_tokens = naive_tokenization(text)
+    vocab = generate_vocab(vocab_tokens)
+
+    assert vocab == {
+        "The": 0,
+        "brown": 1,
+        "dog": 2,
+        "fox": 3,
+        "jumps": 4,
+        "lazy": 5,
+        "over": 6,
+        "quick": 7,
+        "the": 8,
+    }
+
+
+def test_encode(vocab: Vocab, text: str):
+    tokenizer = SimpleTokenizerV1(vocab)
+    encoded = tokenizer.encode(text)
+
+    assert encoded == [0, 7, 1, 3, 4, 6, 8, 5, 2]
+
+
+def test_encode_fails_for_token_not_in_vocab(vocab: Vocab):
+    tokenizer = SimpleTokenizerV1(vocab)
+    unknown_token = "elephant"
+    with pytest.raises(KeyError, match=unknown_token):
+        tokenizer.encode(f"The quick brown {unknown_token} jumps over the lazy dog")
+
+
+def test_decode(vocab: Vocab, text: str):
+    tokenizer = SimpleTokenizerV1(vocab)
+    decoded = tokenizer.decode([0, 7, 1, 3, 4, 6, 8, 5, 2])
+
+    assert decoded == text
diff --git a/tests/test_tokenizer_simple_v2.py b/tests/test_tokenizer_simple_v2.py
new file mode 100644
index 0000000..3bb98ae
--- /dev/null
+++ b/tests/test_tokenizer_simple_v2.py
@@ -0,0 +1,67 @@
+import pytest
+
+from llm.tokenizer import (
+    Vocab,
+)
+from llm.tokenizer.naive import naive_tokenization
+from llm.tokenizer.simple_v2 import (
+    SimpleTokenizerV2,
+    TOKEN_END_OF_TEXT,
+    TOKEN_UNKNOWN,
+    generate_vocab,
+)
+
+
+@pytest.fixture(scope="module")
+def text() -> str:
+    return "The quick brown fox jumps over the lazy dog"
+
+
+@pytest.fixture()
+def vocab(text: str) -> Vocab:
+    vocab_tokens = naive_tokenization(text)
+    return generate_vocab(vocab_tokens)
+
+
+def test_generate_vocabulary():
+    text = "The quick brown fox jumps over the lazy dog"
+    vocab_tokens = naive_tokenization(text)
+    vocab = generate_vocab(vocab_tokens)
+
+    assert vocab == {
+        "The": 0,
+        "brown": 1,
+        "dog": 2,
+        "fox": 3,
+        "jumps": 4,
+        "lazy": 5,
+        "over": 6,
+        "quick": 7,
+        "the": 8,
+        TOKEN_UNKNOWN: 9,
+        TOKEN_END_OF_TEXT: 10,
+    }
+
+
+def test_encode(vocab: Vocab, text: str):
+    tokenizer = SimpleTokenizerV2(vocab)
+    encoded = tokenizer.encode(text)
+
+    assert encoded == [0, 7, 1, 3, 4, 6, 8, 5, 2]
+
+
+def test_encode_substitutes_unknown_token(vocab: Vocab):
+    tokenizer = SimpleTokenizerV2(vocab)
+    unknown_token = "elephant"
+    encoded = tokenizer.encode(
+        f"The quick brown {unknown_token} jumps over the lazy dog"
+    )
+
+    assert encoded == [0, 7, 1, 9, 4, 6, 8, 5, 2]
+
+
+def test_decode(vocab: Vocab):
+    tokenizer = SimpleTokenizerV2(vocab)
+    decoded = tokenizer.decode([0, 7, 1, 9, 4, 6, 8, 5, 2])
+
+    assert decoded == f"The quick brown {TOKEN_UNKNOWN} jumps over the lazy dog"