pbillaut · pbillaut · May 24, 2025 · May 24, 2025
diff --git a/llm/tokenizer.py b/llm/tokenizer.py
@@ -3,7 +3,7 @@
 
 Vocab: TypeAlias = dict[str, int]
 
-NAIVE_TOKENS = re.compile(r'''([_,.;:?!"'()]|--|\s)''')
+NAIVE_TOKENS = re.compile(r"""([_,.;:?!"'()]|--|\s)""")
 
 
 def generate_vocab(tokens: list[str]) -> Vocab:
@@ -16,7 +16,7 @@ def naive_tokenization(text: str) -> list[str]:
     return [text.strip() for text in tokens if text.strip()]
 
 
-DECODE_PATTERN = re.compile(r'''\s+([,.?!"'()])''')
+DECODE_PATTERN = re.compile(r"""\s+([,.?!"'()])""")
 
 
 class SimpleTokenizerV1:
@@ -34,5 +34,5 @@ def encode(self, text: str) -> list[int]:
 
     def decode(self, token_ids: list[int]) -> str:
         text = " ".join([self.int_to_str[i] for i in token_ids])
-        text = DECODE_PATTERN.sub(r'\1', text)
+        text = DECODE_PATTERN.sub(r"\1", text)
         return text
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -3,6 +3,6 @@
 import pytest
 
 
-@pytest.fixture(scope='session')
+@pytest.fixture(scope="session")
 def resource() -> Path:
-    return Path(__file__).parent / 'resources'
+    return Path(__file__).parent / "resources"
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
@@ -6,7 +6,7 @@
 
 
 def test_naive_tokenizer_the_verdict(resource: Path):
-    path = resource / 'the-verdict.txt'
+    path = resource / "the-verdict.txt"
     raw_text = path.read_text()
     tokens = naive_tokenization(raw_text)
     assert len(tokens) == 4690
@@ -17,7 +17,17 @@ def test_generate_vocabulary():
     vocab_tokens = naive_tokenization(text)
     vocab = generate_vocab(vocab_tokens)
 
-    assert vocab == {'The': 0, 'brown': 1, 'dog': 2, 'fox': 3, 'jumps': 4, 'lazy': 5, 'over': 6, 'quick': 7, 'the': 8}
+    assert vocab == {
+        "The": 0,
+        "brown": 1,
+        "dog": 2,
+        "fox": 3,
+        "jumps": 4,
+        "lazy": 5,
+        "over": 6,
+        "quick": 7,
+        "the": 8,
+    }
 
 
 class TestSimpleTokenizerV1:
@@ -36,7 +46,7 @@ def test_encode(self, vocab: Vocab):
 
     def test_encode_fails_for_token_not_in_vocab(self, vocab: Vocab):
         tokenizer = SimpleTokenizerV1(vocab)
-        unknown_token = 'elephant'
+        unknown_token = "elephant"
         with pytest.raises(KeyError, match=unknown_token):
             tokenizer.encode(f"The quick brown {unknown_token} jumps over the lazy dog")