diff --git a/llm/tokenizer.py b/llm/tokenizer.py index d837305..fa1c927 100644 --- a/llm/tokenizer.py +++ b/llm/tokenizer.py @@ -3,7 +3,7 @@ Vocab: TypeAlias = dict[str, int] -NAIVE_TOKENS = re.compile(r'''([_,.;:?!"'()]|--|\s)''') +NAIVE_TOKENS = re.compile(r"""([_,.;:?!"'()]|--|\s)""") def generate_vocab(tokens: list[str]) -> Vocab: @@ -16,7 +16,7 @@ def naive_tokenization(text: str) -> list[str]: return [text.strip() for text in tokens if text.strip()] -DECODE_PATTERN = re.compile(r'''\s+([,.?!"'()])''') +DECODE_PATTERN = re.compile(r"""\s+([,.?!"'()])""") class SimpleTokenizerV1: @@ -34,5 +34,5 @@ def encode(self, text: str) -> list[int]: def decode(self, token_ids: list[int]) -> str: text = " ".join([self.int_to_str[i] for i in token_ids]) - text = DECODE_PATTERN.sub(r'\1', text) + text = DECODE_PATTERN.sub(r"\1", text) return text diff --git a/tests/conftest.py b/tests/conftest.py index a6e7e22..9e6f776 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,6 +3,6 @@ import pytest -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def resource() -> Path: - return Path(__file__).parent / 'resources' + return Path(__file__).parent / "resources" diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index db4a04c..2d764a8 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -6,7 +6,7 @@ def test_naive_tokenizer_the_verdict(resource: Path): - path = resource / 'the-verdict.txt' + path = resource / "the-verdict.txt" raw_text = path.read_text() tokens = naive_tokenization(raw_text) assert len(tokens) == 4690 @@ -17,7 +17,17 @@ def test_generate_vocabulary(): vocab_tokens = naive_tokenization(text) vocab = generate_vocab(vocab_tokens) - assert vocab == {'The': 0, 'brown': 1, 'dog': 2, 'fox': 3, 'jumps': 4, 'lazy': 5, 'over': 6, 'quick': 7, 'the': 8} + assert vocab == { + "The": 0, + "brown": 1, + "dog": 2, + "fox": 3, + "jumps": 4, + "lazy": 5, + "over": 6, + "quick": 7, + "the": 8, + } class TestSimpleTokenizerV1: @@ -36,7 +46,7 @@ def test_encode(self, vocab: Vocab): def test_encode_fails_for_token_not_in_vocab(self, vocab: Vocab): tokenizer = SimpleTokenizerV1(vocab) - unknown_token = 'elephant' + unknown_token = "elephant" with pytest.raises(KeyError, match=unknown_token): tokenizer.encode(f"The quick brown {unknown_token} jumps over the lazy dog")