Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions llm/tokenizer/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import re
from typing import TypeAlias

Vocab: TypeAlias = dict[str, int]

DECODE_PATTERN = re.compile(r"""\s+([,.?!"'()])""")
8 changes: 8 additions & 0 deletions llm/tokenizer/naive.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import re

NAIVE_TOKENS = re.compile(r"""([_,.;:?!"'()]|--|\s)""")


def naive_tokenization(text: str) -> list[str]:
tokens = NAIVE_TOKENS.split(text)
return [text.strip() for text in tokens if text.strip()]
16 changes: 2 additions & 14 deletions llm/tokenizer.py → llm/tokenizer/simple_v1.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,12 @@
import re
from typing import TypeAlias

Vocab: TypeAlias = dict[str, int]

NAIVE_TOKENS = re.compile(r"""([_,.;:?!"'()]|--|\s)""")
from llm.tokenizer import DECODE_PATTERN, Vocab
from llm.tokenizer.naive import naive_tokenization


def generate_vocab(tokens: list[str]) -> Vocab:
all_words = sorted(set(tokens))
return {token: token_id for token_id, token in enumerate(all_words)}


def naive_tokenization(text: str) -> list[str]:
tokens = NAIVE_TOKENS.split(text)
return [text.strip() for text in tokens if text.strip()]


DECODE_PATTERN = re.compile(r"""\s+([,.?!"'()])""")


class SimpleTokenizerV1:
def __init__(self, vocab: Vocab):
self.str_to_int = vocab
Expand Down
30 changes: 30 additions & 0 deletions llm/tokenizer/simple_v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from collections import defaultdict

from llm.tokenizer import DECODE_PATTERN, Vocab
from llm.tokenizer.naive import naive_tokenization


def generate_vocab(tokens: list[str]) -> Vocab:
all_words = sorted(set(tokens))
all_words.extend([TOKEN_UNKNOWN, TOKEN_END_OF_TEXT])
return {token: token_id for token_id, token in enumerate(all_words)}


class SimpleTokenizerV2:
def __init__(self, vocab: Vocab):
self.str_to_int = defaultdict(lambda: vocab[TOKEN_UNKNOWN], vocab)
self.int_to_str = {i: s for s, i in vocab.items()}

def encode(self, text: str) -> list[int]:
pre_processed = naive_tokenization(text)
token_ids = [self.str_to_int[s] for s in pre_processed]
return token_ids

def decode(self, token_ids: list[int]) -> str:
text = " ".join([self.int_to_str[i] for i in token_ids])
text = DECODE_PATTERN.sub(r"\1", text)
return text


TOKEN_UNKNOWN = "<|unk|>"
TOKEN_END_OF_TEXT = "<|endoftext|>"
57 changes: 0 additions & 57 deletions tests/test_tokenizer.py

This file was deleted.

10 changes: 10 additions & 0 deletions tests/test_tokenizer_naive.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from pathlib import Path

from llm.tokenizer.naive import naive_tokenization


def test_naive_tokenization(resource: Path):
path = resource / "the-verdict.txt"
raw_text = path.read_text()
tokens = naive_tokenization(raw_text)
assert len(tokens) == 4690
56 changes: 56 additions & 0 deletions tests/test_tokenizer_simple_v1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import pytest

from llm.tokenizer import (
Vocab,
)
from llm.tokenizer.naive import naive_tokenization
from llm.tokenizer.simple_v1 import SimpleTokenizerV1, generate_vocab


@pytest.fixture(scope="module")
def text() -> str:
return "The quick brown fox jumps over the lazy dog"


@pytest.fixture()
def vocab(text: str) -> Vocab:
vocab_tokens = naive_tokenization(text)
return generate_vocab(vocab_tokens)


def test_generate_vocabulary(text: str):
vocab_tokens = naive_tokenization(text)
vocab = generate_vocab(vocab_tokens)

assert vocab == {
"The": 0,
"brown": 1,
"dog": 2,
"fox": 3,
"jumps": 4,
"lazy": 5,
"over": 6,
"quick": 7,
"the": 8,
}


def test_encode(vocab: Vocab, text: str):
tokenizer = SimpleTokenizerV1(vocab)
encoded = tokenizer.encode(text)

assert encoded == [0, 7, 1, 3, 4, 6, 8, 5, 2]


def test_encode_fails_for_token_not_in_vocab(vocab: Vocab):
tokenizer = SimpleTokenizerV1(vocab)
unknown_token = "elephant"
with pytest.raises(KeyError, match=unknown_token):
tokenizer.encode(f"The quick brown {unknown_token} jumps over the lazy dog")


def test_decode(vocab: Vocab, text: str):
tokenizer = SimpleTokenizerV1(vocab)
decoded = tokenizer.decode([0, 7, 1, 3, 4, 6, 8, 5, 2])

assert decoded == text
67 changes: 67 additions & 0 deletions tests/test_tokenizer_simple_v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import pytest

from llm.tokenizer import (
Vocab,
)
from llm.tokenizer.naive import naive_tokenization
from llm.tokenizer.simple_v2 import (
SimpleTokenizerV2,
TOKEN_END_OF_TEXT,
TOKEN_UNKNOWN,
generate_vocab,
)


@pytest.fixture(scope="module")
def text() -> str:
return "The quick brown fox jumps over the lazy dog"


@pytest.fixture()
def vocab(text: str) -> Vocab:
vocab_tokens = naive_tokenization(text)
return generate_vocab(vocab_tokens)


def test_generate_vocabulary():
text = "The quick brown fox jumps over the lazy dog"
vocab_tokens = naive_tokenization(text)
vocab = generate_vocab(vocab_tokens)

assert vocab == {
"The": 0,
"brown": 1,
"dog": 2,
"fox": 3,
"jumps": 4,
"lazy": 5,
"over": 6,
"quick": 7,
"the": 8,
TOKEN_UNKNOWN: 9,
TOKEN_END_OF_TEXT: 10,
}


def test_encode(vocab: Vocab, text: str):
tokenizer = SimpleTokenizerV2(vocab)
encoded = tokenizer.encode(text)

assert encoded == [0, 7, 1, 3, 4, 6, 8, 5, 2]


def test_encode_substitutes_unknown_token(vocab: Vocab):
tokenizer = SimpleTokenizerV2(vocab)
unknown_token = "elephant"
encoded = tokenizer.encode(
f"The quick brown {unknown_token} jumps over the lazy dog"
)

assert encoded == [0, 7, 1, 9, 4, 6, 8, 5, 2]


def test_decode(vocab: Vocab):
tokenizer = SimpleTokenizerV2(vocab)
decoded = tokenizer.decode([0, 7, 1, 9, 4, 6, 8, 5, 2])

assert decoded == f"The quick brown {TOKEN_UNKNOWN} jumps over the lazy dog"