-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathexample.py
More file actions
39 lines (29 loc) · 1.2 KB
/
Copy pathexample.py
File metadata and controls
39 lines (29 loc) · 1.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#!/usr/bin/env python3
"""Example usage of mon_tokenizer"""
from mon_tokenizer import MonTokenizer
def main():
# Initialize the tokenizer
tokenizer = MonTokenizer()
# Example Mon text
text = "ဂွံအခေါင်အရာမွဲသ္ဂောံဒုင်စသိုင်ကၠာကၠာရ။"
print(f"Original text: {text}")
print(f"Vocab size: {tokenizer.get_vocab_size()}")
# Encode the text
result = tokenizer.encode(text)
print(f"\nEncoded tokens: {result['pieces']}")
print(f"Token IDs: {result['ids']}")
# Decode back to text
decoded = tokenizer.decode(result['pieces'])
print(f"\nDecoded text: {decoded}")
# Decode from IDs
decoded_from_ids = tokenizer.decode_ids(result['ids'])
print(f"Decoded from IDs: {decoded_from_ids}")
# Calculate compression
char_count = len(text)
token_count = len(result['ids'])
print(f"\nCompression: {char_count/token_count:.2f}x ({char_count} chars -> {token_count} tokens)")
# Show some vocabulary statistics
vocab = tokenizer.get_vocab()
print(f"Sample vocabulary items: {list(vocab.keys())[100:110]}")
if __name__ == "__main__":
main()