diff --git a/LICENSE b/LICENSE
index 4fa2b63..12cc758 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
MIT License
-Copyright (c) 2024 Belikov Maxim
+Copyright (c) 2024 Bielikov Maksym
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
diff --git a/README.md b/README.md
index 029d606..e38ee2a 100644
--- a/README.md
+++ b/README.md
@@ -10,9 +10,9 @@
The cefrpy python module offers a comprehensive toolkit for analyzing linguistic data based on the Common European Framework of Reference for Languages (CEFR).
-Documentation: https://maximax67.github.io/cefrpy/
+Documentation:
-HuggingFace demo: https://huggingface.co/spaces/Maximax67/cefrpy-demo
+HuggingFace demo:
## Installation
@@ -48,7 +48,7 @@ else:
print(f"CEFR level not found for '{word}' as a {pos_tag}.")
```
-### Getting Average Level of a Word:
+### Getting average level of a word
```py
from cefrpy import CEFRAnalyzer
@@ -73,7 +73,8 @@ else:
### Recommended usage with [spaCy](https://spacy.io)
-#### Import spacy and load model:
+#### Import spacy and load model
+
```py
import spacy
@@ -148,185 +149,363 @@ for token in tokens:
Result (truncated):
-```
+```text
-------------------------------------------------------
- WORD POS LEVEL CEFR
+ WORD POS LEVEL CEFR
-------------------------------------------------------
- _SP Skip None
-In IN 1.00 A1
-the DT 1.00 A1
-heart NN 1.00 A1
-of IN 1.00 A1
-every DT 1.00 A1
-forest NN 2.00 A2
-, , Skip None
-a DT 1.00 A1
-hidden JJ 3.00 B1
-world NN 1.00 A1
-thrives VBZ 5.86 C2
-among IN 2.00 A2
-the DT 1.00 A1
-towering VBG 1.00 A1
-trees NNS 1.00 A1
-. . Skip None
-Trees NNS 1.00 A1
-, , Skip None
- _SP Skip None
-those DT 1.00 A1
-silent JJ 3.00 B1
+ _SP Skip None
+In IN 1.00 A1
+the DT 1.00 A1
+heart NN 1.00 A1
+of IN 1.00 A1
+every DT 1.00 A1
+forest NN 2.00 A2
+, , Skip None
+a DT 1.00 A1
+hidden JJ 3.00 B1
+world NN 1.00 A1
+thrives VBZ 5.86 C2
+among IN 2.00 A2
+the DT 1.00 A1
+towering VBG 1.00 A1
+trees NNS 1.00 A1
+. . Skip None
+Trees NNS 1.00 A1
+, , Skip None
+ _SP Skip None
+those DT 1.00 A1
+silent JJ 3.00 B1
```
#### Get more statistical information
1. Filter tokens by level:
-```py
-def filter_for_desired_level(level_tokens: list[tuple[str, str, bool, float, int, int]],
- min_level: float | int = 1.0, max_level: float | int = 6.0
- ) -> set[tuple[str, str, bool, float, int, int]]:
- filtered_tokens = set()
- for token in level_tokens:
- level = token[3]
+ ```py
+ def filter_for_desired_level(level_tokens: list[tuple[str, str, bool, float, int, int]],
+ min_level: float | int = 1.0, max_level: float | int = 6.0
+ ) -> set[tuple[str, str, bool, float, int, int]]:
+ filtered_tokens = set()
+ for token in level_tokens:
+ level = token[3]
+
+ if level and level >= min_level and level <= max_level:
+ filtered_tokens.add(token)
+
+ return filtered_tokens
+
+
+ # You can also set min/max level as an int or float in range from 1 to 6
+ desired_min_level = CEFRLevel.C1
+ desired_level_words_set = filter_for_desired_level(tokens, min_level=int(desired_min_level))
+
+ desired_level_words_list = list(desired_level_words_set)
+ desired_level_words_list.sort()
+
+ print(f'\tWords with level {desired_min_level} and higher: {len(desired_level_words_list)}')
+ for word_data in desired_level_words_list:
+ word, pos, _, level, _, _ = word_data
+ print(f"{word.ljust(26)} {pos.ljust(6)} {'{:.2f}'.format(level).ljust(6)} {CEFRLevel(round(level))}")
+ ```
+
+ ```text
+ Words with level B2 and higher: 16
+ benefactors NNS 6.00 C2
+ bristlecone NN 6.00 C2
+ evolved VBN 4.00 B2
+ fungi NNS 5.20 C1
+ living NN 4.00 B2
+ longevity NN 5.96 C2
+ masters NNS 4.00 B2
+ mighty JJ 4.00 B2
+ observers NNS 4.00 B2
+ pines NNS 4.00 B2
+ potential JJ 4.00 B2
+ sequoias NNS 6.00 C2
+ thrives VBZ 5.86 C2
+ underground RB 4.00 B2
+ wildfires NNS 6.00 C2
+ withstand VB 5.12 C1
+ ```
- if level and level >= min_level and level <= max_level:
- filtered_tokens.add(token)
+2. Get CEFR statistic of the text:
- return filtered_tokens
+ ```py
+ def get_word_level_count_statistic(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> list[int]:
+ difficulty_levels_count = [0] * 6
+ for token in level_tokens:
+ level = token[3]
+ if not level:
+ continue
+ level_round = round(level)
+ difficulty_levels_count[level_round - 1] += 1
-# You can also set min/max level as an int or float in range from 1 to 6
-desired_min_level = CEFRLevel.C1
-desired_level_words_set = filter_for_desired_level(tokens, min_level=int(desired_min_level))
+ return difficulty_levels_count
-desired_level_words_list = list(desired_level_words_set)
-desired_level_words_list.sort()
+ difficulty_levels_count = get_word_level_count_statistic(tokens)
+ print('CEFR statistic (total words):')
+ for i in range(1, 7):
+ print(f'{CEFRLevel(i)}: {difficulty_levels_count[i - 1]}')
+ ```
-print(f'\tWords with level {desired_min_level} and higher: {len(desired_level_words_list)}')
-for word_data in desired_level_words_list:
- word, pos, _, level, _, _ = word_data
- print(f"{word.ljust(26)} {pos.ljust(6)} {'{:.2f}'.format(level).ljust(6)} {CEFRLevel(round(level))}")
-```
+ ```text
+ CEFR statistic (total words):
+ A1: 136
+ A2: 36
+ B1: 27
+ B2: 11
+ C1: 2
+ C2: 6
+ ```
-```
-Words with level B2 and higher: 16
-benefactors NNS 6.00 C2
-bristlecone NN 6.00 C2
-evolved VBN 4.00 B2
-fungi NNS 5.20 C1
-living NN 4.00 B2
-longevity NN 5.96 C2
-masters NNS 4.00 B2
-mighty JJ 4.00 B2
-observers NNS 4.00 B2
-pines NNS 4.00 B2
-potential JJ 4.00 B2
-sequoias NNS 6.00 C2
-thrives VBZ 5.86 C2
-underground RB 4.00 B2
-wildfires NNS 6.00 C2
-withstand VB 5.12 C1
-```
+3. Get CEFR statistic for unique words in the text:
-2. Get CEFR statistic of the text:
+ ```py
+ def get_word_level_count_statistic_unique(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> list[int]:
+ processed_word_pos_set = set()
+ difficulty_levels_count = [0] * 6
+ for token in level_tokens:
+ level = token[3]
+ if not level:
+ continue
+
+ to_check_tuple = (token[0], token[1])
+ if not to_check_tuple in processed_word_pos_set:
+ level_round = round(token[3])
+ difficulty_levels_count[level_round - 1] += 1
+ processed_word_pos_set.add(to_check_tuple)
+
+ return difficulty_levels_count
+
+
+ difficulty_levels_count_unique = get_word_level_count_statistic_unique(tokens)
+ print('CEFR statistic (unique words):')
+ for i in range(1, 7):
+ print(f'{CEFRLevel(i)}: {difficulty_levels_count_unique[i - 1]}')
+ ```
+
+ ```text
+ CEFR statistic (unique words):
+ A1: 77
+ A2: 33
+ B1: 23
+ B2: 11
+ C1: 2
+ C2: 6
+ ```
+
+4. Get set of not found CEFR levels for words in text:
+
+ ```py
+ def get_not_found_words(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> set[str]:
+ not_found_words = set()
+ for token in level_tokens:
+ if token[2]:
+ continue
+
+ if not token[3]:
+ not_found_words.add(token[0])
+
+ return not_found_words
+
+
+ not_found_words_set = get_not_found_words(tokens)
+ not_found_words_list = list(not_found_words_set)
+ not_found_words_list.sort()
+
+ print('Not found words:', len(not_found_words_list))
+ if len(not_found_words_list):
+ print('\n'.join(not_found_words_list))
+ ```
+
+ ```text
+ Not found words: 0
+ ```
+
+## Additional features
+
+### Get all possible part-of-speech tags for a word
```py
-def get_word_level_count_statistic(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> list[int]:
- difficulty_levels_count = [0] * 6
- for token in level_tokens:
- level = token[3]
- if not level:
- continue
-
- level_round = round(level)
- difficulty_levels_count[level_round - 1] += 1
-
- return difficulty_levels_count
-
-difficulty_levels_count = get_word_level_count_statistic(tokens)
-print('CEFR statistic (total words):')
-for i in range(1, 7):
- print(f'{CEFRLevel(i)}: {difficulty_levels_count[i - 1]}')
+from cefrpy import CEFRAnalyzer
+
+analyzer = CEFRAnalyzer()
+
+print(analyzer.get_all_pos_for_word("test")) # [, , ]
+print(analyzer.get_all_pos_for_word_as_str("test")) # ['JJ', 'NN']
+
+# {: , : , : }
+print(analyzer.get_pos_level_dict_for_word("test"))
+
+# {'JJ': 2.5, 'NN': 1.0, 'VB': 4.0}
+print(analyzer.get_pos_level_dict_for_word("test", pos_tag_as_string=True, word_level_as_float=True))
```
+### Checking if a word exists in the database
+
+```py
+from cefrpy import CEFRAnalyzer
+
+analyzer = CEFRAnalyzer()
+
+word = "apple"
+if analyzer.is_word_in_database(word):
+ print(f"'{word}' exists in the database.")
+else:
+ print(f"'{word}' does not exist in the database.")
```
-CEFR statistic (total words):
-A1: 136
-A2: 36
-B1: 27
-B2: 11
-C1: 2
-C2: 6
+
+### Checking if a word with a specific part-of-speech exists in the database
+
+```py
+from cefrpy import CEFRAnalyzer
+
+analyzer = CEFRAnalyzer()
+
+word = "run"
+pos_tag = "VB" # Verb
+if analyzer.is_word_pos_id_database(word, pos_tag):
+ print(f"'{word}' with part of speech '{pos_tag}' exists in the database.")
+else:
+ print(f"'{word}' with part of speech '{pos_tag}' does not exist in the database.")
```
-3. Get CEFR statistic for unique words in the text:
+### POSTag usage examples
```py
-def get_word_level_count_statistic_unique(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> list[int]:
- processed_word_pos_set = set()
- difficulty_levels_count = [0] * 6
- for token in level_tokens:
- level = token[3]
- if not level:
- continue
-
- to_check_tuple = (token[0], token[1])
- if not to_check_tuple in processed_word_pos_set:
- level_round = round(token[3])
- difficulty_levels_count[level_round - 1] += 1
- processed_word_pos_set.add(to_check_tuple)
+from cefrpy import POSTag
- return difficulty_levels_count
+# Get list of all part-of-speech tag names
+print(POSTag.get_all_tags()) # ['CC', 'CD', 'DT', ...]
+# Print total tags
+print(POSTag.get_total_tags()) # 28
-difficulty_levels_count_unique = get_word_level_count_statistic_unique(tokens)
-print('CEFR statistic (unique words):')
-for i in range(1, 7):
- print(f'{CEFRLevel(i)}: {difficulty_levels_count_unique[i - 1]}')
-```
+# Get description for a tag
+print(POSTag.get_description_by_tag_name('NN')) # Noun, singular or mass
+tag = POSTag.VB
+print(tag) # VB
+print(POSTag.get_description(tag)) # Verb, base form
+print(int(tag)) # 19 (unique tag id)
+print(tag == POSTag.NN) # False
```
-CEFR statistic (unique words):
-A1: 77
-A2: 33
-B1: 23
-B2: 11
-C1: 2
-C2: 6
+
+### CEFRLevel usage examples
+
+```py
+from cefrpy import CEFRLevel
+
+level = CEFRLevel.A1
+print(level) # A1
+print(int(level)) # 1
+
+level2 = CEFRLevel.C2
+print(level2) # C2
+print(int(level2)) # 6
+
+# You can perform any comparisons:
+print(level2 > level) # True
+print(level2 == level) # False
+
+print(CEFRLevel.from_str("B1") == CEFRLevel.B1) # True
+print(CEFRLevel.from_str("B1") == CEFRLevel(3)) # True
```
-4. Get set of not found CEFR levels for words in text:
+### Yields CEFRAnalyzer methods
+
+For every example you should import and initialize `CEFRAnalyzer`:
```py
-def get_not_found_words(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> set[str]:
- not_found_words = set()
- for token in level_tokens:
- if token[2]:
- continue
+from cefrpy import CEFRAnalyzer
- if not token[3]:
- not_found_words.add(token[0])
+analyzer = CEFRAnalyzer()
+```
- return not_found_words
+#### Iterating over words with a specific length (alphabetical order)
+```py
+iteration_limit = 10
+word_list = []
+for word in analyzer.yield_words_with_length(6):
+ if iteration_limit == 0:
+ break
+ word_list.append(word)
+ iteration_limit -= 1
+
+# ['aaberg', 'aachen', 'aahing', 'aargau', 'aarhus', 'abacus', 'abadan', 'abadia', 'abakan', 'abaris']
+print(word_list)
+```
-not_found_words_set = get_not_found_words(tokens)
-not_found_words_list = list(not_found_words_set)
-not_found_words_list.sort()
+#### Iterating over words with a specific length (reversed alphabetical order)
-print('Not found words:', len(not_found_words_list))
-if len(not_found_words_list):
- print('\n'.join(not_found_words_list))
+```py
+iteration_limit = 10
+word_list = []
+for word in analyzer.yield_words_with_length(6, reverse_order=True):
+ if iteration_limit == 0:
+ break
+ word_list.append(word)
+ iteration_limit -= 1
+
+# ['zymase', 'zygote', 'zygoma', 'zydeco', 'zwolle', 'zwicky', 'zuzana', 'zusman', 'zurvan', 'zurich']
+print(word_list)
```
+#### Iterating over words in alphabetical order
+
+```py
+iteration_limit = 10
+word_list = []
+for word in analyzer.yield_words():
+ if iteration_limit == 0:
+ break
+ word_list.append(word)
+ iteration_limit -= 1
+
+# ['a', 'aa', 'aaa', 'aaaa', 'aaas', 'aaberg', 'aachen', 'aae', 'aaee', 'aaf']
+print(word_list)
+```
+
+#### Iterating over words with their pos in alphabetical order with word length priority ascending
+
+```py
+iteration_limit = 6
+word_pos_list = []
+for word, pos_tag in analyzer.yield_word_pos(word_length_sort=True):
+ if iteration_limit == 0:
+ break
+ word_pos_list.append((word, pos_tag))
+ iteration_limit -= 1
+
+# [('a', ), ('a', ), ('a', ), ('a', ), ('a', ), ('b', )]
+print(word_pos_list)
```
-Not found words: 0
+
+#### Iterating over words with their pos as str and levels as float in reversed alphabetical order with word length priority descending
+
+```py
+iteration_limit = 3
+word_pos_list = []
+for word, pos_tag, level in analyzer.yield_word_pos_level(word_length_sort=True, reverse_order=True, pos_tag_as_string=True, word_level_as_float=True):
+ if iteration_limit == 0:
+ break
+ word_pos_list.append((word, pos_tag, level))
+ iteration_limit -= 1
+
+# [('demethylchlortetracycline', 'NN', 6.0), ('electrocardiographically', 'RB', 6.0), ('polytetrafluoroethylene', 'NN', 6.0)]
+print(word_pos_list)
```
## License
+
This project is licensed under the MIT License - see the LICENSE file for details.
## Acknowledgments
+
I would like to acknowledge the contributions of the following resources. I used them to create my initial SQLite version [Words-CEFR-Dataset](https://github.com/Maximax67/Words-CEFR-Dataset):
+
- [Spacy](https://spacy.io/)
- [CEFR-J](https://cefr-j.org/)
- [LemmInflect](https://github.com/bjascob/LemmInflect)
@@ -334,8 +513,8 @@ I would like to acknowledge the contributions of the following resources. I used
- [List of pos tags form Penn Treebank Project](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html)
Also I used these resources to create my [valid English words list](https://github.com/Maximax67/English-Valid-Words):
+
- [Word list by infochimps (archived)](https://web.archive.org/web/20131118073324/https://www.infochimps.com/datasets/word-list-350000-simple-english-words-excel-readable)
- [English words github repo by dwyl](https://github.com/dwyl/english-words)
- [NLTK (Natural Language Toolkit)](https://www.nltk.org/)
- [WordNet](https://wordnet.princeton.edu/)
-
diff --git a/docs/cefrpy.rst b/docs/cefrpy.rst
index 59303a9..0331a2a 100644
--- a/docs/cefrpy.rst
+++ b/docs/cefrpy.rst
@@ -67,3 +67,4 @@ Module contents
:members:
:undoc-members:
:show-inheritance:
+ :no-index:
diff --git a/docs/conf.py b/docs/conf.py
index 34dd33a..d2cccc1 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -5,29 +5,36 @@
import os
import sys
-sys.path.insert(0, os.path.abspath('../src/cefrpy/'))
+
+sys.path.insert(0, os.path.abspath("../src/cefrpy/"))
# -- Project information -----------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
-project = 'cefrpy'
-copyright = '2024, Maxim Belikov'
-author = 'Maxim Belikov'
-release = '1.0'
+project = "cefrpy"
+copyright = "2026, Maksym Bielikov"
+author = "Maksym Bielikov"
+version = "1.0"
+release = "1.0.2"
# -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
-extensions = ['sphinx.ext.autodoc', 'sphinx_mdinclude', 'sphinx.ext.githubpages']
-
-templates_path = ['_templates']
-exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+extensions = ["sphinx.ext.autodoc", "sphinx_mdinclude", "sphinx.ext.githubpages"]
+templates_path = ["_templates"]
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
# -- Options for HTML output -------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
-html_theme = 'sphinx_rtd_theme'
-html_static_path = ['_static']
+html_theme = "sphinx_rtd_theme"
+html_context = {
+ "display_github": True,
+ "github_user": "Maximax67",
+ "github_repo": "cefrpy",
+ "github_version": "main",
+ "conf_py_path": "/docs/",
+}
diff --git a/docs/docs.md b/docs/docs.md
index 6695048..c0d2136 100644
--- a/docs/docs.md
+++ b/docs/docs.md
@@ -1,7 +1,17 @@
# About cefrpy
+
+
+
+[](https://huggingface.co/spaces/Maximax67/cefrpy-demo)
+[](https://github.com/Maximax67/cefrpy/actions/workflows/unittest.yml)
+
The cefrpy python module offers a comprehensive toolkit for analyzing linguistic data based on the Common European Framework of Reference for Languages (CEFR).
+Documentation:
+
+HuggingFace demo:
+
# Installation
You can install `cefrpy` for Python >= 3.6 via pip:
@@ -36,7 +46,7 @@ else:
print(f"CEFR level not found for '{word}' as a {pos_tag}.")
```
-## Getting average level of a word:
+## Getting average level of a word
```py
from cefrpy import CEFRAnalyzer
@@ -61,7 +71,8 @@ else:
## Recommended usage with [spaCy](https://spacy.io)
-### Import spacy and load model:
+### Import spacy and load model
+
```py
import spacy
@@ -136,179 +147,179 @@ for token in tokens:
Result (truncated):
-```
+```text
-------------------------------------------------------
- WORD POS LEVEL CEFR
+ WORD POS LEVEL CEFR
-------------------------------------------------------
- _SP Skip None
-In IN 1.00 A1
-the DT 1.00 A1
-heart NN 1.00 A1
-of IN 1.00 A1
-every DT 1.00 A1
-forest NN 2.00 A2
-, , Skip None
-a DT 1.00 A1
-hidden JJ 3.00 B1
-world NN 1.00 A1
-thrives VBZ 5.86 C2
-among IN 2.00 A2
-the DT 1.00 A1
-towering VBG 1.00 A1
-trees NNS 1.00 A1
-. . Skip None
-Trees NNS 1.00 A1
-, , Skip None
- _SP Skip None
-those DT 1.00 A1
-silent JJ 3.00 B1
+ _SP Skip None
+In IN 1.00 A1
+the DT 1.00 A1
+heart NN 1.00 A1
+of IN 1.00 A1
+every DT 1.00 A1
+forest NN 2.00 A2
+, , Skip None
+a DT 1.00 A1
+hidden JJ 3.00 B1
+world NN 1.00 A1
+thrives VBZ 5.86 C2
+among IN 2.00 A2
+the DT 1.00 A1
+towering VBG 1.00 A1
+trees NNS 1.00 A1
+. . Skip None
+Trees NNS 1.00 A1
+, , Skip None
+ _SP Skip None
+those DT 1.00 A1
+silent JJ 3.00 B1
```
### Get more statistical information
1. Filter tokens by level:
-```py
-def filter_for_desired_level(level_tokens: list[tuple[str, str, bool, float, int, int]],
- min_level: float | int = 1.0, max_level: float | int = 6.0
- ) -> set[tuple[str, str, bool, float, int, int]]:
- filtered_tokens = set()
- for token in level_tokens:
- level = token[3]
-
- if level and level >= min_level and level <= max_level:
- filtered_tokens.add(token)
-
- return filtered_tokens
-
-
-# You can also set min/max level as an int or float in range from 1 to 6
-desired_min_level = CEFRLevel.C1
-desired_level_words_set = filter_for_desired_level(tokens, min_level=int(desired_min_level))
-
-desired_level_words_list = list(desired_level_words_set)
-desired_level_words_list.sort()
-
-print(f'\tWords with level {desired_min_level} and higher: {len(desired_level_words_list)}')
-for word_data in desired_level_words_list:
- word, pos, _, level, _, _ = word_data
- print(f"{word.ljust(26)} {pos.ljust(6)} {'{:.2f}'.format(level).ljust(6)} {CEFRLevel(round(level))}")
-```
-
-```
-Words with level B2 and higher: 16
-benefactors NNS 6.00 C2
-bristlecone NN 6.00 C2
-evolved VBN 4.00 B2
-fungi NNS 5.20 C1
-living NN 4.00 B2
-longevity NN 5.96 C2
-masters NNS 4.00 B2
-mighty JJ 4.00 B2
-observers NNS 4.00 B2
-pines NNS 4.00 B2
-potential JJ 4.00 B2
-sequoias NNS 6.00 C2
-thrives VBZ 5.86 C2
-underground RB 4.00 B2
-wildfires NNS 6.00 C2
-withstand VB 5.12 C1
-```
+ ```py
+ def filter_for_desired_level(level_tokens: list[tuple[str, str, bool, float, int, int]],
+ min_level: float | int = 1.0, max_level: float | int = 6.0
+ ) -> set[tuple[str, str, bool, float, int, int]]:
+ filtered_tokens = set()
+ for token in level_tokens:
+ level = token[3]
+
+ if level and level >= min_level and level <= max_level:
+ filtered_tokens.add(token)
+
+ return filtered_tokens
+
+
+ # You can also set min/max level as an int or float in range from 1 to 6
+ desired_min_level = CEFRLevel.C1
+ desired_level_words_set = filter_for_desired_level(tokens, min_level=int(desired_min_level))
+
+ desired_level_words_list = list(desired_level_words_set)
+ desired_level_words_list.sort()
+
+ print(f'\tWords with level {desired_min_level} and higher: {len(desired_level_words_list)}')
+ for word_data in desired_level_words_list:
+ word, pos, _, level, _, _ = word_data
+ print(f"{word.ljust(26)} {pos.ljust(6)} {'{:.2f}'.format(level).ljust(6)} {CEFRLevel(round(level))}")
+ ```
+
+ ```text
+ Words with level B2 and higher: 16
+ benefactors NNS 6.00 C2
+ bristlecone NN 6.00 C2
+ evolved VBN 4.00 B2
+ fungi NNS 5.20 C1
+ living NN 4.00 B2
+ longevity NN 5.96 C2
+ masters NNS 4.00 B2
+ mighty JJ 4.00 B2
+ observers NNS 4.00 B2
+ pines NNS 4.00 B2
+ potential JJ 4.00 B2
+ sequoias NNS 6.00 C2
+ thrives VBZ 5.86 C2
+ underground RB 4.00 B2
+ wildfires NNS 6.00 C2
+ withstand VB 5.12 C1
+ ```
2. Get CEFR statistic of the text:
-```py
-def get_word_level_count_statistic(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> list[int]:
- difficulty_levels_count = [0] * 6
- for token in level_tokens:
- level = token[3]
- if not level:
- continue
-
- level_round = round(level)
- difficulty_levels_count[level_round - 1] += 1
-
- return difficulty_levels_count
-
-difficulty_levels_count = get_word_level_count_statistic(tokens)
-print('CEFR statistic (total words):')
-for i in range(1, 7):
- print(f'{CEFRLevel(i)}: {difficulty_levels_count[i - 1]}')
-```
+ ```py
+ def get_word_level_count_statistic(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> list[int]:
+ difficulty_levels_count = [0] * 6
+ for token in level_tokens:
+ level = token[3]
+ if not level:
+ continue
-```
-CEFR statistic (total words):
-A1: 136
-A2: 36
-B1: 27
-B2: 11
-C1: 2
-C2: 6
-```
-
-3. Get CEFR statistic for unique words in the text:
-
-```py
-def get_word_level_count_statistic_unique(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> list[int]:
- processed_word_pos_set = set()
- difficulty_levels_count = [0] * 6
- for token in level_tokens:
- level = token[3]
- if not level:
- continue
-
- to_check_tuple = (token[0], token[1])
- if not to_check_tuple in processed_word_pos_set:
- level_round = round(token[3])
+ level_round = round(level)
difficulty_levels_count[level_round - 1] += 1
- processed_word_pos_set.add(to_check_tuple)
- return difficulty_levels_count
+ return difficulty_levels_count
+ difficulty_levels_count = get_word_level_count_statistic(tokens)
+ print('CEFR statistic (total words):')
+ for i in range(1, 7):
+ print(f'{CEFRLevel(i)}: {difficulty_levels_count[i - 1]}')
+ ```
-difficulty_levels_count_unique = get_word_level_count_statistic_unique(tokens)
-print('CEFR statistic (unique words):')
-for i in range(1, 7):
- print(f'{CEFRLevel(i)}: {difficulty_levels_count_unique[i - 1]}')
-```
+ ```text
+ CEFR statistic (total words):
+ A1: 136
+ A2: 36
+ B1: 27
+ B2: 11
+ C1: 2
+ C2: 6
+ ```
-```
-CEFR statistic (unique words):
-A1: 77
-A2: 33
-B1: 23
-B2: 11
-C1: 2
-C2: 6
-```
+3. Get CEFR statistic for unique words in the text:
+
+ ```py
+ def get_word_level_count_statistic_unique(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> list[int]:
+ processed_word_pos_set = set()
+ difficulty_levels_count = [0] * 6
+ for token in level_tokens:
+ level = token[3]
+ if not level:
+ continue
+
+ to_check_tuple = (token[0], token[1])
+ if not to_check_tuple in processed_word_pos_set:
+ level_round = round(token[3])
+ difficulty_levels_count[level_round - 1] += 1
+ processed_word_pos_set.add(to_check_tuple)
+
+ return difficulty_levels_count
+
+
+ difficulty_levels_count_unique = get_word_level_count_statistic_unique(tokens)
+ print('CEFR statistic (unique words):')
+ for i in range(1, 7):
+ print(f'{CEFRLevel(i)}: {difficulty_levels_count_unique[i - 1]}')
+ ```
+
+ ```text
+ CEFR statistic (unique words):
+ A1: 77
+ A2: 33
+ B1: 23
+ B2: 11
+ C1: 2
+ C2: 6
+ ```
4. Get set of not found CEFR levels for words in text:
-```py
-def get_not_found_words(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> set[str]:
- not_found_words = set()
- for token in level_tokens:
- if token[2]:
- continue
+ ```py
+ def get_not_found_words(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> set[str]:
+ not_found_words = set()
+ for token in level_tokens:
+ if token[2]:
+ continue
- if not token[3]:
- not_found_words.add(token[0])
+ if not token[3]:
+ not_found_words.add(token[0])
- return not_found_words
+ return not_found_words
-not_found_words_set = get_not_found_words(tokens)
-not_found_words_list = list(not_found_words_set)
-not_found_words_list.sort()
+ not_found_words_set = get_not_found_words(tokens)
+ not_found_words_list = list(not_found_words_set)
+ not_found_words_list.sort()
-print('Not found words:', len(not_found_words_list))
-if len(not_found_words_list):
- print('\n'.join(not_found_words_list))
-```
+ print('Not found words:', len(not_found_words_list))
+ if len(not_found_words_list):
+ print('\n'.join(not_found_words_list))
+ ```
-```
-Not found words: 0
-```
+ ```text
+ Not found words: 0
+ ```
# Additional features
@@ -329,7 +340,7 @@ print(analyzer.get_pos_level_dict_for_word("test"))
print(analyzer.get_pos_level_dict_for_word("test", pos_tag_as_string=True, word_level_as_float=True))
```
-## Checking if a word exists in the database
+### Checking if a word exists in the database
```py
from cefrpy import CEFRAnalyzer
@@ -411,6 +422,7 @@ analyzer = CEFRAnalyzer()
```
### Iterating over words with a specific length (alphabetical order)
+
```py
iteration_limit = 10
word_list = []
@@ -469,7 +481,7 @@ for word, pos_tag in analyzer.yield_word_pos(word_length_sort=True):
print(word_pos_list)
```
-### Iterating over words with their pos as str and levels as float in reversed alphabetical order with word length priority descending
+### Iterating over words with their pos as str and levels as float in reversed alphabetical order with word length priority descending
```py
iteration_limit = 3
@@ -484,12 +496,14 @@ for word, pos_tag, level in analyzer.yield_word_pos_level(word_length_sort=True,
print(word_pos_list)
```
-
# License
+
This project is licensed under the MIT License - see the LICENSE file for details.
# Acknowledgments
+
I would like to acknowledge the contributions of the following resources. I used them to create my initial SQLite version [Words-CEFR-Dataset](https://github.com/Maximax67/Words-CEFR-Dataset):
+
- [Spacy](https://spacy.io/)
- [CEFR-J](https://cefr-j.org/)
- [LemmInflect](https://github.com/bjascob/LemmInflect)
@@ -497,6 +511,7 @@ I would like to acknowledge the contributions of the following resources. I used
- [List of pos tags form Penn Treebank Project](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html)
Also I used these resources to create my [valid English words list](https://github.com/Maximax67/English-Valid-Words):
+
- [Word list by infochimps (archived)](https://web.archive.org/web/20131118073324/https://www.infochimps.com/datasets/word-list-350000-simple-english-words-excel-readable)
- [English words github repo by dwyl](https://github.com/dwyl/english-words)
- [NLTK (Natural Language Toolkit)](https://www.nltk.org/)
diff --git a/docs/index.rst b/docs/index.rst
index 8eeeab8..d45f495 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -18,5 +18,3 @@ Indices and tables
* :ref:`genindex`
* :ref:`modindex`
-
-.. mdinclude:: docs.md
diff --git a/setup.cfg b/setup.cfg
index c587c0e..d8091fd 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,11 +1,14 @@
[metadata]
name = cefrpy
-version = 1.0.1
-author = Maxim Belikov
+version = 1.0.2
+author = Maksym Bielikov
author_email = maximax6767@gmail.com
description = Python package for analyzing words based on the CEFR level.
long_description = file: README.md, LICENSE
long_description_content_type = text/markdown
+project_urls =
+ Source = https://github.com/Maximax67/cefrpy
+ Bug Tracker = https://github.com/Maximax67/cefrpy/issues
classifiers =
Programming Language :: Python :: 3
License :: OSI Approved :: MIT License
@@ -19,4 +22,4 @@ python_requires = >=3.6
include_package_data = True
[options.packages.find]
-where = src
\ No newline at end of file
+where = src
diff --git a/src/cefrpy/CEFRAnalyzer.py b/src/cefrpy/CEFRAnalyzer.py
index ab9a0dc..59a1362 100644
--- a/src/cefrpy/CEFRAnalyzer.py
+++ b/src/cefrpy/CEFRAnalyzer.py
@@ -26,8 +26,12 @@ def __init__(self, data_processor: CEFRDataProcessor = CEFRDataProcessor()) -> N
"""
self._data_processor = data_processor
-
- def get_word_pos_level_float(self, word: str, pos_tag: Union[str, POSTag], avg_level_not_found_pos: bool = False) -> Union[float, None]:
+ def get_word_pos_level_float(
+ self,
+ word: str,
+ pos_tag: Union[str, POSTag],
+ avg_level_not_found_pos: bool = False,
+ ) -> Union[float, None]:
"""
Get the level of a word's part of speech.
@@ -46,10 +50,16 @@ def get_word_pos_level_float(self, word: str, pos_tag: Union[str, POSTag], avg_l
pos_tag_id = inf
- return self._data_processor.get_word_level_for_pos_id(word, pos_tag_id, avg_level_not_found_pos)
-
+ return self._data_processor.get_word_level_for_pos_id(
+ word, pos_tag_id, avg_level_not_found_pos
+ )
- def get_word_pos_level_CEFR(self, word: str, pos_tag: Union[str, POSTag], avg_level_not_found_pos: bool = False) -> Union[CEFRLevel, None]:
+ def get_word_pos_level_CEFR(
+ self,
+ word: str,
+ pos_tag: Union[str, POSTag],
+ avg_level_not_found_pos: bool = False,
+ ) -> Union[CEFRLevel, None]:
"""
Get the CEFR level of a word's part of speech.
@@ -61,13 +71,14 @@ def get_word_pos_level_CEFR(self, word: str, pos_tag: Union[str, POSTag], avg_le
Returns:
Union[CEFRLevel, None]: The level of the word's part of speech, or None if not found.
"""
- float_level = self.get_word_pos_level_float(word, pos_tag, avg_level_not_found_pos)
+ float_level = self.get_word_pos_level_float(
+ word, pos_tag, avg_level_not_found_pos
+ )
if float_level is None:
return
return CEFRLevel(round(float_level))
-
def get_average_word_level_float(self, word: str) -> Union[float, None]:
"""
Get the average level of the word.
@@ -80,7 +91,6 @@ def get_average_word_level_float(self, word: str) -> Union[float, None]:
"""
return self._data_processor.get_word_level_for_pos_id(word, inf, True)
-
def get_average_word_level_CEFR(self, word: str) -> Union[CEFRLevel, None]:
"""
Get the average CEFR level of the word.
@@ -97,7 +107,6 @@ def get_average_word_level_CEFR(self, word: str) -> Union[CEFRLevel, None]:
return CEFRLevel(round(float_level))
-
def get_all_pos_for_word_as_str(self, word: str) -> list[str]:
"""
Retrieves the names of all part-of-speech tags associated with a given word.
@@ -106,7 +115,7 @@ def get_all_pos_for_word_as_str(self, word: str) -> list[str]:
word (str): The word to retrieve part-of-speech tags for.
Returns:
- list[str]: A list of strings representing the names of the part-of-speech tags associated with the word.
+ list[str]: A list of strings representing the names of the part-of-speech tags associated with the word.
If the word is not found in the data, an empty list is returned.
"""
pos_tags = self._data_processor.get_all_pos_for_word(word)
@@ -118,7 +127,6 @@ def get_all_pos_for_word_as_str(self, word: str) -> list[str]:
return pos_tags_str_list
-
def get_all_pos_for_word(self, word: str) -> list[POSTag]:
"""
Retrieves all part-of-speech tags associated with a given word as POSTag enums.
@@ -127,7 +135,7 @@ def get_all_pos_for_word(self, word: str) -> list[POSTag]:
word (str): The word to retrieve part-of-speech tags for.
Returns:
- list[POSTag]: A list of POSTag enums representing the part-of-speech tags associated with the word.
+ list[POSTag]: A list of POSTag enums representing the part-of-speech tags associated with the word.
If the word is not found in the data, an empty list is returned.
"""
pos_tags = self._data_processor.get_all_pos_for_word(word)
@@ -138,9 +146,12 @@ def get_all_pos_for_word(self, word: str) -> list[POSTag]:
return pos_tags_list
-
- def get_pos_level_dict_for_word(self, word: str, pos_tag_as_string: bool = False,
- word_level_as_float: bool = False) -> dict[Union[str, POSTag], Union[float, CEFRLevel]]:
+ def get_pos_level_dict_for_word(
+ self,
+ word: str,
+ pos_tag_as_string: bool = False,
+ word_level_as_float: bool = False,
+ ) -> dict[Union[str, POSTag], Union[float, CEFRLevel]]:
"""
Retrieves a dictionary mapping part-of-speech tags to their associated CEFR levels for a given word.
@@ -174,7 +185,6 @@ def get_pos_level_dict_for_word(self, word: str, pos_tag_as_string: bool = False
return pos_and_levels_formatted
-
def get_max_word_len(self) -> int:
"""
Get the maximum word length available in the data.
@@ -184,7 +194,6 @@ def get_max_word_len(self) -> int:
"""
return self._data_processor.get_max_word_len()
-
def is_word_in_database(self, word: str) -> bool:
"""
Check if a word is in the DataReader database.
@@ -197,7 +206,6 @@ def is_word_in_database(self, word: str) -> bool:
"""
return self._data_processor.is_word_in_database(word)
-
def is_word_pos_id_database(self, word: str, pos_tag: Union[str, POSTag]) -> bool:
"""
Check if a word pos is in the database.
@@ -215,7 +223,6 @@ def is_word_pos_id_database(self, word: str, pos_tag: Union[str, POSTag]) -> boo
return self._data_processor.is_word_pos_id_database(word, pos_tag_id)
-
def yield_words_with_length(self, word_length: int, reverse_order: bool = False):
"""
Yield words of a specific length from the database.
@@ -229,7 +236,6 @@ def yield_words_with_length(self, word_length: int, reverse_order: bool = False)
"""
return self._data_processor.yield_words_with_length(word_length, reverse_order)
-
def yield_words(self, reverse_order: bool = False, word_length_sort: bool = False):
"""
Yield all words in the database.
@@ -243,8 +249,12 @@ def yield_words(self, reverse_order: bool = False, word_length_sort: bool = Fals
"""
return self._data_processor.yield_words(reverse_order, word_length_sort)
-
- def yield_word_pos_with_length(self, word_length: int, reverse_order: bool = False, pos_tag_as_string: bool = False):
+ def yield_word_pos_with_length(
+ self,
+ word_length: int,
+ reverse_order: bool = False,
+ pos_tag_as_string: bool = False,
+ ):
"""
Yield words of a specific length with their associated part-of-speech tag IDs from the database.
@@ -263,11 +273,17 @@ def yield_word_pos_with_length(self, word_length: int, reverse_order: bool = Fal
else:
pos_converter = lambda x: POSTag(x)
- for word, pos_tag_id in self._data_processor.yield_word_pos_id_with_length(word_length, reverse_order):
+ for word, pos_tag_id in self._data_processor.yield_word_pos_id_with_length(
+ word_length, reverse_order
+ ):
yield (word, pos_converter(pos_tag_id))
-
- def yield_word_pos(self, reverse_order: bool = False, pos_tag_as_string: bool = False, word_length_sort: bool = False):
+ def yield_word_pos(
+ self,
+ reverse_order: bool = False,
+ pos_tag_as_string: bool = False,
+ word_length_sort: bool = False,
+ ):
"""
Yield all words with their associated part-of-speech tag IDs from the database.
@@ -286,12 +302,18 @@ def yield_word_pos(self, reverse_order: bool = False, pos_tag_as_string: bool =
else:
pos_converter = lambda x: POSTag(x)
- for word, pos_tag_id in self._data_processor.yield_word_pos_id(reverse_order, word_length_sort):
+ for word, pos_tag_id in self._data_processor.yield_word_pos_id(
+ reverse_order, word_length_sort
+ ):
yield (word, pos_converter(pos_tag_id))
-
- def yield_word_pos_level_with_length(self, word_length: int, reverse_order: bool = False,
- pos_tag_as_string: bool = False, word_level_as_float: bool = False):
+ def yield_word_pos_level_with_length(
+ self,
+ word_length: int,
+ reverse_order: bool = False,
+ pos_tag_as_string: bool = False,
+ word_level_as_float: bool = False,
+ ):
"""
Yield words of a specific length, their part-of-speech tags, and their CEFR levels from the database based on the specified criteria.
@@ -302,7 +324,7 @@ def yield_word_pos_level_with_length(self, word_length: int, reverse_order: bool
word_level_as_float (bool, optional): If True, yield CEFR levels as floats instead of CEFRLevel enums. Defaults to False.
Yields:
- tuple: A tuple containing the word, its part-of-speech tag, and its CEFR level. If `pos_tag_as_string` is True, the part-of-speech tag is a string,
+ tuple: A tuple containing the word, its part-of-speech tag, and its CEFR level. If `pos_tag_as_string` is True, the part-of-speech tag is a string,
otherwise, it's a POSTag enum. If `word_level_as_float` is True, the level is a float, otherwise, it's a CEFRLevel enum.
"""
if pos_tag_as_string:
@@ -311,17 +333,33 @@ def yield_word_pos_level_with_length(self, word_length: int, reverse_order: bool
pos_converter = lambda x: POSTag(x)
if word_level_as_float:
- for word, pos_tag_id, level in self._data_processor.yield_word_pos_level_with_length(word_length, reverse_order):
+ for (
+ word,
+ pos_tag_id,
+ level,
+ ) in self._data_processor.yield_word_pos_level_with_length(
+ word_length, reverse_order
+ ):
yield (word, pos_converter(pos_tag_id), level)
return
- for word, pos_tag_id, level in self._data_processor.yield_word_pos_level_with_length(word_length, reverse_order):
+ for (
+ word,
+ pos_tag_id,
+ level,
+ ) in self._data_processor.yield_word_pos_level_with_length(
+ word_length, reverse_order
+ ):
yield (word, pos_converter(pos_tag_id), CEFRLevel(round(level)))
-
- def yield_word_pos_level(self, reverse_order: bool = False, pos_tag_as_string: bool = False,
- word_level_as_float: bool = False, word_length_sort: bool = False):
+ def yield_word_pos_level(
+ self,
+ reverse_order: bool = False,
+ pos_tag_as_string: bool = False,
+ word_level_as_float: bool = False,
+ word_length_sort: bool = False,
+ ):
"""
Yield all words, their part-of-speech tags, and their CEFR levels from the database based on the specified criteria.
@@ -332,7 +370,7 @@ def yield_word_pos_level(self, reverse_order: bool = False, pos_tag_as_string: b
word_length_sort (bool): If True, yields data sorted by word length.
Yields:
- tuple: A tuple containing the word, its part-of-speech tag, and its CEFR level. If `pos_tag_as_string` is True, the part-of-speech tag is a string,
+ tuple: A tuple containing the word, its part-of-speech tag, and its CEFR level. If `pos_tag_as_string` is True, the part-of-speech tag is a string,
otherwise, it's a POSTag enum. If `word_level_as_float` is True, the level is a float, otherwise, it's a CEFRLevel enum.
"""
if pos_tag_as_string:
@@ -341,15 +379,18 @@ def yield_word_pos_level(self, reverse_order: bool = False, pos_tag_as_string: b
pos_converter = lambda x: POSTag(x)
if word_level_as_float:
- for word, pos_tag_id, level in self._data_processor.yield_word_pos_level(reverse_order, word_length_sort):
+ for word, pos_tag_id, level in self._data_processor.yield_word_pos_level(
+ reverse_order, word_length_sort
+ ):
yield (word, pos_converter(pos_tag_id), level)
return
- for word, pos_tag_id, level in self._data_processor.yield_word_pos_level(reverse_order, word_length_sort):
+ for word, pos_tag_id, level in self._data_processor.yield_word_pos_level(
+ reverse_order, word_length_sort
+ ):
yield (word, pos_converter(pos_tag_id), CEFRLevel(round(level)))
-
def get_word_count_for_length(self, word_length: int) -> int:
"""
Count the number of words of a specific length in the data.
@@ -362,7 +403,6 @@ def get_word_count_for_length(self, word_length: int) -> int:
"""
return self._data_processor.get_word_count_for_length(word_length)
-
def get_total_words(self) -> int:
"""
Get the total count of words in the data.
@@ -372,7 +412,6 @@ def get_total_words(self) -> int:
"""
return self._data_processor.get_total_words()
-
def get_word_pos_count_for_length(self, word_length: int) -> int:
"""
Count the number of positions in the data where words of a specific length start.
@@ -385,7 +424,6 @@ def get_word_pos_count_for_length(self, word_length: int) -> int:
"""
return self._data_processor.get_word_pos_count_for_length(word_length)
-
def get_word_pos_count(self) -> int:
"""
Get the total count of positions in the data where words start, across all word lengths.
@@ -395,7 +433,6 @@ def get_word_pos_count(self) -> int:
"""
return self._data_processor.get_word_pos_count()
-
@staticmethod
def get_pos_tag_id(pos_tag: Union[str, POSTag]) -> Union[int, None]:
"""
diff --git a/src/cefrpy/CEFRDataProcessor.py b/src/cefrpy/CEFRDataProcessor.py
index ce7e529..ab0105f 100644
--- a/src/cefrpy/CEFRDataProcessor.py
+++ b/src/cefrpy/CEFRDataProcessor.py
@@ -4,9 +4,10 @@
from heapq import heapify, heappush, heappop
from .CEFRDataReader import CEFRDataReader
+from .CEFRDataValidator import VALID_WORD_CHARACTERS
-class HeapqReverseDataWrapper():
+class HeapqReverseDataWrapper:
"""
Wrapper class to reverse the ordering of data when using heapq.
@@ -22,6 +23,7 @@ class HeapqReverseDataWrapper():
Methods:
__lt__(self, other): Less-than comparison method used to determine the ordering of the wrapped data.
"""
+
def __init__(self, data) -> None:
"""
Initialize the HeapqReverseDataWrapper instance.
@@ -61,7 +63,6 @@ def __init__(self, data_reader: CEFRDataReader = CEFRDataReader()) -> None:
"""
self._data_reader = data_reader
-
def get_max_word_len(self) -> int:
"""
Get the maximum word length available in the data.
@@ -71,7 +72,6 @@ def get_max_word_len(self) -> int:
"""
return self._data_reader.get_wlp_len() - 1
-
def is_word_len_valid(self, word_len: int) -> bool:
"""
Check if the word length is valid.
@@ -84,7 +84,6 @@ def is_word_len_valid(self, word_len: int) -> bool:
"""
return 0 < word_len < self._data_reader.get_wlp_len()
-
def _get_first_word_match_pos(self, word_packed: bytes) -> int:
"""
Get the position of the first occurrence of a word in the data.
@@ -123,8 +122,9 @@ def _get_first_word_match_pos(self, word_packed: bytes) -> int:
return -1
-
- def _get_int_word_level_for_pos_id(self, word_packed: bytes, pos_tag_id: int, avg_level_not_found_pos: bool = False) -> Union[int, None]:
+ def _get_int_word_level_for_pos_id(
+ self, word_packed: bytes, pos_tag_id: int, avg_level_not_found_pos: bool = False
+ ) -> Union[int, None]:
"""
Get the packed level of a word's part of speech.
@@ -183,7 +183,6 @@ def _get_int_word_level_for_pos_id(self, word_packed: bytes, pos_tag_id: int, av
m = first_match
-
else:
while True:
m += data_block_len
@@ -224,7 +223,9 @@ def _get_int_word_level_for_pos_id(self, word_packed: bytes, pos_tag_id: int, av
i += 1
else:
founded_pos += 1
- level_accumulator += self._data_reader.get_data_array_value_at(i + 1)
+ level_accumulator += self._data_reader.get_data_array_value_at(
+ i + 1
+ )
continue
break
@@ -232,7 +233,6 @@ def _get_int_word_level_for_pos_id(self, word_packed: bytes, pos_tag_id: int, av
if avg_level_not_found_pos:
return round(level_accumulator / founded_pos)
-
def _get_word_data_range(self, word: str) -> Union[range, None]:
"""
Determines the range of data associated with a given word.
@@ -247,6 +247,9 @@ def _get_word_data_range(self, word: str) -> Union[range, None]:
if not self.is_word_len_valid(len(word)):
return
+ if not self._is_word_chars_valid(word):
+ return
+
word_packed = self.pack_word(word)
first_match = self._get_first_word_match_pos(word_packed)
if first_match == -1:
@@ -300,7 +303,6 @@ def _get_word_data_range(self, word: str) -> Union[range, None]:
return range(start_range, end_range, data_block_len)
-
def get_all_pos_for_word(self, word: str) -> list[int]:
"""
Retrieves the IDs of all part-of-speech tags associated with a given word.
@@ -309,7 +311,7 @@ def get_all_pos_for_word(self, word: str) -> list[int]:
word (str): The word to retrieve part-of-speech tags for.
Returns:
- list[int]: A list of IDs representing the part-of-speech tags associated with the word.
+ list[int]: A list of IDs representing the part-of-speech tags associated with the word.
If the word is not found in the data, an empty list is returned.
"""
data_range = self._get_word_data_range(word)
@@ -323,7 +325,6 @@ def get_all_pos_for_word(self, word: str) -> list[int]:
return pos_list
-
def get_pos_level_dict_for_word(self, word: str) -> dict[int, float]:
"""
Retrieves a dictionary mapping part-of-speech tag IDs to their associated CEFR levels for a given word.
@@ -349,8 +350,9 @@ def get_pos_level_dict_for_word(self, word: str) -> dict[int, float]:
return result
-
- def get_word_level_for_pos_id(self, word: str, pos_tag_id: int, avg_level_not_found_pos: bool = False) -> Union[float, None]:
+ def get_word_level_for_pos_id(
+ self, word: str, pos_tag_id: int, avg_level_not_found_pos: bool = False
+ ) -> Union[float, None]:
"""
Get the level of a word's part of speech.
@@ -365,13 +367,17 @@ def get_word_level_for_pos_id(self, word: str, pos_tag_id: int, avg_level_not_fo
if not self.is_word_len_valid(len(word)):
return
+ if not self._is_word_chars_valid(word):
+ return
+
word_packed = self.pack_word(word)
- level = self._get_int_word_level_for_pos_id(word_packed, pos_tag_id, avg_level_not_found_pos)
+ level = self._get_int_word_level_for_pos_id(
+ word_packed, pos_tag_id, avg_level_not_found_pos
+ )
if level is not None:
return self.byte_int_level_to_float(level)
-
def is_word_in_database(self, word: str) -> bool:
"""
Check if a word is in the database.
@@ -385,11 +391,13 @@ def is_word_in_database(self, word: str) -> bool:
if not self.is_word_len_valid(len(word)):
return False
+ if not self._is_word_chars_valid(word):
+ return False
+
word_packed = self.pack_word(word)
return self._get_first_word_match_pos(word_packed) != -1
-
def is_word_pos_id_database(self, word: str, pos_tag_id: int) -> bool:
"""
Check if a word pos is in the database.
@@ -403,7 +411,6 @@ def is_word_pos_id_database(self, word: str, pos_tag_id: int) -> bool:
"""
return self.get_word_level_for_pos_id(word, pos_tag_id) is not None
-
def _unpack_word_in_data_array(self, i: int, word_length: int) -> str:
"""
Unpack a word in the data array starting from index 'i' with a given length.
@@ -425,8 +432,9 @@ def _unpack_word_in_data_array(self, i: int, word_length: int) -> str:
return word
-
- def _get_word_yield_start_block_range(self, word_length: int, reverse_order: bool = False):
+ def _get_word_yield_start_block_range(
+ self, word_length: int, reverse_order: bool = False
+ ):
"""
Get the range of block indices to start yielding words of a specific length.
@@ -444,11 +452,14 @@ def _get_word_yield_start_block_range(self, word_length: int, reverse_order: boo
if reverse_order:
# This approach should be faster than reversed(range(...)):
# https://stackoverflow.com/a/7286465/15070145
- return range(segment_end - data_block_len, segment_start - data_block_len, -data_block_len)
+ return range(
+ segment_end - data_block_len,
+ segment_start - data_block_len,
+ -data_block_len,
+ )
return range(segment_start, segment_end, data_block_len)
-
def yield_words_with_length(self, word_length: int, reverse_order: bool = False):
"""
Yield words of a specific length from the database.
@@ -463,7 +474,9 @@ def yield_words_with_length(self, word_length: int, reverse_order: bool = False)
if not self.is_word_len_valid(word_length):
return
- start_block_range = self._get_word_yield_start_block_range(word_length, reverse_order)
+ start_block_range = self._get_word_yield_start_block_range(
+ word_length, reverse_order
+ )
last_word = None
for i in start_block_range:
@@ -473,8 +486,9 @@ def yield_words_with_length(self, word_length: int, reverse_order: bool = False)
yield word
last_word = word
-
- def yield_word_pos_id_with_length(self, word_length: int, reverse_order: bool = False):
+ def yield_word_pos_id_with_length(
+ self, word_length: int, reverse_order: bool = False
+ ):
"""
Yield words of a specific length with their associated part-of-speech tag IDs from the database.
@@ -489,7 +503,9 @@ def yield_word_pos_id_with_length(self, word_length: int, reverse_order: bool =
if not self.is_word_len_valid(word_length):
return
- start_block_range = self._get_word_yield_start_block_range(word_length, reverse_order)
+ start_block_range = self._get_word_yield_start_block_range(
+ word_length, reverse_order
+ )
for i in start_block_range:
word = self._unpack_word_in_data_array(i, word_length)
@@ -497,8 +513,9 @@ def yield_word_pos_id_with_length(self, word_length: int, reverse_order: bool =
yield (word, word_pos)
-
- def yield_word_pos_level_with_length(self, word_length: int, reverse_order: bool = False):
+ def yield_word_pos_level_with_length(
+ self, word_length: int, reverse_order: bool = False
+ ):
"""
Yield words of a specific length with their part-of-speech tag IDs and levels from the database.
@@ -513,7 +530,9 @@ def yield_word_pos_level_with_length(self, word_length: int, reverse_order: bool
if not self.is_word_len_valid(word_length):
return
- start_block_range = self._get_word_yield_start_block_range(word_length, reverse_order)
+ start_block_range = self._get_word_yield_start_block_range(
+ word_length, reverse_order
+ )
for i in start_block_range:
word = self._unpack_word_in_data_array(i, word_length)
@@ -525,8 +544,12 @@ def yield_word_pos_level_with_length(self, word_length: int, reverse_order: bool
yield (word, word_pos, word_level_float)
-
- def _yield_all_data(self, yield_method_with_word_length: callable, reverse_order: bool, word_lenght_sort: bool):
+ def _yield_all_data(
+ self,
+ yield_method_with_word_length: callable,
+ reverse_order: bool,
+ word_lenght_sort: bool,
+ ):
"""
Yields data from various generators based on word length.
@@ -552,7 +575,10 @@ def _yield_all_data(self, yield_method_with_word_length: callable, reverse_order
return
- generators = [yield_method_with_word_length(i, reverse_order) for i in range(1, max_word_len + 1)]
+ generators = [
+ yield_method_with_word_length(i, reverse_order)
+ for i in range(1, max_word_len + 1)
+ ]
words_heap = []
heapify(words_heap)
@@ -593,7 +619,6 @@ def _yield_all_data(self, yield_method_with_word_length: callable, reverse_order
except StopIteration:
pass
-
def yield_words(self, reverse_order: bool = False, word_lenght_sort: bool = False):
"""
Yield all words in the database.
@@ -605,10 +630,13 @@ def yield_words(self, reverse_order: bool = False, word_lenght_sort: bool = Fals
Yields:
str: A word from the database.
"""
- return self._yield_all_data(self.yield_words_with_length, reverse_order, word_lenght_sort)
-
+ return self._yield_all_data(
+ self.yield_words_with_length, reverse_order, word_lenght_sort
+ )
- def yield_word_pos_id(self, reverse_order: bool = False, word_lenght_sort: bool = False):
+ def yield_word_pos_id(
+ self, reverse_order: bool = False, word_lenght_sort: bool = False
+ ):
"""
Yield words with their part-of-speech tag IDs from the database.
@@ -619,10 +647,13 @@ def yield_word_pos_id(self, reverse_order: bool = False, word_lenght_sort: bool
Yields:
tuple[str, int]: A tuple containing a word from the database and its associated part-of-speech tag ID.
"""
- return self._yield_all_data(self.yield_word_pos_id_with_length, reverse_order, word_lenght_sort)
+ return self._yield_all_data(
+ self.yield_word_pos_id_with_length, reverse_order, word_lenght_sort
+ )
-
- def yield_word_pos_level(self, reverse_order: bool = False, word_lenght_sort: bool = False):
+ def yield_word_pos_level(
+ self, reverse_order: bool = False, word_lenght_sort: bool = False
+ ):
"""
Yield words with their part-of-speech tag IDs and levels from the database.
@@ -634,8 +665,9 @@ def yield_word_pos_level(self, reverse_order: bool = False, word_lenght_sort: bo
tuple[str, int, float]: A tuple containing a word from the database, its associated part-of-speech tag ID,
and its level.
"""
- return self._yield_all_data(self.yield_word_pos_level_with_length, reverse_order, word_lenght_sort)
-
+ return self._yield_all_data(
+ self.yield_word_pos_level_with_length, reverse_order, word_lenght_sort
+ )
def get_word_count_for_length(self, word_length: int) -> int:
"""
@@ -664,13 +696,14 @@ def get_word_count_for_length(self, word_length: int) -> int:
for k in range(j + 1, word_length):
array_pos += 1
- last_word[k] = self._data_reader.get_data_array_value_at(array_pos)
+ last_word[k] = self._data_reader.get_data_array_value_at(
+ array_pos
+ )
break
return unique_words_counter
-
def get_total_words(self) -> int:
"""
Get the total count of words in the data.
@@ -686,7 +719,6 @@ def get_total_words(self) -> int:
return counter
-
def get_word_pos_count_for_length(self, word_length: int) -> int:
"""
Count the number of positions in the data where words of a specific length start.
@@ -706,7 +738,6 @@ def get_word_pos_count_for_length(self, word_length: int) -> int:
return (segment_end - segment_start) // data_block_len
-
def get_word_pos_count(self) -> int:
"""
Get the total count of positions in the data where words start, across all word lengths.
@@ -722,7 +753,6 @@ def get_word_pos_count(self) -> int:
return counter
-
@staticmethod
def pack_word(word: str) -> bytes:
"""
@@ -734,8 +764,24 @@ def pack_word(word: str) -> bytes:
Returns:
bytes: The packed representation of the word.
"""
- return struct.pack('B' * len(word), *map(ord, word))
+ return struct.pack("B" * len(word), *map(ord, word))
+
+ @staticmethod
+ def _is_word_chars_valid(word: str) -> bool:
+ """
+ Check whether every character in the word is a valid lowercase ASCII letter.
+
+ Non-ASCII characters (e.g. 'あ', 'é', '中') would cause struct.pack to raise
+ an error, so we reject them early and return None/False from callers instead
+ of crashing.
+ Args:
+ word (str): The word to validate.
+
+ Returns:
+ bool: True if all characters are valid, False otherwise.
+ """
+ return all(c in VALID_WORD_CHARACTERS for c in word)
@staticmethod
def byte_int_level_to_float(level: int) -> float:
diff --git a/src/cefrpy/CEFRDataReader.py b/src/cefrpy/CEFRDataReader.py
index 2b24609..6cb0a70 100644
--- a/src/cefrpy/CEFRDataReader.py
+++ b/src/cefrpy/CEFRDataReader.py
@@ -31,13 +31,16 @@ def __init__(self, data_path: Union[str, None] = None) -> None:
Exception: If the CEFR database file content is invalid.
"""
- self.data_path = os.path.join(os.path.dirname(__file__), 'data.bin') if data_path is None else data_path
- self._wlp = array.array('I')
+ self.data_path = (
+ os.path.join(os.path.dirname(__file__), "data.bin")
+ if data_path is None
+ else data_path
+ )
+ self._wlp = array.array("I")
self._data_array = bytearray()
if not self._read_data():
- raise Exception(f'CEFR database file content is invalid: {self.data_path}')
-
+ raise Exception(f"CEFR database file content is invalid: {self.data_path}")
def _read_data(self) -> bool:
"""
@@ -46,18 +49,17 @@ def _read_data(self) -> bool:
Returns:
bool: True if the data is successfully read and valid, False otherwise.
"""
- with open(self.data_path, 'rb') as file:
- wlp_len = struct.unpack('B', file.read(1))[0]
+ with open(self.data_path, "rb") as file:
+ wlp_len = struct.unpack("B", file.read(1))[0]
if not is_wlp_length_valid(wlp_len):
return False
- wlp_data = file.read(wlp_len * struct.calcsize('I'))
+ wlp_data = file.read(wlp_len * struct.calcsize("I"))
self._wlp.frombytes(wlp_data)
self._data_array = bytearray(file.read())
return is_data_valid(self._wlp, self._data_array)
-
def get_wlp_value_at(self, i: int) -> int:
"""
Get the value at index i in the word length positions array.
@@ -76,7 +78,6 @@ def get_wlp_value_at(self, i: int) -> int:
raise IndexError("Index out of range for _wlp")
-
def get_data_array_value_at(self, i: int) -> int:
"""
Get the value at index i in the data array.
@@ -95,7 +96,6 @@ def get_data_array_value_at(self, i: int) -> int:
raise IndexError("Index out of range for _data_array")
-
def get_wlp_len(self) -> int:
"""
Get the length of the word length positions array.
@@ -105,7 +105,6 @@ def get_wlp_len(self) -> int:
"""
return len(self._wlp)
-
def get_data_array_len(self) -> int:
"""
Get the length of the data array.
diff --git a/src/cefrpy/CEFRDataValidator.py b/src/cefrpy/CEFRDataValidator.py
index cc78c14..8b1d531 100644
--- a/src/cefrpy/CEFRDataValidator.py
+++ b/src/cefrpy/CEFRDataValidator.py
@@ -72,7 +72,7 @@ def validate_data_block(data: bytearray, start_pos: int, block_length: int) -> b
"""
word_len = block_length - 2
for i in range(start_pos, start_pos + word_len):
- if not chr(data[i]) in VALID_WORD_CHARACTERS:
+ if chr(data[i]) not in VALID_WORD_CHARACTERS:
return False
if data[i + 1] > MAX_POS_TAG_ID:
diff --git a/src/cefrpy/CEFRLevel.py b/src/cefrpy/CEFRLevel.py
index 1b6dc0f..cdcce4c 100644
--- a/src/cefrpy/CEFRLevel.py
+++ b/src/cefrpy/CEFRLevel.py
@@ -1,5 +1,6 @@
from enum import Enum, unique
+
@unique
class CEFRLevel(Enum):
"""
diff --git a/src/cefrpy/CEFRSpaCyAnalyzer.py b/src/cefrpy/CEFRSpaCyAnalyzer.py
index 6f04210..405ec5e 100644
--- a/src/cefrpy/CEFRSpaCyAnalyzer.py
+++ b/src/cefrpy/CEFRSpaCyAnalyzer.py
@@ -4,7 +4,8 @@
from .CEFRAnalyzer import CEFRAnalyzer
-class CEFRSpaCyAnalyzer():
+
+class CEFRSpaCyAnalyzer:
"""
Analyze text for CEFR levels, considering provided entity types to skip and abbreviation mapping.
@@ -15,8 +16,12 @@ class CEFRSpaCyAnalyzer():
tokens (list[tuple[str, str, bool, float, int, int]]): List of token tuples containing word, POS tag, skip status, CEFR level, start index, and end index.
"""
- def __init__(self, analyzer: CEFRAnalyzer = CEFRAnalyzer(), entity_types_to_skip: Union[set[str], list[str], None] = None,
- abbreviation_mapping: Union[dict[str, str], None] = None) -> None:
+ def __init__(
+ self,
+ analyzer: CEFRAnalyzer = CEFRAnalyzer(),
+ entity_types_to_skip: Union[set[str], list[str], None] = None,
+ abbreviation_mapping: Union[dict[str, str], None] = None,
+ ) -> None:
"""
Initialize the CEFRSpaCyAnalyzer instance.
@@ -26,8 +31,12 @@ def __init__(self, analyzer: CEFRAnalyzer = CEFRAnalyzer(), entity_types_to_skip
abbreviation_mapping (Union[dict[str, str], None], optional): A dictionary mapping abbreviations to their full forms. Defaults to None.
"""
self._analyzer = analyzer
- self.entity_types_to_skip = set() if entity_types_to_skip is None else set(entity_types_to_skip)
- self.abbreviation_mapping = dict() if abbreviation_mapping is None else abbreviation_mapping
+ self.entity_types_to_skip = (
+ set() if entity_types_to_skip is None else set(entity_types_to_skip)
+ )
+ self.abbreviation_mapping = (
+ dict() if abbreviation_mapping is None else abbreviation_mapping
+ )
def _get_next_entity(self, entities_iter: Iterator):
"""
@@ -38,7 +47,9 @@ def _get_next_entity(self, entities_iter: Iterator):
except StopIteration:
return None
- def _get_word_pos_tokens_set(self, tokens: list[tuple[str, str, str, bool, int, int]]) -> set[tuple[str, str]]:
+ def _get_word_pos_tokens_set(
+ self, tokens: list[tuple[str, str, str, bool, int, int]]
+ ) -> set[tuple[str, str]]:
"""
Get unique word and POS tag tuples from tokens.
@@ -50,7 +61,9 @@ def _get_word_pos_tokens_set(self, tokens: list[tuple[str, str, str, bool, int,
"""
return {(token[1], token[2]) for token in tokens if not token[3]}
- def _fetch_word_pos_level_tokens(self, word_pos_tokens_set: set[tuple[str, str]]) -> dict[tuple[str, str], float]:
+ def _fetch_word_pos_level_tokens(
+ self, word_pos_tokens_set: set[tuple[str, str]]
+ ) -> dict[tuple[str, str], float]:
"""
Fetch CEFR levels for unique word and POS tag tuples.
@@ -62,7 +75,9 @@ def _fetch_word_pos_level_tokens(self, word_pos_tokens_set: set[tuple[str, str]]
"""
result_dict = dict()
for word, pos_tag in word_pos_tokens_set:
- level = self._analyzer.get_word_pos_level_float(word, pos_tag, avg_level_not_found_pos=True)
+ level = self._analyzer.get_word_pos_level_float(
+ word, pos_tag, avg_level_not_found_pos=True
+ )
result_dict[(word, pos_tag)] = level if level is not None else 0
return result_dict
@@ -95,15 +110,20 @@ def analyze_doc(self, doc) -> list[tuple[str, str, bool, float, int, int]]:
while current_entity and token_start > current_entity.end_char:
current_entity = self._get_next_entity(entities_iter)
- if current_entity and current_entity.label_ in self.entity_types_to_skip \
- and current_entity.start_char <= token_start < current_entity.end_char:
+ if (
+ current_entity
+ and current_entity.label_ in self.entity_types_to_skip
+ and current_entity.start_char
+ <= token_start
+ < current_entity.end_char
+ ):
to_skip = True
word = token.text.strip()
word_lower = word.lower()
word_pos = token.tag_
- if word_pos == 'POS' and word_lower == "'s":
+ if word_pos == "POS" and word_lower == "'s":
to_skip = True
else:
abbreviation_form = self.abbreviation_mapping.get(word_lower)
@@ -114,18 +134,29 @@ def analyze_doc(self, doc) -> list[tuple[str, str, bool, float, int, int]]:
if not to_skip and not word.isalpha():
to_skip = True
- nlp_tokens.append((word, word_lower, word_pos, to_skip, token_start, token_end))
+ nlp_tokens.append(
+ (word, word_lower, word_pos, to_skip, token_start, token_end)
+ )
word_pos_set = self._get_word_pos_tokens_set(nlp_tokens)
word_pos_unique_level_tokens = self._fetch_word_pos_level_tokens(word_pos_set)
self.tokens = []
- for word, word_lower, word_pos, is_skipped, token_start, token_end in nlp_tokens:
+ for (
+ word,
+ word_lower,
+ word_pos,
+ is_skipped,
+ token_start,
+ token_end,
+ ) in nlp_tokens:
if is_skipped:
level = None
else:
level = word_pos_unique_level_tokens.get((word_lower, word_pos))
- self.tokens.append((word, word_pos, is_skipped, level, token_start, token_end))
+ self.tokens.append(
+ (word, word_pos, is_skipped, level, token_start, token_end)
+ )
return self.tokens
diff --git a/src/cefrpy/POSTag.py b/src/cefrpy/POSTag.py
index 4f6ae9a..8830ed5 100644
--- a/src/cefrpy/POSTag.py
+++ b/src/cefrpy/POSTag.py
@@ -1,36 +1,37 @@
from enum import Enum, unique
POS_TAGS_DESCRIPTIONS = [
- 'Coordinating conjunction',
- 'Cardinal number',
- 'Determiner',
- 'Preposition or subordinating conjunction',
- 'Adjective',
- 'Adjective, comparative',
- 'Adjective, superlative',
- 'Modal',
- 'Noun, singular or mass',
- 'Noun, plural',
- 'Proper noun, singular',
- 'Proper noun, plural',
- 'Personal/Posessive pronoun',
- 'Adverb',
- 'Adverb, comparative',
- 'Adverb, superlative',
- 'Particle',
- 'To',
- 'Interjection',
- 'Verb, base form',
- 'Verb, past tense',
- 'Verb, gerund or present participle',
- 'Verb, past participle',
- 'Verb, non-3rd person singular present',
- 'Verb, 3rd person singular present',
- 'Wh-determiner',
- 'Wh-pronoun',
- 'Wh-adverb'
+ "Coordinating conjunction",
+ "Cardinal number",
+ "Determiner",
+ "Preposition or subordinating conjunction",
+ "Adjective",
+ "Adjective, comparative",
+ "Adjective, superlative",
+ "Modal",
+ "Noun, singular or mass",
+ "Noun, plural",
+ "Proper noun, singular",
+ "Proper noun, plural",
+ "Personal/Posessive pronoun",
+ "Adverb",
+ "Adverb, comparative",
+ "Adverb, superlative",
+ "Particle",
+ "To",
+ "Interjection",
+ "Verb, base form",
+ "Verb, past tense",
+ "Verb, gerund or present participle",
+ "Verb, past participle",
+ "Verb, non-3rd person singular present",
+ "Verb, 3rd person singular present",
+ "Wh-determiner",
+ "Wh-pronoun",
+ "Wh-adverb",
]
+
@unique
class POSTag(Enum):
"""
@@ -66,21 +67,18 @@ class POSTag(Enum):
WP = 26
WRB = 27
-
def __str__(self) -> str:
"""
Returns a string representation of the POS tag.
"""
return self.name
-
def __int__(self) -> int:
"""
Returns an integer representation of the POS tag.
"""
return self.value
-
def __eq__(self, other) -> bool:
"""
Checks if this POS tag is equal to another POS tag.
@@ -90,21 +88,18 @@ def __eq__(self, other) -> bool:
return NotImplemented
-
def __hash__(self) -> int:
"""
Returns the hash value of the POS tag.
"""
return self.value
-
def get_description(self) -> str:
"""
Retrieve the description of a POS tag.
"""
return POS_TAGS_DESCRIPTIONS[self.value]
-
@classmethod
def from_tag_name(cls, tag_name: str):
"""
@@ -125,7 +120,6 @@ def from_tag_name(cls, tag_name: str):
return tag
-
@staticmethod
def get_id_by_tag_name(tag_name: str) -> int:
"""
@@ -145,7 +139,6 @@ def get_id_by_tag_name(tag_name: str) -> int:
return POSTag[tag_name].value
-
@staticmethod
def get_tag_name_by_id(tag_id: int) -> str:
"""
@@ -165,7 +158,6 @@ def get_tag_name_by_id(tag_id: int) -> str:
raise ValueError(f"Invalid tag id: {tag_id}")
-
@staticmethod
def get_description_by_tag_name(tag_name: str) -> str:
"""
@@ -184,7 +176,6 @@ def get_description_by_tag_name(tag_name: str) -> str:
return POS_TAGS_DESCRIPTIONS[tag_id]
-
@staticmethod
def get_description_by_tag_id(tag_id: int) -> str:
"""
@@ -204,7 +195,6 @@ def get_description_by_tag_id(tag_id: int) -> str:
return POS_TAGS_DESCRIPTIONS[tag_id]
-
@staticmethod
def get_total_tags() -> int:
"""
@@ -215,7 +205,6 @@ def get_total_tags() -> int:
"""
return len(POSTag.__members__)
-
@staticmethod
def get_all_tags() -> list[str]:
"""
diff --git a/src/cefrpy/__init__.py b/src/cefrpy/__init__.py
index a580585..e92b138 100644
--- a/src/cefrpy/__init__.py
+++ b/src/cefrpy/__init__.py
@@ -5,5 +5,5 @@
from .CEFRAnalyzer import CEFRAnalyzer
from .CEFRSpaCyAnalyzer import CEFRSpaCyAnalyzer
-__version__ = "1.0.1"
+__version__ = "1.0.2"
__all__ = ["POSTag", "CEFRDataReader", "CEFRDataProcessor", "CEFRLevel", "CEFRAnalyzer", "CEFRSpaCyAnalyzer"]
diff --git a/tests/test_CEFRAnalyzer.py b/tests/test_CEFRAnalyzer.py
index 3c2fbf9..61305c1 100644
--- a/tests/test_CEFRAnalyzer.py
+++ b/tests/test_CEFRAnalyzer.py
@@ -1,7 +1,7 @@
import unittest
from random import randint
-from cefrpy import CEFRAnalyzer, CEFRDataReader, POSTag, CEFRLevel
+from cefrpy import CEFRAnalyzer, POSTag, CEFRLevel
class TestCEFRAnalyzer(unittest.TestCase):
@@ -11,7 +11,12 @@ def setUpClass(cls):
cls.valid_word_pos = POSTag.NN
cls.valid_word_unknown_pos = POSTag.CD
cls.not_valid_words_test_pos_tag = POSTag.CC
- cls.not_valid_words = ("", "@test@", "notvalidword", "toolongwordtoolongwordtoolongwordtoolongwordtoolongword")
+ cls.not_valid_words = (
+ "",
+ "@test@",
+ "notvalidword",
+ "toolongwordtoolongwordtoolongwordtoolongwordtoolongword",
+ )
cls.analyzer = CEFRAnalyzer()
def test_get_max_word_len(self):
@@ -26,30 +31,58 @@ def test_get_pos_tag_id(self):
self.assertEqual(CEFRAnalyzer.get_pos_tag_id(tag_str), tag_id)
def test_get_word_pos_level_float(self):
- valid_word_pos_level = self.analyzer.get_word_pos_level_float(self.valid_word, self.valid_word_pos, False)
- valid_avg_word_pos_level = self.analyzer.get_word_pos_level_float(self.valid_word, self.valid_word_unknown_pos, True)
- none_level = self.analyzer.get_word_pos_level_float(self.valid_word, self.valid_word_unknown_pos, False)
+ valid_word_pos_level = self.analyzer.get_word_pos_level_float(
+ self.valid_word, self.valid_word_pos, False
+ )
+ valid_avg_word_pos_level = self.analyzer.get_word_pos_level_float(
+ self.valid_word, self.valid_word_unknown_pos, True
+ )
+ none_level = self.analyzer.get_word_pos_level_float(
+ self.valid_word, self.valid_word_unknown_pos, False
+ )
self.assertIsNotNone(valid_word_pos_level)
self.assertIsNotNone(valid_avg_word_pos_level)
self.assertIsNone(none_level)
for word in self.not_valid_words:
- self.assertIsNone(self.analyzer.get_word_pos_level_float(word, self.not_valid_words_test_pos_tag, False))
- self.assertIsNone(self.analyzer.get_word_pos_level_float(word, self.not_valid_words_test_pos_tag, True))
+ self.assertIsNone(
+ self.analyzer.get_word_pos_level_float(
+ word, self.not_valid_words_test_pos_tag, False
+ )
+ )
+ self.assertIsNone(
+ self.analyzer.get_word_pos_level_float(
+ word, self.not_valid_words_test_pos_tag, True
+ )
+ )
def test_get_word_pos_level_CEFR(self):
- valid_word_pos_level = self.analyzer.get_word_pos_level_CEFR(self.valid_word, self.valid_word_pos, False)
- valid_avg_word_pos_level = self.analyzer.get_word_pos_level_CEFR(self.valid_word, self.valid_word_unknown_pos, True)
- none_level = self.analyzer.get_word_pos_level_CEFR(self.valid_word, self.valid_word_unknown_pos, False)
+ valid_word_pos_level = self.analyzer.get_word_pos_level_CEFR(
+ self.valid_word, self.valid_word_pos, False
+ )
+ valid_avg_word_pos_level = self.analyzer.get_word_pos_level_CEFR(
+ self.valid_word, self.valid_word_unknown_pos, True
+ )
+ none_level = self.analyzer.get_word_pos_level_CEFR(
+ self.valid_word, self.valid_word_unknown_pos, False
+ )
self.assertIsInstance(valid_word_pos_level, CEFRLevel)
self.assertIsInstance(valid_avg_word_pos_level, CEFRLevel)
self.assertIsNone(none_level)
for word in self.not_valid_words:
- self.assertIsNone(self.analyzer.get_word_pos_level_float(word, self.not_valid_words_test_pos_tag, False))
- self.assertIsNone(self.analyzer.get_word_pos_level_float(word, self.not_valid_words_test_pos_tag, True))
+ self.assertIsNone(
+ self.analyzer.get_word_pos_level_float(
+ word, self.not_valid_words_test_pos_tag, False
+ )
+ )
+ self.assertIsNone(
+ self.analyzer.get_word_pos_level_float(
+ word, self.not_valid_words_test_pos_tag, True
+ )
+ )
def test_get_avg_word_level_float(self):
valid_word_level = self.analyzer.get_average_word_level_float(self.valid_word)
@@ -72,11 +105,21 @@ def test_is_word_in_database(self):
self.assertFalse(self.analyzer.is_word_in_database(word))
def test_is_word_pos_in_database(self):
- self.assertTrue(self.analyzer.is_word_pos_id_database(self.valid_word, self.valid_word_pos))
- self.assertFalse(self.analyzer.is_word_pos_id_database(self.valid_word, self.valid_word_unknown_pos))
+ self.assertTrue(
+ self.analyzer.is_word_pos_id_database(self.valid_word, self.valid_word_pos)
+ )
+ self.assertFalse(
+ self.analyzer.is_word_pos_id_database(
+ self.valid_word, self.valid_word_unknown_pos
+ )
+ )
for word in self.not_valid_words:
- self.assertFalse(self.analyzer.is_word_pos_id_database(word, self.not_valid_words_test_pos_tag))
+ self.assertFalse(
+ self.analyzer.is_word_pos_id_database(
+ word, self.not_valid_words_test_pos_tag
+ )
+ )
def test_yields(self):
valid_word_len = len(self.valid_word)
@@ -94,7 +137,9 @@ def test_yields(self):
self.assertEqual(len(valid_words), total_words)
valid_words_iter = reversed(valid_words)
- for word in self.analyzer.yield_words_with_length(valid_word_len, reverse_order=True):
+ for word in self.analyzer.yield_words_with_length(
+ valid_word_len, reverse_order=True
+ ):
self.assertEqual(next(valid_words_iter), word)
with self.assertRaises(StopIteration):
@@ -104,8 +149,14 @@ def test_yields(self):
word = next(valid_words_iter)
word_pos_counter = 0
- for data1, data2 in zip(self.analyzer.yield_word_pos_with_length(valid_word_len, pos_tag_as_string=False),
- self.analyzer.yield_word_pos_level_with_length(valid_word_len, pos_tag_as_string=True)):
+ for data1, data2 in zip(
+ self.analyzer.yield_word_pos_with_length(
+ valid_word_len, pos_tag_as_string=False
+ ),
+ self.analyzer.yield_word_pos_level_with_length(
+ valid_word_len, pos_tag_as_string=True
+ ),
+ ):
word1, pos1 = data1
word2, pos2, level = data2
@@ -130,8 +181,17 @@ def test_yields(self):
word = next(valid_words_iter)
word_pos_counter = 0
- for data1, data2 in zip(self.analyzer.yield_word_pos_with_length(valid_word_len, pos_tag_as_string=True, reverse_order=True),
- self.analyzer.yield_word_pos_level_with_length(valid_word_len, pos_tag_as_string=False, word_level_as_float=True, reverse_order=True)):
+ for data1, data2 in zip(
+ self.analyzer.yield_word_pos_with_length(
+ valid_word_len, pos_tag_as_string=True, reverse_order=True
+ ),
+ self.analyzer.yield_word_pos_level_with_length(
+ valid_word_len,
+ pos_tag_as_string=False,
+ word_level_as_float=True,
+ reverse_order=True,
+ ),
+ ):
word1, pos1 = data1
word2, pos2, level = data2
@@ -159,7 +219,9 @@ def test_yields_alphabetical(self):
word_counter = 0
last_word = ""
- for word in self.analyzer.yield_words(reverse_order=False, word_length_sort=False):
+ for word in self.analyzer.yield_words(
+ reverse_order=False, word_length_sort=False
+ ):
self.assertGreater(word, last_word)
last_word = word
word_counter += 1
@@ -167,7 +229,9 @@ def test_yields_alphabetical(self):
self.assertEqual(word_counter, total_words)
word_counter = 1
- generator = self.analyzer.yield_words(reverse_order=True, word_length_sort=False)
+ generator = self.analyzer.yield_words(
+ reverse_order=True, word_length_sort=False
+ )
last_word = next(generator)
for word in generator:
@@ -185,7 +249,9 @@ def test_yields_word_length_sort(self):
last_len = 0
last_word = ""
- for word in self.analyzer.yield_words(reverse_order=False, word_length_sort=True):
+ for word in self.analyzer.yield_words(
+ reverse_order=False, word_length_sort=True
+ ):
word_len = len(word)
self.assertGreaterEqual(word_len, last_len)
@@ -219,5 +285,5 @@ def test_yields_word_length_sort(self):
self.assertEqual(word_counter, total_words)
-if __name__ == '__main__':
+if __name__ == "__main__":
unittest.main()
diff --git a/tests/test_CEFRDataProcessor.py b/tests/test_CEFRDataProcessor.py
index 49f43f5..9db1f5a 100644
--- a/tests/test_CEFRDataProcessor.py
+++ b/tests/test_CEFRDataProcessor.py
@@ -3,6 +3,7 @@
from math import inf
from cefrpy import CEFRDataProcessor, POSTag
+
class TestCEFRDataProcessor(unittest.TestCase):
@classmethod
def setUpClass(cls):
@@ -10,7 +11,12 @@ def setUpClass(cls):
cls.valid_word_pos_id = int(POSTag.NN)
cls.valid_word_unknown_pos_id = int(POSTag.CD)
cls.not_valid_words_test_pos_tag = int(POSTag.CC)
- cls.not_valid_words = ("", "@test@", "notvalidword", "toolongwordtoolongwordtoolongwordtoolongwordtoolongword")
+ cls.not_valid_words = (
+ "",
+ "@test@",
+ "notvalidword",
+ "toolongwordtoolongwordtoolongwordtoolongwordtoolongword",
+ )
cls.processor = CEFRDataProcessor()
def test_get_wlp_and_max_word_len(self):
@@ -33,7 +39,7 @@ def test_word_len_valid(self):
self.assertTrue(self.processor.is_word_len_valid(max_valid_word_len))
def test_pack_word(self):
- self.assertEqual(CEFRDataProcessor.pack_word("test"), b'test')
+ self.assertEqual(CEFRDataProcessor.pack_word("test"), b"test")
def test_byte_int_level_to_float(self):
self.assertAlmostEqual(CEFRDataProcessor.byte_int_level_to_float(0), 1)
@@ -65,35 +71,71 @@ def test_is_word_in_database(self):
self.assertFalse(self.processor.is_word_in_database(word))
def test_is_word_pos_in_database(self):
- self.assertTrue(self.processor.is_word_pos_id_database(self.valid_word, self.valid_word_pos_id))
- self.assertFalse(self.processor.is_word_pos_id_database(self.valid_word, self.valid_word_unknown_pos_id))
+ self.assertTrue(
+ self.processor.is_word_pos_id_database(
+ self.valid_word, self.valid_word_pos_id
+ )
+ )
+ self.assertFalse(
+ self.processor.is_word_pos_id_database(
+ self.valid_word, self.valid_word_unknown_pos_id
+ )
+ )
for word in self.not_valid_words:
- self.assertFalse(self.processor.is_word_pos_id_database(word, self.not_valid_words_test_pos_tag))
+ self.assertFalse(
+ self.processor.is_word_pos_id_database(
+ word, self.not_valid_words_test_pos_tag
+ )
+ )
def test_get_word_level_for_pos_id(self):
- self.assertIsNotNone(self.processor.get_word_level_for_pos_id(self.valid_word, self.valid_word_pos_id, False))
- self.assertIsNone(self.processor.get_word_level_for_pos_id(self.valid_word, inf, False))
-
- self.assertIsNone(self.processor.get_word_level_for_pos_id(self.valid_word, inf, False))
- self.assertIsNotNone(self.processor.get_word_level_for_pos_id(self.valid_word, inf, True))
+ self.assertIsNotNone(
+ self.processor.get_word_level_for_pos_id(
+ self.valid_word, self.valid_word_pos_id, False
+ )
+ )
+ self.assertIsNone(
+ self.processor.get_word_level_for_pos_id(self.valid_word, inf, False)
+ )
+
+ self.assertIsNone(
+ self.processor.get_word_level_for_pos_id(self.valid_word, inf, False)
+ )
+ self.assertIsNotNone(
+ self.processor.get_word_level_for_pos_id(self.valid_word, inf, True)
+ )
for word in self.not_valid_words:
- self.assertIsNone(self.processor.get_word_level_for_pos_id(word, self.not_valid_words_test_pos_tag, True))
- self.assertIsNone(self.processor.get_word_level_for_pos_id(word, self.not_valid_words_test_pos_tag, False))
+ self.assertIsNone(
+ self.processor.get_word_level_for_pos_id(
+ word, self.not_valid_words_test_pos_tag, True
+ )
+ )
+ self.assertIsNone(
+ self.processor.get_word_level_for_pos_id(
+ word, self.not_valid_words_test_pos_tag, False
+ )
+ )
def test_get_word_count_for_length(self):
self.assertTrue(0 <= self.processor.get_word_count_for_length(1) <= 26)
valid_word_len = len(self.valid_word)
- self.assertTrue(1 <= self.processor.get_word_count_for_length(valid_word_len) <= pow(26, valid_word_len))
+ self.assertTrue(
+ 1
+ <= self.processor.get_word_count_for_length(valid_word_len)
+ <= pow(26, valid_word_len)
+ )
def test_word_pos_count_for_length(self):
self.assertGreaterEqual(self.processor.get_word_pos_count_for_length(1), 0)
valid_word_len = len(self.valid_word)
- self.assertGreater(self.processor.get_word_pos_count_for_length(valid_word_len), 0)
+ self.assertGreater(
+ self.processor.get_word_pos_count_for_length(valid_word_len), 0
+ )
-if __name__ == '__main__':
+if __name__ == "__main__":
unittest.main()
diff --git a/tests/test_CEFRDataValidator.py b/tests/test_CEFRDataValidator.py
index f927c57..515d3f5 100644
--- a/tests/test_CEFRDataValidator.py
+++ b/tests/test_CEFRDataValidator.py
@@ -9,11 +9,7 @@ def setUp(self):
self.valid_wlp_lengths = [2, 3, 100, 254, 255]
self.invalid_wlp_lengths = [-inf, -1, 0, 1, 256, 500, inf]
- self.valid_wlp_arrays = [
- [0, 9],
- [0, 6, 10],
- [3, 6, 6, 6, 12]
- ]
+ self.valid_wlp_arrays = [[0, 9], [0, 6, 10], [3, 6, 6, 6, 12]]
self.invalid_wlp_arrays = [
[],
@@ -21,22 +17,22 @@ def setUp(self):
[0, -1],
[1, 2, 3, 4, 5],
[0, 3, 5, 12],
- [3, 12, 9, 12, 17]
+ [3, 12, 9, 12, 17],
]
self.valid_data = [
- bytearray(b'a\x00\x00d\x03\x05z\x02\x10'),
- bytearray(b'g\x10\x05y\x04\x89kk\x05\x12'),
- bytearray(b'---c\x06\x15qwer\x10\x35----')
+ bytearray(b"a\x00\x00d\x03\x05z\x02\x10"),
+ bytearray(b"g\x10\x05y\x04\x89kk\x05\x12"),
+ bytearray(b"---c\x06\x15qwer\x10\x35----"),
]
self.invalid_data = [
- bytearray(b'something\x00\x02test\x00\x01'),
- bytearray(b'hello'),
- bytearray(b'c\x06qwer\x10\x35'),
- bytearray(b'a\x99\x99d\x03\x05z\x02\x10'),
- bytearray(b'testsomething'),
- bytearray(b'#\x00\x00@\x03\x05#\x02\x10')
+ bytearray(b"something\x00\x02test\x00\x01"),
+ bytearray(b"hello"),
+ bytearray(b"c\x06qwer\x10\x35"),
+ bytearray(b"a\x99\x99d\x03\x05z\x02\x10"),
+ bytearray(b"testsomething"),
+ bytearray(b"#\x00\x00@\x03\x05#\x02\x10"),
]
def test_wlp_length_valid(self):
@@ -67,5 +63,5 @@ def test_cefr_data_invalid(self):
self.assertFalse(CEFRDataValidator.is_data_valid(wlp_array, data))
-if __name__ == '__main__':
+if __name__ == "__main__":
unittest.main()
diff --git a/tests/test_CEFRLevel.py b/tests/test_CEFRLevel.py
index 39dd0dd..0bbef60 100644
--- a/tests/test_CEFRLevel.py
+++ b/tests/test_CEFRLevel.py
@@ -2,9 +2,17 @@
from cefrpy import CEFRLevel
+
class TestCEFRLevel(unittest.TestCase):
def setUp(self):
- self.levels = [CEFRLevel.A1, CEFRLevel.A2, CEFRLevel.B1, CEFRLevel.B2, CEFRLevel.C1, CEFRLevel.C2]
+ self.levels = [
+ CEFRLevel.A1,
+ CEFRLevel.A2,
+ CEFRLevel.B1,
+ CEFRLevel.B2,
+ CEFRLevel.C1,
+ CEFRLevel.C2,
+ ]
def test_equality(self):
for level in self.levels:
@@ -48,5 +56,5 @@ def test_from_string_method(self):
self.assertEqual(level_from_str, level)
-if __name__ == '__main__':
+if __name__ == "__main__":
unittest.main()
diff --git a/tests/test_POSTag.py b/tests/test_POSTag.py
index 31712f5..855ca48 100644
--- a/tests/test_POSTag.py
+++ b/tests/test_POSTag.py
@@ -3,6 +3,7 @@
from math import inf
from cefrpy import POSTag
+
class TestPOSTag(unittest.TestCase):
def setUp(self):
self.total_tags = POSTag.get_total_tags()
@@ -83,5 +84,5 @@ def test_get_all_tags(self):
self.assertIsNotNone(POSTag.__members__.get(pos_tag))
-if __name__ == '__main__':
+if __name__ == "__main__":
unittest.main()