From 3df25298fad0389209ea9bd912314fa490c1c304 Mon Sep 17 00:00:00 2001 From: Bielikov Maksym Date: Tue, 5 May 2026 09:11:25 +0300 Subject: [PATCH 1/2] refactor: lint code, fix docs build warnings, update package config --- LICENSE | 2 +- docs/cefrpy.rst | 1 + docs/conf.py | 29 ++++--- docs/index.rst | 2 - setup.cfg | 9 +- src/cefrpy/CEFRAnalyzer.py | 123 ++++++++++++++++++---------- src/cefrpy/CEFRDataProcessor.py | 140 +++++++++++++++++++++----------- src/cefrpy/CEFRDataReader.py | 21 +++-- src/cefrpy/CEFRDataValidator.py | 2 +- src/cefrpy/CEFRLevel.py | 1 + src/cefrpy/CEFRSpaCyAnalyzer.py | 59 ++++++++++---- src/cefrpy/POSTag.py | 69 +++++++--------- src/cefrpy/__init__.py | 2 +- tests/test_CEFRAnalyzer.py | 114 ++++++++++++++++++++------ tests/test_CEFRDataProcessor.py | 72 ++++++++++++---- tests/test_CEFRDataValidator.py | 28 +++---- tests/test_CEFRLevel.py | 12 ++- tests/test_POSTag.py | 3 +- 18 files changed, 457 insertions(+), 232 deletions(-) diff --git a/LICENSE b/LICENSE index 4fa2b63..12cc758 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2024 Belikov Maxim +Copyright (c) 2024 Bielikov Maksym Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/docs/cefrpy.rst b/docs/cefrpy.rst index 59303a9..15388e0 100644 --- a/docs/cefrpy.rst +++ b/docs/cefrpy.rst @@ -67,3 +67,4 @@ Module contents :members: :undoc-members: :show-inheritance: + :no-idex: diff --git a/docs/conf.py b/docs/conf.py index 34dd33a..d2cccc1 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -5,29 +5,36 @@ import os import sys -sys.path.insert(0, os.path.abspath('../src/cefrpy/')) + +sys.path.insert(0, os.path.abspath("../src/cefrpy/")) # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information -project = 'cefrpy' -copyright = '2024, Maxim Belikov' -author = 'Maxim Belikov' -release = '1.0' +project = "cefrpy" +copyright = "2026, Maksym Bielikov" +author = "Maksym Bielikov" +version = "1.0" +release = "1.0.2" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration -extensions = ['sphinx.ext.autodoc', 'sphinx_mdinclude', 'sphinx.ext.githubpages'] - -templates_path = ['_templates'] -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +extensions = ["sphinx.ext.autodoc", "sphinx_mdinclude", "sphinx.ext.githubpages"] +templates_path = ["_templates"] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # -- Options for HTML output ------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output -html_theme = 'sphinx_rtd_theme' -html_static_path = ['_static'] +html_theme = "sphinx_rtd_theme" +html_context = { + "display_github": True, + "github_user": "Maximax67", + "github_repo": "cefrpy", + "github_version": "main", + "conf_py_path": "/docs/", +} diff --git a/docs/index.rst b/docs/index.rst index 8eeeab8..d45f495 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -18,5 +18,3 @@ Indices and tables * :ref:`genindex` * :ref:`modindex` - -.. mdinclude:: docs.md diff --git a/setup.cfg b/setup.cfg index c587c0e..d8091fd 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,11 +1,14 @@ [metadata] name = cefrpy -version = 1.0.1 -author = Maxim Belikov +version = 1.0.2 +author = Maksym Bielikov author_email = maximax6767@gmail.com description = Python package for analyzing words based on the CEFR level. long_description = file: README.md, LICENSE long_description_content_type = text/markdown +project_urls = + Source = https://github.com/Maximax67/cefrpy + Bug Tracker = https://github.com/Maximax67/cefrpy/issues classifiers = Programming Language :: Python :: 3 License :: OSI Approved :: MIT License @@ -19,4 +22,4 @@ python_requires = >=3.6 include_package_data = True [options.packages.find] -where = src \ No newline at end of file +where = src diff --git a/src/cefrpy/CEFRAnalyzer.py b/src/cefrpy/CEFRAnalyzer.py index ab9a0dc..59a1362 100644 --- a/src/cefrpy/CEFRAnalyzer.py +++ b/src/cefrpy/CEFRAnalyzer.py @@ -26,8 +26,12 @@ def __init__(self, data_processor: CEFRDataProcessor = CEFRDataProcessor()) -> N """ self._data_processor = data_processor - - def get_word_pos_level_float(self, word: str, pos_tag: Union[str, POSTag], avg_level_not_found_pos: bool = False) -> Union[float, None]: + def get_word_pos_level_float( + self, + word: str, + pos_tag: Union[str, POSTag], + avg_level_not_found_pos: bool = False, + ) -> Union[float, None]: """ Get the level of a word's part of speech. @@ -46,10 +50,16 @@ def get_word_pos_level_float(self, word: str, pos_tag: Union[str, POSTag], avg_l pos_tag_id = inf - return self._data_processor.get_word_level_for_pos_id(word, pos_tag_id, avg_level_not_found_pos) - + return self._data_processor.get_word_level_for_pos_id( + word, pos_tag_id, avg_level_not_found_pos + ) - def get_word_pos_level_CEFR(self, word: str, pos_tag: Union[str, POSTag], avg_level_not_found_pos: bool = False) -> Union[CEFRLevel, None]: + def get_word_pos_level_CEFR( + self, + word: str, + pos_tag: Union[str, POSTag], + avg_level_not_found_pos: bool = False, + ) -> Union[CEFRLevel, None]: """ Get the CEFR level of a word's part of speech. @@ -61,13 +71,14 @@ def get_word_pos_level_CEFR(self, word: str, pos_tag: Union[str, POSTag], avg_le Returns: Union[CEFRLevel, None]: The level of the word's part of speech, or None if not found. """ - float_level = self.get_word_pos_level_float(word, pos_tag, avg_level_not_found_pos) + float_level = self.get_word_pos_level_float( + word, pos_tag, avg_level_not_found_pos + ) if float_level is None: return return CEFRLevel(round(float_level)) - def get_average_word_level_float(self, word: str) -> Union[float, None]: """ Get the average level of the word. @@ -80,7 +91,6 @@ def get_average_word_level_float(self, word: str) -> Union[float, None]: """ return self._data_processor.get_word_level_for_pos_id(word, inf, True) - def get_average_word_level_CEFR(self, word: str) -> Union[CEFRLevel, None]: """ Get the average CEFR level of the word. @@ -97,7 +107,6 @@ def get_average_word_level_CEFR(self, word: str) -> Union[CEFRLevel, None]: return CEFRLevel(round(float_level)) - def get_all_pos_for_word_as_str(self, word: str) -> list[str]: """ Retrieves the names of all part-of-speech tags associated with a given word. @@ -106,7 +115,7 @@ def get_all_pos_for_word_as_str(self, word: str) -> list[str]: word (str): The word to retrieve part-of-speech tags for. Returns: - list[str]: A list of strings representing the names of the part-of-speech tags associated with the word. + list[str]: A list of strings representing the names of the part-of-speech tags associated with the word. If the word is not found in the data, an empty list is returned. """ pos_tags = self._data_processor.get_all_pos_for_word(word) @@ -118,7 +127,6 @@ def get_all_pos_for_word_as_str(self, word: str) -> list[str]: return pos_tags_str_list - def get_all_pos_for_word(self, word: str) -> list[POSTag]: """ Retrieves all part-of-speech tags associated with a given word as POSTag enums. @@ -127,7 +135,7 @@ def get_all_pos_for_word(self, word: str) -> list[POSTag]: word (str): The word to retrieve part-of-speech tags for. Returns: - list[POSTag]: A list of POSTag enums representing the part-of-speech tags associated with the word. + list[POSTag]: A list of POSTag enums representing the part-of-speech tags associated with the word. If the word is not found in the data, an empty list is returned. """ pos_tags = self._data_processor.get_all_pos_for_word(word) @@ -138,9 +146,12 @@ def get_all_pos_for_word(self, word: str) -> list[POSTag]: return pos_tags_list - - def get_pos_level_dict_for_word(self, word: str, pos_tag_as_string: bool = False, - word_level_as_float: bool = False) -> dict[Union[str, POSTag], Union[float, CEFRLevel]]: + def get_pos_level_dict_for_word( + self, + word: str, + pos_tag_as_string: bool = False, + word_level_as_float: bool = False, + ) -> dict[Union[str, POSTag], Union[float, CEFRLevel]]: """ Retrieves a dictionary mapping part-of-speech tags to their associated CEFR levels for a given word. @@ -174,7 +185,6 @@ def get_pos_level_dict_for_word(self, word: str, pos_tag_as_string: bool = False return pos_and_levels_formatted - def get_max_word_len(self) -> int: """ Get the maximum word length available in the data. @@ -184,7 +194,6 @@ def get_max_word_len(self) -> int: """ return self._data_processor.get_max_word_len() - def is_word_in_database(self, word: str) -> bool: """ Check if a word is in the DataReader database. @@ -197,7 +206,6 @@ def is_word_in_database(self, word: str) -> bool: """ return self._data_processor.is_word_in_database(word) - def is_word_pos_id_database(self, word: str, pos_tag: Union[str, POSTag]) -> bool: """ Check if a word pos is in the database. @@ -215,7 +223,6 @@ def is_word_pos_id_database(self, word: str, pos_tag: Union[str, POSTag]) -> boo return self._data_processor.is_word_pos_id_database(word, pos_tag_id) - def yield_words_with_length(self, word_length: int, reverse_order: bool = False): """ Yield words of a specific length from the database. @@ -229,7 +236,6 @@ def yield_words_with_length(self, word_length: int, reverse_order: bool = False) """ return self._data_processor.yield_words_with_length(word_length, reverse_order) - def yield_words(self, reverse_order: bool = False, word_length_sort: bool = False): """ Yield all words in the database. @@ -243,8 +249,12 @@ def yield_words(self, reverse_order: bool = False, word_length_sort: bool = Fals """ return self._data_processor.yield_words(reverse_order, word_length_sort) - - def yield_word_pos_with_length(self, word_length: int, reverse_order: bool = False, pos_tag_as_string: bool = False): + def yield_word_pos_with_length( + self, + word_length: int, + reverse_order: bool = False, + pos_tag_as_string: bool = False, + ): """ Yield words of a specific length with their associated part-of-speech tag IDs from the database. @@ -263,11 +273,17 @@ def yield_word_pos_with_length(self, word_length: int, reverse_order: bool = Fal else: pos_converter = lambda x: POSTag(x) - for word, pos_tag_id in self._data_processor.yield_word_pos_id_with_length(word_length, reverse_order): + for word, pos_tag_id in self._data_processor.yield_word_pos_id_with_length( + word_length, reverse_order + ): yield (word, pos_converter(pos_tag_id)) - - def yield_word_pos(self, reverse_order: bool = False, pos_tag_as_string: bool = False, word_length_sort: bool = False): + def yield_word_pos( + self, + reverse_order: bool = False, + pos_tag_as_string: bool = False, + word_length_sort: bool = False, + ): """ Yield all words with their associated part-of-speech tag IDs from the database. @@ -286,12 +302,18 @@ def yield_word_pos(self, reverse_order: bool = False, pos_tag_as_string: bool = else: pos_converter = lambda x: POSTag(x) - for word, pos_tag_id in self._data_processor.yield_word_pos_id(reverse_order, word_length_sort): + for word, pos_tag_id in self._data_processor.yield_word_pos_id( + reverse_order, word_length_sort + ): yield (word, pos_converter(pos_tag_id)) - - def yield_word_pos_level_with_length(self, word_length: int, reverse_order: bool = False, - pos_tag_as_string: bool = False, word_level_as_float: bool = False): + def yield_word_pos_level_with_length( + self, + word_length: int, + reverse_order: bool = False, + pos_tag_as_string: bool = False, + word_level_as_float: bool = False, + ): """ Yield words of a specific length, their part-of-speech tags, and their CEFR levels from the database based on the specified criteria. @@ -302,7 +324,7 @@ def yield_word_pos_level_with_length(self, word_length: int, reverse_order: bool word_level_as_float (bool, optional): If True, yield CEFR levels as floats instead of CEFRLevel enums. Defaults to False. Yields: - tuple: A tuple containing the word, its part-of-speech tag, and its CEFR level. If `pos_tag_as_string` is True, the part-of-speech tag is a string, + tuple: A tuple containing the word, its part-of-speech tag, and its CEFR level. If `pos_tag_as_string` is True, the part-of-speech tag is a string, otherwise, it's a POSTag enum. If `word_level_as_float` is True, the level is a float, otherwise, it's a CEFRLevel enum. """ if pos_tag_as_string: @@ -311,17 +333,33 @@ def yield_word_pos_level_with_length(self, word_length: int, reverse_order: bool pos_converter = lambda x: POSTag(x) if word_level_as_float: - for word, pos_tag_id, level in self._data_processor.yield_word_pos_level_with_length(word_length, reverse_order): + for ( + word, + pos_tag_id, + level, + ) in self._data_processor.yield_word_pos_level_with_length( + word_length, reverse_order + ): yield (word, pos_converter(pos_tag_id), level) return - for word, pos_tag_id, level in self._data_processor.yield_word_pos_level_with_length(word_length, reverse_order): + for ( + word, + pos_tag_id, + level, + ) in self._data_processor.yield_word_pos_level_with_length( + word_length, reverse_order + ): yield (word, pos_converter(pos_tag_id), CEFRLevel(round(level))) - - def yield_word_pos_level(self, reverse_order: bool = False, pos_tag_as_string: bool = False, - word_level_as_float: bool = False, word_length_sort: bool = False): + def yield_word_pos_level( + self, + reverse_order: bool = False, + pos_tag_as_string: bool = False, + word_level_as_float: bool = False, + word_length_sort: bool = False, + ): """ Yield all words, their part-of-speech tags, and their CEFR levels from the database based on the specified criteria. @@ -332,7 +370,7 @@ def yield_word_pos_level(self, reverse_order: bool = False, pos_tag_as_string: b word_length_sort (bool): If True, yields data sorted by word length. Yields: - tuple: A tuple containing the word, its part-of-speech tag, and its CEFR level. If `pos_tag_as_string` is True, the part-of-speech tag is a string, + tuple: A tuple containing the word, its part-of-speech tag, and its CEFR level. If `pos_tag_as_string` is True, the part-of-speech tag is a string, otherwise, it's a POSTag enum. If `word_level_as_float` is True, the level is a float, otherwise, it's a CEFRLevel enum. """ if pos_tag_as_string: @@ -341,15 +379,18 @@ def yield_word_pos_level(self, reverse_order: bool = False, pos_tag_as_string: b pos_converter = lambda x: POSTag(x) if word_level_as_float: - for word, pos_tag_id, level in self._data_processor.yield_word_pos_level(reverse_order, word_length_sort): + for word, pos_tag_id, level in self._data_processor.yield_word_pos_level( + reverse_order, word_length_sort + ): yield (word, pos_converter(pos_tag_id), level) return - for word, pos_tag_id, level in self._data_processor.yield_word_pos_level(reverse_order, word_length_sort): + for word, pos_tag_id, level in self._data_processor.yield_word_pos_level( + reverse_order, word_length_sort + ): yield (word, pos_converter(pos_tag_id), CEFRLevel(round(level))) - def get_word_count_for_length(self, word_length: int) -> int: """ Count the number of words of a specific length in the data. @@ -362,7 +403,6 @@ def get_word_count_for_length(self, word_length: int) -> int: """ return self._data_processor.get_word_count_for_length(word_length) - def get_total_words(self) -> int: """ Get the total count of words in the data. @@ -372,7 +412,6 @@ def get_total_words(self) -> int: """ return self._data_processor.get_total_words() - def get_word_pos_count_for_length(self, word_length: int) -> int: """ Count the number of positions in the data where words of a specific length start. @@ -385,7 +424,6 @@ def get_word_pos_count_for_length(self, word_length: int) -> int: """ return self._data_processor.get_word_pos_count_for_length(word_length) - def get_word_pos_count(self) -> int: """ Get the total count of positions in the data where words start, across all word lengths. @@ -395,7 +433,6 @@ def get_word_pos_count(self) -> int: """ return self._data_processor.get_word_pos_count() - @staticmethod def get_pos_tag_id(pos_tag: Union[str, POSTag]) -> Union[int, None]: """ diff --git a/src/cefrpy/CEFRDataProcessor.py b/src/cefrpy/CEFRDataProcessor.py index ce7e529..ab0105f 100644 --- a/src/cefrpy/CEFRDataProcessor.py +++ b/src/cefrpy/CEFRDataProcessor.py @@ -4,9 +4,10 @@ from heapq import heapify, heappush, heappop from .CEFRDataReader import CEFRDataReader +from .CEFRDataValidator import VALID_WORD_CHARACTERS -class HeapqReverseDataWrapper(): +class HeapqReverseDataWrapper: """ Wrapper class to reverse the ordering of data when using heapq. @@ -22,6 +23,7 @@ class HeapqReverseDataWrapper(): Methods: __lt__(self, other): Less-than comparison method used to determine the ordering of the wrapped data. """ + def __init__(self, data) -> None: """ Initialize the HeapqReverseDataWrapper instance. @@ -61,7 +63,6 @@ def __init__(self, data_reader: CEFRDataReader = CEFRDataReader()) -> None: """ self._data_reader = data_reader - def get_max_word_len(self) -> int: """ Get the maximum word length available in the data. @@ -71,7 +72,6 @@ def get_max_word_len(self) -> int: """ return self._data_reader.get_wlp_len() - 1 - def is_word_len_valid(self, word_len: int) -> bool: """ Check if the word length is valid. @@ -84,7 +84,6 @@ def is_word_len_valid(self, word_len: int) -> bool: """ return 0 < word_len < self._data_reader.get_wlp_len() - def _get_first_word_match_pos(self, word_packed: bytes) -> int: """ Get the position of the first occurrence of a word in the data. @@ -123,8 +122,9 @@ def _get_first_word_match_pos(self, word_packed: bytes) -> int: return -1 - - def _get_int_word_level_for_pos_id(self, word_packed: bytes, pos_tag_id: int, avg_level_not_found_pos: bool = False) -> Union[int, None]: + def _get_int_word_level_for_pos_id( + self, word_packed: bytes, pos_tag_id: int, avg_level_not_found_pos: bool = False + ) -> Union[int, None]: """ Get the packed level of a word's part of speech. @@ -183,7 +183,6 @@ def _get_int_word_level_for_pos_id(self, word_packed: bytes, pos_tag_id: int, av m = first_match - else: while True: m += data_block_len @@ -224,7 +223,9 @@ def _get_int_word_level_for_pos_id(self, word_packed: bytes, pos_tag_id: int, av i += 1 else: founded_pos += 1 - level_accumulator += self._data_reader.get_data_array_value_at(i + 1) + level_accumulator += self._data_reader.get_data_array_value_at( + i + 1 + ) continue break @@ -232,7 +233,6 @@ def _get_int_word_level_for_pos_id(self, word_packed: bytes, pos_tag_id: int, av if avg_level_not_found_pos: return round(level_accumulator / founded_pos) - def _get_word_data_range(self, word: str) -> Union[range, None]: """ Determines the range of data associated with a given word. @@ -247,6 +247,9 @@ def _get_word_data_range(self, word: str) -> Union[range, None]: if not self.is_word_len_valid(len(word)): return + if not self._is_word_chars_valid(word): + return + word_packed = self.pack_word(word) first_match = self._get_first_word_match_pos(word_packed) if first_match == -1: @@ -300,7 +303,6 @@ def _get_word_data_range(self, word: str) -> Union[range, None]: return range(start_range, end_range, data_block_len) - def get_all_pos_for_word(self, word: str) -> list[int]: """ Retrieves the IDs of all part-of-speech tags associated with a given word. @@ -309,7 +311,7 @@ def get_all_pos_for_word(self, word: str) -> list[int]: word (str): The word to retrieve part-of-speech tags for. Returns: - list[int]: A list of IDs representing the part-of-speech tags associated with the word. + list[int]: A list of IDs representing the part-of-speech tags associated with the word. If the word is not found in the data, an empty list is returned. """ data_range = self._get_word_data_range(word) @@ -323,7 +325,6 @@ def get_all_pos_for_word(self, word: str) -> list[int]: return pos_list - def get_pos_level_dict_for_word(self, word: str) -> dict[int, float]: """ Retrieves a dictionary mapping part-of-speech tag IDs to their associated CEFR levels for a given word. @@ -349,8 +350,9 @@ def get_pos_level_dict_for_word(self, word: str) -> dict[int, float]: return result - - def get_word_level_for_pos_id(self, word: str, pos_tag_id: int, avg_level_not_found_pos: bool = False) -> Union[float, None]: + def get_word_level_for_pos_id( + self, word: str, pos_tag_id: int, avg_level_not_found_pos: bool = False + ) -> Union[float, None]: """ Get the level of a word's part of speech. @@ -365,13 +367,17 @@ def get_word_level_for_pos_id(self, word: str, pos_tag_id: int, avg_level_not_fo if not self.is_word_len_valid(len(word)): return + if not self._is_word_chars_valid(word): + return + word_packed = self.pack_word(word) - level = self._get_int_word_level_for_pos_id(word_packed, pos_tag_id, avg_level_not_found_pos) + level = self._get_int_word_level_for_pos_id( + word_packed, pos_tag_id, avg_level_not_found_pos + ) if level is not None: return self.byte_int_level_to_float(level) - def is_word_in_database(self, word: str) -> bool: """ Check if a word is in the database. @@ -385,11 +391,13 @@ def is_word_in_database(self, word: str) -> bool: if not self.is_word_len_valid(len(word)): return False + if not self._is_word_chars_valid(word): + return False + word_packed = self.pack_word(word) return self._get_first_word_match_pos(word_packed) != -1 - def is_word_pos_id_database(self, word: str, pos_tag_id: int) -> bool: """ Check if a word pos is in the database. @@ -403,7 +411,6 @@ def is_word_pos_id_database(self, word: str, pos_tag_id: int) -> bool: """ return self.get_word_level_for_pos_id(word, pos_tag_id) is not None - def _unpack_word_in_data_array(self, i: int, word_length: int) -> str: """ Unpack a word in the data array starting from index 'i' with a given length. @@ -425,8 +432,9 @@ def _unpack_word_in_data_array(self, i: int, word_length: int) -> str: return word - - def _get_word_yield_start_block_range(self, word_length: int, reverse_order: bool = False): + def _get_word_yield_start_block_range( + self, word_length: int, reverse_order: bool = False + ): """ Get the range of block indices to start yielding words of a specific length. @@ -444,11 +452,14 @@ def _get_word_yield_start_block_range(self, word_length: int, reverse_order: boo if reverse_order: # This approach should be faster than reversed(range(...)): # https://stackoverflow.com/a/7286465/15070145 - return range(segment_end - data_block_len, segment_start - data_block_len, -data_block_len) + return range( + segment_end - data_block_len, + segment_start - data_block_len, + -data_block_len, + ) return range(segment_start, segment_end, data_block_len) - def yield_words_with_length(self, word_length: int, reverse_order: bool = False): """ Yield words of a specific length from the database. @@ -463,7 +474,9 @@ def yield_words_with_length(self, word_length: int, reverse_order: bool = False) if not self.is_word_len_valid(word_length): return - start_block_range = self._get_word_yield_start_block_range(word_length, reverse_order) + start_block_range = self._get_word_yield_start_block_range( + word_length, reverse_order + ) last_word = None for i in start_block_range: @@ -473,8 +486,9 @@ def yield_words_with_length(self, word_length: int, reverse_order: bool = False) yield word last_word = word - - def yield_word_pos_id_with_length(self, word_length: int, reverse_order: bool = False): + def yield_word_pos_id_with_length( + self, word_length: int, reverse_order: bool = False + ): """ Yield words of a specific length with their associated part-of-speech tag IDs from the database. @@ -489,7 +503,9 @@ def yield_word_pos_id_with_length(self, word_length: int, reverse_order: bool = if not self.is_word_len_valid(word_length): return - start_block_range = self._get_word_yield_start_block_range(word_length, reverse_order) + start_block_range = self._get_word_yield_start_block_range( + word_length, reverse_order + ) for i in start_block_range: word = self._unpack_word_in_data_array(i, word_length) @@ -497,8 +513,9 @@ def yield_word_pos_id_with_length(self, word_length: int, reverse_order: bool = yield (word, word_pos) - - def yield_word_pos_level_with_length(self, word_length: int, reverse_order: bool = False): + def yield_word_pos_level_with_length( + self, word_length: int, reverse_order: bool = False + ): """ Yield words of a specific length with their part-of-speech tag IDs and levels from the database. @@ -513,7 +530,9 @@ def yield_word_pos_level_with_length(self, word_length: int, reverse_order: bool if not self.is_word_len_valid(word_length): return - start_block_range = self._get_word_yield_start_block_range(word_length, reverse_order) + start_block_range = self._get_word_yield_start_block_range( + word_length, reverse_order + ) for i in start_block_range: word = self._unpack_word_in_data_array(i, word_length) @@ -525,8 +544,12 @@ def yield_word_pos_level_with_length(self, word_length: int, reverse_order: bool yield (word, word_pos, word_level_float) - - def _yield_all_data(self, yield_method_with_word_length: callable, reverse_order: bool, word_lenght_sort: bool): + def _yield_all_data( + self, + yield_method_with_word_length: callable, + reverse_order: bool, + word_lenght_sort: bool, + ): """ Yields data from various generators based on word length. @@ -552,7 +575,10 @@ def _yield_all_data(self, yield_method_with_word_length: callable, reverse_order return - generators = [yield_method_with_word_length(i, reverse_order) for i in range(1, max_word_len + 1)] + generators = [ + yield_method_with_word_length(i, reverse_order) + for i in range(1, max_word_len + 1) + ] words_heap = [] heapify(words_heap) @@ -593,7 +619,6 @@ def _yield_all_data(self, yield_method_with_word_length: callable, reverse_order except StopIteration: pass - def yield_words(self, reverse_order: bool = False, word_lenght_sort: bool = False): """ Yield all words in the database. @@ -605,10 +630,13 @@ def yield_words(self, reverse_order: bool = False, word_lenght_sort: bool = Fals Yields: str: A word from the database. """ - return self._yield_all_data(self.yield_words_with_length, reverse_order, word_lenght_sort) - + return self._yield_all_data( + self.yield_words_with_length, reverse_order, word_lenght_sort + ) - def yield_word_pos_id(self, reverse_order: bool = False, word_lenght_sort: bool = False): + def yield_word_pos_id( + self, reverse_order: bool = False, word_lenght_sort: bool = False + ): """ Yield words with their part-of-speech tag IDs from the database. @@ -619,10 +647,13 @@ def yield_word_pos_id(self, reverse_order: bool = False, word_lenght_sort: bool Yields: tuple[str, int]: A tuple containing a word from the database and its associated part-of-speech tag ID. """ - return self._yield_all_data(self.yield_word_pos_id_with_length, reverse_order, word_lenght_sort) + return self._yield_all_data( + self.yield_word_pos_id_with_length, reverse_order, word_lenght_sort + ) - - def yield_word_pos_level(self, reverse_order: bool = False, word_lenght_sort: bool = False): + def yield_word_pos_level( + self, reverse_order: bool = False, word_lenght_sort: bool = False + ): """ Yield words with their part-of-speech tag IDs and levels from the database. @@ -634,8 +665,9 @@ def yield_word_pos_level(self, reverse_order: bool = False, word_lenght_sort: bo tuple[str, int, float]: A tuple containing a word from the database, its associated part-of-speech tag ID, and its level. """ - return self._yield_all_data(self.yield_word_pos_level_with_length, reverse_order, word_lenght_sort) - + return self._yield_all_data( + self.yield_word_pos_level_with_length, reverse_order, word_lenght_sort + ) def get_word_count_for_length(self, word_length: int) -> int: """ @@ -664,13 +696,14 @@ def get_word_count_for_length(self, word_length: int) -> int: for k in range(j + 1, word_length): array_pos += 1 - last_word[k] = self._data_reader.get_data_array_value_at(array_pos) + last_word[k] = self._data_reader.get_data_array_value_at( + array_pos + ) break return unique_words_counter - def get_total_words(self) -> int: """ Get the total count of words in the data. @@ -686,7 +719,6 @@ def get_total_words(self) -> int: return counter - def get_word_pos_count_for_length(self, word_length: int) -> int: """ Count the number of positions in the data where words of a specific length start. @@ -706,7 +738,6 @@ def get_word_pos_count_for_length(self, word_length: int) -> int: return (segment_end - segment_start) // data_block_len - def get_word_pos_count(self) -> int: """ Get the total count of positions in the data where words start, across all word lengths. @@ -722,7 +753,6 @@ def get_word_pos_count(self) -> int: return counter - @staticmethod def pack_word(word: str) -> bytes: """ @@ -734,8 +764,24 @@ def pack_word(word: str) -> bytes: Returns: bytes: The packed representation of the word. """ - return struct.pack('B' * len(word), *map(ord, word)) + return struct.pack("B" * len(word), *map(ord, word)) + + @staticmethod + def _is_word_chars_valid(word: str) -> bool: + """ + Check whether every character in the word is a valid lowercase ASCII letter. + + Non-ASCII characters (e.g. 'あ', 'é', '中') would cause struct.pack to raise + an error, so we reject them early and return None/False from callers instead + of crashing. + Args: + word (str): The word to validate. + + Returns: + bool: True if all characters are valid, False otherwise. + """ + return all(c in VALID_WORD_CHARACTERS for c in word) @staticmethod def byte_int_level_to_float(level: int) -> float: diff --git a/src/cefrpy/CEFRDataReader.py b/src/cefrpy/CEFRDataReader.py index 2b24609..6cb0a70 100644 --- a/src/cefrpy/CEFRDataReader.py +++ b/src/cefrpy/CEFRDataReader.py @@ -31,13 +31,16 @@ def __init__(self, data_path: Union[str, None] = None) -> None: Exception: If the CEFR database file content is invalid. """ - self.data_path = os.path.join(os.path.dirname(__file__), 'data.bin') if data_path is None else data_path - self._wlp = array.array('I') + self.data_path = ( + os.path.join(os.path.dirname(__file__), "data.bin") + if data_path is None + else data_path + ) + self._wlp = array.array("I") self._data_array = bytearray() if not self._read_data(): - raise Exception(f'CEFR database file content is invalid: {self.data_path}') - + raise Exception(f"CEFR database file content is invalid: {self.data_path}") def _read_data(self) -> bool: """ @@ -46,18 +49,17 @@ def _read_data(self) -> bool: Returns: bool: True if the data is successfully read and valid, False otherwise. """ - with open(self.data_path, 'rb') as file: - wlp_len = struct.unpack('B', file.read(1))[0] + with open(self.data_path, "rb") as file: + wlp_len = struct.unpack("B", file.read(1))[0] if not is_wlp_length_valid(wlp_len): return False - wlp_data = file.read(wlp_len * struct.calcsize('I')) + wlp_data = file.read(wlp_len * struct.calcsize("I")) self._wlp.frombytes(wlp_data) self._data_array = bytearray(file.read()) return is_data_valid(self._wlp, self._data_array) - def get_wlp_value_at(self, i: int) -> int: """ Get the value at index i in the word length positions array. @@ -76,7 +78,6 @@ def get_wlp_value_at(self, i: int) -> int: raise IndexError("Index out of range for _wlp") - def get_data_array_value_at(self, i: int) -> int: """ Get the value at index i in the data array. @@ -95,7 +96,6 @@ def get_data_array_value_at(self, i: int) -> int: raise IndexError("Index out of range for _data_array") - def get_wlp_len(self) -> int: """ Get the length of the word length positions array. @@ -105,7 +105,6 @@ def get_wlp_len(self) -> int: """ return len(self._wlp) - def get_data_array_len(self) -> int: """ Get the length of the data array. diff --git a/src/cefrpy/CEFRDataValidator.py b/src/cefrpy/CEFRDataValidator.py index cc78c14..8b1d531 100644 --- a/src/cefrpy/CEFRDataValidator.py +++ b/src/cefrpy/CEFRDataValidator.py @@ -72,7 +72,7 @@ def validate_data_block(data: bytearray, start_pos: int, block_length: int) -> b """ word_len = block_length - 2 for i in range(start_pos, start_pos + word_len): - if not chr(data[i]) in VALID_WORD_CHARACTERS: + if chr(data[i]) not in VALID_WORD_CHARACTERS: return False if data[i + 1] > MAX_POS_TAG_ID: diff --git a/src/cefrpy/CEFRLevel.py b/src/cefrpy/CEFRLevel.py index 1b6dc0f..cdcce4c 100644 --- a/src/cefrpy/CEFRLevel.py +++ b/src/cefrpy/CEFRLevel.py @@ -1,5 +1,6 @@ from enum import Enum, unique + @unique class CEFRLevel(Enum): """ diff --git a/src/cefrpy/CEFRSpaCyAnalyzer.py b/src/cefrpy/CEFRSpaCyAnalyzer.py index 6f04210..405ec5e 100644 --- a/src/cefrpy/CEFRSpaCyAnalyzer.py +++ b/src/cefrpy/CEFRSpaCyAnalyzer.py @@ -4,7 +4,8 @@ from .CEFRAnalyzer import CEFRAnalyzer -class CEFRSpaCyAnalyzer(): + +class CEFRSpaCyAnalyzer: """ Analyze text for CEFR levels, considering provided entity types to skip and abbreviation mapping. @@ -15,8 +16,12 @@ class CEFRSpaCyAnalyzer(): tokens (list[tuple[str, str, bool, float, int, int]]): List of token tuples containing word, POS tag, skip status, CEFR level, start index, and end index. """ - def __init__(self, analyzer: CEFRAnalyzer = CEFRAnalyzer(), entity_types_to_skip: Union[set[str], list[str], None] = None, - abbreviation_mapping: Union[dict[str, str], None] = None) -> None: + def __init__( + self, + analyzer: CEFRAnalyzer = CEFRAnalyzer(), + entity_types_to_skip: Union[set[str], list[str], None] = None, + abbreviation_mapping: Union[dict[str, str], None] = None, + ) -> None: """ Initialize the CEFRSpaCyAnalyzer instance. @@ -26,8 +31,12 @@ def __init__(self, analyzer: CEFRAnalyzer = CEFRAnalyzer(), entity_types_to_skip abbreviation_mapping (Union[dict[str, str], None], optional): A dictionary mapping abbreviations to their full forms. Defaults to None. """ self._analyzer = analyzer - self.entity_types_to_skip = set() if entity_types_to_skip is None else set(entity_types_to_skip) - self.abbreviation_mapping = dict() if abbreviation_mapping is None else abbreviation_mapping + self.entity_types_to_skip = ( + set() if entity_types_to_skip is None else set(entity_types_to_skip) + ) + self.abbreviation_mapping = ( + dict() if abbreviation_mapping is None else abbreviation_mapping + ) def _get_next_entity(self, entities_iter: Iterator): """ @@ -38,7 +47,9 @@ def _get_next_entity(self, entities_iter: Iterator): except StopIteration: return None - def _get_word_pos_tokens_set(self, tokens: list[tuple[str, str, str, bool, int, int]]) -> set[tuple[str, str]]: + def _get_word_pos_tokens_set( + self, tokens: list[tuple[str, str, str, bool, int, int]] + ) -> set[tuple[str, str]]: """ Get unique word and POS tag tuples from tokens. @@ -50,7 +61,9 @@ def _get_word_pos_tokens_set(self, tokens: list[tuple[str, str, str, bool, int, """ return {(token[1], token[2]) for token in tokens if not token[3]} - def _fetch_word_pos_level_tokens(self, word_pos_tokens_set: set[tuple[str, str]]) -> dict[tuple[str, str], float]: + def _fetch_word_pos_level_tokens( + self, word_pos_tokens_set: set[tuple[str, str]] + ) -> dict[tuple[str, str], float]: """ Fetch CEFR levels for unique word and POS tag tuples. @@ -62,7 +75,9 @@ def _fetch_word_pos_level_tokens(self, word_pos_tokens_set: set[tuple[str, str]] """ result_dict = dict() for word, pos_tag in word_pos_tokens_set: - level = self._analyzer.get_word_pos_level_float(word, pos_tag, avg_level_not_found_pos=True) + level = self._analyzer.get_word_pos_level_float( + word, pos_tag, avg_level_not_found_pos=True + ) result_dict[(word, pos_tag)] = level if level is not None else 0 return result_dict @@ -95,15 +110,20 @@ def analyze_doc(self, doc) -> list[tuple[str, str, bool, float, int, int]]: while current_entity and token_start > current_entity.end_char: current_entity = self._get_next_entity(entities_iter) - if current_entity and current_entity.label_ in self.entity_types_to_skip \ - and current_entity.start_char <= token_start < current_entity.end_char: + if ( + current_entity + and current_entity.label_ in self.entity_types_to_skip + and current_entity.start_char + <= token_start + < current_entity.end_char + ): to_skip = True word = token.text.strip() word_lower = word.lower() word_pos = token.tag_ - if word_pos == 'POS' and word_lower == "'s": + if word_pos == "POS" and word_lower == "'s": to_skip = True else: abbreviation_form = self.abbreviation_mapping.get(word_lower) @@ -114,18 +134,29 @@ def analyze_doc(self, doc) -> list[tuple[str, str, bool, float, int, int]]: if not to_skip and not word.isalpha(): to_skip = True - nlp_tokens.append((word, word_lower, word_pos, to_skip, token_start, token_end)) + nlp_tokens.append( + (word, word_lower, word_pos, to_skip, token_start, token_end) + ) word_pos_set = self._get_word_pos_tokens_set(nlp_tokens) word_pos_unique_level_tokens = self._fetch_word_pos_level_tokens(word_pos_set) self.tokens = [] - for word, word_lower, word_pos, is_skipped, token_start, token_end in nlp_tokens: + for ( + word, + word_lower, + word_pos, + is_skipped, + token_start, + token_end, + ) in nlp_tokens: if is_skipped: level = None else: level = word_pos_unique_level_tokens.get((word_lower, word_pos)) - self.tokens.append((word, word_pos, is_skipped, level, token_start, token_end)) + self.tokens.append( + (word, word_pos, is_skipped, level, token_start, token_end) + ) return self.tokens diff --git a/src/cefrpy/POSTag.py b/src/cefrpy/POSTag.py index 4f6ae9a..8830ed5 100644 --- a/src/cefrpy/POSTag.py +++ b/src/cefrpy/POSTag.py @@ -1,36 +1,37 @@ from enum import Enum, unique POS_TAGS_DESCRIPTIONS = [ - 'Coordinating conjunction', - 'Cardinal number', - 'Determiner', - 'Preposition or subordinating conjunction', - 'Adjective', - 'Adjective, comparative', - 'Adjective, superlative', - 'Modal', - 'Noun, singular or mass', - 'Noun, plural', - 'Proper noun, singular', - 'Proper noun, plural', - 'Personal/Posessive pronoun', - 'Adverb', - 'Adverb, comparative', - 'Adverb, superlative', - 'Particle', - 'To', - 'Interjection', - 'Verb, base form', - 'Verb, past tense', - 'Verb, gerund or present participle', - 'Verb, past participle', - 'Verb, non-3rd person singular present', - 'Verb, 3rd person singular present', - 'Wh-determiner', - 'Wh-pronoun', - 'Wh-adverb' + "Coordinating conjunction", + "Cardinal number", + "Determiner", + "Preposition or subordinating conjunction", + "Adjective", + "Adjective, comparative", + "Adjective, superlative", + "Modal", + "Noun, singular or mass", + "Noun, plural", + "Proper noun, singular", + "Proper noun, plural", + "Personal/Posessive pronoun", + "Adverb", + "Adverb, comparative", + "Adverb, superlative", + "Particle", + "To", + "Interjection", + "Verb, base form", + "Verb, past tense", + "Verb, gerund or present participle", + "Verb, past participle", + "Verb, non-3rd person singular present", + "Verb, 3rd person singular present", + "Wh-determiner", + "Wh-pronoun", + "Wh-adverb", ] + @unique class POSTag(Enum): """ @@ -66,21 +67,18 @@ class POSTag(Enum): WP = 26 WRB = 27 - def __str__(self) -> str: """ Returns a string representation of the POS tag. """ return self.name - def __int__(self) -> int: """ Returns an integer representation of the POS tag. """ return self.value - def __eq__(self, other) -> bool: """ Checks if this POS tag is equal to another POS tag. @@ -90,21 +88,18 @@ def __eq__(self, other) -> bool: return NotImplemented - def __hash__(self) -> int: """ Returns the hash value of the POS tag. """ return self.value - def get_description(self) -> str: """ Retrieve the description of a POS tag. """ return POS_TAGS_DESCRIPTIONS[self.value] - @classmethod def from_tag_name(cls, tag_name: str): """ @@ -125,7 +120,6 @@ def from_tag_name(cls, tag_name: str): return tag - @staticmethod def get_id_by_tag_name(tag_name: str) -> int: """ @@ -145,7 +139,6 @@ def get_id_by_tag_name(tag_name: str) -> int: return POSTag[tag_name].value - @staticmethod def get_tag_name_by_id(tag_id: int) -> str: """ @@ -165,7 +158,6 @@ def get_tag_name_by_id(tag_id: int) -> str: raise ValueError(f"Invalid tag id: {tag_id}") - @staticmethod def get_description_by_tag_name(tag_name: str) -> str: """ @@ -184,7 +176,6 @@ def get_description_by_tag_name(tag_name: str) -> str: return POS_TAGS_DESCRIPTIONS[tag_id] - @staticmethod def get_description_by_tag_id(tag_id: int) -> str: """ @@ -204,7 +195,6 @@ def get_description_by_tag_id(tag_id: int) -> str: return POS_TAGS_DESCRIPTIONS[tag_id] - @staticmethod def get_total_tags() -> int: """ @@ -215,7 +205,6 @@ def get_total_tags() -> int: """ return len(POSTag.__members__) - @staticmethod def get_all_tags() -> list[str]: """ diff --git a/src/cefrpy/__init__.py b/src/cefrpy/__init__.py index a580585..e92b138 100644 --- a/src/cefrpy/__init__.py +++ b/src/cefrpy/__init__.py @@ -5,5 +5,5 @@ from .CEFRAnalyzer import CEFRAnalyzer from .CEFRSpaCyAnalyzer import CEFRSpaCyAnalyzer -__version__ = "1.0.1" +__version__ = "1.0.2" __all__ = ["POSTag", "CEFRDataReader", "CEFRDataProcessor", "CEFRLevel", "CEFRAnalyzer", "CEFRSpaCyAnalyzer"] diff --git a/tests/test_CEFRAnalyzer.py b/tests/test_CEFRAnalyzer.py index 3c2fbf9..61305c1 100644 --- a/tests/test_CEFRAnalyzer.py +++ b/tests/test_CEFRAnalyzer.py @@ -1,7 +1,7 @@ import unittest from random import randint -from cefrpy import CEFRAnalyzer, CEFRDataReader, POSTag, CEFRLevel +from cefrpy import CEFRAnalyzer, POSTag, CEFRLevel class TestCEFRAnalyzer(unittest.TestCase): @@ -11,7 +11,12 @@ def setUpClass(cls): cls.valid_word_pos = POSTag.NN cls.valid_word_unknown_pos = POSTag.CD cls.not_valid_words_test_pos_tag = POSTag.CC - cls.not_valid_words = ("", "@test@", "notvalidword", "toolongwordtoolongwordtoolongwordtoolongwordtoolongword") + cls.not_valid_words = ( + "", + "@test@", + "notvalidword", + "toolongwordtoolongwordtoolongwordtoolongwordtoolongword", + ) cls.analyzer = CEFRAnalyzer() def test_get_max_word_len(self): @@ -26,30 +31,58 @@ def test_get_pos_tag_id(self): self.assertEqual(CEFRAnalyzer.get_pos_tag_id(tag_str), tag_id) def test_get_word_pos_level_float(self): - valid_word_pos_level = self.analyzer.get_word_pos_level_float(self.valid_word, self.valid_word_pos, False) - valid_avg_word_pos_level = self.analyzer.get_word_pos_level_float(self.valid_word, self.valid_word_unknown_pos, True) - none_level = self.analyzer.get_word_pos_level_float(self.valid_word, self.valid_word_unknown_pos, False) + valid_word_pos_level = self.analyzer.get_word_pos_level_float( + self.valid_word, self.valid_word_pos, False + ) + valid_avg_word_pos_level = self.analyzer.get_word_pos_level_float( + self.valid_word, self.valid_word_unknown_pos, True + ) + none_level = self.analyzer.get_word_pos_level_float( + self.valid_word, self.valid_word_unknown_pos, False + ) self.assertIsNotNone(valid_word_pos_level) self.assertIsNotNone(valid_avg_word_pos_level) self.assertIsNone(none_level) for word in self.not_valid_words: - self.assertIsNone(self.analyzer.get_word_pos_level_float(word, self.not_valid_words_test_pos_tag, False)) - self.assertIsNone(self.analyzer.get_word_pos_level_float(word, self.not_valid_words_test_pos_tag, True)) + self.assertIsNone( + self.analyzer.get_word_pos_level_float( + word, self.not_valid_words_test_pos_tag, False + ) + ) + self.assertIsNone( + self.analyzer.get_word_pos_level_float( + word, self.not_valid_words_test_pos_tag, True + ) + ) def test_get_word_pos_level_CEFR(self): - valid_word_pos_level = self.analyzer.get_word_pos_level_CEFR(self.valid_word, self.valid_word_pos, False) - valid_avg_word_pos_level = self.analyzer.get_word_pos_level_CEFR(self.valid_word, self.valid_word_unknown_pos, True) - none_level = self.analyzer.get_word_pos_level_CEFR(self.valid_word, self.valid_word_unknown_pos, False) + valid_word_pos_level = self.analyzer.get_word_pos_level_CEFR( + self.valid_word, self.valid_word_pos, False + ) + valid_avg_word_pos_level = self.analyzer.get_word_pos_level_CEFR( + self.valid_word, self.valid_word_unknown_pos, True + ) + none_level = self.analyzer.get_word_pos_level_CEFR( + self.valid_word, self.valid_word_unknown_pos, False + ) self.assertIsInstance(valid_word_pos_level, CEFRLevel) self.assertIsInstance(valid_avg_word_pos_level, CEFRLevel) self.assertIsNone(none_level) for word in self.not_valid_words: - self.assertIsNone(self.analyzer.get_word_pos_level_float(word, self.not_valid_words_test_pos_tag, False)) - self.assertIsNone(self.analyzer.get_word_pos_level_float(word, self.not_valid_words_test_pos_tag, True)) + self.assertIsNone( + self.analyzer.get_word_pos_level_float( + word, self.not_valid_words_test_pos_tag, False + ) + ) + self.assertIsNone( + self.analyzer.get_word_pos_level_float( + word, self.not_valid_words_test_pos_tag, True + ) + ) def test_get_avg_word_level_float(self): valid_word_level = self.analyzer.get_average_word_level_float(self.valid_word) @@ -72,11 +105,21 @@ def test_is_word_in_database(self): self.assertFalse(self.analyzer.is_word_in_database(word)) def test_is_word_pos_in_database(self): - self.assertTrue(self.analyzer.is_word_pos_id_database(self.valid_word, self.valid_word_pos)) - self.assertFalse(self.analyzer.is_word_pos_id_database(self.valid_word, self.valid_word_unknown_pos)) + self.assertTrue( + self.analyzer.is_word_pos_id_database(self.valid_word, self.valid_word_pos) + ) + self.assertFalse( + self.analyzer.is_word_pos_id_database( + self.valid_word, self.valid_word_unknown_pos + ) + ) for word in self.not_valid_words: - self.assertFalse(self.analyzer.is_word_pos_id_database(word, self.not_valid_words_test_pos_tag)) + self.assertFalse( + self.analyzer.is_word_pos_id_database( + word, self.not_valid_words_test_pos_tag + ) + ) def test_yields(self): valid_word_len = len(self.valid_word) @@ -94,7 +137,9 @@ def test_yields(self): self.assertEqual(len(valid_words), total_words) valid_words_iter = reversed(valid_words) - for word in self.analyzer.yield_words_with_length(valid_word_len, reverse_order=True): + for word in self.analyzer.yield_words_with_length( + valid_word_len, reverse_order=True + ): self.assertEqual(next(valid_words_iter), word) with self.assertRaises(StopIteration): @@ -104,8 +149,14 @@ def test_yields(self): word = next(valid_words_iter) word_pos_counter = 0 - for data1, data2 in zip(self.analyzer.yield_word_pos_with_length(valid_word_len, pos_tag_as_string=False), - self.analyzer.yield_word_pos_level_with_length(valid_word_len, pos_tag_as_string=True)): + for data1, data2 in zip( + self.analyzer.yield_word_pos_with_length( + valid_word_len, pos_tag_as_string=False + ), + self.analyzer.yield_word_pos_level_with_length( + valid_word_len, pos_tag_as_string=True + ), + ): word1, pos1 = data1 word2, pos2, level = data2 @@ -130,8 +181,17 @@ def test_yields(self): word = next(valid_words_iter) word_pos_counter = 0 - for data1, data2 in zip(self.analyzer.yield_word_pos_with_length(valid_word_len, pos_tag_as_string=True, reverse_order=True), - self.analyzer.yield_word_pos_level_with_length(valid_word_len, pos_tag_as_string=False, word_level_as_float=True, reverse_order=True)): + for data1, data2 in zip( + self.analyzer.yield_word_pos_with_length( + valid_word_len, pos_tag_as_string=True, reverse_order=True + ), + self.analyzer.yield_word_pos_level_with_length( + valid_word_len, + pos_tag_as_string=False, + word_level_as_float=True, + reverse_order=True, + ), + ): word1, pos1 = data1 word2, pos2, level = data2 @@ -159,7 +219,9 @@ def test_yields_alphabetical(self): word_counter = 0 last_word = "" - for word in self.analyzer.yield_words(reverse_order=False, word_length_sort=False): + for word in self.analyzer.yield_words( + reverse_order=False, word_length_sort=False + ): self.assertGreater(word, last_word) last_word = word word_counter += 1 @@ -167,7 +229,9 @@ def test_yields_alphabetical(self): self.assertEqual(word_counter, total_words) word_counter = 1 - generator = self.analyzer.yield_words(reverse_order=True, word_length_sort=False) + generator = self.analyzer.yield_words( + reverse_order=True, word_length_sort=False + ) last_word = next(generator) for word in generator: @@ -185,7 +249,9 @@ def test_yields_word_length_sort(self): last_len = 0 last_word = "" - for word in self.analyzer.yield_words(reverse_order=False, word_length_sort=True): + for word in self.analyzer.yield_words( + reverse_order=False, word_length_sort=True + ): word_len = len(word) self.assertGreaterEqual(word_len, last_len) @@ -219,5 +285,5 @@ def test_yields_word_length_sort(self): self.assertEqual(word_counter, total_words) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tests/test_CEFRDataProcessor.py b/tests/test_CEFRDataProcessor.py index 49f43f5..9db1f5a 100644 --- a/tests/test_CEFRDataProcessor.py +++ b/tests/test_CEFRDataProcessor.py @@ -3,6 +3,7 @@ from math import inf from cefrpy import CEFRDataProcessor, POSTag + class TestCEFRDataProcessor(unittest.TestCase): @classmethod def setUpClass(cls): @@ -10,7 +11,12 @@ def setUpClass(cls): cls.valid_word_pos_id = int(POSTag.NN) cls.valid_word_unknown_pos_id = int(POSTag.CD) cls.not_valid_words_test_pos_tag = int(POSTag.CC) - cls.not_valid_words = ("", "@test@", "notvalidword", "toolongwordtoolongwordtoolongwordtoolongwordtoolongword") + cls.not_valid_words = ( + "", + "@test@", + "notvalidword", + "toolongwordtoolongwordtoolongwordtoolongwordtoolongword", + ) cls.processor = CEFRDataProcessor() def test_get_wlp_and_max_word_len(self): @@ -33,7 +39,7 @@ def test_word_len_valid(self): self.assertTrue(self.processor.is_word_len_valid(max_valid_word_len)) def test_pack_word(self): - self.assertEqual(CEFRDataProcessor.pack_word("test"), b'test') + self.assertEqual(CEFRDataProcessor.pack_word("test"), b"test") def test_byte_int_level_to_float(self): self.assertAlmostEqual(CEFRDataProcessor.byte_int_level_to_float(0), 1) @@ -65,35 +71,71 @@ def test_is_word_in_database(self): self.assertFalse(self.processor.is_word_in_database(word)) def test_is_word_pos_in_database(self): - self.assertTrue(self.processor.is_word_pos_id_database(self.valid_word, self.valid_word_pos_id)) - self.assertFalse(self.processor.is_word_pos_id_database(self.valid_word, self.valid_word_unknown_pos_id)) + self.assertTrue( + self.processor.is_word_pos_id_database( + self.valid_word, self.valid_word_pos_id + ) + ) + self.assertFalse( + self.processor.is_word_pos_id_database( + self.valid_word, self.valid_word_unknown_pos_id + ) + ) for word in self.not_valid_words: - self.assertFalse(self.processor.is_word_pos_id_database(word, self.not_valid_words_test_pos_tag)) + self.assertFalse( + self.processor.is_word_pos_id_database( + word, self.not_valid_words_test_pos_tag + ) + ) def test_get_word_level_for_pos_id(self): - self.assertIsNotNone(self.processor.get_word_level_for_pos_id(self.valid_word, self.valid_word_pos_id, False)) - self.assertIsNone(self.processor.get_word_level_for_pos_id(self.valid_word, inf, False)) - - self.assertIsNone(self.processor.get_word_level_for_pos_id(self.valid_word, inf, False)) - self.assertIsNotNone(self.processor.get_word_level_for_pos_id(self.valid_word, inf, True)) + self.assertIsNotNone( + self.processor.get_word_level_for_pos_id( + self.valid_word, self.valid_word_pos_id, False + ) + ) + self.assertIsNone( + self.processor.get_word_level_for_pos_id(self.valid_word, inf, False) + ) + + self.assertIsNone( + self.processor.get_word_level_for_pos_id(self.valid_word, inf, False) + ) + self.assertIsNotNone( + self.processor.get_word_level_for_pos_id(self.valid_word, inf, True) + ) for word in self.not_valid_words: - self.assertIsNone(self.processor.get_word_level_for_pos_id(word, self.not_valid_words_test_pos_tag, True)) - self.assertIsNone(self.processor.get_word_level_for_pos_id(word, self.not_valid_words_test_pos_tag, False)) + self.assertIsNone( + self.processor.get_word_level_for_pos_id( + word, self.not_valid_words_test_pos_tag, True + ) + ) + self.assertIsNone( + self.processor.get_word_level_for_pos_id( + word, self.not_valid_words_test_pos_tag, False + ) + ) def test_get_word_count_for_length(self): self.assertTrue(0 <= self.processor.get_word_count_for_length(1) <= 26) valid_word_len = len(self.valid_word) - self.assertTrue(1 <= self.processor.get_word_count_for_length(valid_word_len) <= pow(26, valid_word_len)) + self.assertTrue( + 1 + <= self.processor.get_word_count_for_length(valid_word_len) + <= pow(26, valid_word_len) + ) def test_word_pos_count_for_length(self): self.assertGreaterEqual(self.processor.get_word_pos_count_for_length(1), 0) valid_word_len = len(self.valid_word) - self.assertGreater(self.processor.get_word_pos_count_for_length(valid_word_len), 0) + self.assertGreater( + self.processor.get_word_pos_count_for_length(valid_word_len), 0 + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tests/test_CEFRDataValidator.py b/tests/test_CEFRDataValidator.py index f927c57..515d3f5 100644 --- a/tests/test_CEFRDataValidator.py +++ b/tests/test_CEFRDataValidator.py @@ -9,11 +9,7 @@ def setUp(self): self.valid_wlp_lengths = [2, 3, 100, 254, 255] self.invalid_wlp_lengths = [-inf, -1, 0, 1, 256, 500, inf] - self.valid_wlp_arrays = [ - [0, 9], - [0, 6, 10], - [3, 6, 6, 6, 12] - ] + self.valid_wlp_arrays = [[0, 9], [0, 6, 10], [3, 6, 6, 6, 12]] self.invalid_wlp_arrays = [ [], @@ -21,22 +17,22 @@ def setUp(self): [0, -1], [1, 2, 3, 4, 5], [0, 3, 5, 12], - [3, 12, 9, 12, 17] + [3, 12, 9, 12, 17], ] self.valid_data = [ - bytearray(b'a\x00\x00d\x03\x05z\x02\x10'), - bytearray(b'g\x10\x05y\x04\x89kk\x05\x12'), - bytearray(b'---c\x06\x15qwer\x10\x35----') + bytearray(b"a\x00\x00d\x03\x05z\x02\x10"), + bytearray(b"g\x10\x05y\x04\x89kk\x05\x12"), + bytearray(b"---c\x06\x15qwer\x10\x35----"), ] self.invalid_data = [ - bytearray(b'something\x00\x02test\x00\x01'), - bytearray(b'hello'), - bytearray(b'c\x06qwer\x10\x35'), - bytearray(b'a\x99\x99d\x03\x05z\x02\x10'), - bytearray(b'testsomething'), - bytearray(b'#\x00\x00@\x03\x05#\x02\x10') + bytearray(b"something\x00\x02test\x00\x01"), + bytearray(b"hello"), + bytearray(b"c\x06qwer\x10\x35"), + bytearray(b"a\x99\x99d\x03\x05z\x02\x10"), + bytearray(b"testsomething"), + bytearray(b"#\x00\x00@\x03\x05#\x02\x10"), ] def test_wlp_length_valid(self): @@ -67,5 +63,5 @@ def test_cefr_data_invalid(self): self.assertFalse(CEFRDataValidator.is_data_valid(wlp_array, data)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tests/test_CEFRLevel.py b/tests/test_CEFRLevel.py index 39dd0dd..0bbef60 100644 --- a/tests/test_CEFRLevel.py +++ b/tests/test_CEFRLevel.py @@ -2,9 +2,17 @@ from cefrpy import CEFRLevel + class TestCEFRLevel(unittest.TestCase): def setUp(self): - self.levels = [CEFRLevel.A1, CEFRLevel.A2, CEFRLevel.B1, CEFRLevel.B2, CEFRLevel.C1, CEFRLevel.C2] + self.levels = [ + CEFRLevel.A1, + CEFRLevel.A2, + CEFRLevel.B1, + CEFRLevel.B2, + CEFRLevel.C1, + CEFRLevel.C2, + ] def test_equality(self): for level in self.levels: @@ -48,5 +56,5 @@ def test_from_string_method(self): self.assertEqual(level_from_str, level) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tests/test_POSTag.py b/tests/test_POSTag.py index 31712f5..855ca48 100644 --- a/tests/test_POSTag.py +++ b/tests/test_POSTag.py @@ -3,6 +3,7 @@ from math import inf from cefrpy import POSTag + class TestPOSTag(unittest.TestCase): def setUp(self): self.total_tags = POSTag.get_total_tags() @@ -83,5 +84,5 @@ def test_get_all_tags(self): self.assertIsNotNone(POSTag.__members__.get(pos_tag)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() From 6ae0eb328534d038dce6d6c05748ad2b8bf5fb47 Mon Sep 17 00:00:00 2001 From: Bielikov Maksym Date: Tue, 5 May 2026 09:30:57 +0300 Subject: [PATCH 2/2] refactor: documentation --- README.md | 449 +++++++++++++++++++++++++++++++++--------------- docs/cefrpy.rst | 2 +- docs/docs.md | 319 ++++++++++++++++++---------------- 3 files changed, 482 insertions(+), 288 deletions(-) diff --git a/README.md b/README.md index 029d606..e38ee2a 100644 --- a/README.md +++ b/README.md @@ -10,9 +10,9 @@ The cefrpy python module offers a comprehensive toolkit for analyzing linguistic data based on the Common European Framework of Reference for Languages (CEFR). -Documentation: https://maximax67.github.io/cefrpy/ +Documentation: -HuggingFace demo: https://huggingface.co/spaces/Maximax67/cefrpy-demo +HuggingFace demo: ## Installation @@ -48,7 +48,7 @@ else: print(f"CEFR level not found for '{word}' as a {pos_tag}.") ``` -### Getting Average Level of a Word: +### Getting average level of a word ```py from cefrpy import CEFRAnalyzer @@ -73,7 +73,8 @@ else: ### Recommended usage with [spaCy](https://spacy.io) -#### Import spacy and load model: +#### Import spacy and load model + ```py import spacy @@ -148,185 +149,363 @@ for token in tokens: Result (truncated): -``` +```text ------------------------------------------------------- - WORD POS LEVEL CEFR + WORD POS LEVEL CEFR ------------------------------------------------------- - _SP Skip None -In IN 1.00 A1 -the DT 1.00 A1 -heart NN 1.00 A1 -of IN 1.00 A1 -every DT 1.00 A1 -forest NN 2.00 A2 -, , Skip None -a DT 1.00 A1 -hidden JJ 3.00 B1 -world NN 1.00 A1 -thrives VBZ 5.86 C2 -among IN 2.00 A2 -the DT 1.00 A1 -towering VBG 1.00 A1 -trees NNS 1.00 A1 -. . Skip None -Trees NNS 1.00 A1 -, , Skip None - _SP Skip None -those DT 1.00 A1 -silent JJ 3.00 B1 + _SP Skip None +In IN 1.00 A1 +the DT 1.00 A1 +heart NN 1.00 A1 +of IN 1.00 A1 +every DT 1.00 A1 +forest NN 2.00 A2 +, , Skip None +a DT 1.00 A1 +hidden JJ 3.00 B1 +world NN 1.00 A1 +thrives VBZ 5.86 C2 +among IN 2.00 A2 +the DT 1.00 A1 +towering VBG 1.00 A1 +trees NNS 1.00 A1 +. . Skip None +Trees NNS 1.00 A1 +, , Skip None + _SP Skip None +those DT 1.00 A1 +silent JJ 3.00 B1 ``` #### Get more statistical information 1. Filter tokens by level: -```py -def filter_for_desired_level(level_tokens: list[tuple[str, str, bool, float, int, int]], - min_level: float | int = 1.0, max_level: float | int = 6.0 - ) -> set[tuple[str, str, bool, float, int, int]]: - filtered_tokens = set() - for token in level_tokens: - level = token[3] + ```py + def filter_for_desired_level(level_tokens: list[tuple[str, str, bool, float, int, int]], + min_level: float | int = 1.0, max_level: float | int = 6.0 + ) -> set[tuple[str, str, bool, float, int, int]]: + filtered_tokens = set() + for token in level_tokens: + level = token[3] + + if level and level >= min_level and level <= max_level: + filtered_tokens.add(token) + + return filtered_tokens + + + # You can also set min/max level as an int or float in range from 1 to 6 + desired_min_level = CEFRLevel.C1 + desired_level_words_set = filter_for_desired_level(tokens, min_level=int(desired_min_level)) + + desired_level_words_list = list(desired_level_words_set) + desired_level_words_list.sort() + + print(f'\tWords with level {desired_min_level} and higher: {len(desired_level_words_list)}') + for word_data in desired_level_words_list: + word, pos, _, level, _, _ = word_data + print(f"{word.ljust(26)} {pos.ljust(6)} {'{:.2f}'.format(level).ljust(6)} {CEFRLevel(round(level))}") + ``` + + ```text + Words with level B2 and higher: 16 + benefactors NNS 6.00 C2 + bristlecone NN 6.00 C2 + evolved VBN 4.00 B2 + fungi NNS 5.20 C1 + living NN 4.00 B2 + longevity NN 5.96 C2 + masters NNS 4.00 B2 + mighty JJ 4.00 B2 + observers NNS 4.00 B2 + pines NNS 4.00 B2 + potential JJ 4.00 B2 + sequoias NNS 6.00 C2 + thrives VBZ 5.86 C2 + underground RB 4.00 B2 + wildfires NNS 6.00 C2 + withstand VB 5.12 C1 + ``` - if level and level >= min_level and level <= max_level: - filtered_tokens.add(token) +2. Get CEFR statistic of the text: - return filtered_tokens + ```py + def get_word_level_count_statistic(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> list[int]: + difficulty_levels_count = [0] * 6 + for token in level_tokens: + level = token[3] + if not level: + continue + level_round = round(level) + difficulty_levels_count[level_round - 1] += 1 -# You can also set min/max level as an int or float in range from 1 to 6 -desired_min_level = CEFRLevel.C1 -desired_level_words_set = filter_for_desired_level(tokens, min_level=int(desired_min_level)) + return difficulty_levels_count -desired_level_words_list = list(desired_level_words_set) -desired_level_words_list.sort() + difficulty_levels_count = get_word_level_count_statistic(tokens) + print('CEFR statistic (total words):') + for i in range(1, 7): + print(f'{CEFRLevel(i)}: {difficulty_levels_count[i - 1]}') + ``` -print(f'\tWords with level {desired_min_level} and higher: {len(desired_level_words_list)}') -for word_data in desired_level_words_list: - word, pos, _, level, _, _ = word_data - print(f"{word.ljust(26)} {pos.ljust(6)} {'{:.2f}'.format(level).ljust(6)} {CEFRLevel(round(level))}") -``` + ```text + CEFR statistic (total words): + A1: 136 + A2: 36 + B1: 27 + B2: 11 + C1: 2 + C2: 6 + ``` -``` -Words with level B2 and higher: 16 -benefactors NNS 6.00 C2 -bristlecone NN 6.00 C2 -evolved VBN 4.00 B2 -fungi NNS 5.20 C1 -living NN 4.00 B2 -longevity NN 5.96 C2 -masters NNS 4.00 B2 -mighty JJ 4.00 B2 -observers NNS 4.00 B2 -pines NNS 4.00 B2 -potential JJ 4.00 B2 -sequoias NNS 6.00 C2 -thrives VBZ 5.86 C2 -underground RB 4.00 B2 -wildfires NNS 6.00 C2 -withstand VB 5.12 C1 -``` +3. Get CEFR statistic for unique words in the text: -2. Get CEFR statistic of the text: + ```py + def get_word_level_count_statistic_unique(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> list[int]: + processed_word_pos_set = set() + difficulty_levels_count = [0] * 6 + for token in level_tokens: + level = token[3] + if not level: + continue + + to_check_tuple = (token[0], token[1]) + if not to_check_tuple in processed_word_pos_set: + level_round = round(token[3]) + difficulty_levels_count[level_round - 1] += 1 + processed_word_pos_set.add(to_check_tuple) + + return difficulty_levels_count + + + difficulty_levels_count_unique = get_word_level_count_statistic_unique(tokens) + print('CEFR statistic (unique words):') + for i in range(1, 7): + print(f'{CEFRLevel(i)}: {difficulty_levels_count_unique[i - 1]}') + ``` + + ```text + CEFR statistic (unique words): + A1: 77 + A2: 33 + B1: 23 + B2: 11 + C1: 2 + C2: 6 + ``` + +4. Get set of not found CEFR levels for words in text: + + ```py + def get_not_found_words(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> set[str]: + not_found_words = set() + for token in level_tokens: + if token[2]: + continue + + if not token[3]: + not_found_words.add(token[0]) + + return not_found_words + + + not_found_words_set = get_not_found_words(tokens) + not_found_words_list = list(not_found_words_set) + not_found_words_list.sort() + + print('Not found words:', len(not_found_words_list)) + if len(not_found_words_list): + print('\n'.join(not_found_words_list)) + ``` + + ```text + Not found words: 0 + ``` + +## Additional features + +### Get all possible part-of-speech tags for a word ```py -def get_word_level_count_statistic(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> list[int]: - difficulty_levels_count = [0] * 6 - for token in level_tokens: - level = token[3] - if not level: - continue - - level_round = round(level) - difficulty_levels_count[level_round - 1] += 1 - - return difficulty_levels_count - -difficulty_levels_count = get_word_level_count_statistic(tokens) -print('CEFR statistic (total words):') -for i in range(1, 7): - print(f'{CEFRLevel(i)}: {difficulty_levels_count[i - 1]}') +from cefrpy import CEFRAnalyzer + +analyzer = CEFRAnalyzer() + +print(analyzer.get_all_pos_for_word("test")) # [, , ] +print(analyzer.get_all_pos_for_word_as_str("test")) # ['JJ', 'NN'] + +# {: , : , : } +print(analyzer.get_pos_level_dict_for_word("test")) + +# {'JJ': 2.5, 'NN': 1.0, 'VB': 4.0} +print(analyzer.get_pos_level_dict_for_word("test", pos_tag_as_string=True, word_level_as_float=True)) ``` +### Checking if a word exists in the database + +```py +from cefrpy import CEFRAnalyzer + +analyzer = CEFRAnalyzer() + +word = "apple" +if analyzer.is_word_in_database(word): + print(f"'{word}' exists in the database.") +else: + print(f"'{word}' does not exist in the database.") ``` -CEFR statistic (total words): -A1: 136 -A2: 36 -B1: 27 -B2: 11 -C1: 2 -C2: 6 + +### Checking if a word with a specific part-of-speech exists in the database + +```py +from cefrpy import CEFRAnalyzer + +analyzer = CEFRAnalyzer() + +word = "run" +pos_tag = "VB" # Verb +if analyzer.is_word_pos_id_database(word, pos_tag): + print(f"'{word}' with part of speech '{pos_tag}' exists in the database.") +else: + print(f"'{word}' with part of speech '{pos_tag}' does not exist in the database.") ``` -3. Get CEFR statistic for unique words in the text: +### POSTag usage examples ```py -def get_word_level_count_statistic_unique(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> list[int]: - processed_word_pos_set = set() - difficulty_levels_count = [0] * 6 - for token in level_tokens: - level = token[3] - if not level: - continue - - to_check_tuple = (token[0], token[1]) - if not to_check_tuple in processed_word_pos_set: - level_round = round(token[3]) - difficulty_levels_count[level_round - 1] += 1 - processed_word_pos_set.add(to_check_tuple) +from cefrpy import POSTag - return difficulty_levels_count +# Get list of all part-of-speech tag names +print(POSTag.get_all_tags()) # ['CC', 'CD', 'DT', ...] +# Print total tags +print(POSTag.get_total_tags()) # 28 -difficulty_levels_count_unique = get_word_level_count_statistic_unique(tokens) -print('CEFR statistic (unique words):') -for i in range(1, 7): - print(f'{CEFRLevel(i)}: {difficulty_levels_count_unique[i - 1]}') -``` +# Get description for a tag +print(POSTag.get_description_by_tag_name('NN')) # Noun, singular or mass +tag = POSTag.VB +print(tag) # VB +print(POSTag.get_description(tag)) # Verb, base form +print(int(tag)) # 19 (unique tag id) +print(tag == POSTag.NN) # False ``` -CEFR statistic (unique words): -A1: 77 -A2: 33 -B1: 23 -B2: 11 -C1: 2 -C2: 6 + +### CEFRLevel usage examples + +```py +from cefrpy import CEFRLevel + +level = CEFRLevel.A1 +print(level) # A1 +print(int(level)) # 1 + +level2 = CEFRLevel.C2 +print(level2) # C2 +print(int(level2)) # 6 + +# You can perform any comparisons: +print(level2 > level) # True +print(level2 == level) # False + +print(CEFRLevel.from_str("B1") == CEFRLevel.B1) # True +print(CEFRLevel.from_str("B1") == CEFRLevel(3)) # True ``` -4. Get set of not found CEFR levels for words in text: +### Yields CEFRAnalyzer methods + +For every example you should import and initialize `CEFRAnalyzer`: ```py -def get_not_found_words(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> set[str]: - not_found_words = set() - for token in level_tokens: - if token[2]: - continue +from cefrpy import CEFRAnalyzer - if not token[3]: - not_found_words.add(token[0]) +analyzer = CEFRAnalyzer() +``` - return not_found_words +#### Iterating over words with a specific length (alphabetical order) +```py +iteration_limit = 10 +word_list = [] +for word in analyzer.yield_words_with_length(6): + if iteration_limit == 0: + break + word_list.append(word) + iteration_limit -= 1 + +# ['aaberg', 'aachen', 'aahing', 'aargau', 'aarhus', 'abacus', 'abadan', 'abadia', 'abakan', 'abaris'] +print(word_list) +``` -not_found_words_set = get_not_found_words(tokens) -not_found_words_list = list(not_found_words_set) -not_found_words_list.sort() +#### Iterating over words with a specific length (reversed alphabetical order) -print('Not found words:', len(not_found_words_list)) -if len(not_found_words_list): - print('\n'.join(not_found_words_list)) +```py +iteration_limit = 10 +word_list = [] +for word in analyzer.yield_words_with_length(6, reverse_order=True): + if iteration_limit == 0: + break + word_list.append(word) + iteration_limit -= 1 + +# ['zymase', 'zygote', 'zygoma', 'zydeco', 'zwolle', 'zwicky', 'zuzana', 'zusman', 'zurvan', 'zurich'] +print(word_list) ``` +#### Iterating over words in alphabetical order + +```py +iteration_limit = 10 +word_list = [] +for word in analyzer.yield_words(): + if iteration_limit == 0: + break + word_list.append(word) + iteration_limit -= 1 + +# ['a', 'aa', 'aaa', 'aaaa', 'aaas', 'aaberg', 'aachen', 'aae', 'aaee', 'aaf'] +print(word_list) +``` + +#### Iterating over words with their pos in alphabetical order with word length priority ascending + +```py +iteration_limit = 6 +word_pos_list = [] +for word, pos_tag in analyzer.yield_word_pos(word_length_sort=True): + if iteration_limit == 0: + break + word_pos_list.append((word, pos_tag)) + iteration_limit -= 1 + +# [('a', ), ('a', ), ('a', ), ('a', ), ('a', ), ('b', )] +print(word_pos_list) ``` -Not found words: 0 + +#### Iterating over words with their pos as str and levels as float in reversed alphabetical order with word length priority descending + +```py +iteration_limit = 3 +word_pos_list = [] +for word, pos_tag, level in analyzer.yield_word_pos_level(word_length_sort=True, reverse_order=True, pos_tag_as_string=True, word_level_as_float=True): + if iteration_limit == 0: + break + word_pos_list.append((word, pos_tag, level)) + iteration_limit -= 1 + +# [('demethylchlortetracycline', 'NN', 6.0), ('electrocardiographically', 'RB', 6.0), ('polytetrafluoroethylene', 'NN', 6.0)] +print(word_pos_list) ``` ## License + This project is licensed under the MIT License - see the LICENSE file for details. ## Acknowledgments + I would like to acknowledge the contributions of the following resources. I used them to create my initial SQLite version [Words-CEFR-Dataset](https://github.com/Maximax67/Words-CEFR-Dataset): + - [Spacy](https://spacy.io/) - [CEFR-J](https://cefr-j.org/) - [LemmInflect](https://github.com/bjascob/LemmInflect) @@ -334,8 +513,8 @@ I would like to acknowledge the contributions of the following resources. I used - [List of pos tags form Penn Treebank Project](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html) Also I used these resources to create my [valid English words list](https://github.com/Maximax67/English-Valid-Words): + - [Word list by infochimps (archived)](https://web.archive.org/web/20131118073324/https://www.infochimps.com/datasets/word-list-350000-simple-english-words-excel-readable) - [English words github repo by dwyl](https://github.com/dwyl/english-words) - [NLTK (Natural Language Toolkit)](https://www.nltk.org/) - [WordNet](https://wordnet.princeton.edu/) - diff --git a/docs/cefrpy.rst b/docs/cefrpy.rst index 15388e0..0331a2a 100644 --- a/docs/cefrpy.rst +++ b/docs/cefrpy.rst @@ -67,4 +67,4 @@ Module contents :members: :undoc-members: :show-inheritance: - :no-idex: + :no-index: diff --git a/docs/docs.md b/docs/docs.md index 6695048..c0d2136 100644 --- a/docs/docs.md +++ b/docs/docs.md @@ -1,7 +1,17 @@ # About cefrpy +![PyPI - License](https://img.shields.io/pypi/l/cefrpy) +![PyPI - Version](https://img.shields.io/pypi/v/cefrpy) +![PyPI - Downloads](https://img.shields.io/pypi/dm/cefrpy) +[![Hugging Face Space](https://img.shields.io/badge/Hugging%20Face-Space-4CA6A7?logo=huggingface&style=flat)](https://huggingface.co/spaces/Maximax67/cefrpy-demo) +[![Unit Tests](https://github.com/Maximax67/cefrpy/actions/workflows/unittest.yml/badge.svg)](https://github.com/Maximax67/cefrpy/actions/workflows/unittest.yml) + The cefrpy python module offers a comprehensive toolkit for analyzing linguistic data based on the Common European Framework of Reference for Languages (CEFR). +Documentation: + +HuggingFace demo: + # Installation You can install `cefrpy` for Python >= 3.6 via pip: @@ -36,7 +46,7 @@ else: print(f"CEFR level not found for '{word}' as a {pos_tag}.") ``` -## Getting average level of a word: +## Getting average level of a word ```py from cefrpy import CEFRAnalyzer @@ -61,7 +71,8 @@ else: ## Recommended usage with [spaCy](https://spacy.io) -### Import spacy and load model: +### Import spacy and load model + ```py import spacy @@ -136,179 +147,179 @@ for token in tokens: Result (truncated): -``` +```text ------------------------------------------------------- - WORD POS LEVEL CEFR + WORD POS LEVEL CEFR ------------------------------------------------------- - _SP Skip None -In IN 1.00 A1 -the DT 1.00 A1 -heart NN 1.00 A1 -of IN 1.00 A1 -every DT 1.00 A1 -forest NN 2.00 A2 -, , Skip None -a DT 1.00 A1 -hidden JJ 3.00 B1 -world NN 1.00 A1 -thrives VBZ 5.86 C2 -among IN 2.00 A2 -the DT 1.00 A1 -towering VBG 1.00 A1 -trees NNS 1.00 A1 -. . Skip None -Trees NNS 1.00 A1 -, , Skip None - _SP Skip None -those DT 1.00 A1 -silent JJ 3.00 B1 + _SP Skip None +In IN 1.00 A1 +the DT 1.00 A1 +heart NN 1.00 A1 +of IN 1.00 A1 +every DT 1.00 A1 +forest NN 2.00 A2 +, , Skip None +a DT 1.00 A1 +hidden JJ 3.00 B1 +world NN 1.00 A1 +thrives VBZ 5.86 C2 +among IN 2.00 A2 +the DT 1.00 A1 +towering VBG 1.00 A1 +trees NNS 1.00 A1 +. . Skip None +Trees NNS 1.00 A1 +, , Skip None + _SP Skip None +those DT 1.00 A1 +silent JJ 3.00 B1 ``` ### Get more statistical information 1. Filter tokens by level: -```py -def filter_for_desired_level(level_tokens: list[tuple[str, str, bool, float, int, int]], - min_level: float | int = 1.0, max_level: float | int = 6.0 - ) -> set[tuple[str, str, bool, float, int, int]]: - filtered_tokens = set() - for token in level_tokens: - level = token[3] - - if level and level >= min_level and level <= max_level: - filtered_tokens.add(token) - - return filtered_tokens - - -# You can also set min/max level as an int or float in range from 1 to 6 -desired_min_level = CEFRLevel.C1 -desired_level_words_set = filter_for_desired_level(tokens, min_level=int(desired_min_level)) - -desired_level_words_list = list(desired_level_words_set) -desired_level_words_list.sort() - -print(f'\tWords with level {desired_min_level} and higher: {len(desired_level_words_list)}') -for word_data in desired_level_words_list: - word, pos, _, level, _, _ = word_data - print(f"{word.ljust(26)} {pos.ljust(6)} {'{:.2f}'.format(level).ljust(6)} {CEFRLevel(round(level))}") -``` - -``` -Words with level B2 and higher: 16 -benefactors NNS 6.00 C2 -bristlecone NN 6.00 C2 -evolved VBN 4.00 B2 -fungi NNS 5.20 C1 -living NN 4.00 B2 -longevity NN 5.96 C2 -masters NNS 4.00 B2 -mighty JJ 4.00 B2 -observers NNS 4.00 B2 -pines NNS 4.00 B2 -potential JJ 4.00 B2 -sequoias NNS 6.00 C2 -thrives VBZ 5.86 C2 -underground RB 4.00 B2 -wildfires NNS 6.00 C2 -withstand VB 5.12 C1 -``` + ```py + def filter_for_desired_level(level_tokens: list[tuple[str, str, bool, float, int, int]], + min_level: float | int = 1.0, max_level: float | int = 6.0 + ) -> set[tuple[str, str, bool, float, int, int]]: + filtered_tokens = set() + for token in level_tokens: + level = token[3] + + if level and level >= min_level and level <= max_level: + filtered_tokens.add(token) + + return filtered_tokens + + + # You can also set min/max level as an int or float in range from 1 to 6 + desired_min_level = CEFRLevel.C1 + desired_level_words_set = filter_for_desired_level(tokens, min_level=int(desired_min_level)) + + desired_level_words_list = list(desired_level_words_set) + desired_level_words_list.sort() + + print(f'\tWords with level {desired_min_level} and higher: {len(desired_level_words_list)}') + for word_data in desired_level_words_list: + word, pos, _, level, _, _ = word_data + print(f"{word.ljust(26)} {pos.ljust(6)} {'{:.2f}'.format(level).ljust(6)} {CEFRLevel(round(level))}") + ``` + + ```text + Words with level B2 and higher: 16 + benefactors NNS 6.00 C2 + bristlecone NN 6.00 C2 + evolved VBN 4.00 B2 + fungi NNS 5.20 C1 + living NN 4.00 B2 + longevity NN 5.96 C2 + masters NNS 4.00 B2 + mighty JJ 4.00 B2 + observers NNS 4.00 B2 + pines NNS 4.00 B2 + potential JJ 4.00 B2 + sequoias NNS 6.00 C2 + thrives VBZ 5.86 C2 + underground RB 4.00 B2 + wildfires NNS 6.00 C2 + withstand VB 5.12 C1 + ``` 2. Get CEFR statistic of the text: -```py -def get_word_level_count_statistic(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> list[int]: - difficulty_levels_count = [0] * 6 - for token in level_tokens: - level = token[3] - if not level: - continue - - level_round = round(level) - difficulty_levels_count[level_round - 1] += 1 - - return difficulty_levels_count - -difficulty_levels_count = get_word_level_count_statistic(tokens) -print('CEFR statistic (total words):') -for i in range(1, 7): - print(f'{CEFRLevel(i)}: {difficulty_levels_count[i - 1]}') -``` + ```py + def get_word_level_count_statistic(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> list[int]: + difficulty_levels_count = [0] * 6 + for token in level_tokens: + level = token[3] + if not level: + continue -``` -CEFR statistic (total words): -A1: 136 -A2: 36 -B1: 27 -B2: 11 -C1: 2 -C2: 6 -``` - -3. Get CEFR statistic for unique words in the text: - -```py -def get_word_level_count_statistic_unique(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> list[int]: - processed_word_pos_set = set() - difficulty_levels_count = [0] * 6 - for token in level_tokens: - level = token[3] - if not level: - continue - - to_check_tuple = (token[0], token[1]) - if not to_check_tuple in processed_word_pos_set: - level_round = round(token[3]) + level_round = round(level) difficulty_levels_count[level_round - 1] += 1 - processed_word_pos_set.add(to_check_tuple) - return difficulty_levels_count + return difficulty_levels_count + difficulty_levels_count = get_word_level_count_statistic(tokens) + print('CEFR statistic (total words):') + for i in range(1, 7): + print(f'{CEFRLevel(i)}: {difficulty_levels_count[i - 1]}') + ``` -difficulty_levels_count_unique = get_word_level_count_statistic_unique(tokens) -print('CEFR statistic (unique words):') -for i in range(1, 7): - print(f'{CEFRLevel(i)}: {difficulty_levels_count_unique[i - 1]}') -``` + ```text + CEFR statistic (total words): + A1: 136 + A2: 36 + B1: 27 + B2: 11 + C1: 2 + C2: 6 + ``` -``` -CEFR statistic (unique words): -A1: 77 -A2: 33 -B1: 23 -B2: 11 -C1: 2 -C2: 6 -``` +3. Get CEFR statistic for unique words in the text: + + ```py + def get_word_level_count_statistic_unique(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> list[int]: + processed_word_pos_set = set() + difficulty_levels_count = [0] * 6 + for token in level_tokens: + level = token[3] + if not level: + continue + + to_check_tuple = (token[0], token[1]) + if not to_check_tuple in processed_word_pos_set: + level_round = round(token[3]) + difficulty_levels_count[level_round - 1] += 1 + processed_word_pos_set.add(to_check_tuple) + + return difficulty_levels_count + + + difficulty_levels_count_unique = get_word_level_count_statistic_unique(tokens) + print('CEFR statistic (unique words):') + for i in range(1, 7): + print(f'{CEFRLevel(i)}: {difficulty_levels_count_unique[i - 1]}') + ``` + + ```text + CEFR statistic (unique words): + A1: 77 + A2: 33 + B1: 23 + B2: 11 + C1: 2 + C2: 6 + ``` 4. Get set of not found CEFR levels for words in text: -```py -def get_not_found_words(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> set[str]: - not_found_words = set() - for token in level_tokens: - if token[2]: - continue + ```py + def get_not_found_words(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> set[str]: + not_found_words = set() + for token in level_tokens: + if token[2]: + continue - if not token[3]: - not_found_words.add(token[0]) + if not token[3]: + not_found_words.add(token[0]) - return not_found_words + return not_found_words -not_found_words_set = get_not_found_words(tokens) -not_found_words_list = list(not_found_words_set) -not_found_words_list.sort() + not_found_words_set = get_not_found_words(tokens) + not_found_words_list = list(not_found_words_set) + not_found_words_list.sort() -print('Not found words:', len(not_found_words_list)) -if len(not_found_words_list): - print('\n'.join(not_found_words_list)) -``` + print('Not found words:', len(not_found_words_list)) + if len(not_found_words_list): + print('\n'.join(not_found_words_list)) + ``` -``` -Not found words: 0 -``` + ```text + Not found words: 0 + ``` # Additional features @@ -329,7 +340,7 @@ print(analyzer.get_pos_level_dict_for_word("test")) print(analyzer.get_pos_level_dict_for_word("test", pos_tag_as_string=True, word_level_as_float=True)) ``` -## Checking if a word exists in the database +### Checking if a word exists in the database ```py from cefrpy import CEFRAnalyzer @@ -411,6 +422,7 @@ analyzer = CEFRAnalyzer() ``` ### Iterating over words with a specific length (alphabetical order) + ```py iteration_limit = 10 word_list = [] @@ -469,7 +481,7 @@ for word, pos_tag in analyzer.yield_word_pos(word_length_sort=True): print(word_pos_list) ``` -### Iterating over words with their pos as str and levels as float in reversed alphabetical order with word length priority descending +### Iterating over words with their pos as str and levels as float in reversed alphabetical order with word length priority descending ```py iteration_limit = 3 @@ -484,12 +496,14 @@ for word, pos_tag, level in analyzer.yield_word_pos_level(word_length_sort=True, print(word_pos_list) ``` - # License + This project is licensed under the MIT License - see the LICENSE file for details. # Acknowledgments + I would like to acknowledge the contributions of the following resources. I used them to create my initial SQLite version [Words-CEFR-Dataset](https://github.com/Maximax67/Words-CEFR-Dataset): + - [Spacy](https://spacy.io/) - [CEFR-J](https://cefr-j.org/) - [LemmInflect](https://github.com/bjascob/LemmInflect) @@ -497,6 +511,7 @@ I would like to acknowledge the contributions of the following resources. I used - [List of pos tags form Penn Treebank Project](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html) Also I used these resources to create my [valid English words list](https://github.com/Maximax67/English-Valid-Words): + - [Word list by infochimps (archived)](https://web.archive.org/web/20131118073324/https://www.infochimps.com/datasets/word-list-350000-simple-english-words-excel-readable) - [English words github repo by dwyl](https://github.com/dwyl/english-words) - [NLTK (Natural Language Toolkit)](https://www.nltk.org/)