From 3df25298fad0389209ea9bd912314fa490c1c304 Mon Sep 17 00:00:00 2001
From: Bielikov Maksym <maximax6767@gmail.com>
Date: Tue, 5 May 2026 09:11:25 +0300
Subject: [PATCH 1/2] refactor: lint code, fix docs build warnings, update
 package config

---
 LICENSE                         |   2 +-
 docs/cefrpy.rst                 |   1 +
 docs/conf.py                    |  29 ++++---
 docs/index.rst                  |   2 -
 setup.cfg                       |   9 +-
 src/cefrpy/CEFRAnalyzer.py      | 123 ++++++++++++++++++----------
 src/cefrpy/CEFRDataProcessor.py | 140 +++++++++++++++++++++-----------
 src/cefrpy/CEFRDataReader.py    |  21 +++--
 src/cefrpy/CEFRDataValidator.py |   2 +-
 src/cefrpy/CEFRLevel.py         |   1 +
 src/cefrpy/CEFRSpaCyAnalyzer.py |  59 ++++++++++----
 src/cefrpy/POSTag.py            |  69 +++++++---------
 src/cefrpy/__init__.py          |   2 +-
 tests/test_CEFRAnalyzer.py      | 114 ++++++++++++++++++++------
 tests/test_CEFRDataProcessor.py |  72 ++++++++++++----
 tests/test_CEFRDataValidator.py |  28 +++----
 tests/test_CEFRLevel.py         |  12 ++-
 tests/test_POSTag.py            |   3 +-
 18 files changed, 457 insertions(+), 232 deletions(-)

diff --git a/LICENSE b/LICENSE
index 4fa2b63..12cc758 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2024 Belikov Maxim
+Copyright (c) 2024 Bielikov Maksym
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/docs/cefrpy.rst b/docs/cefrpy.rst
index 59303a9..15388e0 100644
--- a/docs/cefrpy.rst
+++ b/docs/cefrpy.rst
@@ -67,3 +67,4 @@ Module contents
    :members:
    :undoc-members:
    :show-inheritance:
+   :no-idex:
diff --git a/docs/conf.py b/docs/conf.py
index 34dd33a..d2cccc1 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -5,29 +5,36 @@
 
 import os
 import sys
-sys.path.insert(0, os.path.abspath('../src/cefrpy/'))
+
+sys.path.insert(0, os.path.abspath("../src/cefrpy/"))
 
 
 # -- Project information -----------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 
-project = 'cefrpy'
-copyright = '2024, Maxim Belikov'
-author = 'Maxim Belikov'
-release = '1.0'
+project = "cefrpy"
+copyright = "2026, Maksym Bielikov"
+author = "Maksym Bielikov"
+version = "1.0"
+release = "1.0.2"
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
 
-extensions = ['sphinx.ext.autodoc', 'sphinx_mdinclude', 'sphinx.ext.githubpages']
-
-templates_path = ['_templates']
-exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+extensions = ["sphinx.ext.autodoc", "sphinx_mdinclude", "sphinx.ext.githubpages"]
 
+templates_path = ["_templates"]
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 
 
 # -- Options for HTML output -------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
 
-html_theme = 'sphinx_rtd_theme'
-html_static_path = ['_static']
+html_theme = "sphinx_rtd_theme"
+html_context = {
+    "display_github": True,
+    "github_user": "Maximax67",
+    "github_repo": "cefrpy",
+    "github_version": "main",
+    "conf_py_path": "/docs/",
+}
diff --git a/docs/index.rst b/docs/index.rst
index 8eeeab8..d45f495 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -18,5 +18,3 @@ Indices and tables
 
 * :ref:`genindex`
 * :ref:`modindex`
-
-.. mdinclude:: docs.md
diff --git a/setup.cfg b/setup.cfg
index c587c0e..d8091fd 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,11 +1,14 @@
 [metadata]
 name = cefrpy
-version = 1.0.1
-author = Maxim Belikov
+version = 1.0.2
+author = Maksym Bielikov
 author_email = maximax6767@gmail.com
 description = Python package for analyzing words based on the CEFR level.
 long_description = file: README.md, LICENSE
 long_description_content_type = text/markdown
+project_urls =
+    Source = https://github.com/Maximax67/cefrpy
+    Bug Tracker = https://github.com/Maximax67/cefrpy/issues
 classifiers =
     Programming Language :: Python :: 3
     License :: OSI Approved :: MIT License
@@ -19,4 +22,4 @@ python_requires = >=3.6
 include_package_data = True
 
 [options.packages.find]
-where = src
\ No newline at end of file
+where = src
diff --git a/src/cefrpy/CEFRAnalyzer.py b/src/cefrpy/CEFRAnalyzer.py
index ab9a0dc..59a1362 100644
--- a/src/cefrpy/CEFRAnalyzer.py
+++ b/src/cefrpy/CEFRAnalyzer.py
@@ -26,8 +26,12 @@ def __init__(self, data_processor: CEFRDataProcessor = CEFRDataProcessor()) -> N
         """
         self._data_processor = data_processor
 
-
-    def get_word_pos_level_float(self, word: str, pos_tag: Union[str, POSTag], avg_level_not_found_pos: bool = False) -> Union[float, None]:
+    def get_word_pos_level_float(
+        self,
+        word: str,
+        pos_tag: Union[str, POSTag],
+        avg_level_not_found_pos: bool = False,
+    ) -> Union[float, None]:
         """
         Get the level of a word's part of speech.
 
@@ -46,10 +50,16 @@ def get_word_pos_level_float(self, word: str, pos_tag: Union[str, POSTag], avg_l
 
             pos_tag_id = inf
 
-        return self._data_processor.get_word_level_for_pos_id(word, pos_tag_id, avg_level_not_found_pos)
-
+        return self._data_processor.get_word_level_for_pos_id(
+            word, pos_tag_id, avg_level_not_found_pos
+        )
 
-    def get_word_pos_level_CEFR(self, word: str, pos_tag: Union[str, POSTag], avg_level_not_found_pos: bool = False) -> Union[CEFRLevel, None]:
+    def get_word_pos_level_CEFR(
+        self,
+        word: str,
+        pos_tag: Union[str, POSTag],
+        avg_level_not_found_pos: bool = False,
+    ) -> Union[CEFRLevel, None]:
         """
         Get the CEFR level of a word's part of speech.
 
@@ -61,13 +71,14 @@ def get_word_pos_level_CEFR(self, word: str, pos_tag: Union[str, POSTag], avg_le
         Returns:
             Union[CEFRLevel, None]: The level of the word's part of speech, or None if not found.
         """
-        float_level = self.get_word_pos_level_float(word, pos_tag, avg_level_not_found_pos)
+        float_level = self.get_word_pos_level_float(
+            word, pos_tag, avg_level_not_found_pos
+        )
         if float_level is None:
             return
 
         return CEFRLevel(round(float_level))
 
-
     def get_average_word_level_float(self, word: str) -> Union[float, None]:
         """
         Get the average level of the word.
@@ -80,7 +91,6 @@ def get_average_word_level_float(self, word: str) -> Union[float, None]:
         """
         return self._data_processor.get_word_level_for_pos_id(word, inf, True)
 
-
     def get_average_word_level_CEFR(self, word: str) -> Union[CEFRLevel, None]:
         """
         Get the average CEFR level of the word.
@@ -97,7 +107,6 @@ def get_average_word_level_CEFR(self, word: str) -> Union[CEFRLevel, None]:
 
         return CEFRLevel(round(float_level))
 
-
     def get_all_pos_for_word_as_str(self, word: str) -> list[str]:
         """
         Retrieves the names of all part-of-speech tags associated with a given word.
@@ -106,7 +115,7 @@ def get_all_pos_for_word_as_str(self, word: str) -> list[str]:
             word (str): The word to retrieve part-of-speech tags for.
 
         Returns:
-            list[str]: A list of strings representing the names of the part-of-speech tags associated with the word. 
+            list[str]: A list of strings representing the names of the part-of-speech tags associated with the word.
                 If the word is not found in the data, an empty list is returned.
         """
         pos_tags = self._data_processor.get_all_pos_for_word(word)
@@ -118,7 +127,6 @@ def get_all_pos_for_word_as_str(self, word: str) -> list[str]:
 
         return pos_tags_str_list
 
-
     def get_all_pos_for_word(self, word: str) -> list[POSTag]:
         """
         Retrieves all part-of-speech tags associated with a given word as POSTag enums.
@@ -127,7 +135,7 @@ def get_all_pos_for_word(self, word: str) -> list[POSTag]:
             word (str): The word to retrieve part-of-speech tags for.
 
         Returns:
-            list[POSTag]: A list of POSTag enums representing the part-of-speech tags associated with the word. 
+            list[POSTag]: A list of POSTag enums representing the part-of-speech tags associated with the word.
                 If the word is not found in the data, an empty list is returned.
         """
         pos_tags = self._data_processor.get_all_pos_for_word(word)
@@ -138,9 +146,12 @@ def get_all_pos_for_word(self, word: str) -> list[POSTag]:
 
         return pos_tags_list
 
-
-    def get_pos_level_dict_for_word(self, word: str, pos_tag_as_string: bool = False, 
-                                    word_level_as_float: bool = False) -> dict[Union[str, POSTag], Union[float, CEFRLevel]]:
+    def get_pos_level_dict_for_word(
+        self,
+        word: str,
+        pos_tag_as_string: bool = False,
+        word_level_as_float: bool = False,
+    ) -> dict[Union[str, POSTag], Union[float, CEFRLevel]]:
         """
         Retrieves a dictionary mapping part-of-speech tags to their associated CEFR levels for a given word.
 
@@ -174,7 +185,6 @@ def get_pos_level_dict_for_word(self, word: str, pos_tag_as_string: bool = False
 
         return pos_and_levels_formatted
 
-
     def get_max_word_len(self) -> int:
         """
         Get the maximum word length available in the data.
@@ -184,7 +194,6 @@ def get_max_word_len(self) -> int:
         """
         return self._data_processor.get_max_word_len()
 
-
     def is_word_in_database(self, word: str) -> bool:
         """
         Check if a word is in the DataReader database.
@@ -197,7 +206,6 @@ def is_word_in_database(self, word: str) -> bool:
         """
         return self._data_processor.is_word_in_database(word)
 
-
     def is_word_pos_id_database(self, word: str, pos_tag: Union[str, POSTag]) -> bool:
         """
         Check if a word pos is in the database.
@@ -215,7 +223,6 @@ def is_word_pos_id_database(self, word: str, pos_tag: Union[str, POSTag]) -> boo
 
         return self._data_processor.is_word_pos_id_database(word, pos_tag_id)
 
-
     def yield_words_with_length(self, word_length: int, reverse_order: bool = False):
         """
         Yield words of a specific length from the database.
@@ -229,7 +236,6 @@ def yield_words_with_length(self, word_length: int, reverse_order: bool = False)
         """
         return self._data_processor.yield_words_with_length(word_length, reverse_order)
 
-
     def yield_words(self, reverse_order: bool = False, word_length_sort: bool = False):
         """
         Yield all words in the database.
@@ -243,8 +249,12 @@ def yield_words(self, reverse_order: bool = False, word_length_sort: bool = Fals
         """
         return self._data_processor.yield_words(reverse_order, word_length_sort)
 
-
-    def yield_word_pos_with_length(self, word_length: int, reverse_order: bool = False, pos_tag_as_string: bool = False):
+    def yield_word_pos_with_length(
+        self,
+        word_length: int,
+        reverse_order: bool = False,
+        pos_tag_as_string: bool = False,
+    ):
         """
         Yield words of a specific length with their associated part-of-speech tag IDs from the database.
 
@@ -263,11 +273,17 @@ def yield_word_pos_with_length(self, word_length: int, reverse_order: bool = Fal
         else:
             pos_converter = lambda x: POSTag(x)
 
-        for word, pos_tag_id in self._data_processor.yield_word_pos_id_with_length(word_length, reverse_order):
+        for word, pos_tag_id in self._data_processor.yield_word_pos_id_with_length(
+            word_length, reverse_order
+        ):
             yield (word, pos_converter(pos_tag_id))
 
-
-    def yield_word_pos(self, reverse_order: bool = False, pos_tag_as_string: bool = False, word_length_sort: bool = False):
+    def yield_word_pos(
+        self,
+        reverse_order: bool = False,
+        pos_tag_as_string: bool = False,
+        word_length_sort: bool = False,
+    ):
         """
         Yield all words with their associated part-of-speech tag IDs from the database.
 
@@ -286,12 +302,18 @@ def yield_word_pos(self, reverse_order: bool = False, pos_tag_as_string: bool =
         else:
             pos_converter = lambda x: POSTag(x)
 
-        for word, pos_tag_id in self._data_processor.yield_word_pos_id(reverse_order, word_length_sort):
+        for word, pos_tag_id in self._data_processor.yield_word_pos_id(
+            reverse_order, word_length_sort
+        ):
             yield (word, pos_converter(pos_tag_id))
 
-
-    def yield_word_pos_level_with_length(self, word_length: int, reverse_order: bool = False, 
-                                        pos_tag_as_string: bool = False, word_level_as_float: bool = False):
+    def yield_word_pos_level_with_length(
+        self,
+        word_length: int,
+        reverse_order: bool = False,
+        pos_tag_as_string: bool = False,
+        word_level_as_float: bool = False,
+    ):
         """
         Yield words of a specific length, their part-of-speech tags, and their CEFR levels from the database based on the specified criteria.
 
@@ -302,7 +324,7 @@ def yield_word_pos_level_with_length(self, word_length: int, reverse_order: bool
             word_level_as_float (bool, optional): If True, yield CEFR levels as floats instead of CEFRLevel enums. Defaults to False.
 
         Yields:
-            tuple: A tuple containing the word, its part-of-speech tag, and its CEFR level. If `pos_tag_as_string` is True, the part-of-speech tag is a string, 
+            tuple: A tuple containing the word, its part-of-speech tag, and its CEFR level. If `pos_tag_as_string` is True, the part-of-speech tag is a string,
                 otherwise, it's a POSTag enum. If `word_level_as_float` is True, the level is a float, otherwise, it's a CEFRLevel enum.
         """
         if pos_tag_as_string:
@@ -311,17 +333,33 @@ def yield_word_pos_level_with_length(self, word_length: int, reverse_order: bool
             pos_converter = lambda x: POSTag(x)
 
         if word_level_as_float:
-            for word, pos_tag_id, level in self._data_processor.yield_word_pos_level_with_length(word_length, reverse_order):
+            for (
+                word,
+                pos_tag_id,
+                level,
+            ) in self._data_processor.yield_word_pos_level_with_length(
+                word_length, reverse_order
+            ):
                 yield (word, pos_converter(pos_tag_id), level)
 
             return
 
-        for word, pos_tag_id, level in self._data_processor.yield_word_pos_level_with_length(word_length, reverse_order):
+        for (
+            word,
+            pos_tag_id,
+            level,
+        ) in self._data_processor.yield_word_pos_level_with_length(
+            word_length, reverse_order
+        ):
             yield (word, pos_converter(pos_tag_id), CEFRLevel(round(level)))
 
-
-    def yield_word_pos_level(self, reverse_order: bool = False, pos_tag_as_string: bool = False,
-                            word_level_as_float: bool = False, word_length_sort: bool = False):
+    def yield_word_pos_level(
+        self,
+        reverse_order: bool = False,
+        pos_tag_as_string: bool = False,
+        word_level_as_float: bool = False,
+        word_length_sort: bool = False,
+    ):
         """
         Yield all words, their part-of-speech tags, and their CEFR levels from the database based on the specified criteria.
 
@@ -332,7 +370,7 @@ def yield_word_pos_level(self, reverse_order: bool = False, pos_tag_as_string: b
             word_length_sort (bool): If True, yields data sorted by word length.
 
         Yields:
-            tuple: A tuple containing the word, its part-of-speech tag, and its CEFR level. If `pos_tag_as_string` is True, the part-of-speech tag is a string, 
+            tuple: A tuple containing the word, its part-of-speech tag, and its CEFR level. If `pos_tag_as_string` is True, the part-of-speech tag is a string,
                 otherwise, it's a POSTag enum. If `word_level_as_float` is True, the level is a float, otherwise, it's a CEFRLevel enum.
         """
         if pos_tag_as_string:
@@ -341,15 +379,18 @@ def yield_word_pos_level(self, reverse_order: bool = False, pos_tag_as_string: b
             pos_converter = lambda x: POSTag(x)
 
         if word_level_as_float:
-            for word, pos_tag_id, level in self._data_processor.yield_word_pos_level(reverse_order, word_length_sort):
+            for word, pos_tag_id, level in self._data_processor.yield_word_pos_level(
+                reverse_order, word_length_sort
+            ):
                 yield (word, pos_converter(pos_tag_id), level)
 
             return
 
-        for word, pos_tag_id, level in self._data_processor.yield_word_pos_level(reverse_order, word_length_sort):
+        for word, pos_tag_id, level in self._data_processor.yield_word_pos_level(
+            reverse_order, word_length_sort
+        ):
             yield (word, pos_converter(pos_tag_id), CEFRLevel(round(level)))
 
-
     def get_word_count_for_length(self, word_length: int) -> int:
         """
         Count the number of words of a specific length in the data.
@@ -362,7 +403,6 @@ def get_word_count_for_length(self, word_length: int) -> int:
         """
         return self._data_processor.get_word_count_for_length(word_length)
 
-
     def get_total_words(self) -> int:
         """
         Get the total count of words in the data.
@@ -372,7 +412,6 @@ def get_total_words(self) -> int:
         """
         return self._data_processor.get_total_words()
 
-
     def get_word_pos_count_for_length(self, word_length: int) -> int:
         """
         Count the number of positions in the data where words of a specific length start.
@@ -385,7 +424,6 @@ def get_word_pos_count_for_length(self, word_length: int) -> int:
         """
         return self._data_processor.get_word_pos_count_for_length(word_length)
 
-
     def get_word_pos_count(self) -> int:
         """
         Get the total count of positions in the data where words start, across all word lengths.
@@ -395,7 +433,6 @@ def get_word_pos_count(self) -> int:
         """
         return self._data_processor.get_word_pos_count()
 
-
     @staticmethod
     def get_pos_tag_id(pos_tag: Union[str, POSTag]) -> Union[int, None]:
         """
diff --git a/src/cefrpy/CEFRDataProcessor.py b/src/cefrpy/CEFRDataProcessor.py
index ce7e529..ab0105f 100644
--- a/src/cefrpy/CEFRDataProcessor.py
+++ b/src/cefrpy/CEFRDataProcessor.py
@@ -4,9 +4,10 @@
 from heapq import heapify, heappush, heappop
 
 from .CEFRDataReader import CEFRDataReader
+from .CEFRDataValidator import VALID_WORD_CHARACTERS
 
 
-class HeapqReverseDataWrapper():
+class HeapqReverseDataWrapper:
     """
     Wrapper class to reverse the ordering of data when using heapq.
 
@@ -22,6 +23,7 @@ class HeapqReverseDataWrapper():
     Methods:
         __lt__(self, other): Less-than comparison method used to determine the ordering of the wrapped data.
     """
+
     def __init__(self, data) -> None:
         """
         Initialize the HeapqReverseDataWrapper instance.
@@ -61,7 +63,6 @@ def __init__(self, data_reader: CEFRDataReader = CEFRDataReader()) -> None:
         """
         self._data_reader = data_reader
 
-
     def get_max_word_len(self) -> int:
         """
         Get the maximum word length available in the data.
@@ -71,7 +72,6 @@ def get_max_word_len(self) -> int:
         """
         return self._data_reader.get_wlp_len() - 1
 
-
     def is_word_len_valid(self, word_len: int) -> bool:
         """
         Check if the word length is valid.
@@ -84,7 +84,6 @@ def is_word_len_valid(self, word_len: int) -> bool:
         """
         return 0 < word_len < self._data_reader.get_wlp_len()
 
-
     def _get_first_word_match_pos(self, word_packed: bytes) -> int:
         """
         Get the position of the first occurrence of a word in the data.
@@ -123,8 +122,9 @@ def _get_first_word_match_pos(self, word_packed: bytes) -> int:
 
         return -1
 
-
-    def _get_int_word_level_for_pos_id(self, word_packed: bytes, pos_tag_id: int, avg_level_not_found_pos: bool = False) -> Union[int, None]:
+    def _get_int_word_level_for_pos_id(
+        self, word_packed: bytes, pos_tag_id: int, avg_level_not_found_pos: bool = False
+    ) -> Union[int, None]:
         """
         Get the packed level of a word's part of speech.
 
@@ -183,7 +183,6 @@ def _get_int_word_level_for_pos_id(self, word_packed: bytes, pos_tag_id: int, av
 
             m = first_match
 
-            
         else:
             while True:
                 m += data_block_len
@@ -224,7 +223,9 @@ def _get_int_word_level_for_pos_id(self, word_packed: bytes, pos_tag_id: int, av
                     i += 1
                 else:
                     founded_pos += 1
-                    level_accumulator += self._data_reader.get_data_array_value_at(i + 1)
+                    level_accumulator += self._data_reader.get_data_array_value_at(
+                        i + 1
+                    )
                     continue
 
                 break
@@ -232,7 +233,6 @@ def _get_int_word_level_for_pos_id(self, word_packed: bytes, pos_tag_id: int, av
         if avg_level_not_found_pos:
             return round(level_accumulator / founded_pos)
 
-
     def _get_word_data_range(self, word: str) -> Union[range, None]:
         """
         Determines the range of data associated with a given word.
@@ -247,6 +247,9 @@ def _get_word_data_range(self, word: str) -> Union[range, None]:
         if not self.is_word_len_valid(len(word)):
             return
 
+        if not self._is_word_chars_valid(word):
+            return
+
         word_packed = self.pack_word(word)
         first_match = self._get_first_word_match_pos(word_packed)
         if first_match == -1:
@@ -300,7 +303,6 @@ def _get_word_data_range(self, word: str) -> Union[range, None]:
 
         return range(start_range, end_range, data_block_len)
 
-
     def get_all_pos_for_word(self, word: str) -> list[int]:
         """
         Retrieves the IDs of all part-of-speech tags associated with a given word.
@@ -309,7 +311,7 @@ def get_all_pos_for_word(self, word: str) -> list[int]:
             word (str): The word to retrieve part-of-speech tags for.
 
         Returns:
-            list[int]: A list of IDs representing the part-of-speech tags associated with the word. 
+            list[int]: A list of IDs representing the part-of-speech tags associated with the word.
                 If the word is not found in the data, an empty list is returned.
         """
         data_range = self._get_word_data_range(word)
@@ -323,7 +325,6 @@ def get_all_pos_for_word(self, word: str) -> list[int]:
 
         return pos_list
 
-
     def get_pos_level_dict_for_word(self, word: str) -> dict[int, float]:
         """
         Retrieves a dictionary mapping part-of-speech tag IDs to their associated CEFR levels for a given word.
@@ -349,8 +350,9 @@ def get_pos_level_dict_for_word(self, word: str) -> dict[int, float]:
 
         return result
 
-
-    def get_word_level_for_pos_id(self, word: str, pos_tag_id: int, avg_level_not_found_pos: bool = False) -> Union[float, None]:
+    def get_word_level_for_pos_id(
+        self, word: str, pos_tag_id: int, avg_level_not_found_pos: bool = False
+    ) -> Union[float, None]:
         """
         Get the level of a word's part of speech.
 
@@ -365,13 +367,17 @@ def get_word_level_for_pos_id(self, word: str, pos_tag_id: int, avg_level_not_fo
         if not self.is_word_len_valid(len(word)):
             return
 
+        if not self._is_word_chars_valid(word):
+            return
+
         word_packed = self.pack_word(word)
-        level = self._get_int_word_level_for_pos_id(word_packed, pos_tag_id, avg_level_not_found_pos)
+        level = self._get_int_word_level_for_pos_id(
+            word_packed, pos_tag_id, avg_level_not_found_pos
+        )
 
         if level is not None:
             return self.byte_int_level_to_float(level)
 
-
     def is_word_in_database(self, word: str) -> bool:
         """
         Check if a word is in the database.
@@ -385,11 +391,13 @@ def is_word_in_database(self, word: str) -> bool:
         if not self.is_word_len_valid(len(word)):
             return False
 
+        if not self._is_word_chars_valid(word):
+            return False
+
         word_packed = self.pack_word(word)
 
         return self._get_first_word_match_pos(word_packed) != -1
 
-
     def is_word_pos_id_database(self, word: str, pos_tag_id: int) -> bool:
         """
         Check if a word pos is in the database.
@@ -403,7 +411,6 @@ def is_word_pos_id_database(self, word: str, pos_tag_id: int) -> bool:
         """
         return self.get_word_level_for_pos_id(word, pos_tag_id) is not None
 
-
     def _unpack_word_in_data_array(self, i: int, word_length: int) -> str:
         """
         Unpack a word in the data array starting from index 'i' with a given length.
@@ -425,8 +432,9 @@ def _unpack_word_in_data_array(self, i: int, word_length: int) -> str:
 
         return word
 
-
-    def _get_word_yield_start_block_range(self, word_length: int, reverse_order: bool = False):
+    def _get_word_yield_start_block_range(
+        self, word_length: int, reverse_order: bool = False
+    ):
         """
         Get the range of block indices to start yielding words of a specific length.
 
@@ -444,11 +452,14 @@ def _get_word_yield_start_block_range(self, word_length: int, reverse_order: boo
         if reverse_order:
             # This approach should be faster than reversed(range(...)):
             # https://stackoverflow.com/a/7286465/15070145
-            return range(segment_end - data_block_len, segment_start - data_block_len, -data_block_len)
+            return range(
+                segment_end - data_block_len,
+                segment_start - data_block_len,
+                -data_block_len,
+            )
 
         return range(segment_start, segment_end, data_block_len)
 
-
     def yield_words_with_length(self, word_length: int, reverse_order: bool = False):
         """
         Yield words of a specific length from the database.
@@ -463,7 +474,9 @@ def yield_words_with_length(self, word_length: int, reverse_order: bool = False)
         if not self.is_word_len_valid(word_length):
             return
 
-        start_block_range = self._get_word_yield_start_block_range(word_length, reverse_order)
+        start_block_range = self._get_word_yield_start_block_range(
+            word_length, reverse_order
+        )
 
         last_word = None
         for i in start_block_range:
@@ -473,8 +486,9 @@ def yield_words_with_length(self, word_length: int, reverse_order: bool = False)
                 yield word
                 last_word = word
 
-
-    def yield_word_pos_id_with_length(self, word_length: int, reverse_order: bool = False):
+    def yield_word_pos_id_with_length(
+        self, word_length: int, reverse_order: bool = False
+    ):
         """
         Yield words of a specific length with their associated part-of-speech tag IDs from the database.
 
@@ -489,7 +503,9 @@ def yield_word_pos_id_with_length(self, word_length: int, reverse_order: bool =
         if not self.is_word_len_valid(word_length):
             return
 
-        start_block_range = self._get_word_yield_start_block_range(word_length, reverse_order)
+        start_block_range = self._get_word_yield_start_block_range(
+            word_length, reverse_order
+        )
 
         for i in start_block_range:
             word = self._unpack_word_in_data_array(i, word_length)
@@ -497,8 +513,9 @@ def yield_word_pos_id_with_length(self, word_length: int, reverse_order: bool =
 
             yield (word, word_pos)
 
-
-    def yield_word_pos_level_with_length(self, word_length: int, reverse_order: bool = False):
+    def yield_word_pos_level_with_length(
+        self, word_length: int, reverse_order: bool = False
+    ):
         """
         Yield words of a specific length with their part-of-speech tag IDs and levels from the database.
 
@@ -513,7 +530,9 @@ def yield_word_pos_level_with_length(self, word_length: int, reverse_order: bool
         if not self.is_word_len_valid(word_length):
             return
 
-        start_block_range = self._get_word_yield_start_block_range(word_length, reverse_order)
+        start_block_range = self._get_word_yield_start_block_range(
+            word_length, reverse_order
+        )
 
         for i in start_block_range:
             word = self._unpack_word_in_data_array(i, word_length)
@@ -525,8 +544,12 @@ def yield_word_pos_level_with_length(self, word_length: int, reverse_order: bool
 
             yield (word, word_pos, word_level_float)
 
-
-    def _yield_all_data(self, yield_method_with_word_length: callable, reverse_order: bool, word_lenght_sort: bool):
+    def _yield_all_data(
+        self,
+        yield_method_with_word_length: callable,
+        reverse_order: bool,
+        word_lenght_sort: bool,
+    ):
         """
         Yields data from various generators based on word length.
 
@@ -552,7 +575,10 @@ def _yield_all_data(self, yield_method_with_word_length: callable, reverse_order
 
             return
 
-        generators = [yield_method_with_word_length(i, reverse_order) for i in range(1, max_word_len + 1)]
+        generators = [
+            yield_method_with_word_length(i, reverse_order)
+            for i in range(1, max_word_len + 1)
+        ]
         words_heap = []
         heapify(words_heap)
 
@@ -593,7 +619,6 @@ def _yield_all_data(self, yield_method_with_word_length: callable, reverse_order
             except StopIteration:
                 pass
 
-
     def yield_words(self, reverse_order: bool = False, word_lenght_sort: bool = False):
         """
         Yield all words in the database.
@@ -605,10 +630,13 @@ def yield_words(self, reverse_order: bool = False, word_lenght_sort: bool = Fals
         Yields:
             str: A word from the database.
         """
-        return self._yield_all_data(self.yield_words_with_length, reverse_order, word_lenght_sort)
-
+        return self._yield_all_data(
+            self.yield_words_with_length, reverse_order, word_lenght_sort
+        )
 
-    def yield_word_pos_id(self, reverse_order: bool = False, word_lenght_sort: bool = False):
+    def yield_word_pos_id(
+        self, reverse_order: bool = False, word_lenght_sort: bool = False
+    ):
         """
         Yield words with their part-of-speech tag IDs from the database.
 
@@ -619,10 +647,13 @@ def yield_word_pos_id(self, reverse_order: bool = False, word_lenght_sort: bool
         Yields:
             tuple[str, int]: A tuple containing a word from the database and its associated part-of-speech tag ID.
         """
-        return self._yield_all_data(self.yield_word_pos_id_with_length, reverse_order, word_lenght_sort)
+        return self._yield_all_data(
+            self.yield_word_pos_id_with_length, reverse_order, word_lenght_sort
+        )
 
-
-    def yield_word_pos_level(self, reverse_order: bool = False, word_lenght_sort: bool = False):
+    def yield_word_pos_level(
+        self, reverse_order: bool = False, word_lenght_sort: bool = False
+    ):
         """
         Yield words with their part-of-speech tag IDs and levels from the database.
 
@@ -634,8 +665,9 @@ def yield_word_pos_level(self, reverse_order: bool = False, word_lenght_sort: bo
             tuple[str, int, float]: A tuple containing a word from the database, its associated part-of-speech tag ID,
                 and its level.
         """
-        return self._yield_all_data(self.yield_word_pos_level_with_length, reverse_order, word_lenght_sort)
-
+        return self._yield_all_data(
+            self.yield_word_pos_level_with_length, reverse_order, word_lenght_sort
+        )
 
     def get_word_count_for_length(self, word_length: int) -> int:
         """
@@ -664,13 +696,14 @@ def get_word_count_for_length(self, word_length: int) -> int:
 
                     for k in range(j + 1, word_length):
                         array_pos += 1
-                        last_word[k] = self._data_reader.get_data_array_value_at(array_pos)
+                        last_word[k] = self._data_reader.get_data_array_value_at(
+                            array_pos
+                        )
 
                     break
 
         return unique_words_counter
 
-
     def get_total_words(self) -> int:
         """
         Get the total count of words in the data.
@@ -686,7 +719,6 @@ def get_total_words(self) -> int:
 
         return counter
 
-
     def get_word_pos_count_for_length(self, word_length: int) -> int:
         """
         Count the number of positions in the data where words of a specific length start.
@@ -706,7 +738,6 @@ def get_word_pos_count_for_length(self, word_length: int) -> int:
 
         return (segment_end - segment_start) // data_block_len
 
-
     def get_word_pos_count(self) -> int:
         """
         Get the total count of positions in the data where words start, across all word lengths.
@@ -722,7 +753,6 @@ def get_word_pos_count(self) -> int:
 
         return counter
 
-
     @staticmethod
     def pack_word(word: str) -> bytes:
         """
@@ -734,8 +764,24 @@ def pack_word(word: str) -> bytes:
         Returns:
             bytes: The packed representation of the word.
         """
-        return struct.pack('B' * len(word), *map(ord, word))
+        return struct.pack("B" * len(word), *map(ord, word))
+
+    @staticmethod
+    def _is_word_chars_valid(word: str) -> bool:
+        """
+        Check whether every character in the word is a valid lowercase ASCII letter.
+
+        Non-ASCII characters (e.g. 'あ', 'é', '中') would cause struct.pack to raise
+        an error, so we reject them early and return None/False from callers instead
+        of crashing.
 
+        Args:
+            word (str): The word to validate.
+
+        Returns:
+            bool: True if all characters are valid, False otherwise.
+        """
+        return all(c in VALID_WORD_CHARACTERS for c in word)
 
     @staticmethod
     def byte_int_level_to_float(level: int) -> float:
diff --git a/src/cefrpy/CEFRDataReader.py b/src/cefrpy/CEFRDataReader.py
index 2b24609..6cb0a70 100644
--- a/src/cefrpy/CEFRDataReader.py
+++ b/src/cefrpy/CEFRDataReader.py
@@ -31,13 +31,16 @@ def __init__(self, data_path: Union[str, None] = None) -> None:
             Exception: If the CEFR database file content is invalid.
         """
 
-        self.data_path = os.path.join(os.path.dirname(__file__), 'data.bin') if data_path is None else data_path
-        self._wlp = array.array('I')
+        self.data_path = (
+            os.path.join(os.path.dirname(__file__), "data.bin")
+            if data_path is None
+            else data_path
+        )
+        self._wlp = array.array("I")
         self._data_array = bytearray()
 
         if not self._read_data():
-            raise Exception(f'CEFR database file content is invalid: {self.data_path}')
-
+            raise Exception(f"CEFR database file content is invalid: {self.data_path}")
 
     def _read_data(self) -> bool:
         """
@@ -46,18 +49,17 @@ def _read_data(self) -> bool:
         Returns:
             bool: True if the data is successfully read and valid, False otherwise.
         """
-        with open(self.data_path, 'rb') as file:
-            wlp_len = struct.unpack('B', file.read(1))[0]
+        with open(self.data_path, "rb") as file:
+            wlp_len = struct.unpack("B", file.read(1))[0]
             if not is_wlp_length_valid(wlp_len):
                 return False
 
-            wlp_data = file.read(wlp_len * struct.calcsize('I'))
+            wlp_data = file.read(wlp_len * struct.calcsize("I"))
             self._wlp.frombytes(wlp_data)
             self._data_array = bytearray(file.read())
 
         return is_data_valid(self._wlp, self._data_array)
 
-
     def get_wlp_value_at(self, i: int) -> int:
         """
         Get the value at index i in the word length positions array.
@@ -76,7 +78,6 @@ def get_wlp_value_at(self, i: int) -> int:
 
         raise IndexError("Index out of range for _wlp")
 
-
     def get_data_array_value_at(self, i: int) -> int:
         """
         Get the value at index i in the data array.
@@ -95,7 +96,6 @@ def get_data_array_value_at(self, i: int) -> int:
 
         raise IndexError("Index out of range for _data_array")
 
-
     def get_wlp_len(self) -> int:
         """
         Get the length of the word length positions array.
@@ -105,7 +105,6 @@ def get_wlp_len(self) -> int:
         """
         return len(self._wlp)
 
-
     def get_data_array_len(self) -> int:
         """
         Get the length of the data array.
diff --git a/src/cefrpy/CEFRDataValidator.py b/src/cefrpy/CEFRDataValidator.py
index cc78c14..8b1d531 100644
--- a/src/cefrpy/CEFRDataValidator.py
+++ b/src/cefrpy/CEFRDataValidator.py
@@ -72,7 +72,7 @@ def validate_data_block(data: bytearray, start_pos: int, block_length: int) -> b
     """
     word_len = block_length - 2
     for i in range(start_pos, start_pos + word_len):
-        if not chr(data[i]) in VALID_WORD_CHARACTERS:
+        if chr(data[i]) not in VALID_WORD_CHARACTERS:
             return False
 
     if data[i + 1] > MAX_POS_TAG_ID:
diff --git a/src/cefrpy/CEFRLevel.py b/src/cefrpy/CEFRLevel.py
index 1b6dc0f..cdcce4c 100644
--- a/src/cefrpy/CEFRLevel.py
+++ b/src/cefrpy/CEFRLevel.py
@@ -1,5 +1,6 @@
 from enum import Enum, unique
 
+
 @unique
 class CEFRLevel(Enum):
     """
diff --git a/src/cefrpy/CEFRSpaCyAnalyzer.py b/src/cefrpy/CEFRSpaCyAnalyzer.py
index 6f04210..405ec5e 100644
--- a/src/cefrpy/CEFRSpaCyAnalyzer.py
+++ b/src/cefrpy/CEFRSpaCyAnalyzer.py
@@ -4,7 +4,8 @@
 
 from .CEFRAnalyzer import CEFRAnalyzer
 
-class CEFRSpaCyAnalyzer():
+
+class CEFRSpaCyAnalyzer:
     """
     Analyze text for CEFR levels, considering provided entity types to skip and abbreviation mapping.
 
@@ -15,8 +16,12 @@ class CEFRSpaCyAnalyzer():
         tokens (list[tuple[str, str, bool, float, int, int]]): List of token tuples containing word, POS tag, skip status, CEFR level, start index, and end index.
     """
 
-    def __init__(self, analyzer: CEFRAnalyzer = CEFRAnalyzer(), entity_types_to_skip: Union[set[str], list[str], None] = None,
-                abbreviation_mapping: Union[dict[str, str], None] = None) -> None:
+    def __init__(
+        self,
+        analyzer: CEFRAnalyzer = CEFRAnalyzer(),
+        entity_types_to_skip: Union[set[str], list[str], None] = None,
+        abbreviation_mapping: Union[dict[str, str], None] = None,
+    ) -> None:
         """
         Initialize the CEFRSpaCyAnalyzer instance.
 
@@ -26,8 +31,12 @@ def __init__(self, analyzer: CEFRAnalyzer = CEFRAnalyzer(), entity_types_to_skip
             abbreviation_mapping (Union[dict[str, str], None], optional): A dictionary mapping abbreviations to their full forms. Defaults to None.
         """
         self._analyzer = analyzer
-        self.entity_types_to_skip = set() if entity_types_to_skip is None else set(entity_types_to_skip)
-        self.abbreviation_mapping = dict() if abbreviation_mapping is None else abbreviation_mapping
+        self.entity_types_to_skip = (
+            set() if entity_types_to_skip is None else set(entity_types_to_skip)
+        )
+        self.abbreviation_mapping = (
+            dict() if abbreviation_mapping is None else abbreviation_mapping
+        )
 
     def _get_next_entity(self, entities_iter: Iterator):
         """
@@ -38,7 +47,9 @@ def _get_next_entity(self, entities_iter: Iterator):
         except StopIteration:
             return None
 
-    def _get_word_pos_tokens_set(self, tokens: list[tuple[str, str, str, bool, int, int]]) -> set[tuple[str, str]]:
+    def _get_word_pos_tokens_set(
+        self, tokens: list[tuple[str, str, str, bool, int, int]]
+    ) -> set[tuple[str, str]]:
         """
         Get unique word and POS tag tuples from tokens.
 
@@ -50,7 +61,9 @@ def _get_word_pos_tokens_set(self, tokens: list[tuple[str, str, str, bool, int,
         """
         return {(token[1], token[2]) for token in tokens if not token[3]}
 
-    def _fetch_word_pos_level_tokens(self, word_pos_tokens_set: set[tuple[str, str]]) -> dict[tuple[str, str], float]:
+    def _fetch_word_pos_level_tokens(
+        self, word_pos_tokens_set: set[tuple[str, str]]
+    ) -> dict[tuple[str, str], float]:
         """
         Fetch CEFR levels for unique word and POS tag tuples.
 
@@ -62,7 +75,9 @@ def _fetch_word_pos_level_tokens(self, word_pos_tokens_set: set[tuple[str, str]]
         """
         result_dict = dict()
         for word, pos_tag in word_pos_tokens_set:
-            level = self._analyzer.get_word_pos_level_float(word, pos_tag, avg_level_not_found_pos=True)
+            level = self._analyzer.get_word_pos_level_float(
+                word, pos_tag, avg_level_not_found_pos=True
+            )
             result_dict[(word, pos_tag)] = level if level is not None else 0
 
         return result_dict
@@ -95,15 +110,20 @@ def analyze_doc(self, doc) -> list[tuple[str, str, bool, float, int, int]]:
                 while current_entity and token_start > current_entity.end_char:
                     current_entity = self._get_next_entity(entities_iter)
 
-                if current_entity and current_entity.label_ in self.entity_types_to_skip \
-                    and current_entity.start_char <= token_start < current_entity.end_char:
+                if (
+                    current_entity
+                    and current_entity.label_ in self.entity_types_to_skip
+                    and current_entity.start_char
+                    <= token_start
+                    < current_entity.end_char
+                ):
                     to_skip = True
 
             word = token.text.strip()
             word_lower = word.lower()
             word_pos = token.tag_
 
-            if word_pos == 'POS' and word_lower == "'s":
+            if word_pos == "POS" and word_lower == "'s":
                 to_skip = True
             else:
                 abbreviation_form = self.abbreviation_mapping.get(word_lower)
@@ -114,18 +134,29 @@ def analyze_doc(self, doc) -> list[tuple[str, str, bool, float, int, int]]:
                 if not to_skip and not word.isalpha():
                     to_skip = True
 
-            nlp_tokens.append((word, word_lower, word_pos, to_skip, token_start, token_end))
+            nlp_tokens.append(
+                (word, word_lower, word_pos, to_skip, token_start, token_end)
+            )
 
         word_pos_set = self._get_word_pos_tokens_set(nlp_tokens)
         word_pos_unique_level_tokens = self._fetch_word_pos_level_tokens(word_pos_set)
 
         self.tokens = []
-        for word, word_lower, word_pos, is_skipped, token_start, token_end in nlp_tokens:
+        for (
+            word,
+            word_lower,
+            word_pos,
+            is_skipped,
+            token_start,
+            token_end,
+        ) in nlp_tokens:
             if is_skipped:
                 level = None
             else:
                 level = word_pos_unique_level_tokens.get((word_lower, word_pos))
 
-            self.tokens.append((word, word_pos, is_skipped, level, token_start, token_end))
+            self.tokens.append(
+                (word, word_pos, is_skipped, level, token_start, token_end)
+            )
 
         return self.tokens
diff --git a/src/cefrpy/POSTag.py b/src/cefrpy/POSTag.py
index 4f6ae9a..8830ed5 100644
--- a/src/cefrpy/POSTag.py
+++ b/src/cefrpy/POSTag.py
@@ -1,36 +1,37 @@
 from enum import Enum, unique
 
 POS_TAGS_DESCRIPTIONS = [
-    'Coordinating conjunction',
-    'Cardinal number',
-    'Determiner',
-    'Preposition or subordinating conjunction',
-    'Adjective',
-    'Adjective, comparative',
-    'Adjective, superlative',
-    'Modal',
-    'Noun, singular or mass',
-    'Noun, plural',
-    'Proper noun, singular',
-    'Proper noun, plural',
-    'Personal/Posessive pronoun',
-    'Adverb',
-    'Adverb, comparative',
-    'Adverb, superlative',
-    'Particle',
-    'To',
-    'Interjection',
-    'Verb, base form',
-    'Verb, past tense',
-    'Verb, gerund or present participle',
-    'Verb, past participle',
-    'Verb, non-3rd person singular present',
-    'Verb, 3rd person singular present',
-    'Wh-determiner',
-    'Wh-pronoun',
-    'Wh-adverb'
+    "Coordinating conjunction",
+    "Cardinal number",
+    "Determiner",
+    "Preposition or subordinating conjunction",
+    "Adjective",
+    "Adjective, comparative",
+    "Adjective, superlative",
+    "Modal",
+    "Noun, singular or mass",
+    "Noun, plural",
+    "Proper noun, singular",
+    "Proper noun, plural",
+    "Personal/Posessive pronoun",
+    "Adverb",
+    "Adverb, comparative",
+    "Adverb, superlative",
+    "Particle",
+    "To",
+    "Interjection",
+    "Verb, base form",
+    "Verb, past tense",
+    "Verb, gerund or present participle",
+    "Verb, past participle",
+    "Verb, non-3rd person singular present",
+    "Verb, 3rd person singular present",
+    "Wh-determiner",
+    "Wh-pronoun",
+    "Wh-adverb",
 ]
 
+
 @unique
 class POSTag(Enum):
     """
@@ -66,21 +67,18 @@ class POSTag(Enum):
     WP = 26
     WRB = 27
 
-
     def __str__(self) -> str:
         """
         Returns a string representation of the POS tag.
         """
         return self.name
 
-
     def __int__(self) -> int:
         """
         Returns an integer representation of the POS tag.
         """
         return self.value
 
-
     def __eq__(self, other) -> bool:
         """
         Checks if this POS tag is equal to another POS tag.
@@ -90,21 +88,18 @@ def __eq__(self, other) -> bool:
 
         return NotImplemented
 
-
     def __hash__(self) -> int:
         """
         Returns the hash value of the POS tag.
         """
         return self.value
 
-
     def get_description(self) -> str:
         """
         Retrieve the description of a POS tag.
         """
         return POS_TAGS_DESCRIPTIONS[self.value]
 
-
     @classmethod
     def from_tag_name(cls, tag_name: str):
         """
@@ -125,7 +120,6 @@ def from_tag_name(cls, tag_name: str):
 
         return tag
 
-
     @staticmethod
     def get_id_by_tag_name(tag_name: str) -> int:
         """
@@ -145,7 +139,6 @@ def get_id_by_tag_name(tag_name: str) -> int:
 
         return POSTag[tag_name].value
 
-
     @staticmethod
     def get_tag_name_by_id(tag_id: int) -> str:
         """
@@ -165,7 +158,6 @@ def get_tag_name_by_id(tag_id: int) -> str:
 
         raise ValueError(f"Invalid tag id: {tag_id}")
 
-
     @staticmethod
     def get_description_by_tag_name(tag_name: str) -> str:
         """
@@ -184,7 +176,6 @@ def get_description_by_tag_name(tag_name: str) -> str:
 
         return POS_TAGS_DESCRIPTIONS[tag_id]
 
-
     @staticmethod
     def get_description_by_tag_id(tag_id: int) -> str:
         """
@@ -204,7 +195,6 @@ def get_description_by_tag_id(tag_id: int) -> str:
 
         return POS_TAGS_DESCRIPTIONS[tag_id]
 
-
     @staticmethod
     def get_total_tags() -> int:
         """
@@ -215,7 +205,6 @@ def get_total_tags() -> int:
         """
         return len(POSTag.__members__)
 
-
     @staticmethod
     def get_all_tags() -> list[str]:
         """
diff --git a/src/cefrpy/__init__.py b/src/cefrpy/__init__.py
index a580585..e92b138 100644
--- a/src/cefrpy/__init__.py
+++ b/src/cefrpy/__init__.py
@@ -5,5 +5,5 @@
 from .CEFRAnalyzer import CEFRAnalyzer
 from .CEFRSpaCyAnalyzer import CEFRSpaCyAnalyzer
 
-__version__ = "1.0.1"
+__version__ = "1.0.2"
 __all__ = ["POSTag", "CEFRDataReader", "CEFRDataProcessor", "CEFRLevel", "CEFRAnalyzer", "CEFRSpaCyAnalyzer"]
diff --git a/tests/test_CEFRAnalyzer.py b/tests/test_CEFRAnalyzer.py
index 3c2fbf9..61305c1 100644
--- a/tests/test_CEFRAnalyzer.py
+++ b/tests/test_CEFRAnalyzer.py
@@ -1,7 +1,7 @@
 import unittest
 
 from random import randint
-from cefrpy import CEFRAnalyzer, CEFRDataReader, POSTag, CEFRLevel
+from cefrpy import CEFRAnalyzer, POSTag, CEFRLevel
 
 
 class TestCEFRAnalyzer(unittest.TestCase):
@@ -11,7 +11,12 @@ def setUpClass(cls):
         cls.valid_word_pos = POSTag.NN
         cls.valid_word_unknown_pos = POSTag.CD
         cls.not_valid_words_test_pos_tag = POSTag.CC
-        cls.not_valid_words = ("", "@test@", "notvalidword", "toolongwordtoolongwordtoolongwordtoolongwordtoolongword")
+        cls.not_valid_words = (
+            "",
+            "@test@",
+            "notvalidword",
+            "toolongwordtoolongwordtoolongwordtoolongwordtoolongword",
+        )
         cls.analyzer = CEFRAnalyzer()
 
     def test_get_max_word_len(self):
@@ -26,30 +31,58 @@ def test_get_pos_tag_id(self):
         self.assertEqual(CEFRAnalyzer.get_pos_tag_id(tag_str), tag_id)
 
     def test_get_word_pos_level_float(self):
-        valid_word_pos_level = self.analyzer.get_word_pos_level_float(self.valid_word, self.valid_word_pos, False)
-        valid_avg_word_pos_level = self.analyzer.get_word_pos_level_float(self.valid_word, self.valid_word_unknown_pos, True)
-        none_level = self.analyzer.get_word_pos_level_float(self.valid_word, self.valid_word_unknown_pos, False)
+        valid_word_pos_level = self.analyzer.get_word_pos_level_float(
+            self.valid_word, self.valid_word_pos, False
+        )
+        valid_avg_word_pos_level = self.analyzer.get_word_pos_level_float(
+            self.valid_word, self.valid_word_unknown_pos, True
+        )
+        none_level = self.analyzer.get_word_pos_level_float(
+            self.valid_word, self.valid_word_unknown_pos, False
+        )
 
         self.assertIsNotNone(valid_word_pos_level)
         self.assertIsNotNone(valid_avg_word_pos_level)
         self.assertIsNone(none_level)
 
         for word in self.not_valid_words:
-            self.assertIsNone(self.analyzer.get_word_pos_level_float(word, self.not_valid_words_test_pos_tag, False))
-            self.assertIsNone(self.analyzer.get_word_pos_level_float(word, self.not_valid_words_test_pos_tag, True))
+            self.assertIsNone(
+                self.analyzer.get_word_pos_level_float(
+                    word, self.not_valid_words_test_pos_tag, False
+                )
+            )
+            self.assertIsNone(
+                self.analyzer.get_word_pos_level_float(
+                    word, self.not_valid_words_test_pos_tag, True
+                )
+            )
 
     def test_get_word_pos_level_CEFR(self):
-        valid_word_pos_level = self.analyzer.get_word_pos_level_CEFR(self.valid_word, self.valid_word_pos, False)
-        valid_avg_word_pos_level = self.analyzer.get_word_pos_level_CEFR(self.valid_word, self.valid_word_unknown_pos, True)
-        none_level = self.analyzer.get_word_pos_level_CEFR(self.valid_word, self.valid_word_unknown_pos, False)
+        valid_word_pos_level = self.analyzer.get_word_pos_level_CEFR(
+            self.valid_word, self.valid_word_pos, False
+        )
+        valid_avg_word_pos_level = self.analyzer.get_word_pos_level_CEFR(
+            self.valid_word, self.valid_word_unknown_pos, True
+        )
+        none_level = self.analyzer.get_word_pos_level_CEFR(
+            self.valid_word, self.valid_word_unknown_pos, False
+        )
 
         self.assertIsInstance(valid_word_pos_level, CEFRLevel)
         self.assertIsInstance(valid_avg_word_pos_level, CEFRLevel)
         self.assertIsNone(none_level)
 
         for word in self.not_valid_words:
-            self.assertIsNone(self.analyzer.get_word_pos_level_float(word, self.not_valid_words_test_pos_tag, False))
-            self.assertIsNone(self.analyzer.get_word_pos_level_float(word, self.not_valid_words_test_pos_tag, True))
+            self.assertIsNone(
+                self.analyzer.get_word_pos_level_float(
+                    word, self.not_valid_words_test_pos_tag, False
+                )
+            )
+            self.assertIsNone(
+                self.analyzer.get_word_pos_level_float(
+                    word, self.not_valid_words_test_pos_tag, True
+                )
+            )
 
     def test_get_avg_word_level_float(self):
         valid_word_level = self.analyzer.get_average_word_level_float(self.valid_word)
@@ -72,11 +105,21 @@ def test_is_word_in_database(self):
             self.assertFalse(self.analyzer.is_word_in_database(word))
 
     def test_is_word_pos_in_database(self):
-        self.assertTrue(self.analyzer.is_word_pos_id_database(self.valid_word, self.valid_word_pos))
-        self.assertFalse(self.analyzer.is_word_pos_id_database(self.valid_word, self.valid_word_unknown_pos))
+        self.assertTrue(
+            self.analyzer.is_word_pos_id_database(self.valid_word, self.valid_word_pos)
+        )
+        self.assertFalse(
+            self.analyzer.is_word_pos_id_database(
+                self.valid_word, self.valid_word_unknown_pos
+            )
+        )
 
         for word in self.not_valid_words:
-            self.assertFalse(self.analyzer.is_word_pos_id_database(word, self.not_valid_words_test_pos_tag))
+            self.assertFalse(
+                self.analyzer.is_word_pos_id_database(
+                    word, self.not_valid_words_test_pos_tag
+                )
+            )
 
     def test_yields(self):
         valid_word_len = len(self.valid_word)
@@ -94,7 +137,9 @@ def test_yields(self):
         self.assertEqual(len(valid_words), total_words)
 
         valid_words_iter = reversed(valid_words)
-        for word in self.analyzer.yield_words_with_length(valid_word_len, reverse_order=True):
+        for word in self.analyzer.yield_words_with_length(
+            valid_word_len, reverse_order=True
+        ):
             self.assertEqual(next(valid_words_iter), word)
 
         with self.assertRaises(StopIteration):
@@ -104,8 +149,14 @@ def test_yields(self):
         word = next(valid_words_iter)
         word_pos_counter = 0
 
-        for data1, data2 in zip(self.analyzer.yield_word_pos_with_length(valid_word_len, pos_tag_as_string=False),
-                                self.analyzer.yield_word_pos_level_with_length(valid_word_len, pos_tag_as_string=True)):
+        for data1, data2 in zip(
+            self.analyzer.yield_word_pos_with_length(
+                valid_word_len, pos_tag_as_string=False
+            ),
+            self.analyzer.yield_word_pos_level_with_length(
+                valid_word_len, pos_tag_as_string=True
+            ),
+        ):
             word1, pos1 = data1
             word2, pos2, level = data2
 
@@ -130,8 +181,17 @@ def test_yields(self):
         word = next(valid_words_iter)
         word_pos_counter = 0
 
-        for data1, data2 in zip(self.analyzer.yield_word_pos_with_length(valid_word_len, pos_tag_as_string=True, reverse_order=True),
-                                self.analyzer.yield_word_pos_level_with_length(valid_word_len, pos_tag_as_string=False, word_level_as_float=True, reverse_order=True)):
+        for data1, data2 in zip(
+            self.analyzer.yield_word_pos_with_length(
+                valid_word_len, pos_tag_as_string=True, reverse_order=True
+            ),
+            self.analyzer.yield_word_pos_level_with_length(
+                valid_word_len,
+                pos_tag_as_string=False,
+                word_level_as_float=True,
+                reverse_order=True,
+            ),
+        ):
             word1, pos1 = data1
             word2, pos2, level = data2
 
@@ -159,7 +219,9 @@ def test_yields_alphabetical(self):
         word_counter = 0
         last_word = ""
 
-        for word in self.analyzer.yield_words(reverse_order=False, word_length_sort=False):
+        for word in self.analyzer.yield_words(
+            reverse_order=False, word_length_sort=False
+        ):
             self.assertGreater(word, last_word)
             last_word = word
             word_counter += 1
@@ -167,7 +229,9 @@ def test_yields_alphabetical(self):
         self.assertEqual(word_counter, total_words)
 
         word_counter = 1
-        generator = self.analyzer.yield_words(reverse_order=True, word_length_sort=False)
+        generator = self.analyzer.yield_words(
+            reverse_order=True, word_length_sort=False
+        )
         last_word = next(generator)
 
         for word in generator:
@@ -185,7 +249,9 @@ def test_yields_word_length_sort(self):
         last_len = 0
         last_word = ""
 
-        for word in self.analyzer.yield_words(reverse_order=False, word_length_sort=True):
+        for word in self.analyzer.yield_words(
+            reverse_order=False, word_length_sort=True
+        ):
             word_len = len(word)
             self.assertGreaterEqual(word_len, last_len)
 
@@ -219,5 +285,5 @@ def test_yields_word_length_sort(self):
         self.assertEqual(word_counter, total_words)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/tests/test_CEFRDataProcessor.py b/tests/test_CEFRDataProcessor.py
index 49f43f5..9db1f5a 100644
--- a/tests/test_CEFRDataProcessor.py
+++ b/tests/test_CEFRDataProcessor.py
@@ -3,6 +3,7 @@
 from math import inf
 from cefrpy import CEFRDataProcessor, POSTag
 
+
 class TestCEFRDataProcessor(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
@@ -10,7 +11,12 @@ def setUpClass(cls):
         cls.valid_word_pos_id = int(POSTag.NN)
         cls.valid_word_unknown_pos_id = int(POSTag.CD)
         cls.not_valid_words_test_pos_tag = int(POSTag.CC)
-        cls.not_valid_words = ("", "@test@", "notvalidword", "toolongwordtoolongwordtoolongwordtoolongwordtoolongword")
+        cls.not_valid_words = (
+            "",
+            "@test@",
+            "notvalidword",
+            "toolongwordtoolongwordtoolongwordtoolongwordtoolongword",
+        )
         cls.processor = CEFRDataProcessor()
 
     def test_get_wlp_and_max_word_len(self):
@@ -33,7 +39,7 @@ def test_word_len_valid(self):
         self.assertTrue(self.processor.is_word_len_valid(max_valid_word_len))
 
     def test_pack_word(self):
-        self.assertEqual(CEFRDataProcessor.pack_word("test"), b'test')
+        self.assertEqual(CEFRDataProcessor.pack_word("test"), b"test")
 
     def test_byte_int_level_to_float(self):
         self.assertAlmostEqual(CEFRDataProcessor.byte_int_level_to_float(0), 1)
@@ -65,35 +71,71 @@ def test_is_word_in_database(self):
             self.assertFalse(self.processor.is_word_in_database(word))
 
     def test_is_word_pos_in_database(self):
-        self.assertTrue(self.processor.is_word_pos_id_database(self.valid_word, self.valid_word_pos_id))
-        self.assertFalse(self.processor.is_word_pos_id_database(self.valid_word, self.valid_word_unknown_pos_id))
+        self.assertTrue(
+            self.processor.is_word_pos_id_database(
+                self.valid_word, self.valid_word_pos_id
+            )
+        )
+        self.assertFalse(
+            self.processor.is_word_pos_id_database(
+                self.valid_word, self.valid_word_unknown_pos_id
+            )
+        )
 
         for word in self.not_valid_words:
-            self.assertFalse(self.processor.is_word_pos_id_database(word, self.not_valid_words_test_pos_tag))
+            self.assertFalse(
+                self.processor.is_word_pos_id_database(
+                    word, self.not_valid_words_test_pos_tag
+                )
+            )
 
     def test_get_word_level_for_pos_id(self):
-        self.assertIsNotNone(self.processor.get_word_level_for_pos_id(self.valid_word, self.valid_word_pos_id, False))
-        self.assertIsNone(self.processor.get_word_level_for_pos_id(self.valid_word, inf, False))
-
-        self.assertIsNone(self.processor.get_word_level_for_pos_id(self.valid_word, inf, False))
-        self.assertIsNotNone(self.processor.get_word_level_for_pos_id(self.valid_word, inf, True))
+        self.assertIsNotNone(
+            self.processor.get_word_level_for_pos_id(
+                self.valid_word, self.valid_word_pos_id, False
+            )
+        )
+        self.assertIsNone(
+            self.processor.get_word_level_for_pos_id(self.valid_word, inf, False)
+        )
+
+        self.assertIsNone(
+            self.processor.get_word_level_for_pos_id(self.valid_word, inf, False)
+        )
+        self.assertIsNotNone(
+            self.processor.get_word_level_for_pos_id(self.valid_word, inf, True)
+        )
 
         for word in self.not_valid_words:
-            self.assertIsNone(self.processor.get_word_level_for_pos_id(word, self.not_valid_words_test_pos_tag, True))
-            self.assertIsNone(self.processor.get_word_level_for_pos_id(word, self.not_valid_words_test_pos_tag, False))
+            self.assertIsNone(
+                self.processor.get_word_level_for_pos_id(
+                    word, self.not_valid_words_test_pos_tag, True
+                )
+            )
+            self.assertIsNone(
+                self.processor.get_word_level_for_pos_id(
+                    word, self.not_valid_words_test_pos_tag, False
+                )
+            )
 
     def test_get_word_count_for_length(self):
         self.assertTrue(0 <= self.processor.get_word_count_for_length(1) <= 26)
 
         valid_word_len = len(self.valid_word)
-        self.assertTrue(1 <= self.processor.get_word_count_for_length(valid_word_len) <= pow(26, valid_word_len))
+        self.assertTrue(
+            1
+            <= self.processor.get_word_count_for_length(valid_word_len)
+            <= pow(26, valid_word_len)
+        )
 
     def test_word_pos_count_for_length(self):
         self.assertGreaterEqual(self.processor.get_word_pos_count_for_length(1), 0)
 
         valid_word_len = len(self.valid_word)
-        self.assertGreater(self.processor.get_word_pos_count_for_length(valid_word_len), 0)
+        self.assertGreater(
+            self.processor.get_word_pos_count_for_length(valid_word_len), 0
+        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/tests/test_CEFRDataValidator.py b/tests/test_CEFRDataValidator.py
index f927c57..515d3f5 100644
--- a/tests/test_CEFRDataValidator.py
+++ b/tests/test_CEFRDataValidator.py
@@ -9,11 +9,7 @@ def setUp(self):
         self.valid_wlp_lengths = [2, 3, 100, 254, 255]
         self.invalid_wlp_lengths = [-inf, -1, 0, 1, 256, 500, inf]
 
-        self.valid_wlp_arrays = [
-            [0, 9],
-            [0, 6, 10],
-            [3, 6, 6, 6, 12]
-        ]
+        self.valid_wlp_arrays = [[0, 9], [0, 6, 10], [3, 6, 6, 6, 12]]
 
         self.invalid_wlp_arrays = [
             [],
@@ -21,22 +17,22 @@ def setUp(self):
             [0, -1],
             [1, 2, 3, 4, 5],
             [0, 3, 5, 12],
-            [3, 12, 9, 12, 17]
+            [3, 12, 9, 12, 17],
         ]
 
         self.valid_data = [
-            bytearray(b'a\x00\x00d\x03\x05z\x02\x10'),
-            bytearray(b'g\x10\x05y\x04\x89kk\x05\x12'),
-            bytearray(b'---c\x06\x15qwer\x10\x35----')
+            bytearray(b"a\x00\x00d\x03\x05z\x02\x10"),
+            bytearray(b"g\x10\x05y\x04\x89kk\x05\x12"),
+            bytearray(b"---c\x06\x15qwer\x10\x35----"),
         ]
 
         self.invalid_data = [
-            bytearray(b'something\x00\x02test\x00\x01'),
-            bytearray(b'hello'),
-            bytearray(b'c\x06qwer\x10\x35'),
-            bytearray(b'a\x99\x99d\x03\x05z\x02\x10'),
-            bytearray(b'testsomething'),
-            bytearray(b'#\x00\x00@\x03\x05#\x02\x10')
+            bytearray(b"something\x00\x02test\x00\x01"),
+            bytearray(b"hello"),
+            bytearray(b"c\x06qwer\x10\x35"),
+            bytearray(b"a\x99\x99d\x03\x05z\x02\x10"),
+            bytearray(b"testsomething"),
+            bytearray(b"#\x00\x00@\x03\x05#\x02\x10"),
         ]
 
     def test_wlp_length_valid(self):
@@ -67,5 +63,5 @@ def test_cefr_data_invalid(self):
             self.assertFalse(CEFRDataValidator.is_data_valid(wlp_array, data))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/tests/test_CEFRLevel.py b/tests/test_CEFRLevel.py
index 39dd0dd..0bbef60 100644
--- a/tests/test_CEFRLevel.py
+++ b/tests/test_CEFRLevel.py
@@ -2,9 +2,17 @@
 
 from cefrpy import CEFRLevel
 
+
 class TestCEFRLevel(unittest.TestCase):
     def setUp(self):
-        self.levels = [CEFRLevel.A1, CEFRLevel.A2, CEFRLevel.B1, CEFRLevel.B2, CEFRLevel.C1, CEFRLevel.C2]
+        self.levels = [
+            CEFRLevel.A1,
+            CEFRLevel.A2,
+            CEFRLevel.B1,
+            CEFRLevel.B2,
+            CEFRLevel.C1,
+            CEFRLevel.C2,
+        ]
 
     def test_equality(self):
         for level in self.levels:
@@ -48,5 +56,5 @@ def test_from_string_method(self):
             self.assertEqual(level_from_str, level)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/tests/test_POSTag.py b/tests/test_POSTag.py
index 31712f5..855ca48 100644
--- a/tests/test_POSTag.py
+++ b/tests/test_POSTag.py
@@ -3,6 +3,7 @@
 from math import inf
 from cefrpy import POSTag
 
+
 class TestPOSTag(unittest.TestCase):
     def setUp(self):
         self.total_tags = POSTag.get_total_tags()
@@ -83,5 +84,5 @@ def test_get_all_tags(self):
             self.assertIsNotNone(POSTag.__members__.get(pos_tag))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()

From 6ae0eb328534d038dce6d6c05748ad2b8bf5fb47 Mon Sep 17 00:00:00 2001
From: Bielikov Maksym <maximax6767@gmail.com>
Date: Tue, 5 May 2026 09:30:57 +0300
Subject: [PATCH 2/2] refactor: documentation

---
 README.md       | 449 +++++++++++++++++++++++++++++++++---------------
 docs/cefrpy.rst |   2 +-
 docs/docs.md    | 319 ++++++++++++++++++----------------
 3 files changed, 482 insertions(+), 288 deletions(-)

diff --git a/README.md b/README.md
index 029d606..e38ee2a 100644
--- a/README.md
+++ b/README.md
@@ -10,9 +10,9 @@
 
 The cefrpy python module offers a comprehensive toolkit for analyzing linguistic data based on the Common European Framework of Reference for Languages (CEFR).
 
-Documentation: https://maximax67.github.io/cefrpy/
+Documentation: <https://maximax67.github.io/cefrpy/>
 
-HuggingFace demo: https://huggingface.co/spaces/Maximax67/cefrpy-demo
+HuggingFace demo: <https://huggingface.co/spaces/Maximax67/cefrpy-demo>
 
 ## Installation
 
@@ -48,7 +48,7 @@ else:
     print(f"CEFR level not found for '{word}' as a {pos_tag}.")
 ```
 
-### Getting Average Level of a Word:
+### Getting average level of a word
 
 ```py
 from cefrpy import CEFRAnalyzer
@@ -73,7 +73,8 @@ else:
 
 ### Recommended usage with [spaCy](https://spacy.io)
 
-#### Import spacy and load model:
+#### Import spacy and load model
+
 ```py
 import spacy
 
@@ -148,185 +149,363 @@ for token in tokens:
 
 Result (truncated):
 
-```
+```text
 -------------------------------------------------------
- WORD                      	POS	LEVEL	CEFR
+ WORD                       POS LEVEL CEFR
 -------------------------------------------------------
-                          	_SP	Skip	None
-In                        	IN	1.00	A1
-the                       	DT	1.00	A1
-heart                     	NN	1.00	A1
-of                        	IN	1.00	A1
-every                     	DT	1.00	A1
-forest                    	NN	2.00	A2
-,                         	,	Skip	None
-a                         	DT	1.00	A1
-hidden                    	JJ	3.00	B1
-world                     	NN	1.00	A1
-thrives                   	VBZ	5.86	C2
-among                     	IN	2.00	A2
-the                       	DT	1.00	A1
-towering                  	VBG	1.00	A1
-trees                     	NNS	1.00	A1
-.                         	.	Skip	None
-Trees                     	NNS	1.00	A1
-,                         	,	Skip	None
-                          	_SP	Skip	None
-those                     	DT	1.00	A1
-silent                    	JJ	3.00	B1
+                           _SP Skip None
+In                         IN 1.00 A1
+the                        DT 1.00 A1
+heart                      NN 1.00 A1
+of                         IN 1.00 A1
+every                      DT 1.00 A1
+forest                     NN 2.00 A2
+,                          , Skip None
+a                          DT 1.00 A1
+hidden                     JJ 3.00 B1
+world                      NN 1.00 A1
+thrives                    VBZ 5.86 C2
+among                      IN 2.00 A2
+the                        DT 1.00 A1
+towering                   VBG 1.00 A1
+trees                      NNS 1.00 A1
+.                          . Skip None
+Trees                      NNS 1.00 A1
+,                          , Skip None
+                           _SP Skip None
+those                      DT 1.00 A1
+silent                     JJ 3.00 B1
 ```
 
 #### Get more statistical information
 
 1. Filter tokens by level:
 
-```py
-def filter_for_desired_level(level_tokens: list[tuple[str, str, bool, float, int, int]],
-                            min_level: float | int = 1.0, max_level: float | int = 6.0
-                            ) -> set[tuple[str, str, bool, float, int, int]]:
-    filtered_tokens = set()
-    for token in level_tokens:
-        level = token[3]
+    ```py
+    def filter_for_desired_level(level_tokens: list[tuple[str, str, bool, float, int, int]],
+                                min_level: float | int = 1.0, max_level: float | int = 6.0
+                                ) -> set[tuple[str, str, bool, float, int, int]]:
+        filtered_tokens = set()
+        for token in level_tokens:
+            level = token[3]
+
+            if level and level >= min_level and level <= max_level:
+                filtered_tokens.add(token)
+
+        return filtered_tokens
+
+
+    # You can also set min/max level as an int or float in range from 1 to 6
+    desired_min_level = CEFRLevel.C1
+    desired_level_words_set = filter_for_desired_level(tokens, min_level=int(desired_min_level))
+
+    desired_level_words_list = list(desired_level_words_set)
+    desired_level_words_list.sort()
+
+    print(f'\tWords with level {desired_min_level} and higher: {len(desired_level_words_list)}')
+    for word_data in desired_level_words_list:
+        word, pos, _, level, _, _ = word_data
+        print(f"{word.ljust(26)} {pos.ljust(6)} {'{:.2f}'.format(level).ljust(6)} {CEFRLevel(round(level))}")
+    ```
+
+    ```text
+    Words with level B2 and higher: 16
+    benefactors                NNS    6.00   C2
+    bristlecone                NN     6.00   C2
+    evolved                    VBN    4.00   B2
+    fungi                      NNS    5.20   C1
+    living                     NN     4.00   B2
+    longevity                  NN     5.96   C2
+    masters                    NNS    4.00   B2
+    mighty                     JJ     4.00   B2
+    observers                  NNS    4.00   B2
+    pines                      NNS    4.00   B2
+    potential                  JJ     4.00   B2
+    sequoias                   NNS    6.00   C2
+    thrives                    VBZ    5.86   C2
+    underground                RB     4.00   B2
+    wildfires                  NNS    6.00   C2
+    withstand                  VB     5.12   C1
+    ```
 
-        if level and level >= min_level and level <= max_level:
-            filtered_tokens.add(token)
+2. Get CEFR statistic of the text:
 
-    return filtered_tokens
+    ```py
+    def get_word_level_count_statistic(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> list[int]:
+        difficulty_levels_count = [0] * 6
+        for token in level_tokens:
+            level = token[3]
+            if not level:
+                continue
 
+            level_round = round(level)
+            difficulty_levels_count[level_round - 1] += 1
 
-# You can also set min/max level as an int or float in range from 1 to 6
-desired_min_level = CEFRLevel.C1
-desired_level_words_set = filter_for_desired_level(tokens, min_level=int(desired_min_level))
+        return difficulty_levels_count
 
-desired_level_words_list = list(desired_level_words_set)
-desired_level_words_list.sort()
+    difficulty_levels_count = get_word_level_count_statistic(tokens)
+    print('CEFR statistic (total words):')
+    for i in range(1, 7):
+        print(f'{CEFRLevel(i)}: {difficulty_levels_count[i - 1]}')
+    ```
 
-print(f'\tWords with level {desired_min_level} and higher: {len(desired_level_words_list)}')
-for word_data in desired_level_words_list:
-    word, pos, _, level, _, _ = word_data
-    print(f"{word.ljust(26)} {pos.ljust(6)} {'{:.2f}'.format(level).ljust(6)} {CEFRLevel(round(level))}")
-```
+    ```text
+    CEFR statistic (total words):
+    A1: 136
+    A2: 36
+    B1: 27
+    B2: 11
+    C1: 2
+    C2: 6
+    ```
 
-```
-Words with level B2 and higher: 16
-benefactors                NNS    6.00   C2
-bristlecone                NN     6.00   C2
-evolved                    VBN    4.00   B2
-fungi                      NNS    5.20   C1
-living                     NN     4.00   B2
-longevity                  NN     5.96   C2
-masters                    NNS    4.00   B2
-mighty                     JJ     4.00   B2
-observers                  NNS    4.00   B2
-pines                      NNS    4.00   B2
-potential                  JJ     4.00   B2
-sequoias                   NNS    6.00   C2
-thrives                    VBZ    5.86   C2
-underground                RB     4.00   B2
-wildfires                  NNS    6.00   C2
-withstand                  VB     5.12   C1
-```
+3. Get CEFR statistic for unique words in the text:
 
-2. Get CEFR statistic of the text:
+    ```py
+    def get_word_level_count_statistic_unique(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> list[int]:
+        processed_word_pos_set = set()
+        difficulty_levels_count = [0] * 6
+        for token in level_tokens:
+            level = token[3]
+            if not level:
+                continue
+
+            to_check_tuple = (token[0], token[1])
+            if not to_check_tuple in processed_word_pos_set:
+                level_round = round(token[3])
+                difficulty_levels_count[level_round - 1] += 1
+                processed_word_pos_set.add(to_check_tuple)
+
+        return difficulty_levels_count
+
+
+    difficulty_levels_count_unique = get_word_level_count_statistic_unique(tokens)
+    print('CEFR statistic (unique words):')
+    for i in range(1, 7):
+        print(f'{CEFRLevel(i)}: {difficulty_levels_count_unique[i - 1]}')
+    ```
+
+    ```text
+    CEFR statistic (unique words):
+    A1: 77
+    A2: 33
+    B1: 23
+    B2: 11
+    C1: 2
+    C2: 6
+    ```
+
+4. Get set of not found CEFR levels for words in text:
+
+    ```py
+    def get_not_found_words(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> set[str]:
+        not_found_words = set()
+        for token in level_tokens:
+            if token[2]:
+                continue
+
+            if not token[3]:
+                not_found_words.add(token[0])
+
+        return not_found_words
+
+
+    not_found_words_set = get_not_found_words(tokens)
+    not_found_words_list = list(not_found_words_set)
+    not_found_words_list.sort()
+
+    print('Not found words:', len(not_found_words_list))
+    if len(not_found_words_list):
+        print('\n'.join(not_found_words_list))
+    ```
+
+    ```text
+    Not found words: 0
+    ```
+
+## Additional features
+
+### Get all possible part-of-speech tags for a word
 
 ```py
-def get_word_level_count_statistic(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> list[int]:
-    difficulty_levels_count = [0] * 6
-    for token in level_tokens:
-        level = token[3]
-        if not level:
-            continue
-
-        level_round = round(level)
-        difficulty_levels_count[level_round - 1] += 1
-
-    return difficulty_levels_count
-
-difficulty_levels_count = get_word_level_count_statistic(tokens)
-print('CEFR statistic (total words):')
-for i in range(1, 7):
-    print(f'{CEFRLevel(i)}: {difficulty_levels_count[i - 1]}')
+from cefrpy import CEFRAnalyzer
+
+analyzer = CEFRAnalyzer()
+
+print(analyzer.get_all_pos_for_word("test")) # [<POSTag.JJ: 4>, <POSTag.NN: 8>, <POSTag.VB: 19>]
+print(analyzer.get_all_pos_for_word_as_str("test")) # ['JJ', 'NN']
+
+# {<POSTag.JJ: 4>: <CEFRLevel.A2: 2>, <POSTag.NN: 8>: <CEFRLevel.A1: 1>, <POSTag.VB: 19>: <CEFRLevel.B2: 4>}
+print(analyzer.get_pos_level_dict_for_word("test"))
+
+# {'JJ': 2.5, 'NN': 1.0, 'VB': 4.0}
+print(analyzer.get_pos_level_dict_for_word("test", pos_tag_as_string=True, word_level_as_float=True))
 ```
 
+### Checking if a word exists in the database
+
+```py
+from cefrpy import CEFRAnalyzer
+
+analyzer = CEFRAnalyzer()
+
+word = "apple"
+if analyzer.is_word_in_database(word):
+    print(f"'{word}' exists in the database.")
+else:
+    print(f"'{word}' does not exist in the database.")
 ```
-CEFR statistic (total words):
-A1: 136
-A2: 36
-B1: 27
-B2: 11
-C1: 2
-C2: 6
+
+### Checking if a word with a specific part-of-speech exists in the database
+
+```py
+from cefrpy import CEFRAnalyzer
+
+analyzer = CEFRAnalyzer()
+
+word = "run"
+pos_tag = "VB"  # Verb
+if analyzer.is_word_pos_id_database(word, pos_tag):
+    print(f"'{word}' with part of speech '{pos_tag}' exists in the database.")
+else:
+    print(f"'{word}' with part of speech '{pos_tag}' does not exist in the database.")
 ```
 
-3. Get CEFR statistic for unique words in the text:
+### POSTag usage examples
 
 ```py
-def get_word_level_count_statistic_unique(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> list[int]:
-    processed_word_pos_set = set()
-    difficulty_levels_count = [0] * 6
-    for token in level_tokens:
-        level = token[3]
-        if not level:
-            continue
-
-        to_check_tuple = (token[0], token[1])
-        if not to_check_tuple in processed_word_pos_set:
-            level_round = round(token[3])
-            difficulty_levels_count[level_round - 1] += 1
-            processed_word_pos_set.add(to_check_tuple)
+from cefrpy import POSTag
 
-    return difficulty_levels_count
+# Get list of all part-of-speech tag names
+print(POSTag.get_all_tags()) # ['CC', 'CD', 'DT', ...]
 
+# Print total tags
+print(POSTag.get_total_tags()) # 28
 
-difficulty_levels_count_unique = get_word_level_count_statistic_unique(tokens)
-print('CEFR statistic (unique words):')
-for i in range(1, 7):
-    print(f'{CEFRLevel(i)}: {difficulty_levels_count_unique[i - 1]}')
-```
+# Get description for a tag
+print(POSTag.get_description_by_tag_name('NN')) # Noun, singular or mass
 
+tag = POSTag.VB
+print(tag)                          # VB
+print(POSTag.get_description(tag))  # Verb, base form
+print(int(tag))                     # 19 (unique tag id)
+print(tag == POSTag.NN)             # False
 ```
-CEFR statistic (unique words):
-A1: 77
-A2: 33
-B1: 23
-B2: 11
-C1: 2
-C2: 6
+
+### CEFRLevel usage examples
+
+```py
+from cefrpy import CEFRLevel
+
+level = CEFRLevel.A1
+print(level)            # A1
+print(int(level))       # 1
+
+level2 = CEFRLevel.C2
+print(level2)           # C2
+print(int(level2))      # 6
+
+# You can perform any comparisons:
+print(level2 > level)   # True
+print(level2 == level)  # False
+
+print(CEFRLevel.from_str("B1") == CEFRLevel.B1) # True
+print(CEFRLevel.from_str("B1") == CEFRLevel(3)) # True
 ```
 
-4. Get set of not found CEFR levels for words in text:
+### Yields CEFRAnalyzer methods
+
+For every example you should import and initialize `CEFRAnalyzer`:
 
 ```py
-def get_not_found_words(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> set[str]:
-    not_found_words = set()
-    for token in level_tokens:
-        if token[2]:
-            continue
+from cefrpy import CEFRAnalyzer
 
-        if not token[3]:
-            not_found_words.add(token[0])
+analyzer = CEFRAnalyzer()
+```
 
-    return not_found_words
+#### Iterating over words with a specific length (alphabetical order)
 
+```py
+iteration_limit = 10
+word_list = []
+for word in analyzer.yield_words_with_length(6):
+    if iteration_limit == 0:
+        break
+    word_list.append(word)
+    iteration_limit -= 1
+
+# ['aaberg', 'aachen', 'aahing', 'aargau', 'aarhus', 'abacus', 'abadan', 'abadia', 'abakan', 'abaris']
+print(word_list)
+```
 
-not_found_words_set = get_not_found_words(tokens)
-not_found_words_list = list(not_found_words_set)
-not_found_words_list.sort()
+#### Iterating over words with a specific length (reversed alphabetical order)
 
-print('Not found words:', len(not_found_words_list))
-if len(not_found_words_list):
-    print('\n'.join(not_found_words_list))
+```py
+iteration_limit = 10
+word_list = []
+for word in analyzer.yield_words_with_length(6, reverse_order=True):
+    if iteration_limit == 0:
+        break
+    word_list.append(word)
+    iteration_limit -= 1
+
+# ['zymase', 'zygote', 'zygoma', 'zydeco', 'zwolle', 'zwicky', 'zuzana', 'zusman', 'zurvan', 'zurich']
+print(word_list)
 ```
 
+#### Iterating over words in alphabetical order
+
+```py
+iteration_limit = 10
+word_list = []
+for word in analyzer.yield_words():
+    if iteration_limit == 0:
+        break
+    word_list.append(word)
+    iteration_limit -= 1
+
+# ['a', 'aa', 'aaa', 'aaaa', 'aaas', 'aaberg', 'aachen', 'aae', 'aaee', 'aaf']
+print(word_list)
+```
+
+#### Iterating over words with their pos in alphabetical order with word length priority ascending
+
+```py
+iteration_limit = 6
+word_pos_list = []
+for word, pos_tag in analyzer.yield_word_pos(word_length_sort=True):
+    if iteration_limit == 0:
+        break
+    word_pos_list.append((word, pos_tag))
+    iteration_limit -= 1
+
+# [('a', <POSTag.DT: 2>), ('a', <POSTag.IN: 3>), ('a', <POSTag.JJ: 4>), ('a', <POSTag.NN: 8>), ('a', <POSTag.VB: 19>), ('b', <POSTag.JJ: 4>)]
+print(word_pos_list)
 ```
-Not found words: 0
+
+#### Iterating over words with their pos as str and levels as float in reversed alphabetical order with word length priority descending
+
+```py
+iteration_limit = 3
+word_pos_list = []
+for word, pos_tag, level in analyzer.yield_word_pos_level(word_length_sort=True, reverse_order=True, pos_tag_as_string=True, word_level_as_float=True):
+    if iteration_limit == 0:
+        break
+    word_pos_list.append((word, pos_tag, level))
+    iteration_limit -= 1
+
+# [('demethylchlortetracycline', 'NN', 6.0), ('electrocardiographically', 'RB', 6.0), ('polytetrafluoroethylene', 'NN', 6.0)]
+print(word_pos_list)
 ```
 
 ## License
+
 This project is licensed under the MIT License - see the LICENSE file for details.
 
 ## Acknowledgments
+
 I would like to acknowledge the contributions of the following resources. I used them to create my initial SQLite version [Words-CEFR-Dataset](https://github.com/Maximax67/Words-CEFR-Dataset):
+
 - [Spacy](https://spacy.io/)
 - [CEFR-J](https://cefr-j.org/)
 - [LemmInflect](https://github.com/bjascob/LemmInflect)
@@ -334,8 +513,8 @@ I would like to acknowledge the contributions of the following resources. I used
 - [List of pos tags form Penn Treebank Project](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html)
 
 Also I used these resources to create my [valid English words list](https://github.com/Maximax67/English-Valid-Words):
+
 - [Word list by infochimps (archived)](https://web.archive.org/web/20131118073324/https://www.infochimps.com/datasets/word-list-350000-simple-english-words-excel-readable)
 - [English words github repo by dwyl](https://github.com/dwyl/english-words)
 - [NLTK (Natural Language Toolkit)](https://www.nltk.org/)
 - [WordNet](https://wordnet.princeton.edu/)
-
diff --git a/docs/cefrpy.rst b/docs/cefrpy.rst
index 15388e0..0331a2a 100644
--- a/docs/cefrpy.rst
+++ b/docs/cefrpy.rst
@@ -67,4 +67,4 @@ Module contents
    :members:
    :undoc-members:
    :show-inheritance:
-   :no-idex:
+   :no-index:
diff --git a/docs/docs.md b/docs/docs.md
index 6695048..c0d2136 100644
--- a/docs/docs.md
+++ b/docs/docs.md
@@ -1,7 +1,17 @@
 # About cefrpy
 
+![PyPI - License](https://img.shields.io/pypi/l/cefrpy)
+![PyPI - Version](https://img.shields.io/pypi/v/cefrpy)
+![PyPI - Downloads](https://img.shields.io/pypi/dm/cefrpy)
+[![Hugging Face Space](https://img.shields.io/badge/Hugging%20Face-Space-4CA6A7?logo=huggingface&style=flat)](https://huggingface.co/spaces/Maximax67/cefrpy-demo)
+[![Unit Tests](https://github.com/Maximax67/cefrpy/actions/workflows/unittest.yml/badge.svg)](https://github.com/Maximax67/cefrpy/actions/workflows/unittest.yml)
+
 The cefrpy python module offers a comprehensive toolkit for analyzing linguistic data based on the Common European Framework of Reference for Languages (CEFR).
 
+Documentation: <https://maximax67.github.io/cefrpy/>
+
+HuggingFace demo: <https://huggingface.co/spaces/Maximax67/cefrpy-demo>
+
 # Installation
 
 You can install `cefrpy` for Python >= 3.6 via pip:
@@ -36,7 +46,7 @@ else:
     print(f"CEFR level not found for '{word}' as a {pos_tag}.")
 ```
 
-## Getting average level of a word:
+## Getting average level of a word
 
 ```py
 from cefrpy import CEFRAnalyzer
@@ -61,7 +71,8 @@ else:
 
 ## Recommended usage with [spaCy](https://spacy.io)
 
-### Import spacy and load model:
+### Import spacy and load model
+
 ```py
 import spacy
 
@@ -136,179 +147,179 @@ for token in tokens:
 
 Result (truncated):
 
-```
+```text
 -------------------------------------------------------
- WORD                      	POS	LEVEL	CEFR
+ WORD                       POS LEVEL CEFR
 -------------------------------------------------------
-                          	_SP	Skip	None
-In                        	IN	1.00	A1
-the                       	DT	1.00	A1
-heart                     	NN	1.00	A1
-of                        	IN	1.00	A1
-every                     	DT	1.00	A1
-forest                    	NN	2.00	A2
-,                         	,	Skip	None
-a                         	DT	1.00	A1
-hidden                    	JJ	3.00	B1
-world                     	NN	1.00	A1
-thrives                   	VBZ	5.86	C2
-among                     	IN	2.00	A2
-the                       	DT	1.00	A1
-towering                  	VBG	1.00	A1
-trees                     	NNS	1.00	A1
-.                         	.	Skip	None
-Trees                     	NNS	1.00	A1
-,                         	,	Skip	None
-                          	_SP	Skip	None
-those                     	DT	1.00	A1
-silent                    	JJ	3.00	B1
+                           _SP Skip None
+In                         IN 1.00 A1
+the                        DT 1.00 A1
+heart                      NN 1.00 A1
+of                         IN 1.00 A1
+every                      DT 1.00 A1
+forest                     NN 2.00 A2
+,                          , Skip None
+a                          DT 1.00 A1
+hidden                     JJ 3.00 B1
+world                      NN 1.00 A1
+thrives                    VBZ 5.86 C2
+among                      IN 2.00 A2
+the                        DT 1.00 A1
+towering                   VBG 1.00 A1
+trees                      NNS 1.00 A1
+.                          . Skip None
+Trees                      NNS 1.00 A1
+,                          , Skip None
+                           _SP Skip None
+those                      DT 1.00 A1
+silent                     JJ 3.00 B1
 ```
 
 ### Get more statistical information
 
 1. Filter tokens by level:
 
-```py
-def filter_for_desired_level(level_tokens: list[tuple[str, str, bool, float, int, int]],
-                            min_level: float | int = 1.0, max_level: float | int = 6.0
-                            ) -> set[tuple[str, str, bool, float, int, int]]:
-    filtered_tokens = set()
-    for token in level_tokens:
-        level = token[3]
-
-        if level and level >= min_level and level <= max_level:
-            filtered_tokens.add(token)
-
-    return filtered_tokens
-
-
-# You can also set min/max level as an int or float in range from 1 to 6
-desired_min_level = CEFRLevel.C1
-desired_level_words_set = filter_for_desired_level(tokens, min_level=int(desired_min_level))
-
-desired_level_words_list = list(desired_level_words_set)
-desired_level_words_list.sort()
-
-print(f'\tWords with level {desired_min_level} and higher: {len(desired_level_words_list)}')
-for word_data in desired_level_words_list:
-    word, pos, _, level, _, _ = word_data
-    print(f"{word.ljust(26)} {pos.ljust(6)} {'{:.2f}'.format(level).ljust(6)} {CEFRLevel(round(level))}")
-```
-
-```
-Words with level B2 and higher: 16
-benefactors                NNS    6.00   C2
-bristlecone                NN     6.00   C2
-evolved                    VBN    4.00   B2
-fungi                      NNS    5.20   C1
-living                     NN     4.00   B2
-longevity                  NN     5.96   C2
-masters                    NNS    4.00   B2
-mighty                     JJ     4.00   B2
-observers                  NNS    4.00   B2
-pines                      NNS    4.00   B2
-potential                  JJ     4.00   B2
-sequoias                   NNS    6.00   C2
-thrives                    VBZ    5.86   C2
-underground                RB     4.00   B2
-wildfires                  NNS    6.00   C2
-withstand                  VB     5.12   C1
-```
+    ```py
+    def filter_for_desired_level(level_tokens: list[tuple[str, str, bool, float, int, int]],
+                                min_level: float | int = 1.0, max_level: float | int = 6.0
+                                ) -> set[tuple[str, str, bool, float, int, int]]:
+        filtered_tokens = set()
+        for token in level_tokens:
+            level = token[3]
+
+            if level and level >= min_level and level <= max_level:
+                filtered_tokens.add(token)
+
+        return filtered_tokens
+
+
+    # You can also set min/max level as an int or float in range from 1 to 6
+    desired_min_level = CEFRLevel.C1
+    desired_level_words_set = filter_for_desired_level(tokens, min_level=int(desired_min_level))
+
+    desired_level_words_list = list(desired_level_words_set)
+    desired_level_words_list.sort()
+
+    print(f'\tWords with level {desired_min_level} and higher: {len(desired_level_words_list)}')
+    for word_data in desired_level_words_list:
+        word, pos, _, level, _, _ = word_data
+        print(f"{word.ljust(26)} {pos.ljust(6)} {'{:.2f}'.format(level).ljust(6)} {CEFRLevel(round(level))}")
+    ```
+
+    ```text
+    Words with level B2 and higher: 16
+    benefactors                NNS    6.00   C2
+    bristlecone                NN     6.00   C2
+    evolved                    VBN    4.00   B2
+    fungi                      NNS    5.20   C1
+    living                     NN     4.00   B2
+    longevity                  NN     5.96   C2
+    masters                    NNS    4.00   B2
+    mighty                     JJ     4.00   B2
+    observers                  NNS    4.00   B2
+    pines                      NNS    4.00   B2
+    potential                  JJ     4.00   B2
+    sequoias                   NNS    6.00   C2
+    thrives                    VBZ    5.86   C2
+    underground                RB     4.00   B2
+    wildfires                  NNS    6.00   C2
+    withstand                  VB     5.12   C1
+    ```
 
 2. Get CEFR statistic of the text:
 
-```py
-def get_word_level_count_statistic(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> list[int]:
-    difficulty_levels_count = [0] * 6
-    for token in level_tokens:
-        level = token[3]
-        if not level:
-            continue
-
-        level_round = round(level)
-        difficulty_levels_count[level_round - 1] += 1
-
-    return difficulty_levels_count
-
-difficulty_levels_count = get_word_level_count_statistic(tokens)
-print('CEFR statistic (total words):')
-for i in range(1, 7):
-    print(f'{CEFRLevel(i)}: {difficulty_levels_count[i - 1]}')
-```
+    ```py
+    def get_word_level_count_statistic(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> list[int]:
+        difficulty_levels_count = [0] * 6
+        for token in level_tokens:
+            level = token[3]
+            if not level:
+                continue
 
-```
-CEFR statistic (total words):
-A1: 136
-A2: 36
-B1: 27
-B2: 11
-C1: 2
-C2: 6
-```
-
-3. Get CEFR statistic for unique words in the text:
-
-```py
-def get_word_level_count_statistic_unique(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> list[int]:
-    processed_word_pos_set = set()
-    difficulty_levels_count = [0] * 6
-    for token in level_tokens:
-        level = token[3]
-        if not level:
-            continue
-
-        to_check_tuple = (token[0], token[1])
-        if not to_check_tuple in processed_word_pos_set:
-            level_round = round(token[3])
+            level_round = round(level)
             difficulty_levels_count[level_round - 1] += 1
-            processed_word_pos_set.add(to_check_tuple)
 
-    return difficulty_levels_count
+        return difficulty_levels_count
 
+    difficulty_levels_count = get_word_level_count_statistic(tokens)
+    print('CEFR statistic (total words):')
+    for i in range(1, 7):
+        print(f'{CEFRLevel(i)}: {difficulty_levels_count[i - 1]}')
+    ```
 
-difficulty_levels_count_unique = get_word_level_count_statistic_unique(tokens)
-print('CEFR statistic (unique words):')
-for i in range(1, 7):
-    print(f'{CEFRLevel(i)}: {difficulty_levels_count_unique[i - 1]}')
-```
+    ```text
+    CEFR statistic (total words):
+    A1: 136
+    A2: 36
+    B1: 27
+    B2: 11
+    C1: 2
+    C2: 6
+    ```
 
-```
-CEFR statistic (unique words):
-A1: 77
-A2: 33
-B1: 23
-B2: 11
-C1: 2
-C2: 6
-```
+3. Get CEFR statistic for unique words in the text:
+
+    ```py
+    def get_word_level_count_statistic_unique(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> list[int]:
+        processed_word_pos_set = set()
+        difficulty_levels_count = [0] * 6
+        for token in level_tokens:
+            level = token[3]
+            if not level:
+                continue
+
+            to_check_tuple = (token[0], token[1])
+            if not to_check_tuple in processed_word_pos_set:
+                level_round = round(token[3])
+                difficulty_levels_count[level_round - 1] += 1
+                processed_word_pos_set.add(to_check_tuple)
+
+        return difficulty_levels_count
+
+
+    difficulty_levels_count_unique = get_word_level_count_statistic_unique(tokens)
+    print('CEFR statistic (unique words):')
+    for i in range(1, 7):
+        print(f'{CEFRLevel(i)}: {difficulty_levels_count_unique[i - 1]}')
+    ```
+
+    ```text
+    CEFR statistic (unique words):
+    A1: 77
+    A2: 33
+    B1: 23
+    B2: 11
+    C1: 2
+    C2: 6
+    ```
 
 4. Get set of not found CEFR levels for words in text:
 
-```py
-def get_not_found_words(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> set[str]:
-    not_found_words = set()
-    for token in level_tokens:
-        if token[2]:
-            continue
+    ```py
+    def get_not_found_words(level_tokens: list[tuple[str, str, bool, float, int, int]]) -> set[str]:
+        not_found_words = set()
+        for token in level_tokens:
+            if token[2]:
+                continue
 
-        if not token[3]:
-            not_found_words.add(token[0])
+            if not token[3]:
+                not_found_words.add(token[0])
 
-    return not_found_words
+        return not_found_words
 
 
-not_found_words_set = get_not_found_words(tokens)
-not_found_words_list = list(not_found_words_set)
-not_found_words_list.sort()
+    not_found_words_set = get_not_found_words(tokens)
+    not_found_words_list = list(not_found_words_set)
+    not_found_words_list.sort()
 
-print('Not found words:', len(not_found_words_list))
-if len(not_found_words_list):
-    print('\n'.join(not_found_words_list))
-```
+    print('Not found words:', len(not_found_words_list))
+    if len(not_found_words_list):
+        print('\n'.join(not_found_words_list))
+    ```
 
-```
-Not found words: 0
-```
+    ```text
+    Not found words: 0
+    ```
 
 # Additional features
 
@@ -329,7 +340,7 @@ print(analyzer.get_pos_level_dict_for_word("test"))
 print(analyzer.get_pos_level_dict_for_word("test", pos_tag_as_string=True, word_level_as_float=True))
 ```
 
-## Checking if a word exists in the database
+### Checking if a word exists in the database
 
 ```py
 from cefrpy import CEFRAnalyzer
@@ -411,6 +422,7 @@ analyzer = CEFRAnalyzer()
 ```
 
 ### Iterating over words with a specific length (alphabetical order)
+
 ```py
 iteration_limit = 10
 word_list = []
@@ -469,7 +481,7 @@ for word, pos_tag in analyzer.yield_word_pos(word_length_sort=True):
 print(word_pos_list)
 ```
 
-### Iterating over words with their pos as str and levels as float in reversed alphabetical order with word length priority descending 
+### Iterating over words with their pos as str and levels as float in reversed alphabetical order with word length priority descending
 
 ```py
 iteration_limit = 3
@@ -484,12 +496,14 @@ for word, pos_tag, level in analyzer.yield_word_pos_level(word_length_sort=True,
 print(word_pos_list)
 ```
 
-
 # License
+
 This project is licensed under the MIT License - see the LICENSE file for details.
 
 # Acknowledgments
+
 I would like to acknowledge the contributions of the following resources. I used them to create my initial SQLite version [Words-CEFR-Dataset](https://github.com/Maximax67/Words-CEFR-Dataset):
+
 - [Spacy](https://spacy.io/)
 - [CEFR-J](https://cefr-j.org/)
 - [LemmInflect](https://github.com/bjascob/LemmInflect)
@@ -497,6 +511,7 @@ I would like to acknowledge the contributions of the following resources. I used
 - [List of pos tags form Penn Treebank Project](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html)
 
 Also I used these resources to create my [valid English words list](https://github.com/Maximax67/English-Valid-Words):
+
 - [Word list by infochimps (archived)](https://web.archive.org/web/20131118073324/https://www.infochimps.com/datasets/word-list-350000-simple-english-words-excel-readable)
 - [English words github repo by dwyl](https://github.com/dwyl/english-words)
 - [NLTK (Natural Language Toolkit)](https://www.nltk.org/)