diff --git a/.gitignore b/.gitignore index dd2275c..82eb24f 100644 --- a/.gitignore +++ b/.gitignore @@ -53,4 +53,4 @@ build/ docs/out/ # Extensions -.VSCodeCounter \ No newline at end of file +.VSCodeCounter diff --git a/Makefile b/Makefile index 0eda490..1af5644 100644 --- a/Makefile +++ b/Makefile @@ -75,7 +75,7 @@ verify-package-all: python3.13 -m unittest tests.test_package_installation -v dev-dependencies: - pip install -e ".[dev]" + pip install -e ".[dev,corpus-spanish,corpus-all]" hooks: pre-commit install diff --git a/README.md b/README.md index 57dc57b..a44e170 100644 --- a/README.md +++ b/README.md @@ -768,4 +768,4 @@ For questions, issues, or feature requests: ## Production Usage -LLMShield is used in production environments by [brainful.ai](https://brainful.ai) to protect user data confidentiality. \ No newline at end of file +LLMShield is used in production environments by [brainful.ai](https://brainful.ai) to protect user data confidentiality. diff --git a/llmshield/cache/entity_cache.py b/llmshield/cache/entity_cache.py index 581c438..451e367 100644 --- a/llmshield/cache/entity_cache.py +++ b/llmshield/cache/entity_cache.py @@ -19,6 +19,8 @@ import threading +from llmshield.cache.loader import loader + from ..error_handling import safe_resource_load @@ -54,6 +56,7 @@ def __init__(self) -> None: self._countries: frozenset[str] | None = None self._organisations: frozenset[str] | None = None self._english_corpus: frozenset[str] | None = None + self._composite_corpus: frozenset[str] | None = None self._initialized = True @property @@ -92,6 +95,17 @@ def english_corpus(self) -> frozenset[str]: self._english_corpus = self._load_english_corpus() return self._english_corpus + @property + def composite_corpus(self) -> frozenset[str]: + """Get the corpus from the other languages.""" + if self._composite_corpus is None: + with self._lock: + if self._composite_corpus is None: + self._composite_corpus = self._load_all_corpuses() + return ( + self._composite_corpus if self._composite_corpus else frozenset() + ) + def get_all_places(self) -> frozenset[str]: """Get combined cities and countries set.""" return self.cities | self.countries @@ -108,6 +122,10 @@ def is_english_word(self, text_lower: str) -> bool: """O(1) lookup for English words.""" return text_lower in self.english_corpus + def is_foreign_word(self, text_lower: str) -> bool: + """O(1) lookup for foreign words in the composite corpus.""" + return text_lower in self.composite_corpus + def preload_all(self) -> None: """Preload all dictionaries for optimal performance.""" with self._lock: @@ -127,6 +145,8 @@ def get_memory_stats(self) -> dict[str, int]: stats["organisations"] = len(self._organisations) if self._english_corpus is not None: stats["english_corpus"] = len(self._english_corpus) + if self._composite_corpus is not None: + stats["composite_corpus"] = len(self._composite_corpus) return stats def _load_cities(self) -> frozenset[str]: @@ -145,6 +165,27 @@ def _load_english_corpus(self) -> frozenset[str]: """Load English corpus from resource file.""" return self._load_dict_file("corpus/english.txt") + @staticmethod + def _load_all_corpuses() -> frozenset[str] | None: + """Load all corpuses containing all of the words from the additional non-English language packs.""" # noqa: E501 + corpus_frozenset: frozenset[str] = frozenset() + + for lang in loader.get_available_languages(): + try: + imported_frozenset = frozenset( + entry.lower() + for entry in safe_resource_load( + "llmshield_" + lang + "_corpus", + f"data/{lang}.txt", + f"Loading {lang} corpus", + ) + ) + corpus_frozenset = corpus_frozenset | imported_frozenset + except ImportError: + continue + + return corpus_frozenset if corpus_frozenset else None + # skipcq: PYL-R0201 def _load_dict_file(self, filename: str) -> frozenset[str]: """Load and process dictionary files. diff --git a/llmshield/cache/loader/__init__.py b/llmshield/cache/loader/__init__.py new file mode 100644 index 0000000..acada72 --- /dev/null +++ b/llmshield/cache/loader/__init__.py @@ -0,0 +1 @@ +""" "Corpus loader that works with separate language packages.""" # noqa: D210 diff --git a/llmshield/cache/loader/loader.py b/llmshield/cache/loader/loader.py new file mode 100644 index 0000000..7cc7344 --- /dev/null +++ b/llmshield/cache/loader/loader.py @@ -0,0 +1,23 @@ +"""Corpus loader that works with separate language packages.""" + +import importlib + + +def get_available_languages() -> list[str]: + """Get list of available language corpus packages. + + Returns: + List of available language codes + + """ + available = [] + common_languages = ["spanish"] + + for lang in common_languages: + try: + importlib.import_module(f"llmshield_{lang}_corpus") + available.append(lang) + except ImportError: + continue + + return available diff --git a/llmshield/entity_detector.py b/llmshield/entity_detector.py index 92afe6f..7b4dc30 100644 --- a/llmshield/entity_detector.py +++ b/llmshield/entity_detector.py @@ -526,6 +526,7 @@ def _is_person(self, p_noun: str) -> bool: if ( not clean_word[0].isupper() or self.cache.is_english_word(clean_word.lower()) + or self.cache.is_foreign_word(clean_word.lower()) or any(c.isdigit() for c in clean_word) ): return False diff --git a/llmshield/matchers/dicts/corpus/english.txt b/llmshield/matchers/dicts/corpus/english.txt index 7d8b854..7b2f54d 100644 --- a/llmshield/matchers/dicts/corpus/english.txt +++ b/llmshield/matchers/dicts/corpus/english.txt @@ -19997,4 +19997,4 @@ cpan plotting yan succeeding -bizjournalshire \ No newline at end of file +bizjournalshire diff --git a/pyproject.toml b/pyproject.toml index bfb829b..49d9817 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,7 +52,6 @@ packages = [ ".mypy_cache/*", "build/*", "dist/*", - "*.egg-info", "Makefile", ] @@ -108,3 +107,5 @@ dev = [ "openai>=1.83.0", # For testing with OpenAI-compatible providers "twine>=6.1.0", # Package uploading to PyPI ] +corpus-spanish = ["llmshield-spanish-corpus"] +corpus-all = ["llmshield-spanish-corpus"] # Should be updated when we expand to more languages. diff --git a/scripts/test_pypi_upload_with_token.sh b/scripts/test_pypi_upload_with_token.sh index 1edec10..17e152c 100755 --- a/scripts/test_pypi_upload_with_token.sh +++ b/scripts/test_pypi_upload_with_token.sh @@ -57,4 +57,4 @@ if [ $? -eq 0 ]; then else echo echo "❌ Upload failed. Please check your token and try again." -fi \ No newline at end of file +fi diff --git a/tests/test_core.py b/tests/test_core.py index 7ea9944..0d209d6 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -28,7 +28,7 @@ from unittest.mock import patch # Third party Imports -from parameterized import parameterized +from parameterized import parameterized # type: ignore # Local Imports from llmshield import LLMShield diff --git a/tests/test_multiple_languages.py b/tests/test_multiple_languages.py new file mode 100644 index 0000000..44f903d --- /dev/null +++ b/tests/test_multiple_languages.py @@ -0,0 +1,39 @@ +"""Test cases for language corpus loading. + +Makes sure the necessary language +packages are installed and loaded. +""" + +import importlib +import unittest + +from llmshield.cache.loader.loader import get_available_languages + +SPANISH_LANGUAGE_DEPENDENCY_PACKAGE_NAME = "llmshield_spanish_corpus" + + +class TestLanguageCorpus(unittest.TestCase): + """Test cases for language corpus loading. + + Args: + unittest (module): The unittest module. + + """ + + def test_spanish_corpus_installed(self): + """Test if the Spanish corpus dependency is installed.""" + try: + importlib.import_module(SPANISH_LANGUAGE_DEPENDENCY_PACKAGE_NAME) + except ImportError: + self.fail("Spanish corpus dependency is not installed.") + + def test_spanish_corpus_loaded(self): + """Test if the Spanish corpus is available in the loader.""" + available_languages = get_available_languages() + self.assertIn( + "spanish", available_languages, "Spanish corpus is not loaded." + ) + + +if __name__ == "__main__": + unittest.main()