Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,4 @@ build/
docs/out/

# Extensions
.VSCodeCounter
.VSCodeCounter
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ verify-package-all:
python3.13 -m unittest tests.test_package_installation -v

dev-dependencies:
pip install -e ".[dev]"
pip install -e ".[dev,corpus-spanish,corpus-all]"

hooks:
pre-commit install
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -768,4 +768,4 @@ For questions, issues, or feature requests:

## Production Usage

LLMShield is used in production environments by [brainful.ai](https://brainful.ai) to protect user data confidentiality.
LLMShield is used in production environments by [brainful.ai](https://brainful.ai) to protect user data confidentiality.
41 changes: 41 additions & 0 deletions llmshield/cache/entity_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

import threading

from llmshield.cache.loader import loader

from ..error_handling import safe_resource_load


Expand Down Expand Up @@ -54,6 +56,7 @@ def __init__(self) -> None:
self._countries: frozenset[str] | None = None
self._organisations: frozenset[str] | None = None
self._english_corpus: frozenset[str] | None = None
self._composite_corpus: frozenset[str] | None = None
self._initialized = True

@property
Expand Down Expand Up @@ -92,6 +95,17 @@ def english_corpus(self) -> frozenset[str]:
self._english_corpus = self._load_english_corpus()
return self._english_corpus

@property
def composite_corpus(self) -> frozenset[str]:
"""Get the corpus from the other languages."""
if self._composite_corpus is None:
with self._lock:
if self._composite_corpus is None:
self._composite_corpus = self._load_all_corpuses()
return (
self._composite_corpus if self._composite_corpus else frozenset()
)

def get_all_places(self) -> frozenset[str]:
"""Get combined cities and countries set."""
return self.cities | self.countries
Expand All @@ -108,6 +122,10 @@ def is_english_word(self, text_lower: str) -> bool:
"""O(1) lookup for English words."""
return text_lower in self.english_corpus

def is_foreign_word(self, text_lower: str) -> bool:
"""O(1) lookup for foreign words in the composite corpus."""
return text_lower in self.composite_corpus

def preload_all(self) -> None:
"""Preload all dictionaries for optimal performance."""
with self._lock:
Expand All @@ -127,6 +145,8 @@ def get_memory_stats(self) -> dict[str, int]:
stats["organisations"] = len(self._organisations)
if self._english_corpus is not None:
stats["english_corpus"] = len(self._english_corpus)
if self._composite_corpus is not None:
stats["composite_corpus"] = len(self._composite_corpus)
return stats

def _load_cities(self) -> frozenset[str]:
Expand All @@ -145,6 +165,27 @@ def _load_english_corpus(self) -> frozenset[str]:
"""Load English corpus from resource file."""
return self._load_dict_file("corpus/english.txt")

@staticmethod
def _load_all_corpuses() -> frozenset[str] | None:
"""Load all corpuses containing all of the words from the additional non-English language packs.""" # noqa: E501
corpus_frozenset: frozenset[str] = frozenset()

for lang in loader.get_available_languages():
try:
imported_frozenset = frozenset(
entry.lower()
for entry in safe_resource_load(
"llmshield_" + lang + "_corpus",
f"data/{lang}.txt",
f"Loading {lang} corpus",
)
)
corpus_frozenset = corpus_frozenset | imported_frozenset
except ImportError:
continue

return corpus_frozenset if corpus_frozenset else None

# skipcq: PYL-R0201
def _load_dict_file(self, filename: str) -> frozenset[str]:
"""Load and process dictionary files.
Expand Down
1 change: 1 addition & 0 deletions llmshield/cache/loader/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
""" "Corpus loader that works with separate language packages.""" # noqa: D210
23 changes: 23 additions & 0 deletions llmshield/cache/loader/loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
"""Corpus loader that works with separate language packages."""

import importlib


def get_available_languages() -> list[str]:
"""Get list of available language corpus packages.

Returns:
List of available language codes

"""
available = []
common_languages = ["spanish"]

for lang in common_languages:
try:
importlib.import_module(f"llmshield_{lang}_corpus")
available.append(lang)
except ImportError:
continue

return available
1 change: 1 addition & 0 deletions llmshield/entity_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -526,6 +526,7 @@ def _is_person(self, p_noun: str) -> bool:
if (
not clean_word[0].isupper()
or self.cache.is_english_word(clean_word.lower())
or self.cache.is_foreign_word(clean_word.lower())
or any(c.isdigit() for c in clean_word)
):
return False
Expand Down
2 changes: 1 addition & 1 deletion llmshield/matchers/dicts/corpus/english.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19997,4 +19997,4 @@ cpan
plotting
yan
succeeding
bizjournalshire
bizjournalshire
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ packages = [
".mypy_cache/*",
"build/*",
"dist/*",
"*.egg-info",
"Makefile",
]

Expand Down Expand Up @@ -108,3 +107,5 @@ dev = [
"openai>=1.83.0", # For testing with OpenAI-compatible providers
"twine>=6.1.0", # Package uploading to PyPI
]
corpus-spanish = ["llmshield-spanish-corpus"]
corpus-all = ["llmshield-spanish-corpus"] # Should be updated when we expand to more languages.
2 changes: 1 addition & 1 deletion scripts/test_pypi_upload_with_token.sh
Original file line number Diff line number Diff line change
Expand Up @@ -57,4 +57,4 @@ if [ $? -eq 0 ]; then
else
echo
echo "❌ Upload failed. Please check your token and try again."
fi
fi
2 changes: 1 addition & 1 deletion tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from unittest.mock import patch

# Third party Imports
from parameterized import parameterized
from parameterized import parameterized # type: ignore

# Local Imports
from llmshield import LLMShield
Expand Down
39 changes: 39 additions & 0 deletions tests/test_multiple_languages.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
"""Test cases for language corpus loading.

Makes sure the necessary language
packages are installed and loaded.
"""

import importlib
import unittest

from llmshield.cache.loader.loader import get_available_languages

SPANISH_LANGUAGE_DEPENDENCY_PACKAGE_NAME = "llmshield_spanish_corpus"


class TestLanguageCorpus(unittest.TestCase):
"""Test cases for language corpus loading.

Args:
unittest (module): The unittest module.

"""

def test_spanish_corpus_installed(self):
"""Test if the Spanish corpus dependency is installed."""
try:
importlib.import_module(SPANISH_LANGUAGE_DEPENDENCY_PACKAGE_NAME)
except ImportError:
self.fail("Spanish corpus dependency is not installed.")

def test_spanish_corpus_loaded(self):
"""Test if the Spanish corpus is available in the loader."""
available_languages = get_available_languages()
self.assertIn(
"spanish", available_languages, "Spanish corpus is not loaded."
)


if __name__ == "__main__":
unittest.main()
Loading