Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 44 additions & 67 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ xx_sent_ud_sm = {url = "https://github.com/explosion/spacy-models/releases/downl
lingua-language-detector = "^2.1.1"
psycopg2-binary = "^2.9.10"
brotli = "^1.2.0"
scikit-learn = "~=1.6.1"
scikit-learn = "~=1.7.0"
optimum = {extras = ["onnxruntime"], version = "^2.1.0"}
azure-storage-blob = "^12.28.0"
welearn-database = "^1.3.0"
Expand Down
14 changes: 8 additions & 6 deletions tests/document_classifier/test_sdgs_classifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def test_should_classify_slices_n_with_force_sdg(self, mock_load):
self.assertEqual(result.sdg_number, 4)

@patch("joblib.load")
def test_should_not_classify_slices_n_with_force_sdg(self, mock_load):
def test_should_classify_slices_n(self, mock_load):
mock_load.return_value.predict_proba.return_value = [
numpy.array(
[0.3, 0.2, 0.99, 0.562, 0.2, 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Expand All @@ -54,17 +54,18 @@ def test_should_not_classify_slices_n_with_force_sdg(self, mock_load):
result = n_classify_slice(
slice,
"model_name",
forced_sdg=[1, 11, 12],
n_classifier_id=uuid.uuid4(),
bi_classifier_id=uuid.uuid4(),
)
self.assertEqual(result, None)
self.assertEqual(result.sdg_number, 3)
self.assertIsNotNone(result.bi_classifier_model_id)
self.assertIsNone(result.n_classifier_model_id)

@patch("joblib.load")
def test_should_classify_slices_n(self, mock_load):
def test_should_classify_slices_n_with_forced_corpus(self, mock_load):
mock_load.return_value.predict_proba.return_value = [
numpy.array(
[0.3, 0.2, 0.99, 0.562, 0.2, 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0.3, 0.2, 0.3, 0.462, 0.2, 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
)
]
slice = DocumentSlice(id=1, embedding=b"\x00" * 128)
Expand All @@ -73,8 +74,9 @@ def test_should_classify_slices_n(self, mock_load):
"model_name",
n_classifier_id=uuid.uuid4(),
bi_classifier_id=uuid.uuid4(),
is_forced_corpus=True,
)
self.assertEqual(result.sdg_number, 3)
self.assertEqual(result.sdg_number, 4)
self.assertIsNotNone(result.bi_classifier_model_id)
self.assertIsNone(result.n_classifier_model_id)

Expand Down
2 changes: 2 additions & 0 deletions welearn_datastack/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,3 +355,5 @@
]

QDRANT_MULTI_LINGUAL_CODE = "mul"

FORCED_CORPUS_CLASSIFIED = ["uved"]
38 changes: 38 additions & 0 deletions welearn_datastack/modules/retrieve_data_from_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
URLRetrievalType,
WeighedScope,
)
from welearn_datastack.exceptions import NoModelFoundError
from welearn_datastack.types import QuerySizeLimitDocument, QuerySizeLimitSlice

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -416,3 +417,40 @@ def retrieve_slices_sdgs(
)

return {s[0]: s[1] for s in slices_sdgs}


def get_model_classification_model_by_id(
db_session, model_id: UUID
) -> BiClassifierModel | NClassifierModel | EmbeddingModel:
"""
Retrieve a model from the database by its ID, and return it as the correct type (BiClassifierModel, NClassifierModel or EmbeddingModel)

:param db_session: Database session
:param model_id: Model ID to retrieve
:return: The model retrieved from the database, as the correct type
"""
model = (
db_session.query(BiClassifierModel)
.filter(BiClassifierModel.id == model_id)
.first()
)
if model:
return model

model = (
db_session.query(NClassifierModel)
.filter(NClassifierModel.id == model_id)
.first()
)
if model:
return model

model = (
db_session.query(EmbeddingModel).filter(EmbeddingModel.id == model_id).first()
)
if model:
return model

raise NoModelFoundError(
f"Model not found in the database according this id : {model_id}"
)
58 changes: 42 additions & 16 deletions welearn_datastack/modules/sdgs_classifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,36 @@ def n_classify_slice(
bi_classifier_id: uuid.UUID,
n_classifier_id: uuid.UUID,
forced_sdg: None | list = None,
is_forced_corpus: bool = False,
) -> Sdg | None:
"""
n classifier for welearn sliced containers to classify them into one of the 17 SDGs
:param classifier_model_name: The name of the classifier model, which also the name of the file
:param _slice: Input of welearn sliced container
:param bi_classifier_id: The id of the bi-classifier model used to classify the slice as SDG or not, to keep track of the models used for classification
:param n_classifier_id: The id of the n-classifier model used to classify the slice into one of the 17 SDGs, to keep track of the models used for classification
:param forced_sdg: If not None, list of SDG numbers to force the classification on, if None, all SDGs are possible
:param is_forced_corpus: If True, the classification is forced even if the bi-classifier does not classify the slice as SDG, to keep track of the corpus that are forced classified

:return: Sdg object if classified as one of the SDGs, None otherwise
:raises ValueError: If the embedding of the slice is not of type bytes
"""
# By default every SDGs are equally possible
is_forced_sdg_classif = bool(forced_sdg)
if not forced_sdg:
forced_sdg = [sdg_n + 1 for sdg_n in range(0, 17)]

# If there is only one forced sdg return it
if len(forced_sdg) == 1:
[sdg_number] = forced_sdg
return Sdg(
slice_id=_slice.id,
sdg_number=sdg_number,
id=uuid.uuid4(),
bi_classifier_model_id=bi_classifier_id,
n_classifier_model_id=n_classifier_id if not forced_sdg else None,
)
Comment on lines +68 to +82

logger.debug("Loading multiclass classifier model %s", classifier_model_name)
classifier_path = generate_ml_models_path(
model_type=MLModelsType.N_CLASSIFIER, model_name=classifier_model_name
Expand All @@ -78,19 +104,19 @@ def n_classify_slice(
]
proba_lst.sort(key=lambda x: x[1], reverse=True)

# If the score is superior to 0.5
sdg_number = proba_lst[0][0] if proba_lst[0][1] > 0.5 else None
if sdg_number:
logger.debug(
f"Slice {_slice.id} is labelized with SDG {proba_lst[0][0]} with {proba_lst[0][1]} score"
)
# Create Sdg object, associating it with the slice and classifiers except if forced_sdg is provided because
# in this case we assume classification was done outside the pipeline
return Sdg(
slice_id=_slice.id,
sdg_number=sdg_number,
id=uuid.uuid4(),
bi_classifier_model_id=bi_classifier_id,
n_classifier_model_id=n_classifier_id if not forced_sdg else None,
)
return None
best_sdg, best_score = proba_lst[0]

# If there is no forced SDG and no SDGs with more than 0.5 threshold
if not (is_forced_corpus or is_forced_sdg_classif) and best_score <= 0.5:
return None

logger.debug(
f"Slice {_slice.id} is labelized with SDG {best_sdg} with {best_score} score"
)
return Sdg(
slice_id=_slice.id,
sdg_number=best_sdg,
id=uuid.uuid4(),
bi_classifier_model_id=bi_classifier_id,
n_classifier_model_id=n_classifier_id if not forced_sdg else None,
)
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from welearn_database.data.enumeration import Step
from welearn_database.data.models import DocumentSlice, ProcessState, Sdg

from welearn_datastack.constants import FORCED_CORPUS_CLASSIFIED
from welearn_datastack.data.enumerations import MLModelsType
from welearn_datastack.modules.retrieve_data_from_database import retrieve_models
from welearn_datastack.modules.retrieve_data_from_files import retrieve_ids_from_csv
Expand Down Expand Up @@ -84,7 +85,11 @@ def main() -> None:
slices_per_docs, lambda x: x.document_id
):
doc_slices: List[DocumentSlice] = list(group_doc_slices) # type: ignore

corpus_name = doc_slices[0].document.corpus.source_name
is_forced_corpus = corpus_name in FORCED_CORPUS_CLASSIFIED
logger.info(
f"Classifying document {key_doc_id} from corpus {corpus_name} with {len(doc_slices)} slices"
)
bi_model_name = bi_model_by_docid.get(key_doc_id, dict()).get("model_name")
bi_model_id: UUID = bi_model_by_docid.get(key_doc_id, dict()).get("model_id")
if not bi_model_name and not isinstance(bi_model_name, str):
Expand Down Expand Up @@ -118,7 +123,17 @@ def main() -> None:
key_external_sdg in s.document.details
and s.document.details[key_external_sdg]
)
if bi_classify_slice(slice_=s, classifier_model_name=bi_model_name):
if externaly_classified_flag:
logger.info(f"Document {key_doc_id} is externally classified ")

if (
externaly_classified_flag
or is_forced_corpus
or bi_classify_slice(slice_=s, classifier_model_name=bi_model_name)
):
logger.info(
f"Document {key_doc_id} is classified as SDG by bi-classifier"
Comment on lines +129 to +135
)
specific_sdg = n_classify_slice(
_slice=s,
classifier_model_name=n_model_name,
Expand All @@ -129,9 +144,13 @@ def main() -> None:
),
bi_classifier_id=bi_model_id,
n_classifier_id=n_model_id,
is_forced_corpus=is_forced_corpus,
)
if not specific_sdg:
continue
logger.info(
f"Document {key_doc_id} is classified as SDG {specific_sdg.sdg_number} by n-classifier"
)
specific_sdgs.append(specific_sdg)
sdg_docs_ids.add(key_doc_id)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def main() -> None:
parallelism_max: int = int(os.getenv("PARALLELISM_URL_MAX", 15))
batch_urls_directory: str = os.getenv("BATCH_URLS_DIRECTORY", "batch_urls")
qty_max_str: str | None = os.getenv("PICK_QTY_MAX", None)
corpus_name: str = os.getenv("PICK_CORPUS_NAME", "*")
size_limit_str: str | None = os.getenv("SIZE_TOTAL_LIMIT", None)

qty_max: int | None = None
Expand Down Expand Up @@ -69,6 +70,7 @@ def main() -> None:
process_titles=[Step.DOCUMENT_SCRAPED],
size_total_max=size_limit,
weighed_scope=WeighedScope.DOCUMENT,
corpus_name=corpus_name,
)
)
logger.info("'%s' Docsids were retrieved", len(ids_to_batch))
Expand Down
Loading