CyberCRI · lpi-tn · Mar 19, 2026 · Mar 19, 2026 · Mar 19, 2026 · Mar 19, 2026
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -43,7 +43,7 @@ xx_sent_ud_sm = {url = "https://github.com/explosion/spacy-models/releases/downl
 lingua-language-detector = "^2.1.1"
 psycopg2-binary = "^2.9.10"
 brotli = "^1.2.0"
-scikit-learn = "~=1.6.1"
+scikit-learn = "~=1.7.0"
 optimum = {extras = ["onnxruntime"], version = "^2.1.0"}
 azure-storage-blob = "^12.28.0"
 welearn-database = "^1.3.0"

diff --git a/tests/document_classifier/test_sdgs_classifiers.py b/tests/document_classifier/test_sdgs_classifiers.py
@@ -44,7 +44,7 @@ def test_should_classify_slices_n_with_force_sdg(self, mock_load):
         self.assertEqual(result.sdg_number, 4)
 
     @patch("joblib.load")
-    def test_should_not_classify_slices_n_with_force_sdg(self, mock_load):
+    def test_should_classify_slices_n(self, mock_load):
         mock_load.return_value.predict_proba.return_value = [
             numpy.array(
                 [0.3, 0.2, 0.99, 0.562, 0.2, 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
@@ -54,17 +54,18 @@ def test_should_not_classify_slices_n_with_force_sdg(self, mock_load):
         result = n_classify_slice(
             slice,
             "model_name",
-            forced_sdg=[1, 11, 12],
             n_classifier_id=uuid.uuid4(),
             bi_classifier_id=uuid.uuid4(),
         )
-        self.assertEqual(result, None)
+        self.assertEqual(result.sdg_number, 3)
+        self.assertIsNotNone(result.bi_classifier_model_id)
+        self.assertIsNone(result.n_classifier_model_id)
 
     @patch("joblib.load")
-    def test_should_classify_slices_n(self, mock_load):
+    def test_should_classify_slices_n_with_forced_corpus(self, mock_load):
         mock_load.return_value.predict_proba.return_value = [
             numpy.array(
-                [0.3, 0.2, 0.99, 0.562, 0.2, 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                [0.3, 0.2, 0.3, 0.462, 0.2, 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
             )
         ]
         slice = DocumentSlice(id=1, embedding=b"\x00" * 128)
@@ -73,8 +74,9 @@ def test_should_classify_slices_n(self, mock_load):
             "model_name",
             n_classifier_id=uuid.uuid4(),
             bi_classifier_id=uuid.uuid4(),
+            is_forced_corpus=True,
         )
-        self.assertEqual(result.sdg_number, 3)
+        self.assertEqual(result.sdg_number, 4)
         self.assertIsNotNone(result.bi_classifier_model_id)
         self.assertIsNone(result.n_classifier_model_id)
 

diff --git a/welearn_datastack/constants.py b/welearn_datastack/constants.py
@@ -355,3 +355,5 @@
 ]
 
 QDRANT_MULTI_LINGUAL_CODE = "mul"
+
+FORCED_CORPUS_CLASSIFIED = ["uved"]
diff --git a/welearn_datastack/modules/retrieve_data_from_database.py b/welearn_datastack/modules/retrieve_data_from_database.py
@@ -26,6 +26,7 @@
     URLRetrievalType,
     WeighedScope,
 )
+from welearn_datastack.exceptions import NoModelFoundError
 from welearn_datastack.types import QuerySizeLimitDocument, QuerySizeLimitSlice
 
 logger = logging.getLogger(__name__)
@@ -416,3 +417,40 @@ def retrieve_slices_sdgs(
     )
 
     return {s[0]: s[1] for s in slices_sdgs}
+
+
+def get_model_classification_model_by_id(
+    db_session, model_id: UUID
+) -> BiClassifierModel | NClassifierModel | EmbeddingModel:
+    """
+    Retrieve a model from the database by its ID, and return it as the correct type (BiClassifierModel, NClassifierModel or EmbeddingModel)
+
+    :param db_session: Database session
+    :param model_id: Model ID to retrieve
+    :return: The model retrieved from the database, as the correct type
+    """
+    model = (
+        db_session.query(BiClassifierModel)
+        .filter(BiClassifierModel.id == model_id)
+        .first()
+    )
+    if model:
+        return model
+
+    model = (
+        db_session.query(NClassifierModel)
+        .filter(NClassifierModel.id == model_id)
+        .first()
+    )
+    if model:
+        return model
+
+    model = (
+        db_session.query(EmbeddingModel).filter(EmbeddingModel.id == model_id).first()
+    )
+    if model:
+        return model
+
+    raise NoModelFoundError(
+        f"Model not found in the database according this id : {model_id}"
+    )
diff --git a/welearn_datastack/modules/sdgs_classifiers.py b/welearn_datastack/modules/sdgs_classifiers.py
@@ -51,10 +51,36 @@ def n_classify_slice(
     bi_classifier_id: uuid.UUID,
     n_classifier_id: uuid.UUID,
     forced_sdg: None | list = None,
+    is_forced_corpus: bool = False,
 ) -> Sdg | None:
+    """
+    n classifier for welearn sliced containers to classify them into one of the 17 SDGs
+    :param classifier_model_name: The name of the classifier model, which also the name of the file
+    :param _slice: Input of welearn sliced container
+    :param bi_classifier_id: The id of the bi-classifier model used to classify the slice as SDG or not, to keep track of the models used for classification
+    :param n_classifier_id: The id of the n-classifier model used to classify the slice into one of the 17 SDGs, to keep track of the models used for classification
+    :param forced_sdg: If not None, list of SDG numbers to force the classification on, if None, all SDGs are possible
+    :param is_forced_corpus: If True, the classification is forced even if the bi-classifier does not classify the slice as SDG, to keep track of the corpus that are forced classified
+
+    :return: Sdg object if classified as one of the SDGs, None otherwise
+    :raises ValueError: If the embedding of the slice is not of type bytes
+    """
+    # By default every SDGs are equally possible
+    is_forced_sdg_classif = bool(forced_sdg)
     if not forced_sdg:
         forced_sdg = [sdg_n + 1 for sdg_n in range(0, 17)]
 
+    # If there is only one forced sdg return it
+    if len(forced_sdg) == 1:
+        [sdg_number] = forced_sdg
+        return Sdg(
+            slice_id=_slice.id,
+            sdg_number=sdg_number,
+            id=uuid.uuid4(),
+            bi_classifier_model_id=bi_classifier_id,
+            n_classifier_model_id=n_classifier_id if not forced_sdg else None,
+        )
+
     logger.debug("Loading multiclass classifier model %s", classifier_model_name)
     classifier_path = generate_ml_models_path(
         model_type=MLModelsType.N_CLASSIFIER, model_name=classifier_model_name
@@ -78,19 +104,19 @@ def n_classify_slice(
     ]
     proba_lst.sort(key=lambda x: x[1], reverse=True)
 
-    # If the score is superior to 0.5
-    sdg_number = proba_lst[0][0] if proba_lst[0][1] > 0.5 else None
-    if sdg_number:
-        logger.debug(
-            f"Slice {_slice.id} is labelized with SDG {proba_lst[0][0]} with {proba_lst[0][1]} score"
-        )
-        # Create Sdg object, associating it with the slice and classifiers except if forced_sdg is provided because
-        # in this case we assume classification was done outside the pipeline
-        return Sdg(
-            slice_id=_slice.id,
-            sdg_number=sdg_number,
-            id=uuid.uuid4(),
-            bi_classifier_model_id=bi_classifier_id,
-            n_classifier_model_id=n_classifier_id if not forced_sdg else None,
-        )
-    return None
+    best_sdg, best_score = proba_lst[0]
+
+    # If there is no forced SDG and no SDGs with more than 0.5 threshold
+    if not (is_forced_corpus or is_forced_sdg_classif) and best_score <= 0.5:
+        return None
+
+    logger.debug(
+        f"Slice {_slice.id} is labelized with SDG {best_sdg} with {best_score} score"
+    )
+    return Sdg(
+        slice_id=_slice.id,
+        sdg_number=best_sdg,
+        id=uuid.uuid4(),
+        bi_classifier_model_id=bi_classifier_id,
+        n_classifier_model_id=n_classifier_id if not forced_sdg else None,
+    )
diff --git a/welearn_datastack/nodes_workflow/DocumentClassifier/document_classifier.py b/welearn_datastack/nodes_workflow/DocumentClassifier/document_classifier.py
@@ -9,6 +9,7 @@
 from welearn_database.data.enumeration import Step
 from welearn_database.data.models import DocumentSlice, ProcessState, Sdg
 
+from welearn_datastack.constants import FORCED_CORPUS_CLASSIFIED
 from welearn_datastack.data.enumerations import MLModelsType
 from welearn_datastack.modules.retrieve_data_from_database import retrieve_models
 from welearn_datastack.modules.retrieve_data_from_files import retrieve_ids_from_csv
@@ -84,7 +85,11 @@ def main() -> None:
         slices_per_docs, lambda x: x.document_id
     ):
         doc_slices: List[DocumentSlice] = list(group_doc_slices)  # type: ignore
-
+        corpus_name = doc_slices[0].document.corpus.source_name
+        is_forced_corpus = corpus_name in FORCED_CORPUS_CLASSIFIED
+        logger.info(
+            f"Classifying document {key_doc_id} from corpus {corpus_name} with {len(doc_slices)} slices"
+        )
         bi_model_name = bi_model_by_docid.get(key_doc_id, dict()).get("model_name")
         bi_model_id: UUID = bi_model_by_docid.get(key_doc_id, dict()).get("model_id")
         if not bi_model_name and not isinstance(bi_model_name, str):
@@ -118,7 +123,17 @@ def main() -> None:
                 key_external_sdg in s.document.details
                 and s.document.details[key_external_sdg]
             )
-            if bi_classify_slice(slice_=s, classifier_model_name=bi_model_name):
+            if externaly_classified_flag:
+                logger.info(f"Document {key_doc_id} is externally classified ")
+
+            if (
+                externaly_classified_flag
+                or is_forced_corpus
+                or bi_classify_slice(slice_=s, classifier_model_name=bi_model_name)
+            ):
+                logger.info(
+                    f"Document {key_doc_id} is classified as SDG by bi-classifier"
+                )
                 specific_sdg = n_classify_slice(
                     _slice=s,
                     classifier_model_name=n_model_name,
@@ -129,9 +144,13 @@ def main() -> None:
                     ),
                     bi_classifier_id=bi_model_id,
                     n_classifier_id=n_model_id,
+                    is_forced_corpus=is_forced_corpus,
                 )
                 if not specific_sdg:
                     continue
+                logger.info(
+                    f"Document {key_doc_id} is classified as SDG {specific_sdg.sdg_number} by n-classifier"
+                )
                 specific_sdgs.append(specific_sdg)
                 sdg_docs_ids.add(key_doc_id)
 

diff --git a/welearn_datastack/nodes_workflow/DocumentVectorizer/generate_to_vectorize_batch.py b/welearn_datastack/nodes_workflow/DocumentVectorizer/generate_to_vectorize_batch.py
@@ -35,6 +35,7 @@ def main() -> None:
     parallelism_max: int = int(os.getenv("PARALLELISM_URL_MAX", 15))
     batch_urls_directory: str = os.getenv("BATCH_URLS_DIRECTORY", "batch_urls")
     qty_max_str: str | None = os.getenv("PICK_QTY_MAX", None)
+    corpus_name: str = os.getenv("PICK_CORPUS_NAME", "*")
     size_limit_str: str | None = os.getenv("SIZE_TOTAL_LIMIT", None)
 
     qty_max: int | None = None
@@ -69,6 +70,7 @@ def main() -> None:
             process_titles=[Step.DOCUMENT_SCRAPED],
             size_total_max=size_limit,
             weighed_scope=WeighedScope.DOCUMENT,
+            corpus_name=corpus_name,
         )
     )
     logger.info("'%s' Docsids were retrieved", len(ids_to_batch))
Original file line number	Diff line number	Diff line change
Expand Up		@@ -355,3 +355,5 @@
		]

		QDRANT_MULTI_LINGUAL_CODE = "mul"

		FORCED_CORPUS_CLASSIFIED = ["uved"]