From d2d6db27fe4132327fec11429caec89b1df55755 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Z=C3=BCbeyde=20Civelek?= <zubeydeecivelek@gmail.com>
Date: Tue, 5 May 2026 14:54:30 +0200
Subject: [PATCH] add(eco): new model and transformation rules

---
 .../rdm/records/transform/models/eco.py       |  95 ++++++
 .../rdm/records/transform/models/it.py        |   2 +
 .../rdm/records/transform/transform.py        |   2 +-
 .../transform/xml_processing/rules/base.py    |   8 +-
 .../transform/xml_processing/rules/eco.py     | 295 ++++++++++++++++++
 cds_migrator_kit/rdm/streams.yaml             |  11 +
 setup.cfg                                     |   5 +
 7 files changed, 415 insertions(+), 3 deletions(-)
 create mode 100644 cds_migrator_kit/rdm/records/transform/models/eco.py
 create mode 100644 cds_migrator_kit/rdm/records/transform/xml_processing/rules/eco.py

diff --git a/cds_migrator_kit/rdm/records/transform/models/eco.py b/cds_migrator_kit/rdm/records/transform/models/eco.py
new file mode 100644
index 00000000..883329da
--- /dev/null
+++ b/cds_migrator_kit/rdm/records/transform/models/eco.py
@@ -0,0 +1,95 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2026 CERN.
+#
+# CDS-RDM is free software; you can redistribute it and/or modify it under
+# the terms of the MIT License; see LICENSE file for more details.
+
+"""CDS-RDM ECO model."""
+
+from cds_migrator_kit.rdm.records.transform.models.base_record import (
+    rdm_base_record_model,
+)
+from cds_migrator_kit.transform.overdo import CdsOverdo
+
+
+class ECOModel(CdsOverdo):
+    """Translation model for ECO records."""
+
+    __query__ = """
+        (
+            980__:POSTER
+            OR (980__:BROCHURE AND 690C_:CERNOFFICIALPRESSBROCHURE)
+            OR (
+                (980__:BROCHURE AND 690C_:CERNEXPERIMENTBROCHURE)
+                OR (
+                    980__:CMSOUTREACH
+                    AND (
+                        6531_.a:Brochure
+                        OR 6531_.a:brochure
+                        OR 6531_a:Brochure
+                        OR 6531_a:brochure
+                    )
+                )
+            )
+            OR (980__:NOTE AND 710__.5:IR)
+        )
+        AND -595__a:Press
+        AND -980__:LHCb_Misc
+        AND -690C_a:PRIVATLAS
+    """
+
+    __ignore_keys__ = {
+        "0247_9",  # source of pid, only value: OSTI, 2948638, 2853279
+        "0248_a",
+        "0248_p",
+        "0248_q",
+        "035__d",  # oai harvest tag
+        "035__h",  # oai harvest tag
+        "035__m",  # oai harvest tag
+        "100__m",  # email of contributor
+        "245__9",  # source of title, only value: submitter
+        "270__m",  # email of contact person - TODO: is it okay to ignore? example: 2908973
+        "270__p",  # contact person name - TODO: is it okay to ignore?
+        "300__a",  # number of pages
+        "340__a",  # Physical medium
+        "520__9",  # abstract provenance
+        "541__e",  # Original source poster https://cds.cern.ch/record/2695195/export/hm
+        "594__a",  # PUB: 2749806, 2749822
+        "6531_9",  # scheme of keywords
+        "700__m",  # email of contributor
+        "773__p",  # display name of the related link TODO: is it okay to ignore?
+        "773__y",  # year, TODO: is it okay to ignore? https://cds.cern.ch/record/1452204/export/xm
+        "773__v",  # TODO: is it okay to ignore? https://cds.cern.ch/record/1452204/export/xm
+        "852__c",
+        "852__h",
+        "8560_f",  # contact email
+        "8564_8",  # file id
+        "8564_s",  # bibdoc id
+        "8564_x",  # icon thumbnails sizes
+        "8564_y",  # file description - handled by files dump
+        "8564_z",  # DM metadata
+        "937__c",  # last modified by
+        "937__s",  # last modification date
+        "960__a",  # base number
+        "961__a",  # CDS modification tag
+        "961__b",  # CDS modification tag
+        "961__c",  # CDS modification tag
+        "961__h",  # CDS modification tag
+        "961__l",  # CDS modification tag
+        "961__x",  # CDS modification tag
+        "981__a",  # duplicate record id
+    }
+
+    _default_fields = {
+        "custom_fields": {},
+        "languages": [],
+        "related_identifiers": [],
+        "creators": [{"person_or_org": {"type": "organizational", "name": "CERN"}}],
+    }
+
+
+eco_model = ECOModel(
+    bases=(rdm_base_record_model,),
+    entry_point_group="cds_migrator_kit.migrator.rules.eco",
+)
diff --git a/cds_migrator_kit/rdm/records/transform/models/it.py b/cds_migrator_kit/rdm/records/transform/models/it.py
index 3dec8cd0..a57b1b01 100644
--- a/cds_migrator_kit/rdm/records/transform/models/it.py
+++ b/cds_migrator_kit/rdm/records/transform/models/it.py
@@ -32,6 +32,8 @@ class ITModel(CdsOverdo):
                     -980__:BOOK
                     -690C_:YELLOWREPORT
                     -690C_:"YELLOW REPORT"
+                    -690C_:CERNOFFICIALPRESSBROCHURE
+                    -690C_:CERNEXPERIMENTBROCHURE
                     -980__:THESIS
                     -980__:INTNOTECMSPUBL
                     """
diff --git a/cds_migrator_kit/rdm/records/transform/transform.py b/cds_migrator_kit/rdm/records/transform/transform.py
index f2fb96bc..c222a7ba 100644
--- a/cds_migrator_kit/rdm/records/transform/transform.py
+++ b/cds_migrator_kit/rdm/records/transform/transform.py
@@ -472,7 +472,7 @@ def field_experiments(record_json, custom_fields_dict):
                 "cern:experiments", []
             )
             for experiment in experiments:
-                if experiment.lower().strip() == "not applicable":
+                if experiment.lower().strip() in ["not applicable", "select:"]:
                     continue
                 result = search_vocabulary(experiment, "experiments")
 
diff --git a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/base.py b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/base.py
index 9a743263..4c39bf0a 100644
--- a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/base.py
+++ b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/base.py
@@ -73,8 +73,8 @@ def created(self, key, value):
         source = clean_val("s", value, str)
         # h = human catalogued
         # n = script catalogued or via submission
-        if source not in ["n", "h", "m", "r"]:
-            raise UnexpectedValue(subfield="s", field=key, value=value)
+        if source not in ["n", "h", "m", "r", "d"]:
+            raise UnexpectedValue(subfield="s", field=key, value=source)
     date_values = value.get("w")
     if not date_values or not date_values[0]:
         return datetime.date.today().isoformat()
@@ -797,6 +797,10 @@ def related_identifiers_787(self, key, value):
             "relation_type": {"id": "references"},
             "resource_type": {"id": "publication-conferencepaper"},
         },
+        "paper": {
+            "relation_type": {"id": "references"},
+            "resource_type": {"id": "publication-article"},
+        },
     }
 
     if recid:
diff --git a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/eco.py b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/eco.py
new file mode 100644
index 00000000..bddc88e9
--- /dev/null
+++ b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/eco.py
@@ -0,0 +1,295 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2026 CERN.
+#
+# CDS-RDM is free software; you can redistribute it and/or modify it under
+# the terms of the MIT License; see LICENSE file for more details.
+
+"""CDS-RDM ECO rules."""
+
+import re
+
+import pycountry
+from dojson.utils import IgnoreKey, for_each_value, force_list
+
+from cds_migrator_kit.errors import UnexpectedValue
+from cds_migrator_kit.rdm.records.transform.xml_processing.rules.it import (
+    corporate_author,
+)
+from cds_migrator_kit.transform.xml_processing.quality.decorators import (
+    require,
+)
+from cds_migrator_kit.transform.xml_processing.quality.parsers import (
+    StringValue,
+)
+from cds_migrator_kit.transform.xml_processing.rules.base import (
+    languages as base_languages,
+)
+from cds_migrator_kit.transform.xml_processing.rules.base import (
+    process_contributors,
+)
+
+from ...models.eco import eco_model as model
+from .base import identifiers
+from .base import note as base_note
+from .base import report_number, urls
+from .bulletin_issue import (
+    additional_descriptions,
+    additional_titles_bulletin,
+    rel_identifiers,
+    translated_description,
+)
+from .it import corporate_author
+from .publications import internal_notes, journal, organisation, related_identifiers
+
+model.over("additional_titles", "(^246_[1_])", override=True)(
+    additional_titles_bulletin
+)
+model.over("additional_descriptions", "(^500__)")(additional_descriptions)
+model.over("additional_descriptions", "(^590__)")(translated_description)
+model.over("internal_notes", "^562__")(internal_notes)
+model.over("contributors", "^901__")(organisation)
+model.over("creators", "(^110__)")(corporate_author)
+
+
+@model.over("internal_notes", "^595__")
+@for_each_value
+def internal_notes(self, key, value):
+    """Translates internal notes."""
+    subject_notes = force_list(value.get("s", ""))
+    if subject_notes:
+        # add them as subjects
+        subjects = self.get("subjects", [])
+        for note in subject_notes:
+            subjects.append({"subject": note})
+        self["subjects"] = subjects
+    base_note(self, key, value)
+    raise IgnoreKey("internal_notes")
+
+
+@model.over("eco_report_number", "(^037__)|(^088__)", override=True)
+@for_each_value
+def eco_report_number(self, key, value):
+    """Translates report number."""
+    identifier = value.get("a", "")
+    # Check it's email TODO: how to handle?
+    if key == "088__" and "@" in identifier:
+        pass
+    else:
+        _identifier = report_number(self, key, value)
+        identifiers = self.get("identifiers", [])
+
+        if _identifier and _identifier not in identifiers:
+            identifiers += _identifier
+        self["identifiers"] = identifiers
+    raise IgnoreKey("eco_report_number")
+
+
+@model.over("eco_related_identifiers", "(^962__)", override=True)
+@for_each_value
+def eco_related_identifiers(self, key, value):
+    """Translates related identifiers."""
+    scheme = value.get("l", "")
+    if scheme:
+        rel_identifiers(self, key, value)
+        raise IgnoreKey("eco_related_identifiers")
+    rel_identifier = related_identifiers(self, key, value)
+    if rel_identifier:
+        rel_id = rel_identifier[0]
+        rel_ids = self.get("related_identifiers", [])
+        if rel_id not in rel_ids:
+            rel_ids.append(rel_id)
+            self["related_identifiers"] = rel_ids
+    raise IgnoreKey("eco_related_identifiers")
+
+
+@model.over("eco_identifiers", "^035__", override=True)
+@for_each_value
+def eco_identifiers(self, key, value):
+    """Translates identifiers."""
+    original_scheme = StringValue(value.get("9", "")).parse()
+    scheme = original_scheme.lower()
+
+    # TODO: handle photo identifier
+    if scheme == "phopho":
+        id_value = StringValue(value.get("a", "")).parse()
+        new_id = {"scheme": "photo", "identifier": id_value}
+        raise IgnoreKey("eco_identifiers")
+    identifiers(self, key, value)
+    raise IgnoreKey("eco_identifiers")
+
+
+@model.over("eco_urls", "^8564[1_]", override=True)
+@for_each_value
+def urls_eco(self, key, value):
+    """Translates urls field."""
+    q_value = value.get("q", "")
+    note = value.get("x", "").strip().lower()
+    if note and not ("icon" in note or note in ["pdfa", "1"]):
+        raise UnexpectedValue(field=key, subfield="x", value=note)
+    if q_value:
+        identifier = urls(self, key, value, subfield="q")
+    else:
+        identifier = urls(self, key, value)
+    new_id = identifier[0]
+    related_identifiers = self.get("related_identifiers", [])
+    if new_id not in related_identifiers:
+        related_identifiers.append(new_id)
+    self["related_identifiers"] = related_identifiers
+    raise IgnoreKey("eco_urls")
+
+
+@model.over("resource_type", "^980__", override=True)
+def resource_type(self, key, value):
+    """Translates resource_type."""
+    value = value.get("a") if "a" in value else value.get("b")
+    if value:
+        value = value.strip().lower()
+
+    # TODO: are they correct?
+    mapping = {
+        "poster": {"id": "poster"},
+        "brochure": {"id": "publication-brochure"},
+        "note": {"id": "publication-technicalnote"},
+        "conferencepaper": {"id": "publication-conferencepaper"},
+    }
+
+    try:
+        return mapping[value]
+    except KeyError:
+        raise UnexpectedValue("Unknown resource type (ECO)", field=key, value=value)
+
+
+@model.over("collection", "^690C_", override=True)
+@for_each_value
+def collection(self, key, value):
+    """Translates collection."""
+    collection = value.get("a", "")
+    if collection.strip().upper() == "CERN":
+        raise IgnoreKey("collection")
+    if collection.strip().upper() not in [
+        "POSTER",
+        "PREPRINT",
+    ]:
+        raise UnexpectedValue(subfield="a", field=key, value=value)
+    subjects = self.get("subjects", [])
+    subjects.append(
+        {
+            "subject": f"collection:{collection}",
+        }
+    )
+    self["subjects"] = subjects
+    raise IgnoreKey("collection")
+
+
+@model.over("related_ids", "^773__")
+@for_each_value
+def related_ids(self, key, value):
+    """Translated related links."""
+    # TODO: how to transform? https://cds.cern.ch/record/1452204/export/xm
+    related_link = value.get("u", "")
+    if not related_link:
+        journal(self, key, value)
+        raise IgnoreKey("related_ids")
+
+    # Transform like the base `urls` rule
+    rel_ids = urls(self, key, value)
+    if not rel_ids:
+        raise IgnoreKey("related_ids")
+    rel_id = rel_ids[0]
+    related_identifiers = self.get("related_identifiers", [])
+    if rel_id not in related_identifiers:
+        related_identifiers.append(rel_id)
+    self["related_identifiers"] = related_identifiers
+
+    raise IgnoreKey("related_ids")
+
+
+@model.over("submitter_info", "^923__")
+@for_each_value
+def submitter_info(self, key, value):
+    """Translates submitter information."""
+    submitter_info = value.get("r", "")
+    names = submitter_info.strip().split(",")
+
+    if len(names) == 2:
+        names = {"family_name": names[0].strip(), "given_name": names[1].strip()}
+    else:
+        names = {"family_name": names[0].strip()}
+    contributor = {
+        "person_or_org": {
+            "type": "personal",
+            **names,
+        },
+        "role": {"id": "contactperson"},
+    }
+    contributors = self.get("contributors", [])
+    contributors.append(contributor)
+    self["contributors"] = contributors
+    raise IgnoreKey("submitter_info")
+
+
+@model.over("languages", "^041__", override=True)
+@for_each_value
+@require(["a"])
+def language(self, key, value):
+    """Translates languages fields."""
+    langs = value.get("a")
+    languages = self.get("languages", [])
+    if "-" in langs or "/" in langs:
+        # https://cds.cern.ch/record/921930/export/xm
+        language_codes = re.split(r"[-/]+", langs)
+        for lang in language_codes:
+            if not lang:
+                continue
+            if lang == "fre":
+                lang = "fra"
+            try:
+                # If it's a 2-letter code
+                if len(lang) == 2:
+                    lang_obj = pycountry.languages.get(alpha_2=lang)
+                else:
+                    lang_obj = pycountry.languages.get(alpha_3=lang)
+                if not lang_obj:
+                    lang_obj = pycountry.languages.lookup(lang)
+                languages.append({"id": lang_obj.alpha_3.lower()})
+            except (KeyError, AttributeError, LookupError):
+                raise UnexpectedValue(field=key, subfield="a", value=langs)
+    else:
+        new_langs = base_languages(self, key, value)
+        languages.extend(new_langs)
+    self["languages"] = languages
+    raise IgnoreKey("language")
+
+
+@model.over("eco_contributors", "^700__", override=True)
+@for_each_value
+@require(["a"])
+def eco_contributors(self, key, value):
+    """Translates contributors."""
+    value = dict(value)
+    affiliation = value.get("u", "").strip()
+    # Some records have "-" as affiliation: 1614471, 1953712
+    if affiliation and affiliation == "-":
+        value.pop("u", None)
+    contributor = process_contributors(key, value)
+    contributors = self.get("contributors", [])
+    contributors.append(contributor)
+    self["contributors"] = contributors
+    raise IgnoreKey("eco_contributors")
+
+
+@model.over("field_993", "^993__", override=True)
+@for_each_value
+def field_993(self, key, value):
+    """Translates field 993 as a keyword."""
+    value = value.get("q", "")
+    if value and value not in ["Project Management"]:
+        raise UnexpectedValue(field=key, subfield="a", value=value)
+    _subjects = self.get("subjects", [])
+    subject = {
+        "subject": value,
+    }
+    _subjects.append(subject)
+    self["subjects"] = _subjects
+    raise IgnoreKey("field_993")
diff --git a/cds_migrator_kit/rdm/streams.yaml b/cds_migrator_kit/rdm/streams.yaml
index 87e52a0a..19e66e4c 100644
--- a/cds_migrator_kit/rdm/streams.yaml
+++ b/cds_migrator_kit/rdm/streams.yaml
@@ -122,3 +122,14 @@ records:
       missing_users: cds_migrator_kit/rdm/data/users
       communities_ids:
         - ""
+  eco:
+    data_dir: cds_migrator_kit/rdm/data/eco
+    tmp_dir: cds_migrator_kit/rdm/tmp/eco
+    log_dir: cds_migrator_kit/rdm/log/eco
+    extract:
+      dirpath: cds_migrator_kit/rdm/data/eco/dump/
+    transform:
+      files_dump_dir: cds_migrator_kit/rdm/data/eco/files/
+      missing_users: cds_migrator_kit/rdm/data/users
+      communities_ids:
+        - "c8ab44fd-f130-4e52-9bff-70de14913bde"
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
index 493dceca..3a2ce538 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -83,6 +83,7 @@ cds_migrator_kit.migrator.models =
     en = cds_migrator_kit.rdm.records.transform.models.en:en_model
     annual_rep = cds_migrator_kit.rdm.records.transform.models.annual_report:annual_rep_model
     fap = cds_migrator_kit.rdm.records.transform.models.fap:fap_model
+    eco = cds_migrator_kit.rdm.records.transform.models.eco:eco_model
 cds_migrator_kit.migrator.rules.base =
     base = cds_migrator_kit.transform.xml_processing.rules.base
 cds_migrator_kit.migrator.rdm.rules.base =
@@ -164,6 +165,10 @@ cds_migrator_kit.migrator.rules.fap =
     base = cds_migrator_kit.transform.xml_processing.rules.base
     base_records = cds_migrator_kit.rdm.records.transform.xml_processing.rules.base
     fap = cds_migrator_kit.rdm.records.transform.xml_processing.rules.fap
+cds_migrator_kit.migrator.rules.eco =
+    base = cds_migrator_kit.transform.xml_processing.rules.base
+    base_records = cds_migrator_kit.rdm.records.transform.xml_processing.rules.base
+    eco = cds_migrator_kit.rdm.records.transform.xml_processing.rules.eco
 cds_migrator_kit.migrator.rules.people =
     people = cds_migrator_kit.rdm.users.transform.xml_processing.rules.people
 invenio_pidstore.minters =