From d2d6db27fe4132327fec11429caec89b1df55755 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Z=C3=BCbeyde=20Civelek?= Date: Tue, 5 May 2026 14:54:30 +0200 Subject: [PATCH] add(eco): new model and transformation rules --- .../rdm/records/transform/models/eco.py | 95 ++++++ .../rdm/records/transform/models/it.py | 2 + .../rdm/records/transform/transform.py | 2 +- .../transform/xml_processing/rules/base.py | 8 +- .../transform/xml_processing/rules/eco.py | 295 ++++++++++++++++++ cds_migrator_kit/rdm/streams.yaml | 11 + setup.cfg | 5 + 7 files changed, 415 insertions(+), 3 deletions(-) create mode 100644 cds_migrator_kit/rdm/records/transform/models/eco.py create mode 100644 cds_migrator_kit/rdm/records/transform/xml_processing/rules/eco.py diff --git a/cds_migrator_kit/rdm/records/transform/models/eco.py b/cds_migrator_kit/rdm/records/transform/models/eco.py new file mode 100644 index 00000000..883329da --- /dev/null +++ b/cds_migrator_kit/rdm/records/transform/models/eco.py @@ -0,0 +1,95 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2026 CERN. +# +# CDS-RDM is free software; you can redistribute it and/or modify it under +# the terms of the MIT License; see LICENSE file for more details. + +"""CDS-RDM ECO model.""" + +from cds_migrator_kit.rdm.records.transform.models.base_record import ( + rdm_base_record_model, +) +from cds_migrator_kit.transform.overdo import CdsOverdo + + +class ECOModel(CdsOverdo): + """Translation model for ECO records.""" + + __query__ = """ + ( + 980__:POSTER + OR (980__:BROCHURE AND 690C_:CERNOFFICIALPRESSBROCHURE) + OR ( + (980__:BROCHURE AND 690C_:CERNEXPERIMENTBROCHURE) + OR ( + 980__:CMSOUTREACH + AND ( + 6531_.a:Brochure + OR 6531_.a:brochure + OR 6531_a:Brochure + OR 6531_a:brochure + ) + ) + ) + OR (980__:NOTE AND 710__.5:IR) + ) + AND -595__a:Press + AND -980__:LHCb_Misc + AND -690C_a:PRIVATLAS + """ + + __ignore_keys__ = { + "0247_9", # source of pid, only value: OSTI, 2948638, 2853279 + "0248_a", + "0248_p", + "0248_q", + "035__d", # oai harvest tag + "035__h", # oai harvest tag + "035__m", # oai harvest tag + "100__m", # email of contributor + "245__9", # source of title, only value: submitter + "270__m", # email of contact person - TODO: is it okay to ignore? example: 2908973 + "270__p", # contact person name - TODO: is it okay to ignore? + "300__a", # number of pages + "340__a", # Physical medium + "520__9", # abstract provenance + "541__e", # Original source poster https://cds.cern.ch/record/2695195/export/hm + "594__a", # PUB: 2749806, 2749822 + "6531_9", # scheme of keywords + "700__m", # email of contributor + "773__p", # display name of the related link TODO: is it okay to ignore? + "773__y", # year, TODO: is it okay to ignore? https://cds.cern.ch/record/1452204/export/xm + "773__v", # TODO: is it okay to ignore? https://cds.cern.ch/record/1452204/export/xm + "852__c", + "852__h", + "8560_f", # contact email + "8564_8", # file id + "8564_s", # bibdoc id + "8564_x", # icon thumbnails sizes + "8564_y", # file description - handled by files dump + "8564_z", # DM metadata + "937__c", # last modified by + "937__s", # last modification date + "960__a", # base number + "961__a", # CDS modification tag + "961__b", # CDS modification tag + "961__c", # CDS modification tag + "961__h", # CDS modification tag + "961__l", # CDS modification tag + "961__x", # CDS modification tag + "981__a", # duplicate record id + } + + _default_fields = { + "custom_fields": {}, + "languages": [], + "related_identifiers": [], + "creators": [{"person_or_org": {"type": "organizational", "name": "CERN"}}], + } + + +eco_model = ECOModel( + bases=(rdm_base_record_model,), + entry_point_group="cds_migrator_kit.migrator.rules.eco", +) diff --git a/cds_migrator_kit/rdm/records/transform/models/it.py b/cds_migrator_kit/rdm/records/transform/models/it.py index 3dec8cd0..a57b1b01 100644 --- a/cds_migrator_kit/rdm/records/transform/models/it.py +++ b/cds_migrator_kit/rdm/records/transform/models/it.py @@ -32,6 +32,8 @@ class ITModel(CdsOverdo): -980__:BOOK -690C_:YELLOWREPORT -690C_:"YELLOW REPORT" + -690C_:CERNOFFICIALPRESSBROCHURE + -690C_:CERNEXPERIMENTBROCHURE -980__:THESIS -980__:INTNOTECMSPUBL """ diff --git a/cds_migrator_kit/rdm/records/transform/transform.py b/cds_migrator_kit/rdm/records/transform/transform.py index f2fb96bc..c222a7ba 100644 --- a/cds_migrator_kit/rdm/records/transform/transform.py +++ b/cds_migrator_kit/rdm/records/transform/transform.py @@ -472,7 +472,7 @@ def field_experiments(record_json, custom_fields_dict): "cern:experiments", [] ) for experiment in experiments: - if experiment.lower().strip() == "not applicable": + if experiment.lower().strip() in ["not applicable", "select:"]: continue result = search_vocabulary(experiment, "experiments") diff --git a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/base.py b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/base.py index 9a743263..4c39bf0a 100644 --- a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/base.py +++ b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/base.py @@ -73,8 +73,8 @@ def created(self, key, value): source = clean_val("s", value, str) # h = human catalogued # n = script catalogued or via submission - if source not in ["n", "h", "m", "r"]: - raise UnexpectedValue(subfield="s", field=key, value=value) + if source not in ["n", "h", "m", "r", "d"]: + raise UnexpectedValue(subfield="s", field=key, value=source) date_values = value.get("w") if not date_values or not date_values[0]: return datetime.date.today().isoformat() @@ -797,6 +797,10 @@ def related_identifiers_787(self, key, value): "relation_type": {"id": "references"}, "resource_type": {"id": "publication-conferencepaper"}, }, + "paper": { + "relation_type": {"id": "references"}, + "resource_type": {"id": "publication-article"}, + }, } if recid: diff --git a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/eco.py b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/eco.py new file mode 100644 index 00000000..bddc88e9 --- /dev/null +++ b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/eco.py @@ -0,0 +1,295 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2026 CERN. +# +# CDS-RDM is free software; you can redistribute it and/or modify it under +# the terms of the MIT License; see LICENSE file for more details. + +"""CDS-RDM ECO rules.""" + +import re + +import pycountry +from dojson.utils import IgnoreKey, for_each_value, force_list + +from cds_migrator_kit.errors import UnexpectedValue +from cds_migrator_kit.rdm.records.transform.xml_processing.rules.it import ( + corporate_author, +) +from cds_migrator_kit.transform.xml_processing.quality.decorators import ( + require, +) +from cds_migrator_kit.transform.xml_processing.quality.parsers import ( + StringValue, +) +from cds_migrator_kit.transform.xml_processing.rules.base import ( + languages as base_languages, +) +from cds_migrator_kit.transform.xml_processing.rules.base import ( + process_contributors, +) + +from ...models.eco import eco_model as model +from .base import identifiers +from .base import note as base_note +from .base import report_number, urls +from .bulletin_issue import ( + additional_descriptions, + additional_titles_bulletin, + rel_identifiers, + translated_description, +) +from .it import corporate_author +from .publications import internal_notes, journal, organisation, related_identifiers + +model.over("additional_titles", "(^246_[1_])", override=True)( + additional_titles_bulletin +) +model.over("additional_descriptions", "(^500__)")(additional_descriptions) +model.over("additional_descriptions", "(^590__)")(translated_description) +model.over("internal_notes", "^562__")(internal_notes) +model.over("contributors", "^901__")(organisation) +model.over("creators", "(^110__)")(corporate_author) + + +@model.over("internal_notes", "^595__") +@for_each_value +def internal_notes(self, key, value): + """Translates internal notes.""" + subject_notes = force_list(value.get("s", "")) + if subject_notes: + # add them as subjects + subjects = self.get("subjects", []) + for note in subject_notes: + subjects.append({"subject": note}) + self["subjects"] = subjects + base_note(self, key, value) + raise IgnoreKey("internal_notes") + + +@model.over("eco_report_number", "(^037__)|(^088__)", override=True) +@for_each_value +def eco_report_number(self, key, value): + """Translates report number.""" + identifier = value.get("a", "") + # Check it's email TODO: how to handle? + if key == "088__" and "@" in identifier: + pass + else: + _identifier = report_number(self, key, value) + identifiers = self.get("identifiers", []) + + if _identifier and _identifier not in identifiers: + identifiers += _identifier + self["identifiers"] = identifiers + raise IgnoreKey("eco_report_number") + + +@model.over("eco_related_identifiers", "(^962__)", override=True) +@for_each_value +def eco_related_identifiers(self, key, value): + """Translates related identifiers.""" + scheme = value.get("l", "") + if scheme: + rel_identifiers(self, key, value) + raise IgnoreKey("eco_related_identifiers") + rel_identifier = related_identifiers(self, key, value) + if rel_identifier: + rel_id = rel_identifier[0] + rel_ids = self.get("related_identifiers", []) + if rel_id not in rel_ids: + rel_ids.append(rel_id) + self["related_identifiers"] = rel_ids + raise IgnoreKey("eco_related_identifiers") + + +@model.over("eco_identifiers", "^035__", override=True) +@for_each_value +def eco_identifiers(self, key, value): + """Translates identifiers.""" + original_scheme = StringValue(value.get("9", "")).parse() + scheme = original_scheme.lower() + + # TODO: handle photo identifier + if scheme == "phopho": + id_value = StringValue(value.get("a", "")).parse() + new_id = {"scheme": "photo", "identifier": id_value} + raise IgnoreKey("eco_identifiers") + identifiers(self, key, value) + raise IgnoreKey("eco_identifiers") + + +@model.over("eco_urls", "^8564[1_]", override=True) +@for_each_value +def urls_eco(self, key, value): + """Translates urls field.""" + q_value = value.get("q", "") + note = value.get("x", "").strip().lower() + if note and not ("icon" in note or note in ["pdfa", "1"]): + raise UnexpectedValue(field=key, subfield="x", value=note) + if q_value: + identifier = urls(self, key, value, subfield="q") + else: + identifier = urls(self, key, value) + new_id = identifier[0] + related_identifiers = self.get("related_identifiers", []) + if new_id not in related_identifiers: + related_identifiers.append(new_id) + self["related_identifiers"] = related_identifiers + raise IgnoreKey("eco_urls") + + +@model.over("resource_type", "^980__", override=True) +def resource_type(self, key, value): + """Translates resource_type.""" + value = value.get("a") if "a" in value else value.get("b") + if value: + value = value.strip().lower() + + # TODO: are they correct? + mapping = { + "poster": {"id": "poster"}, + "brochure": {"id": "publication-brochure"}, + "note": {"id": "publication-technicalnote"}, + "conferencepaper": {"id": "publication-conferencepaper"}, + } + + try: + return mapping[value] + except KeyError: + raise UnexpectedValue("Unknown resource type (ECO)", field=key, value=value) + + +@model.over("collection", "^690C_", override=True) +@for_each_value +def collection(self, key, value): + """Translates collection.""" + collection = value.get("a", "") + if collection.strip().upper() == "CERN": + raise IgnoreKey("collection") + if collection.strip().upper() not in [ + "POSTER", + "PREPRINT", + ]: + raise UnexpectedValue(subfield="a", field=key, value=value) + subjects = self.get("subjects", []) + subjects.append( + { + "subject": f"collection:{collection}", + } + ) + self["subjects"] = subjects + raise IgnoreKey("collection") + + +@model.over("related_ids", "^773__") +@for_each_value +def related_ids(self, key, value): + """Translated related links.""" + # TODO: how to transform? https://cds.cern.ch/record/1452204/export/xm + related_link = value.get("u", "") + if not related_link: + journal(self, key, value) + raise IgnoreKey("related_ids") + + # Transform like the base `urls` rule + rel_ids = urls(self, key, value) + if not rel_ids: + raise IgnoreKey("related_ids") + rel_id = rel_ids[0] + related_identifiers = self.get("related_identifiers", []) + if rel_id not in related_identifiers: + related_identifiers.append(rel_id) + self["related_identifiers"] = related_identifiers + + raise IgnoreKey("related_ids") + + +@model.over("submitter_info", "^923__") +@for_each_value +def submitter_info(self, key, value): + """Translates submitter information.""" + submitter_info = value.get("r", "") + names = submitter_info.strip().split(",") + + if len(names) == 2: + names = {"family_name": names[0].strip(), "given_name": names[1].strip()} + else: + names = {"family_name": names[0].strip()} + contributor = { + "person_or_org": { + "type": "personal", + **names, + }, + "role": {"id": "contactperson"}, + } + contributors = self.get("contributors", []) + contributors.append(contributor) + self["contributors"] = contributors + raise IgnoreKey("submitter_info") + + +@model.over("languages", "^041__", override=True) +@for_each_value +@require(["a"]) +def language(self, key, value): + """Translates languages fields.""" + langs = value.get("a") + languages = self.get("languages", []) + if "-" in langs or "/" in langs: + # https://cds.cern.ch/record/921930/export/xm + language_codes = re.split(r"[-/]+", langs) + for lang in language_codes: + if not lang: + continue + if lang == "fre": + lang = "fra" + try: + # If it's a 2-letter code + if len(lang) == 2: + lang_obj = pycountry.languages.get(alpha_2=lang) + else: + lang_obj = pycountry.languages.get(alpha_3=lang) + if not lang_obj: + lang_obj = pycountry.languages.lookup(lang) + languages.append({"id": lang_obj.alpha_3.lower()}) + except (KeyError, AttributeError, LookupError): + raise UnexpectedValue(field=key, subfield="a", value=langs) + else: + new_langs = base_languages(self, key, value) + languages.extend(new_langs) + self["languages"] = languages + raise IgnoreKey("language") + + +@model.over("eco_contributors", "^700__", override=True) +@for_each_value +@require(["a"]) +def eco_contributors(self, key, value): + """Translates contributors.""" + value = dict(value) + affiliation = value.get("u", "").strip() + # Some records have "-" as affiliation: 1614471, 1953712 + if affiliation and affiliation == "-": + value.pop("u", None) + contributor = process_contributors(key, value) + contributors = self.get("contributors", []) + contributors.append(contributor) + self["contributors"] = contributors + raise IgnoreKey("eco_contributors") + + +@model.over("field_993", "^993__", override=True) +@for_each_value +def field_993(self, key, value): + """Translates field 993 as a keyword.""" + value = value.get("q", "") + if value and value not in ["Project Management"]: + raise UnexpectedValue(field=key, subfield="a", value=value) + _subjects = self.get("subjects", []) + subject = { + "subject": value, + } + _subjects.append(subject) + self["subjects"] = _subjects + raise IgnoreKey("field_993") diff --git a/cds_migrator_kit/rdm/streams.yaml b/cds_migrator_kit/rdm/streams.yaml index 87e52a0a..19e66e4c 100644 --- a/cds_migrator_kit/rdm/streams.yaml +++ b/cds_migrator_kit/rdm/streams.yaml @@ -122,3 +122,14 @@ records: missing_users: cds_migrator_kit/rdm/data/users communities_ids: - "" + eco: + data_dir: cds_migrator_kit/rdm/data/eco + tmp_dir: cds_migrator_kit/rdm/tmp/eco + log_dir: cds_migrator_kit/rdm/log/eco + extract: + dirpath: cds_migrator_kit/rdm/data/eco/dump/ + transform: + files_dump_dir: cds_migrator_kit/rdm/data/eco/files/ + missing_users: cds_migrator_kit/rdm/data/users + communities_ids: + - "c8ab44fd-f130-4e52-9bff-70de14913bde" \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index 493dceca..3a2ce538 100644 --- a/setup.cfg +++ b/setup.cfg @@ -83,6 +83,7 @@ cds_migrator_kit.migrator.models = en = cds_migrator_kit.rdm.records.transform.models.en:en_model annual_rep = cds_migrator_kit.rdm.records.transform.models.annual_report:annual_rep_model fap = cds_migrator_kit.rdm.records.transform.models.fap:fap_model + eco = cds_migrator_kit.rdm.records.transform.models.eco:eco_model cds_migrator_kit.migrator.rules.base = base = cds_migrator_kit.transform.xml_processing.rules.base cds_migrator_kit.migrator.rdm.rules.base = @@ -164,6 +165,10 @@ cds_migrator_kit.migrator.rules.fap = base = cds_migrator_kit.transform.xml_processing.rules.base base_records = cds_migrator_kit.rdm.records.transform.xml_processing.rules.base fap = cds_migrator_kit.rdm.records.transform.xml_processing.rules.fap +cds_migrator_kit.migrator.rules.eco = + base = cds_migrator_kit.transform.xml_processing.rules.base + base_records = cds_migrator_kit.rdm.records.transform.xml_processing.rules.base + eco = cds_migrator_kit.rdm.records.transform.xml_processing.rules.eco cds_migrator_kit.migrator.rules.people = people = cds_migrator_kit.rdm.users.transform.xml_processing.rules.people invenio_pidstore.minters =