From 0ddf7ef15546efe977ec18469011cb99393a5f61 Mon Sep 17 00:00:00 2001 From: Juraj Roka <95219754+jr-rk@users.noreply.github.com> Date: Mon, 3 Nov 2025 17:23:19 +0100 Subject: [PATCH 1/8] Script for correcting dc.date.issued formats --- tools/add_metadata/README.md | 28 +++++ tools/add_metadata/add_metadata.py | 183 +++++++++++++++++++++++++---- 2 files changed, 188 insertions(+), 23 deletions(-) diff --git a/tools/add_metadata/README.md b/tools/add_metadata/README.md index 41df652..cbf4f6f 100644 --- a/tools/add_metadata/README.md +++ b/tools/add_metadata/README.md @@ -9,6 +9,34 @@ Dry run: python add_metadata.py --dry-run --endpoint="http://dev-5.pc:86/server/api/" --to_mtd_field dc.date.issued --from_mtd_field dc.date.submitted dc.date.committed dc.date.defense dc.date ``` +## Fix date format in dc.date.issued + +This mode corrects date formats in existing `dc.date.issued` fields without using other metadata fields. + +**Behavior:** +- **Null/empty values**: Kept untouched +- **Year-only values** (`YYYY`): Kept as-is (e.g., `2020` stays `2020`) +- **Full dates with wrong format**: Converted to `YYYY-MM-DD` (e.g., `30.5.2025` → `2025-05-30`) +- **Partial dates**: Converted to `YYYY-MM-DD` with `01` for missing parts (e.g., `5/2025` → `2025-05-01`) +- **Anomalies**: Unparseable dates are logged and reported + +Dry run: +``` +python add_metadata.py --fix-date-format --dry-run +``` + +Real run: +``` +python add_metadata.py --fix-date-format +``` + +## TUL fix date format in dc.date.issued + +``` +set ENVFILE=.env-tul +python add_metadata.py --fix-date-format --endpoint="https://dspace.tul.cz/server/api/" --dry-run +``` + ## TUL update dc.date.issued ``` diff --git a/tools/add_metadata/add_metadata.py b/tools/add_metadata/add_metadata.py index d678dd7..421206d 100644 --- a/tools/add_metadata/add_metadata.py +++ b/tools/add_metadata/add_metadata.py @@ -72,6 +72,17 @@ def input(self) -> str: def value(self) -> str: return self._d + @staticmethod + def is_year_only(date_str: str) -> bool: + """Check if the string represents a year-only format (YYYY).""" + if len(date_str) != 4 or not date_str.isdigit(): + return False + try: + datetime.strptime(date_str, '%Y') + return True + except ValueError: + return False + def is_valid(self): """Check if the given string is a valid date.""" try: @@ -83,6 +94,24 @@ def is_valid(self): _logger.debug(f"[{self._d}] is not valid date. Error: {e}") return False + def is_valid_relaxed(self): + """Check if the given string is a valid date in YYYY-MM-DD or YYYY format.""" + # Check if it's already in YYYY-MM-DD format + try: + datetime.strptime(self._d, '%Y-%m-%d') + return True + except ValueError: + pass + + # Check if it's in YYYY format (year only - also valid) + if date.is_year_only(self._d): + return True + + date.invalid[self._d] += 1 + if date.invalid[self._d] == 1: + _logger.debug(f"[{self._d}] is not valid date format (expected YYYY-MM-DD or YYYY)") + return False + def parse(self) -> bool: """Convert the value to a date format. Normalize date to 'YYYY-MM-DD' format, filling missing parts with '01'.""" if len(self._d) < 1: @@ -107,6 +136,47 @@ def parse(self) -> bool: _logger.warning(f"Error converting [{self._d}] to date.") return False + def parse_relaxed(self) -> bool: + """Convert the value to a date format with relaxed rules. + - Keep YYYY format as-is (year only is valid) + - If date is not full (missing month/day), extract only the year (YYYY) + - Only full dates are converted to 'YYYY-MM-DD' format + """ + if len(self._d) < 1: + return False + + # Check if it's already year-only format (YYYY) - keep it as-is + if date.is_year_only(self._d): + return True # Year only is valid, keep as-is + + # Try full date formats only (with day, month, and year) + full_date_formats = ['%Y/%m/%d', '%d/%m/%Y', '%Y.%m.%d', '%d.%m.%Y', + '%Y-%m-%d', '%d-%m-%Y', '%d. %m. %Y'] + for fmt in full_date_formats: + try: + datetime_obj = datetime.strptime(self._d, fmt) + # Normalize to 'YYYY-MM-DD' + self._d = datetime_obj.strftime('%Y-%m-%d') + return True + except ValueError: + continue + + # If not a full date, try to extract year from partial formats + partial_formats = [('%Y-%m', '%Y'), ('%m-%Y', '%Y'), + ('%Y/%m', '%Y'), ('%m/%Y', '%Y'), + ('%Y.%m', '%Y'), ('%m.%Y', '%Y')] + for parse_fmt, output_fmt in partial_formats: + try: + datetime_obj = datetime.strptime(self._d, parse_fmt) + # Extract only the year + self._d = datetime_obj.strftime('%Y') + return True + except ValueError: + continue + + _logger.warning(f"Error converting [{self._d}] to date.") + return False + def update_item(item_d: dict): item = Item(item_d) @@ -129,16 +199,20 @@ class updater: ret_invalid_meta = 4 ret_empty_meta = 4 - def __init__(self, dspace_be, from_mtd_fields: list, to_mtd_field: list, dry_run: bool = False): + def __init__(self, dspace_be, from_mtd_fields: list, to_mtd_field: list, dry_run: bool = False, fix_date_mode: bool = False): self._dspace_be = dspace_be self._from_mtd_fields = from_mtd_fields self._to_mtd_field = to_mtd_field self._dry_run = dry_run + self._fix_date_mode = fix_date_mode self._info = { "valid": [], + "valid_year_only": [], + "null_values": [], "multiple": set(), "invalid_date": [], "invalid_date_all": set(), + "anomalies": [], "updated": [], "error_updating": [], "error_creating": [], @@ -177,12 +251,33 @@ def find_correct_metadata(self, item: dict): return None, None - def update_existing_metadata(self, item: dict, date_str: str, force: bool = False) -> int: - uuid = item['uuid'] + def _perform_update(self, item: dict, date_val: date, uuid: str, id_str: str) -> int: + """Common logic for updating item metadata in database.""" item_mtd = item["metadata"] + + # Log conversion + date.invalid_but_converted[date_val.input] += 1 + if date.invalid_but_converted[date_val.input] == 1: + _logger.info(f"{id_str}: invalid date [{date_val.input}] converted to [{date_val.value}]") + # Update the item metadata with the converted date + item_mtd[self._to_mtd_field][0]["value"] = date_val.value + item["metadata"] = item_mtd + + # Update the item in the database + updated_ok = self._dry_run or update_item(item) + if not updated_ok: + _logger.error(f"{id_str}: error updating item") + self._info["error_updating"].append((uuid, date_val.input)) + return updater.ret_failed + + self._info["updated"].append((uuid, date_val.input)) + return updater.ret_updated + + def update_existing_metadata(self, item: dict, date_str: str, force: bool = False) -> int: + uuid = item['uuid'] id_str = f"Item [{uuid}]: [{self._to_mtd_field}]" - # If there is more than one value, get only the first one + date_val = date(date_str) if not force: if date_val.is_valid(): @@ -195,24 +290,38 @@ def update_existing_metadata(self, item: dict, date_str: str, force: bool = Fals self._info["invalid_date"].append((uuid, date_val.input)) return updater.ret_invalid_meta - # Convert date to correct format if necessary - date.invalid_but_converted[date_val.input] += 1 - if date.invalid_but_converted[date_val.input] == 1: - _logger.info(f"{id_str}: invalid date [{date_val.input}] converted") + return self._perform_update(item, date_val, uuid, id_str) - # Update the item metadata with the converted date - item_mtd[self._to_mtd_field][0]["value"] = date_val.value - item["metadata"] = item_mtd + def update_existing_metadata_relaxed(self, item: dict, date_str: str, force: bool = False) -> int: + """Update existing metadata with relaxed date format rules (for --fix-date-format mode).""" + uuid = item['uuid'] + id_str = f"Item [{uuid}]: [{self._to_mtd_field}]" + + # Check for null/empty values - keep them untouched + if date_str is None or date_str.strip() == "": + self._info["null_values"].append((uuid, date_str)) + _logger.info(f"{id_str}: null/empty value - keeping as-is") + return updater.ret_already_ok + + date_val = date(date_str) + if not force: + if date_val.is_valid_relaxed(): + # Check if it's year-only format + if date.is_year_only(date_str): + self._info["valid_year_only"].append((uuid, date_val.input)) + _logger.info(f"{id_str}: year-only format [{date_str}] - keeping as-is") + else: + self._info["valid"].append((uuid, date_val.input)) + return updater.ret_already_ok - # Update the item in the database - updated_ok = self._dry_run or update_item(item) - if not updated_ok: - _logger.error(f"{id_str}: error updating item") - self._info["error_updating"].append((uuid, date_val.input)) - return updater.ret_failed + parsed_ok = date_val.parse_relaxed() + if parsed_ok is False: + _logger.error(f"{id_str}: cannot convert [{date_val.input}] to date - ANOMALY") + self._info["invalid_date"].append((uuid, date_val.input)) + self._info["anomalies"].append((uuid, date_val.input, "Cannot parse date format")) + return updater.ret_invalid_meta - self._info["updated"].append((uuid, date_val.input)) - return updater.ret_updated + return self._perform_update(item, date_val, uuid, id_str) def add_new_metadata(self, item) -> int: uuid = item['uuid'] @@ -275,7 +384,11 @@ def update(self, item: dict, force: bool = False) -> int: f"Forced metadata change but no value found for [{uuid}]") return updater.ret_empty_meta - return self.update_existing_metadata(item, val, force=force) + # Use relaxed method for fix-date-format mode + if self._fix_date_mode: + return self.update_existing_metadata_relaxed(item, val, force=force) + else: + return self.update_existing_metadata(item, val, force=force) else: return self.add_new_metadata(item) @@ -312,10 +425,12 @@ def print_info(self, show_limit=100): if __name__ == '__main__': parser = argparse.ArgumentParser(description="Add metadata for DSpace items") parser.add_argument("--to_mtd_field", - type=str, required=True, help="Metadata field to be created.") + type=str, required=False, help="Metadata field to be created.") parser.add_argument("--from_mtd_field", - type=str, nargs='+', required=True, + type=str, nargs='+', required=False, help="Metadata field(s) from which value(s) can be used.") + parser.add_argument("--fix-date-format", action='store_true', default=False, + help="Fix date format in dc.date.issued field (no other parameters needed)") parser.add_argument("--endpoint", type=str, default=env["backend"]["endpoint"]) parser.add_argument("--user", type=str, default=env["backend"]["user"]) parser.add_argument("--password", type=str, default=env["backend"]["password"]) @@ -323,6 +438,18 @@ def print_info(self, show_limit=100): parser.add_argument("--result-every-N", type=int, default=10000) parser.add_argument("--only", type=str, default=None) args = parser.parse_args() + + # Handle fix-date-format mode + if args.fix_date_format: + args.to_mtd_field = "dc.date.issued" + args.from_mtd_field = ["dc.date.issued"] + _logger.info("Fix date format mode enabled: correcting dc.date.issued") + + # Validate required arguments for non-fix-date-format mode + if not args.fix_date_format: + if args.to_mtd_field is None or args.from_mtd_field is None: + parser.error("--to_mtd_field and --from_mtd_field are required unless --fix-date-format is used") + # output args from parse_args but without passwords args_dict = vars(args).copy() args_dict.pop("password", None) @@ -341,7 +468,8 @@ def print_info(self, show_limit=100): # Initialize DSpace backend dspace_be = dspace.rest(endpoint, user, password, True) - upd = updater(dspace_be, args.from_mtd_field, args.to_mtd_field, dry_run=args.dry_run) + upd = updater(dspace_be, args.from_mtd_field, args.to_mtd_field, + dry_run=args.dry_run, fix_date_mode=args.fix_date_format) stats = additional_stats() @@ -436,6 +564,15 @@ def print_info(self, show_limit=100): for k, v in upd.info.items(): _logger.info(f"{k:20s}:{len(v):6d}: first {limit} items .. {list(v)[:limit]}...") + _logger.info(40 * "=") + _logger.info("Anomalies found:") + if len(upd.info["anomalies"]) > 0: + _logger.warning(f"Total anomalies: {len(upd.info['anomalies'])}") + for uuid, value, reason in upd.info["anomalies"][:100]: # Show first 100 + _logger.warning(f" Item [{uuid}]: value=[{value}] - {reason}") + else: + _logger.info("No anomalies found") + _logger.info(40 * "=") _logger.info("Date info") msgs = "\n\t".join(upd.cannot_parse) From 69323053d0a1bced1a2eeb42e64237b10ffdd54b Mon Sep 17 00:00:00 2001 From: Juraj Roka <95219754+jr-rk@users.noreply.github.com> Date: Wed, 5 Nov 2025 07:24:10 +0100 Subject: [PATCH 2/8] Edited the README --- tools/add_metadata/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/add_metadata/README.md b/tools/add_metadata/README.md index cbf4f6f..7313500 100644 --- a/tools/add_metadata/README.md +++ b/tools/add_metadata/README.md @@ -17,7 +17,7 @@ This mode corrects date formats in existing `dc.date.issued` fields without usin - **Null/empty values**: Kept untouched - **Year-only values** (`YYYY`): Kept as-is (e.g., `2020` stays `2020`) - **Full dates with wrong format**: Converted to `YYYY-MM-DD` (e.g., `30.5.2025` → `2025-05-30`) -- **Partial dates**: Converted to `YYYY-MM-DD` with `01` for missing parts (e.g., `5/2025` → `2025-05-01`) +- **Partial dates**: Converted to year-only format (e.g., `5/2025` → `2025`) - **Anomalies**: Unparseable dates are logged and reported Dry run: From edb3a9e28d376210d54ecc2f13cb5842fe10c275 Mon Sep 17 00:00:00 2001 From: Juraj Roka <95219754+jr-rk@users.noreply.github.com> Date: Wed, 5 Nov 2025 07:26:11 +0100 Subject: [PATCH 3/8] Edited the help text --- tools/add_metadata/add_metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/add_metadata/add_metadata.py b/tools/add_metadata/add_metadata.py index 421206d..90adf66 100644 --- a/tools/add_metadata/add_metadata.py +++ b/tools/add_metadata/add_metadata.py @@ -425,7 +425,7 @@ def print_info(self, show_limit=100): if __name__ == '__main__': parser = argparse.ArgumentParser(description="Add metadata for DSpace items") parser.add_argument("--to_mtd_field", - type=str, required=False, help="Metadata field to be created.") + type=str, required=False, help="Metadata field to be created or updated.") parser.add_argument("--from_mtd_field", type=str, nargs='+', required=False, help="Metadata field(s) from which value(s) can be used.") From a5b6dadbad955fffa0e32e9f2ccce9638f7ea9f9 Mon Sep 17 00:00:00 2001 From: Juraj Roka <95219754+jr-rk@users.noreply.github.com> Date: Wed, 5 Nov 2025 07:30:42 +0100 Subject: [PATCH 4/8] Edited the help text --- tools/add_metadata/add_metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/add_metadata/add_metadata.py b/tools/add_metadata/add_metadata.py index 90adf66..f800cfb 100644 --- a/tools/add_metadata/add_metadata.py +++ b/tools/add_metadata/add_metadata.py @@ -425,7 +425,7 @@ def print_info(self, show_limit=100): if __name__ == '__main__': parser = argparse.ArgumentParser(description="Add metadata for DSpace items") parser.add_argument("--to_mtd_field", - type=str, required=False, help="Metadata field to be created or updated.") + type=str, required=False, help="Metadata field to be created or updated (required unless --fix-date-format is used).") parser.add_argument("--from_mtd_field", type=str, nargs='+', required=False, help="Metadata field(s) from which value(s) can be used.") From e5f721d66df9a3fe7f7e5f29fe420571c9c50f00 Mon Sep 17 00:00:00 2001 From: Juraj Roka <95219754+jr-rk@users.noreply.github.com> Date: Wed, 5 Nov 2025 08:01:20 +0100 Subject: [PATCH 5/8] Reduced the unnecessary amount of text in README --- tools/add_metadata/README.md | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/tools/add_metadata/README.md b/tools/add_metadata/README.md index 7313500..fa13990 100644 --- a/tools/add_metadata/README.md +++ b/tools/add_metadata/README.md @@ -11,23 +11,8 @@ python add_metadata.py --dry-run --endpoint="http://dev-5.pc:86/server/api/" --t ## Fix date format in dc.date.issued -This mode corrects date formats in existing `dc.date.issued` fields without using other metadata fields. - -**Behavior:** -- **Null/empty values**: Kept untouched -- **Year-only values** (`YYYY`): Kept as-is (e.g., `2020` stays `2020`) -- **Full dates with wrong format**: Converted to `YYYY-MM-DD` (e.g., `30.5.2025` → `2025-05-30`) -- **Partial dates**: Converted to year-only format (e.g., `5/2025` → `2025`) -- **Anomalies**: Unparseable dates are logged and reported - -Dry run: -``` -python add_metadata.py --fix-date-format --dry-run -``` - -Real run: ``` -python add_metadata.py --fix-date-format +python add_metadata.py --fix-date-format --endpoint="https://dspace.tul.cz/server/api/" --user="..." --password="..." ``` ## TUL fix date format in dc.date.issued From 8603b9d89d9e9a4f8b319f3dbb8d4e21de1fc6cc Mon Sep 17 00:00:00 2001 From: Juraj Roka <95219754+jr-rk@users.noreply.github.com> Date: Wed, 5 Nov 2025 08:05:03 +0100 Subject: [PATCH 6/8] Reduced the unnecessary amount of text in README - ensure consistency --- tools/add_metadata/README.md | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tools/add_metadata/README.md b/tools/add_metadata/README.md index fa13990..9cd6612 100644 --- a/tools/add_metadata/README.md +++ b/tools/add_metadata/README.md @@ -9,12 +9,6 @@ Dry run: python add_metadata.py --dry-run --endpoint="http://dev-5.pc:86/server/api/" --to_mtd_field dc.date.issued --from_mtd_field dc.date.submitted dc.date.committed dc.date.defense dc.date ``` -## Fix date format in dc.date.issued - -``` -python add_metadata.py --fix-date-format --endpoint="https://dspace.tul.cz/server/api/" --user="..." --password="..." -``` - ## TUL fix date format in dc.date.issued ``` From 1e7008da976a977e59751e42b9ce4310a96b3175 Mon Sep 17 00:00:00 2001 From: Juraj Roka <95219754+jr-rk@users.noreply.github.com> Date: Thu, 13 Nov 2025 14:47:50 +0100 Subject: [PATCH 7/8] Add hybrid date format handling that preserves YYYY and YYYY-MM formats while normalizing separators and logging anomalies --- tools/add_metadata/add_metadata.py | 95 +++++++++++++++++++++++++++--- 1 file changed, 86 insertions(+), 9 deletions(-) diff --git a/tools/add_metadata/add_metadata.py b/tools/add_metadata/add_metadata.py index f800cfb..2bfa500 100644 --- a/tools/add_metadata/add_metadata.py +++ b/tools/add_metadata/add_metadata.py @@ -112,6 +112,31 @@ def is_valid_relaxed(self): _logger.debug(f"[{self._d}] is not valid date format (expected YYYY-MM-DD or YYYY)") return False + def is_valid_hybrid(self): + """Check if date is valid in YYYY-MM-DD, YYYY-MM, or YYYY format (all kept as-is).""" + # Check YYYY-MM-DD format + try: + datetime.strptime(self._d, '%Y-%m-%d') + return True + except ValueError: + pass + + # Check YYYY-MM format (partial date) + try: + datetime.strptime(self._d, '%Y-%m') + return True + except ValueError: + pass + + # Check YYYY format (year only) + if date.is_year_only(self._d): + return True + + date.invalid[self._d] += 1 + if date.invalid[self._d] == 1: + _logger.debug(f"[{self._d}] is not valid date format (expected YYYY-MM-DD, YYYY-MM, or YYYY)") + return False + def parse(self) -> bool: """Convert the value to a date format. Normalize date to 'YYYY-MM-DD' format, filling missing parts with '01'.""" if len(self._d) < 1: @@ -177,6 +202,45 @@ def parse_relaxed(self) -> bool: _logger.warning(f"Error converting [{self._d}] to date.") return False + def parse_hybrid(self) -> bool: + """Convert date with hybrid rules: + - Keep YYYY format as-is (year only) + - Keep YYYY-MM format as-is (partial date, but normalize separators) + - Convert full dates to YYYY-MM-DD format + """ + if len(self._d) < 1: + return False + + # Check if it's already year-only format (YYYY) - keep as-is + if date.is_year_only(self._d): + return True + + # Try full date formats (with day, month, and year) + full_date_formats = ['%Y/%m/%d', '%d/%m/%Y', '%Y.%m.%d', '%d.%m.%Y', + '%Y-%m-%d', '%d-%m-%Y', '%d. %m. %Y'] + for fmt in full_date_formats: + try: + datetime_obj = datetime.strptime(self._d, fmt) + # Normalize to 'YYYY-MM-DD' + self._d = datetime_obj.strftime('%Y-%m-%d') + return True + except ValueError: + continue + + # Try partial date formats (year-month) - normalize to YYYY-MM + partial_formats = ['%Y-%m', '%m-%Y', '%Y/%m', '%m/%Y', '%Y.%m', '%m.%Y'] + for fmt in partial_formats: + try: + datetime_obj = datetime.strptime(self._d, fmt) + # Normalize to 'YYYY-MM' (keep as partial date) + self._d = datetime_obj.strftime('%Y-%m') + return True + except ValueError: + continue + + _logger.warning(f"Error converting [{self._d}] to date.") + return False + def update_item(item_d: dict): item = Item(item_d) @@ -275,19 +339,35 @@ def _perform_update(self, item: dict, date_val: date, uuid: str, id_str: str) -> return updater.ret_updated def update_existing_metadata(self, item: dict, date_str: str, force: bool = False) -> int: + """Update existing metadata with hybrid rules: + - No null/empty handling (will crash on None) + - YYYY formats kept as-is + - Partial dates (YYYY-MM) kept as-is (normalized) + - Invalid dates logged as ANOMALY + """ uuid = item['uuid'] id_str = f"Item [{uuid}]: [{self._to_mtd_field}]" + # No null/empty handling - let it crash if needed date_val = date(date_str) if not force: - if date_val.is_valid(): - self._info["valid"].append((uuid, date_val.input)) + if date_val.is_valid_hybrid(): + # Check if it's year-only or partial format + if date.is_year_only(date_str): + self._info["valid_year_only"].append((uuid, date_val.input)) + _logger.info(f"{id_str}: year-only format [{date_str}] - keeping as-is") + elif len(date_str) == 7 and date_str[4] == '-': # YYYY-MM format + self._info["valid"].append((uuid, date_val.input)) + _logger.info(f"{id_str}: partial date format [{date_str}] - keeping as-is") + else: + self._info["valid"].append((uuid, date_val.input)) return updater.ret_already_ok - parsed_ok = date_val.parse() + parsed_ok = date_val.parse_hybrid() if parsed_ok is False: - _logger.error(f"{id_str}: cannot convert [{date_val.input}] to date") + _logger.error(f"{id_str}: cannot convert [{date_val.input}] to date - ANOMALY") self._info["invalid_date"].append((uuid, date_val.input)) + self._info["anomalies"].append((uuid, date_val.input, "Cannot parse date format")) return updater.ret_invalid_meta return self._perform_update(item, date_val, uuid, id_str) @@ -384,11 +464,8 @@ def update(self, item: dict, force: bool = False) -> int: f"Forced metadata change but no value found for [{uuid}]") return updater.ret_empty_meta - # Use relaxed method for fix-date-format mode - if self._fix_date_mode: - return self.update_existing_metadata_relaxed(item, val, force=force) - else: - return self.update_existing_metadata(item, val, force=force) + # Always use standard validation/parsing (no relaxed mode) + return self.update_existing_metadata(item, val, force=force) else: return self.add_new_metadata(item) From 5353f332e414ef9d28c0ac88d535bdae570db4ac Mon Sep 17 00:00:00 2001 From: Juraj Roka <95219754+jr-rk@users.noreply.github.com> Date: Thu, 13 Nov 2025 15:55:23 +0100 Subject: [PATCH 8/8] Consolidate date handling to hybrid validation and remove dead code --- tools/add_metadata/add_metadata.py | 137 ++--------------------------- 1 file changed, 5 insertions(+), 132 deletions(-) diff --git a/tools/add_metadata/add_metadata.py b/tools/add_metadata/add_metadata.py index 2bfa500..c58d771 100644 --- a/tools/add_metadata/add_metadata.py +++ b/tools/add_metadata/add_metadata.py @@ -83,35 +83,6 @@ def is_year_only(date_str: str) -> bool: except ValueError: return False - def is_valid(self): - """Check if the given string is a valid date.""" - try: - datetime.strptime(self._d, '%Y-%m-%d') - return True - except ValueError as e: - date.invalid[self._d] += 1 - if date.invalid[self._d] == 1: - _logger.debug(f"[{self._d}] is not valid date. Error: {e}") - return False - - def is_valid_relaxed(self): - """Check if the given string is a valid date in YYYY-MM-DD or YYYY format.""" - # Check if it's already in YYYY-MM-DD format - try: - datetime.strptime(self._d, '%Y-%m-%d') - return True - except ValueError: - pass - - # Check if it's in YYYY format (year only - also valid) - if date.is_year_only(self._d): - return True - - date.invalid[self._d] += 1 - if date.invalid[self._d] == 1: - _logger.debug(f"[{self._d}] is not valid date format (expected YYYY-MM-DD or YYYY)") - return False - def is_valid_hybrid(self): """Check if date is valid in YYYY-MM-DD, YYYY-MM, or YYYY format (all kept as-is).""" # Check YYYY-MM-DD format @@ -137,71 +108,6 @@ def is_valid_hybrid(self): _logger.debug(f"[{self._d}] is not valid date format (expected YYYY-MM-DD, YYYY-MM, or YYYY)") return False - def parse(self) -> bool: - """Convert the value to a date format. Normalize date to 'YYYY-MM-DD' format, filling missing parts with '01'.""" - if len(self._d) < 1: - return False - - formats = ['%Y/%m/%d', '%d/%m/%Y', '%Y.%m.%d', '%d.%m.%Y', '%Y', - '%Y-%m', '%m-%Y', '%Y/%m', '%m/%Y', '%Y.%m', '%m.%Y', '%d. %m. %Y'] - for fmt in formats: - try: - datetime_obj = datetime.strptime(self._d, fmt) - # Normalize date to 'YYYY-MM-DD' - if fmt in ['%Y-%m', '%Y/%m', '%Y.%m', '%m-%Y', "%m/%Y", "%m.%Y"]: - self._d = datetime_obj.strftime('%Y-%m-01') - elif fmt == '%Y': - self._d = datetime_obj.strftime('%Y-01-01') - else: - self._d = datetime_obj.strftime('%Y-%m-%d') - return True - except ValueError: - # The test format does not match the input date format - continue - _logger.warning(f"Error converting [{self._d}] to date.") - return False - - def parse_relaxed(self) -> bool: - """Convert the value to a date format with relaxed rules. - - Keep YYYY format as-is (year only is valid) - - If date is not full (missing month/day), extract only the year (YYYY) - - Only full dates are converted to 'YYYY-MM-DD' format - """ - if len(self._d) < 1: - return False - - # Check if it's already year-only format (YYYY) - keep it as-is - if date.is_year_only(self._d): - return True # Year only is valid, keep as-is - - # Try full date formats only (with day, month, and year) - full_date_formats = ['%Y/%m/%d', '%d/%m/%Y', '%Y.%m.%d', '%d.%m.%Y', - '%Y-%m-%d', '%d-%m-%Y', '%d. %m. %Y'] - for fmt in full_date_formats: - try: - datetime_obj = datetime.strptime(self._d, fmt) - # Normalize to 'YYYY-MM-DD' - self._d = datetime_obj.strftime('%Y-%m-%d') - return True - except ValueError: - continue - - # If not a full date, try to extract year from partial formats - partial_formats = [('%Y-%m', '%Y'), ('%m-%Y', '%Y'), - ('%Y/%m', '%Y'), ('%m/%Y', '%Y'), - ('%Y.%m', '%Y'), ('%m.%Y', '%Y')] - for parse_fmt, output_fmt in partial_formats: - try: - datetime_obj = datetime.strptime(self._d, parse_fmt) - # Extract only the year - self._d = datetime_obj.strftime('%Y') - return True - except ValueError: - continue - - _logger.warning(f"Error converting [{self._d}] to date.") - return False - def parse_hybrid(self) -> bool: """Convert date with hybrid rules: - Keep YYYY format as-is (year only) @@ -263,16 +169,14 @@ class updater: ret_invalid_meta = 4 ret_empty_meta = 4 - def __init__(self, dspace_be, from_mtd_fields: list, to_mtd_field: list, dry_run: bool = False, fix_date_mode: bool = False): + def __init__(self, dspace_be, from_mtd_fields: list, to_mtd_field: list, dry_run: bool = False): self._dspace_be = dspace_be self._from_mtd_fields = from_mtd_fields self._to_mtd_field = to_mtd_field self._dry_run = dry_run - self._fix_date_mode = fix_date_mode self._info = { "valid": [], "valid_year_only": [], - "null_values": [], "multiple": set(), "invalid_date": [], "invalid_date_all": set(), @@ -307,8 +211,8 @@ def find_correct_metadata(self, item: dict): # If there is more than one value, get only the first one meta_val = date(meta_key[0]["value"]) # Convert date if necessary - if not meta_val.is_valid(): - if not meta_val.parse(): + if not meta_val.is_valid_hybrid(): + if not meta_val.parse_hybrid(): self._info["invalid_date_all"].add(meta_val.input) continue return meta_val, id_str @@ -372,37 +276,6 @@ def update_existing_metadata(self, item: dict, date_str: str, force: bool = Fals return self._perform_update(item, date_val, uuid, id_str) - def update_existing_metadata_relaxed(self, item: dict, date_str: str, force: bool = False) -> int: - """Update existing metadata with relaxed date format rules (for --fix-date-format mode).""" - uuid = item['uuid'] - id_str = f"Item [{uuid}]: [{self._to_mtd_field}]" - - # Check for null/empty values - keep them untouched - if date_str is None or date_str.strip() == "": - self._info["null_values"].append((uuid, date_str)) - _logger.info(f"{id_str}: null/empty value - keeping as-is") - return updater.ret_already_ok - - date_val = date(date_str) - if not force: - if date_val.is_valid_relaxed(): - # Check if it's year-only format - if date.is_year_only(date_str): - self._info["valid_year_only"].append((uuid, date_val.input)) - _logger.info(f"{id_str}: year-only format [{date_str}] - keeping as-is") - else: - self._info["valid"].append((uuid, date_val.input)) - return updater.ret_already_ok - - parsed_ok = date_val.parse_relaxed() - if parsed_ok is False: - _logger.error(f"{id_str}: cannot convert [{date_val.input}] to date - ANOMALY") - self._info["invalid_date"].append((uuid, date_val.input)) - self._info["anomalies"].append((uuid, date_val.input, "Cannot parse date format")) - return updater.ret_invalid_meta - - return self._perform_update(item, date_val, uuid, id_str) - def add_new_metadata(self, item) -> int: uuid = item['uuid'] @@ -439,7 +312,7 @@ def update(self, item: dict, force: bool = False) -> int: for i in range(len(date_meta)): if len(val) == 0: date_val = date(date_meta[i]["value"]) - if date_val.is_valid() or date_val.parse(): + if date_val.is_valid_hybrid() or date_val.parse_hybrid(): val = date_val.value continue if val == '' and i == len(date_meta) - 1: @@ -546,7 +419,7 @@ def print_info(self, show_limit=100): dspace_be = dspace.rest(endpoint, user, password, True) upd = updater(dspace_be, args.from_mtd_field, args.to_mtd_field, - dry_run=args.dry_run, fix_date_mode=args.fix_date_format) + dry_run=args.dry_run) stats = additional_stats()