OSIRIS-RWD_mapping/utils.py at main · drci-foch/OSIRIS-RWD_mapping · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""Shared utilities for the OSIRIS-RWD pipeline."""

import csv
import json
from config import IPP_FILE, JSON_FILE, CORRESPONDENCE_FILE


# ============================================================
# Type conversion helpers
# ============================================================


def to_int(val):
    """Convert an Oracle Decimal/float/None value to int or None."""
    if val is None:
        return None
    return int(val)


def map_sex(val):
    """Map SI_SEXE (1=M, 2=F) to OSIRIS-RWD biologicalSex string."""
    if val == 1:
        return "MALE"
    elif val == 2:
        return "FEMALE"
    return None


# ============================================================
# IPP list I/O
# ============================================================


def save_ipp_list(ipp_list):
    """Save the IPP list to CSV."""
    with open(IPP_FILE, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["ipp"])
        for ipp in ipp_list:
            writer.writerow([ipp])
    print(f"IPP list saved: {IPP_FILE} ({len(ipp_list)} patients)")


def load_ipp_list():
    """Load the IPP list from CSV."""
    with open(IPP_FILE, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        ipp_list = [row["ipp"] for row in reader]
    print(f"IPP list loaded: {IPP_FILE} ({len(ipp_list)} patients)")
    return ipp_list


# ============================================================
# Pseudonymization
# ============================================================


def generate_pseudonyms(ipp_list):
    """Generate sequential pseudonym IDs and save the correspondence table.

    Returns a dict mapping IPP -> pseudo_id (e.g. "PSCC1", "PSCC2", ...).
    IPPs are normalized to 10 digits (zero-padded) to avoid leading-zero
    mismatches between databases (EDS 9-digit vs CHIMIO 10-digit).
    """
    correspondence = {}
    for i, ipp in enumerate(ipp_list, start=1):
        normalized = str(ipp).strip().zfill(10)
        correspondence[normalized] = f"PSCC{i}"

    with open(CORRESPONDENCE_FILE, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["ipp", "pseudo_id"])
        for ipp, pseudo_id in correspondence.items():
            writer.writerow([ipp, pseudo_id])

    print(f"Correspondence table saved: {CORRESPONDENCE_FILE}")
    return correspondence


def load_pseudonyms():
    """Load an existing correspondence table from CSV.

    Returns a dict mapping IPP -> pseudo_id.
    IPPs are normalized to 10 digits (zero-padded) on load.
    """
    correspondence = {}
    with open(CORRESPONDENCE_FILE, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            ipp = str(row["ipp"]).strip().zfill(10)
            correspondence[ipp] = row["pseudo_id"]
    print(f"Correspondence table loaded: {CORRESPONDENCE_FILE}")
    return correspondence


# ============================================================
# JSON I/O
# ============================================================


def save_patients_json(patients):
    """Save the patient list to the OSIRIS-RWD JSON file."""
    with open(JSON_FILE, "w", encoding="utf-8") as f:
        json.dump(patients, f, indent=2, ensure_ascii=False)
    print(f"JSON saved: {JSON_FILE} ({len(patients)} patients)")


def load_patients_json():
    """Load the patient list from the OSIRIS-RWD JSON file."""
    with open(JSON_FILE, "r", encoding="utf-8") as f:
        patients = json.load(f)
    print(f"JSON loaded: {JSON_FILE} ({len(patients)} patients)")
    return patients