-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
113 lines (84 loc) · 3.51 KB
/
utils.py
File metadata and controls
113 lines (84 loc) · 3.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""Shared utilities for the OSIRIS-RWD pipeline."""
import csv
import json
from config import IPP_FILE, JSON_FILE, CORRESPONDENCE_FILE
# ============================================================
# Type conversion helpers
# ============================================================
def to_int(val):
"""Convert an Oracle Decimal/float/None value to int or None."""
if val is None:
return None
return int(val)
def map_sex(val):
"""Map SI_SEXE (1=M, 2=F) to OSIRIS-RWD biologicalSex string."""
if val == 1:
return "MALE"
elif val == 2:
return "FEMALE"
return None
# ============================================================
# IPP list I/O
# ============================================================
def save_ipp_list(ipp_list):
"""Save the IPP list to CSV."""
with open(IPP_FILE, "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["ipp"])
for ipp in ipp_list:
writer.writerow([ipp])
print(f"IPP list saved: {IPP_FILE} ({len(ipp_list)} patients)")
def load_ipp_list():
"""Load the IPP list from CSV."""
with open(IPP_FILE, "r", encoding="utf-8") as f:
reader = csv.DictReader(f)
ipp_list = [row["ipp"] for row in reader]
print(f"IPP list loaded: {IPP_FILE} ({len(ipp_list)} patients)")
return ipp_list
# ============================================================
# Pseudonymization
# ============================================================
def generate_pseudonyms(ipp_list):
"""Generate sequential pseudonym IDs and save the correspondence table.
Returns a dict mapping IPP -> pseudo_id (e.g. "PSCC1", "PSCC2", ...).
IPPs are normalized to 10 digits (zero-padded) to avoid leading-zero
mismatches between databases (EDS 9-digit vs CHIMIO 10-digit).
"""
correspondence = {}
for i, ipp in enumerate(ipp_list, start=1):
normalized = str(ipp).strip().zfill(10)
correspondence[normalized] = f"PSCC{i}"
with open(CORRESPONDENCE_FILE, "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["ipp", "pseudo_id"])
for ipp, pseudo_id in correspondence.items():
writer.writerow([ipp, pseudo_id])
print(f"Correspondence table saved: {CORRESPONDENCE_FILE}")
return correspondence
def load_pseudonyms():
"""Load an existing correspondence table from CSV.
Returns a dict mapping IPP -> pseudo_id.
IPPs are normalized to 10 digits (zero-padded) on load.
"""
correspondence = {}
with open(CORRESPONDENCE_FILE, "r", encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
ipp = str(row["ipp"]).strip().zfill(10)
correspondence[ipp] = row["pseudo_id"]
print(f"Correspondence table loaded: {CORRESPONDENCE_FILE}")
return correspondence
# ============================================================
# JSON I/O
# ============================================================
def save_patients_json(patients):
"""Save the patient list to the OSIRIS-RWD JSON file."""
with open(JSON_FILE, "w", encoding="utf-8") as f:
json.dump(patients, f, indent=2, ensure_ascii=False)
print(f"JSON saved: {JSON_FILE} ({len(patients)} patients)")
def load_patients_json():
"""Load the patient list from the OSIRIS-RWD JSON file."""
with open(JSON_FILE, "r", encoding="utf-8") as f:
patients = json.load(f)
print(f"JSON loaded: {JSON_FILE} ({len(patients)} patients)")
return patients