-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path03DiseaseControlStatisticalAnalysis.py
More file actions
103 lines (86 loc) · 3.3 KB
/
03DiseaseControlStatisticalAnalysis.py
File metadata and controls
103 lines (86 loc) · 3.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import pandas as pd
import numpy as np
from functools import reduce
from scipy.stats import pearsonr
from scipy import stats
import statsmodels.api as sm
from statsmodels.tools.sm_exceptions import PerfectSeparationError
def run_logit(data, prediction_col, covariates=["Female", "Age"]):
"""
Perform logistic regression on the specified variable and return statistical results including odds ratio (OR), p-value, confidence interval (CI).
If errors such as matrix singularity occur, return all fields as "SM".
"""
try:
X_full = data[[prediction_col] + covariates].astype(float)
X_full = sm.add_constant(X_full)
y = data["Label"]
model_full = sm.Logit(y, X_full).fit(disp=0)
p_values = model_full.pvalues
z_values = model_full.params / model_full.bse
coef = model_full.params.get(prediction_col, None)
se = model_full.bse.get(prediction_col, None)
OR = np.exp(coef) if coef is not None else None
lower_CI = np.exp(coef - 1.96 * se) if coef is not None else None
upper_CI = np.exp(coef + 1.96 * se) if coef is not None else None
return {
"summary": model_full.summary(),
"logOR": coef,
"se_logOR": se,
"z_value": z_values.get(prediction_col, None),
"p_value": p_values.get(prediction_col, None),
"OR": OR,
"95% CI": (lower_CI, upper_CI),
}
except (np.linalg.LinAlgError, PerfectSeparationError) as e:
print(f"[Skip] {prediction_col} - Singular matrix or perfect separation")
return {
"summary": "SM",
"logOR": "SM",
"se_logOR": "SM",
"z_value": "SM",
"p_value": "SM",
"OR": "SM",
"95% CI": ("SM", "SM"),
}
AgeAcc = pd.read_pickle("./AgeAccelerationResidual.pickle")
Confunder = pd.read_pickle("./Confunder.pickle")
Log_p_List = []
Log_ZScore_List = []
pList = []
meanControllist = []
meanCaseList= []
logOR_List = []
OR_List = []
CI_List = []
colNeed = AgeAcc.columns.drop(["Tag"])
for col in colNeed:
if col != "Label":
# print(col)
data = AgeAcc[[col]].join(Confunder[["Age", "Female", "Label"]])
Result_Dict = run_logit(data, col)
control = data[data.Label.eq(0)]
case = data[data.Label.eq(1)]
_, p = stats.mannwhitneyu(control[col].dropna(), case[col].dropna())
meanControl = round(control[col].median(),3)
meanCase = round(case[col].median(),3)
differ = meanCase - meanControl
Log_p_List.append(Result_Dict["p_value"])
Log_ZScore_List.append(Result_Dict["z_value"])
OR_List.append(Result_Dict["OR"])
logOR_List.append(Result_Dict["logOR"])
CI_List.append(Result_Dict["95% CI"])
pList.append(p)
meanControllist.append(meanControl)
meanCaseList.append(differ)
Data = pd.DataFrame(data=[Log_p_List, Log_ZScore_List, logOR_List, OR_List, CI_List, pList, meanCaseList],
columns=AgeAcc.columns,
index=[
"Log_P",
"Log_ZScore",
"logOR",
"OR",
"CI",
"MW_P",
"Differ"
])
Data