Skin-PathwayAge/03DiseaseControlStatisticalAnalysis.py at master · BioTransAI/Skin-PathwayAge · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import pandas as pd
import numpy as np
from functools import reduce
from scipy.stats import pearsonr
from scipy import stats
import statsmodels.api as sm
from statsmodels.tools.sm_exceptions import PerfectSeparationError

def run_logit(data, prediction_col, covariates=["Female", "Age"]):
    """
    Perform logistic regression on the specified variable and return statistical results including odds ratio (OR), p-value, confidence interval (CI).
    If errors such as matrix singularity occur, return all fields as "SM".
    """

    try:
        X_full = data[[prediction_col] + covariates].astype(float)
        X_full = sm.add_constant(X_full)
        y = data["Label"]
        model_full = sm.Logit(y, X_full).fit(disp=0)

        p_values = model_full.pvalues
        z_values = model_full.params / model_full.bse
        coef = model_full.params.get(prediction_col, None)
        se = model_full.bse.get(prediction_col, None)
        OR = np.exp(coef) if coef is not None else None
        lower_CI = np.exp(coef - 1.96 * se) if coef is not None else None
        upper_CI = np.exp(coef + 1.96 * se) if coef is not None else None

        return {
            "summary": model_full.summary(),
            "logOR": coef,
            "se_logOR": se,
            "z_value": z_values.get(prediction_col, None),
            "p_value": p_values.get(prediction_col, None),
            "OR": OR,
            "95% CI": (lower_CI, upper_CI),

        }

    except (np.linalg.LinAlgError, PerfectSeparationError) as e:
        print(f"[Skip] {prediction_col} - Singular matrix or perfect separation")
        return {
            "summary": "SM",
            "logOR": "SM",
            "se_logOR": "SM",
            "z_value": "SM",
            "p_value": "SM",
            "OR": "SM",
            "95% CI": ("SM", "SM"),
        }


AgeAcc = pd.read_pickle("./AgeAccelerationResidual.pickle")
Confunder = pd.read_pickle("./Confunder.pickle")

Log_p_List = []
Log_ZScore_List = []
pList = []
meanControllist = []
meanCaseList= []
logOR_List = []
OR_List = []
CI_List = []

colNeed = AgeAcc.columns.drop(["Tag"])

for col in colNeed:
    if col != "Label":

        # print(col)
        data = AgeAcc[[col]].join(Confunder[["Age", "Female", "Label"]])
        Result_Dict = run_logit(data, col)
        control = data[data.Label.eq(0)]
        case = data[data.Label.eq(1)]
        _, p = stats.mannwhitneyu(control[col].dropna(), case[col].dropna())
        meanControl = round(control[col].median(),3)
        meanCase = round(case[col].median(),3)
        differ = meanCase - meanControl
        Log_p_List.append(Result_Dict["p_value"])
        Log_ZScore_List.append(Result_Dict["z_value"])
        OR_List.append(Result_Dict["OR"])
        logOR_List.append(Result_Dict["logOR"])
        CI_List.append(Result_Dict["95% CI"])
        pList.append(p)
        meanControllist.append(meanControl)
        meanCaseList.append(differ)

Data = pd.DataFrame(data=[Log_p_List, Log_ZScore_List, logOR_List, OR_List, CI_List, pList, meanCaseList],
                    columns=AgeAcc.columns,
                    index=[
                        "Log_P",
                        "Log_ZScore",
                        "logOR",
                        "OR",
                        "CI",
                        "MW_P",
                        "Differ"
                    ])
Data