Survival
In [1]:
Copied!
import sys
import pandas as pd
import os
project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))
sys.path.append(project_root)
data_dir = os.path.join(project_root, 'data')
data_file_path = os.path.join(data_dir, 'RADCURE_processed_clinical.csv')
df = pd.read_csv(data_file_path, index_col=0)
df.drop(columns=["Study ID"], inplace=True)
import sys
import pandas as pd
import os
project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))
sys.path.append(project_root)
data_dir = os.path.join(project_root, 'data')
data_file_path = os.path.join(data_dir, 'RADCURE_processed_clinical.csv')
df = pd.read_csv(data_file_path, index_col=0)
df.drop(columns=["Study ID"], inplace=True)
In [2]:
Copied!
from jarvais.analyzer import Analyzer
from rich import print
analyzer = Analyzer(
data=df,
output_dir='./survival_outputs/analyzer',
categorical_columns= [
"Sex",
"T Stage",
"N Stage",
"Stage",
"Smoking Status",
"Disease Site",
"death",
"HPV Combined",
"Chemotherapy"
],
continuous_columns = [
"survival_time",
"age at dx",
"Dose"
],
target_variable='death',
task='classification'
)
print(analyzer)
analyzer.run()
from jarvais.analyzer import Analyzer
from rich import print
analyzer = Analyzer(
data=df,
output_dir='./survival_outputs/analyzer',
categorical_columns= [
"Sex",
"T Stage",
"N Stage",
"Stage",
"Smoking Status",
"Disease Site",
"death",
"HPV Combined",
"Chemotherapy"
],
continuous_columns = [
"survival_time",
"age at dx",
"Dose"
],
target_variable='death',
task='classification'
)
print(analyzer)
analyzer.run()
/home/joshua-siraj/Documents/CDI/jarvais/.pixi/envs/dev/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm 14:41:01 [warning ] Date columns not specified. Inferring from remaining columns. [jarvais] call=analyzer.__init__:76
Analyzer( AnalyzerSettings( output_dir=PosixPath('survival_outputs/analyzer'), categorical_columns=[ 'Sex', 'T Stage', 'N Stage', 'Stage', 'Smoking Status', 'Disease Site', 'death', 'HPV Combined', 'Chemotherapy' ], continuous_columns=['survival_time', 'age at dx', 'Dose'], date_columns=[], task='classification', target_variable='death', generate_report=True, settings_path=None, settings_schema_path=None, missingness=MissingnessModule( categorical_strategy={ 'Sex': 'unknown', 'T Stage': 'unknown', 'N Stage': 'unknown', 'Stage': 'unknown', 'Smoking Status': 'unknown', 'Disease Site': 'unknown', 'death': 'unknown', 'HPV Combined': 'unknown', 'Chemotherapy': 'unknown' }, continuous_strategy={'survival_time': 'median', 'age at dx': 'median', 'Dose': 'median'}, enabled=True ), outlier=OutlierModule( categorical_strategy={ 'Sex': 'frequency', 'T Stage': 'frequency', 'N Stage': 'frequency', 'Stage': 'frequency', 'Smoking Status': 'frequency', 'Disease Site': 'frequency', 'death': 'frequency', 'HPV Combined': 'frequency', 'Chemotherapy': 'frequency' }, continuous_strategy={'survival_time': 'none', 'age at dx': 'none', 'Dose': 'none'}, threshold=0.01, enabled=True ), encoding=OneHotEncodingModule( columns=[ 'Sex', 'T Stage', 'N Stage', 'Stage', 'Smoking Status', 'Disease Site', 'HPV Combined', 'Chemotherapy' ], target_variable='death', prefix_sep='|', enabled=True ), visualization=VisualizationModule( plots=['corr', 'pairplot', 'umap', 'frequency_table', 'multiplot'], enabled=True ) ) )
[info ] Performing missingness analysis... [jarvais] call=missingness.__call__:43 [info ] Performing outlier analysis... [jarvais] call=outlier.__call__:53 [info ] Plotting Correlation Matrix... [jarvais] call=visualization.__call__:115
+--------------------------+-------------------+-----------+-------------+ | | | Missing | Overall | +==========================+===================+===========+=============+ | n | | | 3346 | +--------------------------+-------------------+-----------+-------------+ | survival_time, mean (SD) | | 0 | 4.1 (2.7) | +--------------------------+-------------------+-----------+-------------+ | age at dx, mean (SD) | | 0 | 62.3 (11.6) | +--------------------------+-------------------+-----------+-------------+ | Dose, mean (SD) | | 0 | 66.7 (5.8) | +--------------------------+-------------------+-----------+-------------+ | Sex, n (%) | Female | | 686 (20.5) | +--------------------------+-------------------+-----------+-------------+ | | Male | | 2660 (79.5) | +--------------------------+-------------------+-----------+-------------+ | T Stage, n (%) | None | | 12 (0.4) | +--------------------------+-------------------+-----------+-------------+ | | T0 | | 167 (5.0) | +--------------------------+-------------------+-----------+-------------+ | | T1 | | 454 (13.6) | +--------------------------+-------------------+-----------+-------------+ | | T1 (2) | | 1 (0.0) | +--------------------------+-------------------+-----------+-------------+ | | T1a | | 179 (5.3) | +--------------------------+-------------------+-----------+-------------+ | | T1b | | 88 (2.6) | +--------------------------+-------------------+-----------+-------------+ | | T2 | | 927 (27.7) | +--------------------------+-------------------+-----------+-------------+ | | T2 (2) | | 1 (0.0) | +--------------------------+-------------------+-----------+-------------+ | | T2a | | 4 (0.1) | +--------------------------+-------------------+-----------+-------------+ | | T2b | | 5 (0.1) | +--------------------------+-------------------+-----------+-------------+ | | T3 | | 861 (25.7) | +--------------------------+-------------------+-----------+-------------+ | | T3 (2) | | 3 (0.1) | +--------------------------+-------------------+-----------+-------------+ | | T4 | | 116 (3.5) | +--------------------------+-------------------+-----------+-------------+ | | T4a | | 358 (10.7) | +--------------------------+-------------------+-----------+-------------+ | | T4b | | 121 (3.6) | +--------------------------+-------------------+-----------+-------------+ | | TX | | 4 (0.1) | +--------------------------+-------------------+-----------+-------------+ | | Tis | | 44 (1.3) | +--------------------------+-------------------+-----------+-------------+ | | rT0 | | 1 (0.0) | +--------------------------+-------------------+-----------+-------------+ | N Stage, n (%) | N0 | | 1147 (34.3) | +--------------------------+-------------------+-----------+-------------+ | | N1 | | 344 (10.3) | +--------------------------+-------------------+-----------+-------------+ | | N2 | | 182 (5.4) | +--------------------------+-------------------+-----------+-------------+ | | N2a | | 125 (3.7) | +--------------------------+-------------------+-----------+-------------+ | | N2b | | 791 (23.6) | +--------------------------+-------------------+-----------+-------------+ | | N2c | | 532 (15.9) | +--------------------------+-------------------+-----------+-------------+ | | N3 | | 170 (5.1) | +--------------------------+-------------------+-----------+-------------+ | | N3a | | 13 (0.4) | +--------------------------+-------------------+-----------+-------------+ | | N3b | | 28 (0.8) | +--------------------------+-------------------+-----------+-------------+ | | NX | | 1 (0.0) | +--------------------------+-------------------+-----------+-------------+ | | None | | 13 (0.4) | +--------------------------+-------------------+-----------+-------------+ | Stage, n (%) | 0 | | 44 (1.3) | +--------------------------+-------------------+-----------+-------------+ | | I | | 352 (10.5) | +--------------------------+-------------------+-----------+-------------+ | | IB | | 1 (0.0) | +--------------------------+-------------------+-----------+-------------+ | | II | | 400 (12.0) | +--------------------------+-------------------+-----------+-------------+ | | IIA | | 2 (0.1) | +--------------------------+-------------------+-----------+-------------+ | | IIB | | 1 (0.0) | +--------------------------+-------------------+-----------+-------------+ | | III | | 605 (18.1) | +--------------------------+-------------------+-----------+-------------+ | | IIIA | | 2 (0.1) | +--------------------------+-------------------+-----------+-------------+ | | IIIC | | 2 (0.1) | +--------------------------+-------------------+-----------+-------------+ | | IV | | 12 (0.4) | +--------------------------+-------------------+-----------+-------------+ | | IVA | | 1581 (47.3) | +--------------------------+-------------------+-----------+-------------+ | | IVB | | 309 (9.2) | +--------------------------+-------------------+-----------+-------------+ | | IVC | | 2 (0.1) | +--------------------------+-------------------+-----------+-------------+ | | None | | 27 (0.8) | +--------------------------+-------------------+-----------+-------------+ | | X | | 6 (0.2) | +--------------------------+-------------------+-----------+-------------+ | Smoking Status, n (%) | Current | | 1139 (34.0) | +--------------------------+-------------------+-----------+-------------+ | | Ex-smoker | | 1290 (38.6) | +--------------------------+-------------------+-----------+-------------+ | | Non-smoker | | 872 (26.1) | +--------------------------+-------------------+-----------+-------------+ | | unknown | | 45 (1.3) | +--------------------------+-------------------+-----------+-------------+ | Disease Site, n (%) | benign tumor | | 1 (0.0) | +--------------------------+-------------------+-----------+-------------+ | | esophagus | | 33 (1.0) | +--------------------------+-------------------+-----------+-------------+ | | hypopharynx | | 162 (4.8) | +--------------------------+-------------------+-----------+-------------+ | | lacrimal gland | | 1 (0.0) | +--------------------------+-------------------+-----------+-------------+ | | larynx | | 877 (26.2) | +--------------------------+-------------------+-----------+-------------+ | | lip & oral cavity | | 100 (3.0) | +--------------------------+-------------------+-----------+-------------+ | | nasal cavity | | 62 (1.9) | +--------------------------+-------------------+-----------+-------------+ | | nasopharynx | | 355 (10.6) | +--------------------------+-------------------+-----------+-------------+ | | orbit | | 1 (0.0) | +--------------------------+-------------------+-----------+-------------+ | | oropharynx | | 1501 (44.9) | +--------------------------+-------------------+-----------+-------------+ | | other | | 2 (0.1) | +--------------------------+-------------------+-----------+-------------+ | | paraganglioma | | 7 (0.2) | +--------------------------+-------------------+-----------+-------------+ | | paranasal sinus | | 28 (0.8) | +--------------------------+-------------------+-----------+-------------+ | | salivary glands | | 4 (0.1) | +--------------------------+-------------------+-----------+-------------+ | | sarcoma | | 20 (0.6) | +--------------------------+-------------------+-----------+-------------+ | | skin | | 24 (0.7) | +--------------------------+-------------------+-----------+-------------+ | | unknown | | 168 (5.0) | +--------------------------+-------------------+-----------+-------------+ | death, n (%) | 0 | | 2288 (68.4) | +--------------------------+-------------------+-----------+-------------+ | | 1 | | 1058 (31.6) | +--------------------------+-------------------+-----------+-------------+ | HPV Combined, n (%) | 1.0 | | 1139 (34.0) | +--------------------------+-------------------+-----------+-------------+ | | None | | 2207 (66.0) | +--------------------------+-------------------+-----------+-------------+ | Chemotherapy, n (%) | 0 | | 1923 (57.5) | +--------------------------+-------------------+-----------+-------------+ | | 1 | | 1423 (42.5) | +--------------------------+-------------------+-----------+-------------+ Outlier Report: - No Outliers found in Sex - Outliers found in T Stage: ['nan: 12 out of 3346', 'T2b: 5 out of 3346', 'T2a: 4 out of 3346', 'TX: 4 out of 3346', 'T3 (2): 3 out of 3346', 'T2 (2): 1 out of 3346', 'T1 (2): 1 out of 3346', 'rT0: 1 out of 3346'] - Outliers found in N Stage: ['N3b: 28 out of 3346', 'N3a: 13 out of 3346', 'nan: 13 out of 3346', 'NX: 1 out of 3346'] - Outliers found in Stage: ['nan: 27 out of 3346', 'IV: 12 out of 3346', 'X: 6 out of 3346', 'IIA: 2 out of 3346', 'IIIA: 2 out of 3346', 'IIIC: 2 out of 3346', 'IVC: 2 out of 3346', 'IB: 1 out of 3346', 'IIB: 1 out of 3346'] - No Outliers found in Smoking Status - Outliers found in Disease Site: ['paranasal sinus: 28 out of 3346', 'skin: 24 out of 3346', 'sarcoma: 20 out of 3346', 'paraganglioma: 7 out of 3346', 'salivary glands: 4 out of 3346', 'other: 2 out of 3346', 'benign tumor: 1 out of 3346', 'lacrimal gland: 1 out of 3346', 'orbit: 1 out of 3346'] - No Outliers found in death - No Outliers found in HPV Combined - No Outliers found in Chemotherapy
[info ] Plotting Pairplot... [jarvais] call=visualization.__call__:118 14:41:04 [info ] Plotting UMAP... [jarvais] call=visualization.__call__:124 14:41:13 [info ] Plotting Frequency Table... [jarvais] call=visualization.__call__:121 14:41:22 [info ] Plotting Multiplot... [jarvais] call=visualization.__call__:136 Font MPDFAA+Inter28ptBold is missing the following glyphs: ' ' (\n)
In [3]:
Copied!
from jarvais.trainer import TrainerSupervised
df = pd.read_csv('./survival_outputs/analyzer/updated_data.csv')
df.rename(columns={'survival_time': 'time', 'death':'event'}, inplace=True)
trainer = TrainerSupervised(task='survival', output_dir='./radcure_outputs/ED_trainer_explainer',)
trainer.run(df, ['event','time'])
from jarvais.trainer import TrainerSupervised
df = pd.read_csv('./survival_outputs/analyzer/updated_data.csv')
df.rename(columns={'survival_time': 'time', 'death':'event'}, inplace=True)
trainer = TrainerSupervised(task='survival', output_dir='./radcure_outputs/ED_trainer_explainer',)
trainer.run(df, ['event','time'])
Training MTLR... Best trial: Params: C1: 0.01 dropout: 0.3292835147016483 dims: [64, 64] Training DeepSurv... Best trial: Params: l2_reg: 0.006451742136969566 dropout: 0.360636124048913 dims: [256, 256, 256] Training CoxPH... Training GradientBoosting... Training RandomForest... Training SVM... Consolidated C-index Scores: MTLR: 0.6482 DeepSurv: 0.6863 CoxPH: 0.7212 GradientBoosting: 0.7152 RandomForest: 0.7156 SVM: 0.7156
In [4]:
Copied!
from jarvais.explainer import Explainer
exp = Explainer.from_trainer(trainer)
exp.run()
from jarvais.explainer import Explainer
exp = Explainer.from_trainer(trainer)
exp.run()
⚠️ **Possible Bias Detected in Disease Site** ⚠️ === Subgroup Analysis for 'Disease Site' Using Cox Proportional Hazards Model === Model Statistics: AIC (Partial): 2393.09 Log-Likelihood: -1187.55 Log-Likelihood Ratio p-value: 0.0003 Concordance Index (C-index): 0.59 Model Coefficients: +--------------------------------+---------------+------------------+ | Feature | Coefficient | Standard Error | +================================+===============+==================+ | Disease Site_Other | 0.424 | 4.571 | +--------------------------------+---------------+------------------+ | Disease Site_esophagus | 0.795 | 4.577 | +--------------------------------+---------------+------------------+ | Disease Site_hypopharynx | 0.746 | 4.556 | +--------------------------------+---------------+------------------+ | Disease Site_larynx | 0.202 | 4.552 | +--------------------------------+---------------+------------------+ | Disease Site_lip & oral cavity | 0.666 | 4.561 | +--------------------------------+---------------+------------------+ | Disease Site_nasal cavity | -1.205 | 4.653 | +--------------------------------+---------------+------------------+ | Disease Site_nasopharynx | -0.925 | 4.558 | +--------------------------------+---------------+------------------+ | Disease Site_oropharynx | -0.046 | 4.551 | +--------------------------------+---------------+------------------+ | Disease Site_unknown | 0.142 | 4.557 | +--------------------------------+---------------+------------------+ ⚠️ **Possible Bias Detected in Smoking Status** ⚠️ === Subgroup Analysis for 'Smoking Status' Using Cox Proportional Hazards Model === Model Statistics: AIC (Partial): 2375.84 Log-Likelihood: -1183.92 Log-Likelihood Ratio p-value: 0.0000 Concordance Index (C-index): 0.61 Model Coefficients: +---------------------------+---------------+------------------+ | Feature | Coefficient | Standard Error | +===========================+===============+==================+ | Smoking Status_Current | 0.508 | 4.720 | +---------------------------+---------------+------------------+ | Smoking Status_Ex-smoker | 0.070 | 4.720 | +---------------------------+---------------+------------------+ | Smoking Status_Non-smoker | -0.710 | 4.721 | +---------------------------+---------------+------------------+ | Smoking Status_unknown | 0.417 | 4.753 | +---------------------------+---------------+------------------+