Survival
In [1]:
Copied!
import sys
import pandas as pd
import os
project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))
sys.path.append(project_root)
data_dir = os.path.join(project_root, 'data')
data_file_path = os.path.join(data_dir, 'RADCURE_processed_clinical.csv')
df = pd.read_csv(data_file_path, index_col=0)
df.drop(columns=["Study ID"], inplace=True)
df.rename(columns={'survival_time': 'time', 'death':'event'}, inplace=True)
import sys
import pandas as pd
import os
project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))
sys.path.append(project_root)
data_dir = os.path.join(project_root, 'data')
data_file_path = os.path.join(data_dir, 'RADCURE_processed_clinical.csv')
df = pd.read_csv(data_file_path, index_col=0)
df.drop(columns=["Study ID"], inplace=True)
df.rename(columns={'survival_time': 'time', 'death':'event'}, inplace=True)
In [6]:
Copied!
from jarvais.analyzer import Analyzer
from rich import print
analyzer = Analyzer(
data=df,
output_dir='./survival_outputs/analyzer',
categorical_columns= [
"Sex",
"T Stage",
"N Stage",
"Stage",
"Smoking Status",
"Disease Site",
"event",
"HPV Combined",
"Chemotherapy"
],
continuous_columns = [
"time",
"age at dx",
"Dose"
],
target_variable='event',
task='survival'
)
print(analyzer)
analyzer.run()
from jarvais.analyzer import Analyzer
from rich import print
analyzer = Analyzer(
data=df,
output_dir='./survival_outputs/analyzer',
categorical_columns= [
"Sex",
"T Stage",
"N Stage",
"Stage",
"Smoking Status",
"Disease Site",
"event",
"HPV Combined",
"Chemotherapy"
],
continuous_columns = [
"time",
"age at dx",
"Dose"
],
target_variable='event',
task='survival'
)
print(analyzer)
analyzer.run()
13:14:57 [warning ] Date columns not specified. Inferring from remaining columns. [jarvais] call=analyzer.__init__:76
Analyzer( AnalyzerSettings( output_dir=PosixPath('survival_outputs/analyzer'), categorical_columns=[ 'Sex', 'T Stage', 'N Stage', 'Stage', 'Smoking Status', 'Disease Site', 'event', 'HPV Combined', 'Chemotherapy' ], continuous_columns=['time', 'age at dx', 'Dose'], date_columns=[], task='survival', target_variable='event', generate_report=True, settings_path=None, settings_schema_path=None, missingness=MissingnessModule( categorical_strategy={ 'Sex': 'unknown', 'T Stage': 'unknown', 'N Stage': 'unknown', 'Stage': 'unknown', 'Smoking Status': 'unknown', 'Disease Site': 'unknown', 'event': 'unknown', 'HPV Combined': 'unknown', 'Chemotherapy': 'unknown' }, continuous_strategy={'time': 'median', 'age at dx': 'median', 'Dose': 'median'}, enabled=True ), outlier=OutlierModule( categorical_strategy={ 'Sex': 'frequency', 'T Stage': 'frequency', 'N Stage': 'frequency', 'Stage': 'frequency', 'Smoking Status': 'frequency', 'Disease Site': 'frequency', 'event': 'frequency', 'HPV Combined': 'frequency', 'Chemotherapy': 'frequency' }, continuous_strategy={'time': 'none', 'age at dx': 'none', 'Dose': 'none'}, threshold=0.01, enabled=True, categorical_mapping={} ), encoding=OneHotEncodingModule( columns=[ 'Sex', 'T Stage', 'N Stage', 'Stage', 'Smoking Status', 'Disease Site', 'HPV Combined', 'Chemotherapy' ], target_variable='event', prefix_sep='|', enabled=True ), visualization=VisualizationModule( plots=['corr', 'pairplot', 'umap', 'frequency_table', 'multiplot', 'kaplan_meier'], enabled=True ) ) )
[info ] Performing missingness analysis... [jarvais] call=missingness.__call__:43 [info ] Performing outlier analysis... [jarvais] call=outlier.__call__:60 13:14:58 [info ] Plotting Correlation Matrix... [jarvais] call=visualization.__call__:115
+-----------------------+-------------------+-----------+-------------+ | | | Missing | Overall | +=======================+===================+===========+=============+ | n | | | 3346 | +-----------------------+-------------------+-----------+-------------+ | time, mean (SD) | | 0 | 4.1 (2.7) | +-----------------------+-------------------+-----------+-------------+ | age at dx, mean (SD) | | 0 | 62.3 (11.6) | +-----------------------+-------------------+-----------+-------------+ | Dose, mean (SD) | | 0 | 66.7 (5.8) | +-----------------------+-------------------+-----------+-------------+ | Sex, n (%) | Female | | 686 (20.5) | +-----------------------+-------------------+-----------+-------------+ | | Male | | 2660 (79.5) | +-----------------------+-------------------+-----------+-------------+ | T Stage, n (%) | None | | 12 (0.4) | +-----------------------+-------------------+-----------+-------------+ | | T0 | | 167 (5.0) | +-----------------------+-------------------+-----------+-------------+ | | T1 | | 454 (13.6) | +-----------------------+-------------------+-----------+-------------+ | | T1 (2) | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | | T1a | | 179 (5.3) | +-----------------------+-------------------+-----------+-------------+ | | T1b | | 88 (2.6) | +-----------------------+-------------------+-----------+-------------+ | | T2 | | 927 (27.7) | +-----------------------+-------------------+-----------+-------------+ | | T2 (2) | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | | T2a | | 4 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | T2b | | 5 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | T3 | | 861 (25.7) | +-----------------------+-------------------+-----------+-------------+ | | T3 (2) | | 3 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | T4 | | 116 (3.5) | +-----------------------+-------------------+-----------+-------------+ | | T4a | | 358 (10.7) | +-----------------------+-------------------+-----------+-------------+ | | T4b | | 121 (3.6) | +-----------------------+-------------------+-----------+-------------+ | | TX | | 4 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | Tis | | 44 (1.3) | +-----------------------+-------------------+-----------+-------------+ | | rT0 | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | N Stage, n (%) | N0 | | 1147 (34.3) | +-----------------------+-------------------+-----------+-------------+ | | N1 | | 344 (10.3) | +-----------------------+-------------------+-----------+-------------+ | | N2 | | 182 (5.4) | +-----------------------+-------------------+-----------+-------------+ | | N2a | | 125 (3.7) | +-----------------------+-------------------+-----------+-------------+ | | N2b | | 791 (23.6) | +-----------------------+-------------------+-----------+-------------+ | | N2c | | 532 (15.9) | +-----------------------+-------------------+-----------+-------------+ | | N3 | | 170 (5.1) | +-----------------------+-------------------+-----------+-------------+ | | N3a | | 13 (0.4) | +-----------------------+-------------------+-----------+-------------+ | | N3b | | 28 (0.8) | +-----------------------+-------------------+-----------+-------------+ | | NX | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | | None | | 13 (0.4) | +-----------------------+-------------------+-----------+-------------+ | Stage, n (%) | 0 | | 44 (1.3) | +-----------------------+-------------------+-----------+-------------+ | | I | | 352 (10.5) | +-----------------------+-------------------+-----------+-------------+ | | IB | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | | II | | 400 (12.0) | +-----------------------+-------------------+-----------+-------------+ | | IIA | | 2 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | IIB | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | | III | | 605 (18.1) | +-----------------------+-------------------+-----------+-------------+ | | IIIA | | 2 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | IIIC | | 2 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | IV | | 12 (0.4) | +-----------------------+-------------------+-----------+-------------+ | | IVA | | 1581 (47.3) | +-----------------------+-------------------+-----------+-------------+ | | IVB | | 309 (9.2) | +-----------------------+-------------------+-----------+-------------+ | | IVC | | 2 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | None | | 27 (0.8) | +-----------------------+-------------------+-----------+-------------+ | | X | | 6 (0.2) | +-----------------------+-------------------+-----------+-------------+ | Smoking Status, n (%) | Current | | 1139 (34.0) | +-----------------------+-------------------+-----------+-------------+ | | Ex-smoker | | 1290 (38.6) | +-----------------------+-------------------+-----------+-------------+ | | Non-smoker | | 872 (26.1) | +-----------------------+-------------------+-----------+-------------+ | | unknown | | 45 (1.3) | +-----------------------+-------------------+-----------+-------------+ | Disease Site, n (%) | benign tumor | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | | esophagus | | 33 (1.0) | +-----------------------+-------------------+-----------+-------------+ | | hypopharynx | | 162 (4.8) | +-----------------------+-------------------+-----------+-------------+ | | lacrimal gland | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | | larynx | | 877 (26.2) | +-----------------------+-------------------+-----------+-------------+ | | lip & oral cavity | | 100 (3.0) | +-----------------------+-------------------+-----------+-------------+ | | nasal cavity | | 62 (1.9) | +-----------------------+-------------------+-----------+-------------+ | | nasopharynx | | 355 (10.6) | +-----------------------+-------------------+-----------+-------------+ | | orbit | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | | oropharynx | | 1501 (44.9) | +-----------------------+-------------------+-----------+-------------+ | | other | | 2 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | paraganglioma | | 7 (0.2) | +-----------------------+-------------------+-----------+-------------+ | | paranasal sinus | | 28 (0.8) | +-----------------------+-------------------+-----------+-------------+ | | salivary glands | | 4 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | sarcoma | | 20 (0.6) | +-----------------------+-------------------+-----------+-------------+ | | skin | | 24 (0.7) | +-----------------------+-------------------+-----------+-------------+ | | unknown | | 168 (5.0) | +-----------------------+-------------------+-----------+-------------+ | event, n (%) | 0 | | 2288 (68.4) | +-----------------------+-------------------+-----------+-------------+ | | 1 | | 1058 (31.6) | +-----------------------+-------------------+-----------+-------------+ | HPV Combined, n (%) | 1.0 | | 1139 (34.0) | +-----------------------+-------------------+-----------+-------------+ | | None | | 2207 (66.0) | +-----------------------+-------------------+-----------+-------------+ | Chemotherapy, n (%) | 0 | | 1923 (57.5) | +-----------------------+-------------------+-----------+-------------+ | | 1 | | 1423 (42.5) | +-----------------------+-------------------+-----------+-------------+ Outlier Report: - No Outliers found in Sex - Outliers found in T Stage: ['nan: 12 out of 3346', 'T2b: 5 out of 3346', 'T2a: 4 out of 3346', 'TX: 4 out of 3346', 'T3 (2): 3 out of 3346', 'T2 (2): 1 out of 3346', 'T1 (2): 1 out of 3346', 'rT0: 1 out of 3346'] - Outliers found in N Stage: ['N3b: 28 out of 3346', 'N3a: 13 out of 3346', 'nan: 13 out of 3346', 'NX: 1 out of 3346'] - Outliers found in Stage: ['nan: 27 out of 3346', 'IV: 12 out of 3346', 'X: 6 out of 3346', 'IIA: 2 out of 3346', 'IIIA: 2 out of 3346', 'IIIC: 2 out of 3346', 'IVC: 2 out of 3346', 'IB: 1 out of 3346', 'IIB: 1 out of 3346'] - No Outliers found in Smoking Status - Outliers found in Disease Site: ['paranasal sinus: 28 out of 3346', 'skin: 24 out of 3346', 'sarcoma: 20 out of 3346', 'paraganglioma: 7 out of 3346', 'salivary glands: 4 out of 3346', 'other: 2 out of 3346', 'benign tumor: 1 out of 3346', 'lacrimal gland: 1 out of 3346', 'orbit: 1 out of 3346'] - No Outliers found in event - No Outliers found in HPV Combined - No Outliers found in Chemotherapy
[info ] Plotting Pairplot... [jarvais] call=visualization.__call__:118 13:15:00 [info ] Plotting UMAP... [jarvais] call=visualization.__call__:124 13:15:06 [info ] Plotting Frequency Table... [jarvais] call=visualization.__call__:121 13:15:16 [info ] Plotting Kaplan Meier Curves... [jarvais] call=visualization.__call__:127 13:15:19 [info ] Plotting Multiplot... [jarvais] call=visualization.__call__:136 Font MPDFAA+Inter28ptBold is missing the following glyphs: ' ' (\n)
In [7]:
Copied!
from jarvais.trainer import TrainerSupervised
analyzer.data['event'] = analyzer.data['event'].astype(int)
trainer = TrainerSupervised(
output_dir="./outputs/trainer",
target_variable=['event','time'],
task="survival",
)
print(trainer)
trainer.run(analyzer.data)
from jarvais.trainer import TrainerSupervised
analyzer.data['event'] = analyzer.data['event'].astype(int)
trainer = TrainerSupervised(
output_dir="./outputs/trainer",
target_variable=['event','time'],
task="survival",
)
print(trainer)
trainer.run(analyzer.data)
TrainerSupervised( TrainerSettings( output_dir=PosixPath('outputs/trainer'), target_variable=['event', 'time'], task='survival', stratify_on=None, test_size=0.2, random_state=42, explain=False, reduction_module=FeatureReductionModule(method=None, task='survival', keep_k=2, enabled=True), trainer_module=SurvivalTrainerModule( output_dir=PosixPath('outputs/trainer'), classical_models=['CoxPH', 'RandomForest', 'GradientBoosting', 'SVM'], deep_models=['MTLR', 'DeepSurv'], eval_metric='c_index', random_seed=42 ) ) )
13:15:34 [info ] Skipping feature reduction. [jarvais] call=feature_reduction.__call__:39 [info ] Training MTLR... [jarvais] call=train.train_mtlr:66 NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor. [W 2025-06-09 13:15:35,681] Trial 1 failed with parameters: {'C1': 100.0, 'dropout': 0.2873687420594126, 'dims': [512, 512]} because of the following error: The value nan is not acceptable. [W 2025-06-09 13:15:35,682] Trial 1 failed with value nan. 13:16:24 [info ] Best trial: C1: 0.01, dropout: 0.38638712110505574, dims: [32, 32] [jarvais] call=train.train_mtlr:70 13:16:25 [info ] Training DeepSurv... [jarvais] call=train.train_deepsurv:54 13:16:57 [info ] Best trial: l2_reg: 0.00628564041712275, dropout: 0.2974641242409045, dims: [256, 256, 256] [jarvais] call=train.train_deepsurv:57 13:16:59 [info ] Training CoxPH model... [jarvais] call=survival_trainer.fit:195 13:17:00 [info ] Training GradientBoosting model... [jarvais] call=survival_trainer.fit:195 13:17:07 [info ] Training RandomForest model... [jarvais] call=survival_trainer.fit:195 13:17:11 [info ] Training SVM model... [jarvais] call=survival_trainer.fit:195
Model Leaderboard ---------------- +------------------+----------------+----------------+----------------+ | model | test_score | val_score | train_score | +==================+================+================+================+ | MTLR | C_INDEX: 0.723 | C_INDEX: 0.744 | C_INDEX: 0.776 | +------------------+----------------+----------------+----------------+ | CoxPH | C_INDEX: 0.721 | N/A | C_INDEX: 0.777 | +------------------+----------------+----------------+----------------+ | RandomForest | C_INDEX: 0.716 | N/A | C_INDEX: 0.853 | +------------------+----------------+----------------+----------------+ | SVM | C_INDEX: 0.716 | N/A | C_INDEX: 0.783 | +------------------+----------------+----------------+----------------+ | GradientBoosting | C_INDEX: 0.715 | N/A | C_INDEX: 0.79 | +------------------+----------------+----------------+----------------+ | DeepSurv | C_INDEX: 0.69 | C_INDEX: 0.712 | C_INDEX: 0.772 | +------------------+----------------+----------------+----------------+
In [8]:
Copied!
from jarvais.explainer import Explainer
exp = Explainer.from_trainer(trainer)
exp.run()
from jarvais.explainer import Explainer
exp = Explainer.from_trainer(trainer)
exp.run()
⚠️ **Possible Bias Detected in Smoking Status** ⚠️ === Subgroup Analysis for 'Smoking Status' Using Cox Proportional Hazards Model === Model Statistics: AIC (Partial): 2375.84 Log-Likelihood: -1183.92 Log-Likelihood Ratio p-value: 0.0000 Concordance Index (C-index): 0.61 Model Coefficients: +---------------------------+---------------+------------------+ | Feature | Coefficient | Standard Error | +===========================+===============+==================+ | Smoking Status_Current | 0.508 | 4.720 | +---------------------------+---------------+------------------+ | Smoking Status_Ex-smoker | 0.070 | 4.720 | +---------------------------+---------------+------------------+ | Smoking Status_Non-smoker | -0.710 | 4.721 | +---------------------------+---------------+------------------+ | Smoking Status_unknown | 0.417 | 4.753 | +---------------------------+---------------+------------------+ ⚠️ **Possible Bias Detected in Disease Site** ⚠️ === Subgroup Analysis for 'Disease Site' Using Cox Proportional Hazards Model === Model Statistics: AIC (Partial): 2393.09 Log-Likelihood: -1187.55 Log-Likelihood Ratio p-value: 0.0003 Concordance Index (C-index): 0.59 Model Coefficients: +--------------------------------+---------------+------------------+ | Feature | Coefficient | Standard Error | +================================+===============+==================+ | Disease Site_Other | 0.424 | 4.571 | +--------------------------------+---------------+------------------+ | Disease Site_esophagus | 0.795 | 4.577 | +--------------------------------+---------------+------------------+ | Disease Site_hypopharynx | 0.746 | 4.556 | +--------------------------------+---------------+------------------+ | Disease Site_larynx | 0.202 | 4.552 | +--------------------------------+---------------+------------------+ | Disease Site_lip & oral cavity | 0.666 | 4.561 | +--------------------------------+---------------+------------------+ | Disease Site_nasal cavity | -1.205 | 4.653 | +--------------------------------+---------------+------------------+ | Disease Site_nasopharynx | -0.925 | 4.558 | +--------------------------------+---------------+------------------+ | Disease Site_oropharynx | -0.046 | 4.551 | +--------------------------------+---------------+------------------+ | Disease Site_unknown | 0.142 | 4.557 | +--------------------------------+---------------+------------------+