Survival
In [1]:
Copied!
import sys
import pandas as pd
import os
project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))
sys.path.append(project_root)
data_dir = os.path.join(project_root, 'data')
data_file_path = os.path.join(data_dir, 'RADCURE_processed_clinical.csv')
df = pd.read_csv(data_file_path, index_col=0)
df.drop(columns=["Study ID"], inplace=True)
df.rename(columns={'survival_time': 'time', 'death':'event'}, inplace=True)
import sys
import pandas as pd
import os
project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))
sys.path.append(project_root)
data_dir = os.path.join(project_root, 'data')
data_file_path = os.path.join(data_dir, 'RADCURE_processed_clinical.csv')
df = pd.read_csv(data_file_path, index_col=0)
df.drop(columns=["Study ID"], inplace=True)
df.rename(columns={'survival_time': 'time', 'death':'event'}, inplace=True)
In [2]:
Copied!
from jarvais.analyzer import Analyzer
from rich import print
analyzer = Analyzer(
data=df,
output_dir='./survival_outputs/analyzer',
categorical_columns= [
"Sex",
"T Stage",
"N Stage",
"Stage",
"Smoking Status",
"Disease Site",
"event",
"HPV Combined",
"Chemotherapy"
],
continuous_columns = [
"time",
"age at dx",
"Dose"
],
target_variable='event',
task='survival'
)
print(analyzer)
analyzer.run()
from jarvais.analyzer import Analyzer
from rich import print
analyzer = Analyzer(
data=df,
output_dir='./survival_outputs/analyzer',
categorical_columns= [
"Sex",
"T Stage",
"N Stage",
"Stage",
"Smoking Status",
"Disease Site",
"event",
"HPV Combined",
"Chemotherapy"
],
continuous_columns = [
"time",
"age at dx",
"Dose"
],
target_variable='event',
task='survival'
)
print(analyzer)
analyzer.run()
/home/joshua-siraj/Documents/CDI/jarvais/.pixi/envs/dev/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm 16:28:19 [warning ] Date columns not specified. Inferring from remaining columns. [jarvais] call=analyzer.__init__:85
Analyzer( AnalyzerSettings( output_dir=PosixPath('survival_outputs/analyzer'), categorical_columns=[ 'Sex', 'T Stage', 'N Stage', 'Stage', 'Smoking Status', 'Disease Site', 'event', 'HPV Combined', 'Chemotherapy' ], continuous_columns=['time', 'age at dx', 'Dose'], date_columns=[], task='survival', target_variable='event', generate_report=True, settings_path=None, settings_schema_path=None, missingness=MissingnessModule( enabled=True, categorical_strategy={ 'Sex': 'unknown', 'T Stage': 'unknown', 'N Stage': 'unknown', 'Stage': 'unknown', 'Smoking Status': 'unknown', 'Disease Site': 'unknown', 'event': 'unknown', 'HPV Combined': 'unknown', 'Chemotherapy': 'unknown' }, continuous_strategy={'time': 'median', 'age at dx': 'median', 'Dose': 'median'} ), outlier=OutlierModule( enabled=True, categorical_strategy={ 'Sex': 'frequency', 'T Stage': 'frequency', 'N Stage': 'frequency', 'Stage': 'frequency', 'Smoking Status': 'frequency', 'Disease Site': 'frequency', 'event': 'frequency', 'HPV Combined': 'frequency', 'Chemotherapy': 'frequency' }, continuous_strategy={'time': 'none', 'age at dx': 'none', 'Dose': 'none'}, threshold=0.01, categorical_mapping={}, group_outliers=True ), visualization=DataVisualizationModule( enabled=True, plots=['corr', 'pairplot', 'umap', 'frequency_table', 'multiplot', 'kaplan_meier'], save_to_json=False ), boolean=BooleanEncodingModule(enabled=True, columns=[]), dashboard=DashboardModule(enabled=True, n_top=10, significance_threshold=0.05) ) )
[info ] Performing missingness analysis... [jarvais] call=missingness.__call__:40 [info ] Performing outlier analysis... [jarvais] call=outlier.__call__:63 [info ] Plotting Correlation Matrix... [jarvais] call=visualization.__call__:122
+-----------------------+-------------------+-----------+-------------+ | | | Missing | Overall | +=======================+===================+===========+=============+ | n | | | 3346 | +-----------------------+-------------------+-----------+-------------+ | time, mean (SD) | | 0 | 4.1 (2.7) | +-----------------------+-------------------+-----------+-------------+ | age at dx, mean (SD) | | 0 | 62.3 (11.6) | +-----------------------+-------------------+-----------+-------------+ | Dose, mean (SD) | | 0 | 66.7 (5.8) | +-----------------------+-------------------+-----------+-------------+ | Sex, n (%) | Female | | 686 (20.5) | +-----------------------+-------------------+-----------+-------------+ | | Male | | 2660 (79.5) | +-----------------------+-------------------+-----------+-------------+ | T Stage, n (%) | None | | 12 (0.4) | +-----------------------+-------------------+-----------+-------------+ | | T0 | | 167 (5.0) | +-----------------------+-------------------+-----------+-------------+ | | T1 | | 454 (13.6) | +-----------------------+-------------------+-----------+-------------+ | | T1 (2) | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | | T1a | | 179 (5.3) | +-----------------------+-------------------+-----------+-------------+ | | T1b | | 88 (2.6) | +-----------------------+-------------------+-----------+-------------+ | | T2 | | 927 (27.7) | +-----------------------+-------------------+-----------+-------------+ | | T2 (2) | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | | T2a | | 4 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | T2b | | 5 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | T3 | | 861 (25.7) | +-----------------------+-------------------+-----------+-------------+ | | T3 (2) | | 3 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | T4 | | 116 (3.5) | +-----------------------+-------------------+-----------+-------------+ | | T4a | | 358 (10.7) | +-----------------------+-------------------+-----------+-------------+ | | T4b | | 121 (3.6) | +-----------------------+-------------------+-----------+-------------+ | | TX | | 4 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | Tis | | 44 (1.3) | +-----------------------+-------------------+-----------+-------------+ | | rT0 | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | N Stage, n (%) | N0 | | 1147 (34.3) | +-----------------------+-------------------+-----------+-------------+ | | N1 | | 344 (10.3) | +-----------------------+-------------------+-----------+-------------+ | | N2 | | 182 (5.4) | +-----------------------+-------------------+-----------+-------------+ | | N2a | | 125 (3.7) | +-----------------------+-------------------+-----------+-------------+ | | N2b | | 791 (23.6) | +-----------------------+-------------------+-----------+-------------+ | | N2c | | 532 (15.9) | +-----------------------+-------------------+-----------+-------------+ | | N3 | | 170 (5.1) | +-----------------------+-------------------+-----------+-------------+ | | N3a | | 13 (0.4) | +-----------------------+-------------------+-----------+-------------+ | | N3b | | 28 (0.8) | +-----------------------+-------------------+-----------+-------------+ | | NX | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | | None | | 13 (0.4) | +-----------------------+-------------------+-----------+-------------+ | Stage, n (%) | 0 | | 44 (1.3) | +-----------------------+-------------------+-----------+-------------+ | | I | | 352 (10.5) | +-----------------------+-------------------+-----------+-------------+ | | IB | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | | II | | 400 (12.0) | +-----------------------+-------------------+-----------+-------------+ | | IIA | | 2 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | IIB | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | | III | | 605 (18.1) | +-----------------------+-------------------+-----------+-------------+ | | IIIA | | 2 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | IIIC | | 2 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | IV | | 12 (0.4) | +-----------------------+-------------------+-----------+-------------+ | | IVA | | 1581 (47.3) | +-----------------------+-------------------+-----------+-------------+ | | IVB | | 309 (9.2) | +-----------------------+-------------------+-----------+-------------+ | | IVC | | 2 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | None | | 27 (0.8) | +-----------------------+-------------------+-----------+-------------+ | | X | | 6 (0.2) | +-----------------------+-------------------+-----------+-------------+ | Smoking Status, n (%) | Current | | 1139 (34.0) | +-----------------------+-------------------+-----------+-------------+ | | Ex-smoker | | 1290 (38.6) | +-----------------------+-------------------+-----------+-------------+ | | Non-smoker | | 872 (26.1) | +-----------------------+-------------------+-----------+-------------+ | | unknown | | 45 (1.3) | +-----------------------+-------------------+-----------+-------------+ | Disease Site, n (%) | benign tumor | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | | esophagus | | 33 (1.0) | +-----------------------+-------------------+-----------+-------------+ | | hypopharynx | | 162 (4.8) | +-----------------------+-------------------+-----------+-------------+ | | lacrimal gland | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | | larynx | | 877 (26.2) | +-----------------------+-------------------+-----------+-------------+ | | lip & oral cavity | | 100 (3.0) | +-----------------------+-------------------+-----------+-------------+ | | nasal cavity | | 62 (1.9) | +-----------------------+-------------------+-----------+-------------+ | | nasopharynx | | 355 (10.6) | +-----------------------+-------------------+-----------+-------------+ | | orbit | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | | oropharynx | | 1501 (44.9) | +-----------------------+-------------------+-----------+-------------+ | | other | | 2 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | paraganglioma | | 7 (0.2) | +-----------------------+-------------------+-----------+-------------+ | | paranasal sinus | | 28 (0.8) | +-----------------------+-------------------+-----------+-------------+ | | salivary glands | | 4 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | sarcoma | | 20 (0.6) | +-----------------------+-------------------+-----------+-------------+ | | skin | | 24 (0.7) | +-----------------------+-------------------+-----------+-------------+ | | unknown | | 168 (5.0) | +-----------------------+-------------------+-----------+-------------+ | event, n (%) | 0 | | 2288 (68.4) | +-----------------------+-------------------+-----------+-------------+ | | 1 | | 1058 (31.6) | +-----------------------+-------------------+-----------+-------------+ | HPV Combined, n (%) | 1.0 | | 1139 (34.0) | +-----------------------+-------------------+-----------+-------------+ | | None | | 2207 (66.0) | +-----------------------+-------------------+-----------+-------------+ | Chemotherapy, n (%) | 0 | | 1923 (57.5) | +-----------------------+-------------------+-----------+-------------+ | | 1 | | 1423 (42.5) | +-----------------------+-------------------+-----------+-------------+ Outlier Report: - No Outliers found in Sex - Outliers found in T Stage: ['nan: 12 out of 3346', 'T2b: 5 out of 3346', 'T2a: 4 out of 3346', 'TX: 4 out of 3346', 'T3 (2): 3 out of 3346', 'T2 (2): 1 out of 3346', 'T1 (2): 1 out of 3346', 'rT0: 1 out of 3346'] - Outliers found in N Stage: ['N3b: 28 out of 3346', 'N3a: 13 out of 3346', 'nan: 13 out of 3346', 'NX: 1 out of 3346'] - Outliers found in Stage: ['nan: 27 out of 3346', 'IV: 12 out of 3346', 'X: 6 out of 3346', 'IIA: 2 out of 3346', 'IIIA: 2 out of 3346', 'IIIC: 2 out of 3346', 'IVC: 2 out of 3346', 'IB: 1 out of 3346', 'IIB: 1 out of 3346'] - No Outliers found in Smoking Status - Outliers found in Disease Site: ['paranasal sinus: 28 out of 3346', 'skin: 24 out of 3346', 'sarcoma: 20 out of 3346', 'paraganglioma: 7 out of 3346', 'salivary glands: 4 out of 3346', 'other: 2 out of 3346', 'benign tumor: 1 out of 3346', 'lacrimal gland: 1 out of 3346', 'orbit: 1 out of 3346'] - No Outliers found in event - No Outliers found in HPV Combined - No Outliers found in Chemotherapy
16:28:20 [info ] Plotting Pairplot... [jarvais] call=visualization.__call__:125 16:28:22 [info ] Plotting UMAP... [jarvais] call=visualization.__call__:131 16:28:31 [info ] Plotting Frequency Table... [jarvais] call=visualization.__call__:128 16:28:40 [info ] Plotting Kaplan Meier Curves... [jarvais] call=visualization.__call__:137 16:28:43 [info ] Plotting Multiplot... [jarvais] call=visualization.__call__:146 16:29:00 [info ] Computing statistical ranking for dashboard... [jarvais] call=dashboard.__call__:77 [info ] Analyzing statistical significance for 9 categorical × 3 continuous variables [jarvais] call=statistical_ranking.find_top_multiplots:73 [info ] Found 27 total comparisons, 24 significant (p < 0.05) [jarvais] call=statistical_ranking.find_top_multiplots:126 [info ] Most significant p-value: 0.00e+00 [jarvais] call=statistical_ranking.find_top_multiplots:131 [info ] Generating dashboard plot of significant multiplots... [jarvais] call=dashboard.__call__:98 Font MPDFAA+Inter28ptBold is missing the following glyphs: ' ' (\n)
In [3]:
Copied!
from jarvais.trainer import TrainerSupervised
analyzer.data['event'] = analyzer.data['event'].astype(int)
trainer = TrainerSupervised(
output_dir="./outputs/trainer",
target_variable=['event','time'],
task="survival",
)
print(trainer)
trainer.run(analyzer.data)
from jarvais.trainer import TrainerSupervised
analyzer.data['event'] = analyzer.data['event'].astype(int)
trainer = TrainerSupervised(
output_dir="./outputs/trainer",
target_variable=['event','time'],
task="survival",
)
print(trainer)
trainer.run(analyzer.data)
TrainerSupervised( TrainerSettings( output_dir=PosixPath('outputs/trainer'), target_variable=['event', 'time'], task='survival', stratify_on=None, test_size=0.2, random_state=42, explain=False, encoding_module=OneHotEncodingModule(columns=None, prefix_sep='|', enabled=True), reduction_module=FeatureReductionModule(method=None, task='survival', keep_k=2, enabled=True), trainer_module=SurvivalTrainerModule( output_dir=PosixPath('outputs/trainer'), classical_models=['CoxPH', 'RandomForest', 'GradientBoosting', 'SVM'], deep_models=['MTLR', 'DeepSurv'], eval_metric='c_index', random_seed=42 ) ) )
16:29:16 [info ] Skipping feature reduction. [jarvais] call=feature_reduction.__call__:39 [info ] Training MTLR... [jarvais] call=train.train_mtlr:66 NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor. [W 2025-11-18 16:29:17,736] Trial 1 failed with parameters: {'C1': 100.0, 'dropout': 0.2873687420594126, 'dims': [512, 512]} because of the following error: The value nan is not acceptable. [W 2025-11-18 16:29:17,737] Trial 1 failed with value nan. NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor. [W 2025-11-18 16:29:21,726] Trial 11 failed with parameters: {'C1': 0.01, 'dropout': 0.42901791493317987, 'dims': [512, 512]} because of the following error: The value nan is not acceptable. [W 2025-11-18 16:29:21,726] Trial 11 failed with value nan. NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor. NaN or Inf found in input tensor. [W 2025-11-18 16:29:22,421] Trial 12 failed with parameters: {'C1': 0.01, 'dropout': 0.4656969000433397, 'dims': [512, 512]} because of the following error: The value nan is not acceptable. [W 2025-11-18 16:29:22,422] Trial 12 failed with value nan. 16:29:54 [info ] Best trial: C1: 0.01, dropout: 0.26737812933879473, dims: [64, 64] [jarvais] call=train.train_mtlr:70 16:29:55 [info ] Training DeepSurv... [jarvais] call=train.train_deepsurv:54 16:30:38 [info ] Best trial: l2_reg: 0.004861934760070567, dropout: 0.2495290056735473, dims: [256, 256, 256] [jarvais] call=train.train_deepsurv:57 16:30:40 [info ] Training CoxPH model... [jarvais] call=survival_trainer.fit:217 [info ] Training GradientBoosting model... [jarvais] call=survival_trainer.fit:217 16:30:48 [info ] Training RandomForest model... [jarvais] call=survival_trainer.fit:217 16:30:51 [info ] Training SVM model... [jarvais] call=survival_trainer.fit:217
Model Leaderboard ---------------- +------------------+----------------+----------------+----------------+ | model | test_score | val_score | train_score | +==================+================+================+================+ | CoxPH | C_INDEX: 0.721 | N/A | C_INDEX: 0.777 | +------------------+----------------+----------------+----------------+ | SVM | C_INDEX: 0.716 | N/A | C_INDEX: 0.783 | +------------------+----------------+----------------+----------------+ | RandomForest | C_INDEX: 0.715 | N/A | C_INDEX: 0.855 | +------------------+----------------+----------------+----------------+ | GradientBoosting | C_INDEX: 0.715 | N/A | C_INDEX: 0.79 | +------------------+----------------+----------------+----------------+ | DeepSurv | C_INDEX: 0.702 | C_INDEX: 0.728 | C_INDEX: 0.775 | +------------------+----------------+----------------+----------------+ | MTLR | C_INDEX: 0.667 | C_INDEX: 0.627 | C_INDEX: 0.688 | +------------------+----------------+----------------+----------------+
In [4]:
Copied!
from jarvais.explainer import Explainer
sensitive_features = ['N Stage', 'Disease Site', 'Sex']
explainer = Explainer(output_dir="./outputs/explainer", sensitive_features=sensitive_features)
explainer.run(trainer)
from jarvais.explainer import Explainer
sensitive_features = ['N Stage', 'Disease Site', 'Sex']
explainer = Explainer(output_dir="./outputs/explainer", sensitive_features=sensitive_features)
explainer.run(trainer)
16:30:58 [info ] Running Bias Audit Module... [jarvais] call=bias_audit.__call__:57
⚠️ **Possible Bias Detected in N Stage** ⚠️
=== Subgroup Analysis for 'N Stage' Using Cox Proportional Hazards Model ===
Model Statistics:
AIC (Partial): 2406.53
Log-Likelihood: -1195.26
Log-Likelihood Ratio p-value: 0.0474
Concordance Index (C-index): 0.56
Model Coefficients:
+---------------+---------------+------------------+
| Feature | Coefficient | Standard Error |
+===============+===============+==================+
| N Stage_N0 | 0.108 | 4.363 |
+---------------+---------------+------------------+
| N Stage_N1 | -0.186 | 4.368 |
+---------------+---------------+------------------+
| N Stage_N2 | -0.638 | 4.376 |
+---------------+---------------+------------------+
| N Stage_N2a | -0.732 | 4.382 |
+---------------+---------------+------------------+
| N Stage_N2b | 0.031 | 4.363 |
+---------------+---------------+------------------+
| N Stage_N2c | 0.106 | 4.364 |
+---------------+---------------+------------------+
| N Stage_N3 | 0.702 | 4.367 |
+---------------+---------------+------------------+
| N Stage_Other | 0.094 | 4.399 |
+---------------+---------------+------------------+
⚠️ **Possible Bias Detected in Disease Site** ⚠️
=== Subgroup Analysis for 'Disease Site' Using Cox Proportional Hazards Model ===
Model Statistics:
AIC (Partial): 2393.09
Log-Likelihood: -1187.55
Log-Likelihood Ratio p-value: 0.0003
Concordance Index (C-index): 0.59
Model Coefficients:
+--------------------------------+---------------+------------------+
| Feature | Coefficient | Standard Error |
+================================+===============+==================+
| Disease Site_Other | 0.424 | 4.571 |
+--------------------------------+---------------+------------------+
| Disease Site_esophagus | 0.795 | 4.577 |
+--------------------------------+---------------+------------------+
| Disease Site_hypopharynx | 0.746 | 4.556 |
+--------------------------------+---------------+------------------+
| Disease Site_larynx | 0.202 | 4.552 |
+--------------------------------+---------------+------------------+
| Disease Site_lip & oral cavity | 0.666 | 4.561 |
+--------------------------------+---------------+------------------+
| Disease Site_nasal cavity | -1.205 | 4.653 |
+--------------------------------+---------------+------------------+
| Disease Site_nasopharynx | -0.925 | 4.558 |
+--------------------------------+---------------+------------------+
| Disease Site_oropharynx | -0.046 | 4.551 |
+--------------------------------+---------------+------------------+
| Disease Site_unknown | 0.142 | 4.557 |
+--------------------------------+---------------+------------------+
[info ] Running Visualization Module... [jarvais] call=interpretation.__call__:38 16:31:27 [info ] Running Feature Importance Module... [jarvais] call=importance.__call__:25