Regression
In [1]:
Copied!
import sys
import pandas as pd
import os
project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))
sys.path.append(project_root)
data_dir = os.path.join(project_root, 'data')
data_file_path = os.path.join(data_dir, 'RADCURE_processed_clinical.csv')
df = pd.read_csv(data_file_path, index_col=0)
df.drop(columns=["Study ID", "survival_time", "death"], inplace=True)
import sys
import pandas as pd
import os
project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))
sys.path.append(project_root)
data_dir = os.path.join(project_root, 'data')
data_file_path = os.path.join(data_dir, 'RADCURE_processed_clinical.csv')
df = pd.read_csv(data_file_path, index_col=0)
df.drop(columns=["Study ID", "survival_time", "death"], inplace=True)
In [2]:
Copied!
from jarvais.analyzer import Analyzer
from rich import print
analyzer = Analyzer(
data=df,
output_dir='./outputs/analyzer',
categorical_columns= [
"Sex",
"T Stage",
"N Stage",
"Stage",
"Smoking Status",
"Disease Site",
"HPV Combined",
"Chemotherapy"
],
continuous_columns = [
"age at dx",
"Dose"
],
target_variable='Dose',
task='classification'
)
analyzer.encoding_module.enabled = False # AutoGluon will handle encoding
print(analyzer)
analyzer.run()
from jarvais.analyzer import Analyzer
from rich import print
analyzer = Analyzer(
data=df,
output_dir='./outputs/analyzer',
categorical_columns= [
"Sex",
"T Stage",
"N Stage",
"Stage",
"Smoking Status",
"Disease Site",
"HPV Combined",
"Chemotherapy"
],
continuous_columns = [
"age at dx",
"Dose"
],
target_variable='Dose',
task='classification'
)
analyzer.encoding_module.enabled = False # AutoGluon will handle encoding
print(analyzer)
analyzer.run()
/home/joshua-siraj/Documents/CDI/jarvais/.pixi/envs/dev/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm 15:02:49 [warning ] Date columns not specified. Inferring from remaining columns. [jarvais] call=analyzer.__init__:76
Analyzer( AnalyzerSettings( output_dir=PosixPath('outputs/analyzer'), categorical_columns=[ 'Sex', 'T Stage', 'N Stage', 'Stage', 'Smoking Status', 'Disease Site', 'HPV Combined', 'Chemotherapy' ], continuous_columns=['age at dx', 'Dose'], date_columns=[], task='classification', target_variable='Dose', generate_report=True, settings_path=None, settings_schema_path=None, missingness=MissingnessModule( categorical_strategy={ 'Sex': 'unknown', 'T Stage': 'unknown', 'N Stage': 'unknown', 'Stage': 'unknown', 'Smoking Status': 'unknown', 'Disease Site': 'unknown', 'HPV Combined': 'unknown', 'Chemotherapy': 'unknown' }, continuous_strategy={'age at dx': 'median', 'Dose': 'median'}, enabled=True ), outlier=OutlierModule( categorical_strategy={ 'Sex': 'frequency', 'T Stage': 'frequency', 'N Stage': 'frequency', 'Stage': 'frequency', 'Smoking Status': 'frequency', 'Disease Site': 'frequency', 'HPV Combined': 'frequency', 'Chemotherapy': 'frequency' }, continuous_strategy={'age at dx': 'none', 'Dose': 'none'}, threshold=0.01, enabled=True ), encoding=OneHotEncodingModule( columns=[ 'Sex', 'T Stage', 'N Stage', 'Stage', 'Smoking Status', 'Disease Site', 'HPV Combined', 'Chemotherapy' ], target_variable='Dose', prefix_sep='|', enabled=False ), visualization=VisualizationModule( plots=['corr', 'pairplot', 'umap', 'frequency_table', 'multiplot'], enabled=True ) ) )
[info ] Performing missingness analysis... [jarvais] call=missingness.__call__:43 [info ] Performing outlier analysis... [jarvais] call=outlier.__call__:53 [info ] Plotting Correlation Matrix... [jarvais] call=visualization.__call__:115
+-----------------------+-------------------+-----------+-------------+ | | | Missing | Overall | +=======================+===================+===========+=============+ | n | | | 3346 | +-----------------------+-------------------+-----------+-------------+ | age at dx, mean (SD) | | 0 | 62.3 (11.6) | +-----------------------+-------------------+-----------+-------------+ | Dose, mean (SD) | | 0 | 66.7 (5.8) | +-----------------------+-------------------+-----------+-------------+ | Sex, n (%) | Female | | 686 (20.5) | +-----------------------+-------------------+-----------+-------------+ | | Male | | 2660 (79.5) | +-----------------------+-------------------+-----------+-------------+ | T Stage, n (%) | None | | 12 (0.4) | +-----------------------+-------------------+-----------+-------------+ | | T0 | | 167 (5.0) | +-----------------------+-------------------+-----------+-------------+ | | T1 | | 454 (13.6) | +-----------------------+-------------------+-----------+-------------+ | | T1 (2) | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | | T1a | | 179 (5.3) | +-----------------------+-------------------+-----------+-------------+ | | T1b | | 88 (2.6) | +-----------------------+-------------------+-----------+-------------+ | | T2 | | 927 (27.7) | +-----------------------+-------------------+-----------+-------------+ | | T2 (2) | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | | T2a | | 4 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | T2b | | 5 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | T3 | | 861 (25.7) | +-----------------------+-------------------+-----------+-------------+ | | T3 (2) | | 3 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | T4 | | 116 (3.5) | +-----------------------+-------------------+-----------+-------------+ | | T4a | | 358 (10.7) | +-----------------------+-------------------+-----------+-------------+ | | T4b | | 121 (3.6) | +-----------------------+-------------------+-----------+-------------+ | | TX | | 4 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | Tis | | 44 (1.3) | +-----------------------+-------------------+-----------+-------------+ | | rT0 | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | N Stage, n (%) | N0 | | 1147 (34.3) | +-----------------------+-------------------+-----------+-------------+ | | N1 | | 344 (10.3) | +-----------------------+-------------------+-----------+-------------+ | | N2 | | 182 (5.4) | +-----------------------+-------------------+-----------+-------------+ | | N2a | | 125 (3.7) | +-----------------------+-------------------+-----------+-------------+ | | N2b | | 791 (23.6) | +-----------------------+-------------------+-----------+-------------+ | | N2c | | 532 (15.9) | +-----------------------+-------------------+-----------+-------------+ | | N3 | | 170 (5.1) | +-----------------------+-------------------+-----------+-------------+ | | N3a | | 13 (0.4) | +-----------------------+-------------------+-----------+-------------+ | | N3b | | 28 (0.8) | +-----------------------+-------------------+-----------+-------------+ | | NX | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | | None | | 13 (0.4) | +-----------------------+-------------------+-----------+-------------+ | Stage, n (%) | 0 | | 44 (1.3) | +-----------------------+-------------------+-----------+-------------+ | | I | | 352 (10.5) | +-----------------------+-------------------+-----------+-------------+ | | IB | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | | II | | 400 (12.0) | +-----------------------+-------------------+-----------+-------------+ | | IIA | | 2 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | IIB | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | | III | | 605 (18.1) | +-----------------------+-------------------+-----------+-------------+ | | IIIA | | 2 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | IIIC | | 2 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | IV | | 12 (0.4) | +-----------------------+-------------------+-----------+-------------+ | | IVA | | 1581 (47.3) | +-----------------------+-------------------+-----------+-------------+ | | IVB | | 309 (9.2) | +-----------------------+-------------------+-----------+-------------+ | | IVC | | 2 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | None | | 27 (0.8) | +-----------------------+-------------------+-----------+-------------+ | | X | | 6 (0.2) | +-----------------------+-------------------+-----------+-------------+ | Smoking Status, n (%) | Current | | 1139 (34.0) | +-----------------------+-------------------+-----------+-------------+ | | Ex-smoker | | 1290 (38.6) | +-----------------------+-------------------+-----------+-------------+ | | Non-smoker | | 872 (26.1) | +-----------------------+-------------------+-----------+-------------+ | | unknown | | 45 (1.3) | +-----------------------+-------------------+-----------+-------------+ | Disease Site, n (%) | benign tumor | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | | esophagus | | 33 (1.0) | +-----------------------+-------------------+-----------+-------------+ | | hypopharynx | | 162 (4.8) | +-----------------------+-------------------+-----------+-------------+ | | lacrimal gland | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | | larynx | | 877 (26.2) | +-----------------------+-------------------+-----------+-------------+ | | lip & oral cavity | | 100 (3.0) | +-----------------------+-------------------+-----------+-------------+ | | nasal cavity | | 62 (1.9) | +-----------------------+-------------------+-----------+-------------+ | | nasopharynx | | 355 (10.6) | +-----------------------+-------------------+-----------+-------------+ | | orbit | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | | oropharynx | | 1501 (44.9) | +-----------------------+-------------------+-----------+-------------+ | | other | | 2 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | paraganglioma | | 7 (0.2) | +-----------------------+-------------------+-----------+-------------+ | | paranasal sinus | | 28 (0.8) | +-----------------------+-------------------+-----------+-------------+ | | salivary glands | | 4 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | sarcoma | | 20 (0.6) | +-----------------------+-------------------+-----------+-------------+ | | skin | | 24 (0.7) | +-----------------------+-------------------+-----------+-------------+ | | unknown | | 168 (5.0) | +-----------------------+-------------------+-----------+-------------+ | HPV Combined, n (%) | 1.0 | | 1139 (34.0) | +-----------------------+-------------------+-----------+-------------+ | | None | | 2207 (66.0) | +-----------------------+-------------------+-----------+-------------+ | Chemotherapy, n (%) | 0 | | 1923 (57.5) | +-----------------------+-------------------+-----------+-------------+ | | 1 | | 1423 (42.5) | +-----------------------+-------------------+-----------+-------------+ Outlier Report: - No Outliers found in Sex - Outliers found in T Stage: ['nan: 12 out of 3346', 'T2b: 5 out of 3346', 'T2a: 4 out of 3346', 'TX: 4 out of 3346', 'T3 (2): 3 out of 3346', 'T2 (2): 1 out of 3346', 'T1 (2): 1 out of 3346', 'rT0: 1 out of 3346'] - Outliers found in N Stage: ['N3b: 28 out of 3346', 'N3a: 13 out of 3346', 'nan: 13 out of 3346', 'NX: 1 out of 3346'] - Outliers found in Stage: ['nan: 27 out of 3346', 'IV: 12 out of 3346', 'X: 6 out of 3346', 'IIA: 2 out of 3346', 'IIIA: 2 out of 3346', 'IIIC: 2 out of 3346', 'IVC: 2 out of 3346', 'IB: 1 out of 3346', 'IIB: 1 out of 3346'] - No Outliers found in Smoking Status - Outliers found in Disease Site: ['paranasal sinus: 28 out of 3346', 'skin: 24 out of 3346', 'sarcoma: 20 out of 3346', 'paraganglioma: 7 out of 3346', 'salivary glands: 4 out of 3346', 'other: 2 out of 3346', 'benign tumor: 1 out of 3346', 'lacrimal gland: 1 out of 3346', 'orbit: 1 out of 3346'] - No Outliers found in HPV Combined - No Outliers found in Chemotherapy
[info ] Plotting Pairplot... [jarvais] call=visualization.__call__:118 15:02:50 [info ] Plotting UMAP... [jarvais] call=visualization.__call__:124 15:03:00 [info ] Plotting Frequency Table... [jarvais] call=visualization.__call__:121 15:03:07 [info ] Plotting Multiplot... [jarvais] call=visualization.__call__:136 15:03:11 [warning ] One-hot encoding is disabled. [jarvais] call=encoding.__call__:40 Font MPDFAA+Inter28ptBold is missing the following glyphs: ' ' (\n)
In [3]:
Copied!
from jarvais.trainer import TrainerSupervised
df = pd.read_csv('./outputs/analyzer/updated_data.csv')
trainer = TrainerSupervised(task='regression', output_dir='./outputs/trainer')
trainer.run(df, 'Dose')
from jarvais.trainer import TrainerSupervised
df = pd.read_csv('./outputs/analyzer/updated_data.csv')
trainer = TrainerSupervised(task='regression', output_dir='./outputs/trainer')
trainer.run(df, 'Dose')
Training fold 1/5... Fold 1 score: 0.6668817529452749 Training fold 2/5... Fold 2 score: 0.5944331638262828 Training fold 3/5... Fold 3 score: 0.6045543370974688 Training fold 4/5... Fold 4 score: 0.6360018803349704 Training fold 5/5... Fold 5 score: 0.590947874967624 Model Leaderboard (Displays values in "mean [min, max]" format across training folds) ------------------------------------------------------------------------------------ +-----------------------+----------------------------+----------------------------+----------------------------+ | model | score_test | score_val | score_train | +=======================+============================+============================+============================+ | WeightedEnsemble_L2 | R2 0.62 [0.61, 0.63] | R2 0.62 [0.59, 0.67] | R2 0.71 [0.69, 0.76] | | | RMSE: -3.55 [-3.6, -3.49] | RMSE: -3.54 [-3.87, -3.37] | RMSE: -3.09 [-3.21, -2.79] | +-----------------------+----------------------------+----------------------------+----------------------------+ | LightGBMXT | R2 0.61 [0.6, 0.62] | R2 0.6 [0.58, 0.64] | R2 0.68 [0.65, 0.71] | | | RMSE: -3.61 [-3.64, -3.58] | RMSE: -3.61 [-3.88, -3.47] | RMSE: -3.27 [-3.39, -3.04] | +-----------------------+----------------------------+----------------------------+----------------------------+ | LightGBM | R2 0.61 [0.6, 0.61] | R2 0.58 [0.54, 0.64] | R2 0.7 [0.68, 0.71] | | | RMSE: -3.63 [-3.67, -3.59] | RMSE: -3.7 [-4.09, -3.52] | RMSE: -3.14 [-3.2, -3.1] | +-----------------------+----------------------------+----------------------------+----------------------------+ | CatBoost | R2 0.61 [0.59, 0.63] | R2 0.6 [0.57, 0.65] | R2 0.67 [0.65, 0.7] | | | RMSE: -3.62 [-3.69, -3.5] | RMSE: -3.63 [-3.97, -3.47] | RMSE: -3.29 [-3.43, -3.12] | +-----------------------+----------------------------+----------------------------+----------------------------+ | NeuralNetFastAI | R2 0.61 [0.59, 0.62] | R2 0.6 [0.56, 0.66] | R2 0.65 [0.61, 0.68] | | | RMSE: -3.6 [-3.7, -3.54] | RMSE: -3.63 [-3.94, -3.42] | RMSE: -3.39 [-3.56, -3.19] | +-----------------------+----------------------------+----------------------------+----------------------------+ | XGBoost | R2 0.6 [0.58, 0.62] | R2 0.58 [0.54, 0.62] | R2 0.73 [0.69, 0.82] | | | RMSE: -3.63 [-3.73, -3.56] | RMSE: -3.74 [-4.06, -3.54] | RMSE: -2.95 [-3.15, -2.43] | +-----------------------+----------------------------+----------------------------+----------------------------+ | RandomForestMSE | R2 0.58 [0.57, 0.59] | R2 0.55 [0.5, 0.59] | R2 0.93 [0.93, 0.94] | | | RMSE: -3.74 [-3.78, -3.7] | RMSE: -3.86 [-4.19, -3.72] | RMSE: -1.5 [-1.54, -1.42] | +-----------------------+----------------------------+----------------------------+----------------------------+ | ExtraTreesMSE | R2 0.57 [0.56, 0.58] | R2 0.54 [0.5, 0.6] | R2 0.93 [0.93, 0.94] | | | RMSE: -3.79 [-3.83, -3.74] | RMSE: -3.89 [-4.21, -3.7] | RMSE: -1.51 [-1.53, -1.43] | +-----------------------+----------------------------+----------------------------+----------------------------+ | LightGBMLarge | R2 0.57 [0.56, 0.58] | R2 0.56 [0.52, 0.59] | R2 0.83 [0.79, 0.86] | | | RMSE: -3.78 [-3.83, -3.73] | RMSE: -3.83 [-4.13, -3.72] | RMSE: -2.37 [-2.69, -2.16] | +-----------------------+----------------------------+----------------------------+----------------------------+ | NeuralNetTorch | R2 0.54 [0.52, 0.55] | R2 0.51 [0.44, 0.6] | R2 0.57 [0.51, 0.6] | | | RMSE: -3.93 [-3.99, -3.86] | RMSE: -4.01 [-4.52, -3.67] | RMSE: -3.78 [-3.97, -3.65] | +-----------------------+----------------------------+----------------------------+----------------------------+ | SimpleRegressionModel | R2 0.41 [0.4, 0.41] | R2 0.4 [0.38, 0.43] | R2 0.4 [0.39, 0.41] | | | RMSE: -4.44 [-4.45, -4.44] | RMSE: -4.46 [-4.77, -4.22] | RMSE: -4.44 [-4.5, -4.37] | +-----------------------+----------------------------+----------------------------+----------------------------+ | KNeighborsDist | R2 -0.13 [-0.17, -0.1] | R2 -0.19 [-0.3, -0.09] | R2 0.27 [0.26, 0.27] | | | RMSE: -6.14 [-6.23, -6.05] | RMSE: -6.27 [-6.54, -6.06] | RMSE: -4.93 [-5.01, -4.83] | +-----------------------+----------------------------+----------------------------+----------------------------+ | KNeighborsUnif | R2 -0.08 [-0.1, -0.05] | R2 -0.11 [-0.14, -0.05] | R2 0.19 [0.19, 0.21] | | | RMSE: -6.01 [-6.05, -5.93] | RMSE: -6.04 [-6.43, -5.68] | RMSE: -5.16 [-5.25, -5.05] | +-----------------------+----------------------------+----------------------------+----------------------------+
In [4]:
Copied!
from jarvais.explainer import Explainer
sensitive_features = {k: trainer.X_test[k] for k in ['N Stage', 'Disease Site', 'Sex']}
exp = Explainer.from_trainer(trainer, sensitive_features=sensitive_features)
exp.run()
from jarvais.explainer import Explainer
sensitive_features = {k: trainer.X_test[k] for k in ['N Stage', 'Disease Site', 'Sex']}
exp = Explainer.from_trainer(trainer, sensitive_features=sensitive_features)
exp.run()
⚠️ **Possible Bias Detected in N Stage** ⚠️ === Subgroup Analysis for 'N Stage' Using OLS Regression === Model Statistics: R-squared: 0.137 F-statistic: 15.034 F-statistic p-value: 0.0000 AIC: 3174.85 Log-Likelihood: -1579.42 Model Coefficients: +---------------+---------------+------------------+ | Feature | Coefficient | Standard Error | +===============+===============+==================+ | const | 1.937 | 0.139 | +---------------+---------------+------------------+ | N Stage_N0 | 1.571 | 0.206 | +---------------+---------------+------------------+ | N Stage_N1 | 0.829 | 0.329 | +---------------+---------------+------------------+ | N Stage_N2 | -1.328 | 0.380 | +---------------+---------------+------------------+ | N Stage_N2a | 0.692 | 0.450 | +---------------+---------------+------------------+ | N Stage_N2b | -0.428 | 0.221 | +---------------+---------------+------------------+ | N Stage_N2c | -0.065 | 0.264 | +---------------+---------------+------------------+ | N Stage_N3 | -1.225 | 0.403 | +---------------+---------------+------------------+ | N Stage_Other | 1.891 | 0.769 | +---------------+---------------+------------------+
=== Subgroup Analysis for 'N Stage' using FairLearn === +--------------------------+-------------------+-------------------+-------------------+------------------+ | | N0 | N1 | N2 | N2a | +==========================+===================+===================+===================+==================+ | mean_prediction | 63.11251277751751 | 67.65180969238281 | 70.02266953631145 | 67.6926611491612 | +--------------------------+-------------------+-------------------+-------------------+------------------+ | Relative mean_prediction | 1.000 ✅ | 1.072 ✅ | 1.109 ✅ | 1.073 ✅ | +--------------------------+-------------------+-------------------+-------------------+------------------+ ⚠️ **Possible Bias Detected in Disease Site** ⚠️ === Subgroup Analysis for 'Disease Site' Using OLS Regression === Model Statistics: R-squared: 0.156 F-statistic: 15.229 F-statistic p-value: 0.0000 AIC: 3162.36 Log-Likelihood: -1572.18 Model Coefficients: +--------------------------------+---------------+------------------+ | Feature | Coefficient | Standard Error | +================================+===============+==================+ | const | 2.913 | 0.158 | +--------------------------------+---------------+------------------+ | Disease Site_Other | 2.073 | 0.609 | +--------------------------------+---------------+------------------+ | Disease Site_esophagus | 4.174 | 0.820 | +--------------------------------+---------------+------------------+ | Disease Site_hypopharynx | -0.884 | 0.411 | +--------------------------------+---------------+------------------+ | Disease Site_larynx | 0.181 | 0.237 | +--------------------------------+---------------+------------------+ | Disease Site_lip & oral cavity | 1.705 | 0.521 | +--------------------------------+---------------+------------------+ | Disease Site_nasal cavity | -0.596 | 0.629 | +--------------------------------+---------------+------------------+ | Disease Site_nasopharynx | -2.388 | 0.315 | +--------------------------------+---------------+------------------+ | Disease Site_oropharynx | -1.038 | 0.206 | +--------------------------------+---------------+------------------+ | Disease Site_unknown | -0.314 | 0.389 | +--------------------------------+---------------+------------------+
=== Subgroup Analysis for 'Disease Site' using FairLearn === +--------------------------+-------------------+-------------------+-------------------+--------------------+ | | Other | esophagus | hypopharynx | larynx | +==========================+===================+===================+===================+====================+ | mean_prediction | 65.05188573201498 | 66.81039047241211 | 67.64559576246474 | 62.185764588505386 | +--------------------------+-------------------+-------------------+-------------------+--------------------+ | Relative mean_prediction | 0.947 ✅ | 0.972 ✅ | 0.984 ✅ | 0.905 ✅ | +--------------------------+-------------------+-------------------+-------------------+--------------------+