Classification
In [1]:
Copied!
import sys
import pandas as pd
import os
project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))
sys.path.append(project_root)
data_dir = os.path.join(project_root, 'data')
data_file_path = os.path.join(data_dir, 'RADCURE_processed_clinical.csv')
df = pd.read_csv(data_file_path, index_col=0)
df.drop(columns=["Study ID", "survival_time"], inplace=True)
import sys
import pandas as pd
import os
project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))
sys.path.append(project_root)
data_dir = os.path.join(project_root, 'data')
data_file_path = os.path.join(data_dir, 'RADCURE_processed_clinical.csv')
df = pd.read_csv(data_file_path, index_col=0)
df.drop(columns=["Study ID", "survival_time"], inplace=True)
In [2]:
Copied!
from jarvais.analyzer import Analyzer
from rich import print
analyzer = Analyzer(
data=df,
output_dir='./outputs/analyzer',
categorical_columns= [
"Sex",
"T Stage",
"N Stage",
"Stage",
"Smoking Status",
"Disease Site",
"death",
"HPV Combined",
"Chemotherapy"
],
continuous_columns = [
"age at dx",
"Dose"
],
target_variable='death',
task='classification'
)
analyzer.encoding_module.enabled = False # AutoGluon will handle encoding
print(analyzer)
analyzer.run()
from jarvais.analyzer import Analyzer
from rich import print
analyzer = Analyzer(
data=df,
output_dir='./outputs/analyzer',
categorical_columns= [
"Sex",
"T Stage",
"N Stage",
"Stage",
"Smoking Status",
"Disease Site",
"death",
"HPV Combined",
"Chemotherapy"
],
continuous_columns = [
"age at dx",
"Dose"
],
target_variable='death',
task='classification'
)
analyzer.encoding_module.enabled = False # AutoGluon will handle encoding
print(analyzer)
analyzer.run()
/home/joshua-siraj/Documents/CDI/jarvais/.pixi/envs/dev/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm 12:50:06 [warning ] Date columns not specified. Inferring from remaining columns. [jarvais] call=analyzer.__init__:76
Analyzer( AnalyzerSettings( output_dir=PosixPath('outputs/analyzer'), categorical_columns=[ 'Sex', 'T Stage', 'N Stage', 'Stage', 'Smoking Status', 'Disease Site', 'death', 'HPV Combined', 'Chemotherapy' ], continuous_columns=['age at dx', 'Dose'], date_columns=[], task='classification', target_variable='death', generate_report=True, settings_path=None, settings_schema_path=None, missingness=MissingnessModule( categorical_strategy={ 'Sex': 'unknown', 'T Stage': 'unknown', 'N Stage': 'unknown', 'Stage': 'unknown', 'Smoking Status': 'unknown', 'Disease Site': 'unknown', 'death': 'unknown', 'HPV Combined': 'unknown', 'Chemotherapy': 'unknown' }, continuous_strategy={'age at dx': 'median', 'Dose': 'median'}, enabled=True ), outlier=OutlierModule( categorical_strategy={ 'Sex': 'frequency', 'T Stage': 'frequency', 'N Stage': 'frequency', 'Stage': 'frequency', 'Smoking Status': 'frequency', 'Disease Site': 'frequency', 'death': 'frequency', 'HPV Combined': 'frequency', 'Chemotherapy': 'frequency' }, continuous_strategy={'age at dx': 'none', 'Dose': 'none'}, threshold=0.01, enabled=True, categorical_mapping={} ), encoding=OneHotEncodingModule( columns=[ 'Sex', 'T Stage', 'N Stage', 'Stage', 'Smoking Status', 'Disease Site', 'HPV Combined', 'Chemotherapy' ], target_variable='death', prefix_sep='|', enabled=False ), visualization=VisualizationModule( plots=['corr', 'pairplot', 'umap', 'frequency_table', 'multiplot'], enabled=True ) ) )
[info ] Performing missingness analysis... [jarvais] call=missingness.__call__:43 [info ] Performing outlier analysis... [jarvais] call=outlier.__call__:60 [info ] Plotting Correlation Matrix... [jarvais] call=visualization.__call__:115
+-----------------------+-------------------+-----------+-------------+ | | | Missing | Overall | +=======================+===================+===========+=============+ | n | | | 3346 | +-----------------------+-------------------+-----------+-------------+ | age at dx, mean (SD) | | 0 | 62.3 (11.6) | +-----------------------+-------------------+-----------+-------------+ | Dose, mean (SD) | | 0 | 66.7 (5.8) | +-----------------------+-------------------+-----------+-------------+ | Sex, n (%) | Female | | 686 (20.5) | +-----------------------+-------------------+-----------+-------------+ | | Male | | 2660 (79.5) | +-----------------------+-------------------+-----------+-------------+ | T Stage, n (%) | None | | 12 (0.4) | +-----------------------+-------------------+-----------+-------------+ | | T0 | | 167 (5.0) | +-----------------------+-------------------+-----------+-------------+ | | T1 | | 454 (13.6) | +-----------------------+-------------------+-----------+-------------+ | | T1 (2) | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | | T1a | | 179 (5.3) | +-----------------------+-------------------+-----------+-------------+ | | T1b | | 88 (2.6) | +-----------------------+-------------------+-----------+-------------+ | | T2 | | 927 (27.7) | +-----------------------+-------------------+-----------+-------------+ | | T2 (2) | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | | T2a | | 4 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | T2b | | 5 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | T3 | | 861 (25.7) | +-----------------------+-------------------+-----------+-------------+ | | T3 (2) | | 3 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | T4 | | 116 (3.5) | +-----------------------+-------------------+-----------+-------------+ | | T4a | | 358 (10.7) | +-----------------------+-------------------+-----------+-------------+ | | T4b | | 121 (3.6) | +-----------------------+-------------------+-----------+-------------+ | | TX | | 4 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | Tis | | 44 (1.3) | +-----------------------+-------------------+-----------+-------------+ | | rT0 | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | N Stage, n (%) | N0 | | 1147 (34.3) | +-----------------------+-------------------+-----------+-------------+ | | N1 | | 344 (10.3) | +-----------------------+-------------------+-----------+-------------+ | | N2 | | 182 (5.4) | +-----------------------+-------------------+-----------+-------------+ | | N2a | | 125 (3.7) | +-----------------------+-------------------+-----------+-------------+ | | N2b | | 791 (23.6) | +-----------------------+-------------------+-----------+-------------+ | | N2c | | 532 (15.9) | +-----------------------+-------------------+-----------+-------------+ | | N3 | | 170 (5.1) | +-----------------------+-------------------+-----------+-------------+ | | N3a | | 13 (0.4) | +-----------------------+-------------------+-----------+-------------+ | | N3b | | 28 (0.8) | +-----------------------+-------------------+-----------+-------------+ | | NX | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | | None | | 13 (0.4) | +-----------------------+-------------------+-----------+-------------+ | Stage, n (%) | 0 | | 44 (1.3) | +-----------------------+-------------------+-----------+-------------+ | | I | | 352 (10.5) | +-----------------------+-------------------+-----------+-------------+ | | IB | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | | II | | 400 (12.0) | +-----------------------+-------------------+-----------+-------------+ | | IIA | | 2 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | IIB | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | | III | | 605 (18.1) | +-----------------------+-------------------+-----------+-------------+ | | IIIA | | 2 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | IIIC | | 2 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | IV | | 12 (0.4) | +-----------------------+-------------------+-----------+-------------+ | | IVA | | 1581 (47.3) | +-----------------------+-------------------+-----------+-------------+ | | IVB | | 309 (9.2) | +-----------------------+-------------------+-----------+-------------+ | | IVC | | 2 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | None | | 27 (0.8) | +-----------------------+-------------------+-----------+-------------+ | | X | | 6 (0.2) | +-----------------------+-------------------+-----------+-------------+ | Smoking Status, n (%) | Current | | 1139 (34.0) | +-----------------------+-------------------+-----------+-------------+ | | Ex-smoker | | 1290 (38.6) | +-----------------------+-------------------+-----------+-------------+ | | Non-smoker | | 872 (26.1) | +-----------------------+-------------------+-----------+-------------+ | | unknown | | 45 (1.3) | +-----------------------+-------------------+-----------+-------------+ | Disease Site, n (%) | benign tumor | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | | esophagus | | 33 (1.0) | +-----------------------+-------------------+-----------+-------------+ | | hypopharynx | | 162 (4.8) | +-----------------------+-------------------+-----------+-------------+ | | lacrimal gland | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | | larynx | | 877 (26.2) | +-----------------------+-------------------+-----------+-------------+ | | lip & oral cavity | | 100 (3.0) | +-----------------------+-------------------+-----------+-------------+ | | nasal cavity | | 62 (1.9) | +-----------------------+-------------------+-----------+-------------+ | | nasopharynx | | 355 (10.6) | +-----------------------+-------------------+-----------+-------------+ | | orbit | | 1 (0.0) | +-----------------------+-------------------+-----------+-------------+ | | oropharynx | | 1501 (44.9) | +-----------------------+-------------------+-----------+-------------+ | | other | | 2 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | paraganglioma | | 7 (0.2) | +-----------------------+-------------------+-----------+-------------+ | | paranasal sinus | | 28 (0.8) | +-----------------------+-------------------+-----------+-------------+ | | salivary glands | | 4 (0.1) | +-----------------------+-------------------+-----------+-------------+ | | sarcoma | | 20 (0.6) | +-----------------------+-------------------+-----------+-------------+ | | skin | | 24 (0.7) | +-----------------------+-------------------+-----------+-------------+ | | unknown | | 168 (5.0) | +-----------------------+-------------------+-----------+-------------+ | death, n (%) | 0 | | 2288 (68.4) | +-----------------------+-------------------+-----------+-------------+ | | 1 | | 1058 (31.6) | +-----------------------+-------------------+-----------+-------------+ | HPV Combined, n (%) | 1.0 | | 1139 (34.0) | +-----------------------+-------------------+-----------+-------------+ | | None | | 2207 (66.0) | +-----------------------+-------------------+-----------+-------------+ | Chemotherapy, n (%) | 0 | | 1923 (57.5) | +-----------------------+-------------------+-----------+-------------+ | | 1 | | 1423 (42.5) | +-----------------------+-------------------+-----------+-------------+ Outlier Report: - No Outliers found in Sex - Outliers found in T Stage: ['nan: 12 out of 3346', 'T2b: 5 out of 3346', 'T2a: 4 out of 3346', 'TX: 4 out of 3346', 'T3 (2): 3 out of 3346', 'T2 (2): 1 out of 3346', 'T1 (2): 1 out of 3346', 'rT0: 1 out of 3346'] - Outliers found in N Stage: ['N3b: 28 out of 3346', 'N3a: 13 out of 3346', 'nan: 13 out of 3346', 'NX: 1 out of 3346'] - Outliers found in Stage: ['nan: 27 out of 3346', 'IV: 12 out of 3346', 'X: 6 out of 3346', 'IIA: 2 out of 3346', 'IIIA: 2 out of 3346', 'IIIC: 2 out of 3346', 'IVC: 2 out of 3346', 'IB: 1 out of 3346', 'IIB: 1 out of 3346'] - No Outliers found in Smoking Status - Outliers found in Disease Site: ['paranasal sinus: 28 out of 3346', 'skin: 24 out of 3346', 'sarcoma: 20 out of 3346', 'paraganglioma: 7 out of 3346', 'salivary glands: 4 out of 3346', 'other: 2 out of 3346', 'benign tumor: 1 out of 3346', 'lacrimal gland: 1 out of 3346', 'orbit: 1 out of 3346'] - No Outliers found in death - No Outliers found in HPV Combined - No Outliers found in Chemotherapy
[info ] Plotting Pairplot... [jarvais] call=visualization.__call__:118 12:50:07 [info ] Plotting UMAP... [jarvais] call=visualization.__call__:124 12:50:18 [info ] Plotting Frequency Table... [jarvais] call=visualization.__call__:121 12:50:26 [info ] Plotting Multiplot... [jarvais] call=visualization.__call__:136 12:50:30 [warning ] One-hot encoding is disabled. [jarvais] call=encoding.__call__:40 Font MPDFAA+Inter28ptBold is missing the following glyphs: ' ' (\n)
In [3]:
Copied!
from jarvais.trainer import TrainerSupervised
trainer = TrainerSupervised(
output_dir="./outputs/trainer",
target_variable="death",
task="binary",
k_folds=2
)
print(trainer)
analyzer.data['death'] = analyzer.data['death'].astype(int)
trainer.run(analyzer.data)
from jarvais.trainer import TrainerSupervised
trainer = TrainerSupervised(
output_dir="./outputs/trainer",
target_variable="death",
task="binary",
k_folds=2
)
print(trainer)
analyzer.data['death'] = analyzer.data['death'].astype(int)
trainer.run(analyzer.data)
TrainerSupervised( TrainerSettings( output_dir=PosixPath('outputs/trainer'), target_variable='death', task='binary', stratify_on=None, test_size=0.2, random_state=42, explain=False, reduction_module=FeatureReductionModule(method=None, task='binary', keep_k=2, enabled=True), trainer_module=AutogluonTabularWrapper( output_dir=PosixPath('outputs/trainer'), target_variable='death', task='binary', eval_metric='roc_auc', k_folds=2, extra_metrics=['f1', 'auprc'], kwargs={} ) ) )
12:50:37 [info ] Skipping feature reduction. [jarvais] call=feature_reduction.__call__:39 [info ] Training fold 1/2... [jarvais] call=autogluon_trainer._train_autogluon_with_cv:192 12:51:06 [info ] Fold 1/2 score: 0.7761862315751399 (roc_auc) [jarvais] call=autogluon_trainer._train_autogluon_with_cv:209 [info ] Training fold 2/2... [jarvais] call=autogluon_trainer._train_autogluon_with_cv:192 12:51:31 [info ] Fold 2/2 score: 0.750053611337183 (roc_auc) [jarvais] call=autogluon_trainer._train_autogluon_with_cv:209
Model Leaderboard ---------------- +-----------------------+---------------------------+---------------------------+---------------------------+ | model | score_train | score_val | score_test | +=======================+===========================+===========================+===========================+ | NeuralNetTorch | ROC_AUC 0.79 [0.78, 0.79] | ROC_AUC 0.8 [0.78, 0.83] | ROC_AUC 0.76 [0.75, 0.76] | | | F1: 0.57 [0.56, 0.58] | F1: 0.6 [0.57, 0.62] | F1: 0.54 [0.52, 0.55] | | | AUPRC: 0.65 [0.64, 0.66] | AUPRC: 0.67 [0.65, 0.69] | AUPRC: 0.62 [0.61, 0.63] | +-----------------------+---------------------------+---------------------------+---------------------------+ | WeightedEnsemble_L2 | ROC_AUC 0.82 [0.81, 0.83] | ROC_AUC 0.82 [0.78, 0.87] | ROC_AUC 0.75 [0.74, 0.76] | | | F1: 0.56 [0.55, 0.58] | F1: 0.56 [0.5, 0.63] | F1: 0.48 [0.48, 0.48] | | | AUPRC: 0.68 [0.67, 0.68] | AUPRC: 0.67 [0.61, 0.74] | AUPRC: 0.6 [0.6, 0.6] | +-----------------------+---------------------------+---------------------------+---------------------------+ | CatBoost | ROC_AUC 0.8 [0.8, 0.8] | ROC_AUC 0.82 [0.78, 0.86] | ROC_AUC 0.75 [0.73, 0.76] | | | F1: 0.56 [0.55, 0.56] | F1: 0.57 [0.52, 0.63] | F1: 0.49 [0.48, 0.49] | | | AUPRC: 0.66 [0.66, 0.66] | AUPRC: 0.67 [0.63, 0.71] | AUPRC: 0.6 [0.6, 0.6] | +-----------------------+---------------------------+---------------------------+---------------------------+ | LightGBMXT | ROC_AUC 0.8 [0.79, 0.81] | ROC_AUC 0.81 [0.78, 0.84] | ROC_AUC 0.74 [0.74, 0.75] | | | F1: 0.55 [0.54, 0.56] | F1: 0.56 [0.54, 0.58] | F1: 0.45 [0.45, 0.45] | | | AUPRC: 0.66 [0.65, 0.67] | AUPRC: 0.66 [0.64, 0.68] | AUPRC: 0.57 [0.57, 0.57] | +-----------------------+---------------------------+---------------------------+---------------------------+ | SimpleRegressionModel | ROC_AUC 0.77 [0.77, 0.77] | ROC_AUC 0.79 [0.78, 0.79] | ROC_AUC 0.73 [0.73, 0.74] | | | F1: 0.52 [0.51, 0.52] | F1: 0.54 [0.53, 0.56] | F1: 0.47 [0.46, 0.48] | | | AUPRC: 0.63 [0.63, 0.63] | AUPRC: 0.65 [0.64, 0.66] | AUPRC: 0.59 [0.58, 0.6] | +-----------------------+---------------------------+---------------------------+---------------------------+ | NeuralNetFastAI | ROC_AUC 0.77 [0.77, 0.77] | ROC_AUC 0.79 [0.75, 0.82] | ROC_AUC 0.73 [0.73, 0.74] | | | F1: 0.54 [0.53, 0.55] | F1: 0.56 [0.52, 0.59] | F1: 0.46 [0.45, 0.47] | | | AUPRC: 0.64 [0.64, 0.64] | AUPRC: 0.66 [0.62, 0.69] | AUPRC: 0.58 [0.57, 0.59] | +-----------------------+---------------------------+---------------------------+---------------------------+ | XGBoost | ROC_AUC 0.81 [0.8, 0.82] | ROC_AUC 0.81 [0.75, 0.88] | ROC_AUC 0.73 [0.72, 0.74] | | | F1: 0.56 [0.53, 0.58] | F1: 0.56 [0.5, 0.62] | F1: 0.45 [0.45, 0.45] | | | AUPRC: 0.67 [0.65, 0.68] | AUPRC: 0.66 [0.6, 0.72] | AUPRC: 0.57 [0.56, 0.58] | +-----------------------+---------------------------+---------------------------+---------------------------+ | LightGBM | ROC_AUC 0.82 [0.82, 0.82] | ROC_AUC 0.82 [0.75, 0.89] | ROC_AUC 0.73 [0.71, 0.75] | | | F1: 0.57 [0.55, 0.59] | F1: 0.58 [0.51, 0.66] | F1: 0.45 [0.44, 0.45] | | | AUPRC: 0.68 [0.67, 0.69] | AUPRC: 0.69 [0.62, 0.76] | AUPRC: 0.57 [0.56, 0.58] | +-----------------------+---------------------------+---------------------------+---------------------------+ | RandomForestEntr | ROC_AUC 0.88 [0.87, 0.89] | ROC_AUC 0.86 [0.75, 0.97] | ROC_AUC 0.71 [0.7, 0.73] | | | F1: 0.72 [0.71, 0.72] | F1: 0.71 [0.52, 0.91] | F1: 0.49 [0.48, 0.5] | | | AUPRC: 0.77 [0.77, 0.77] | AUPRC: 0.77 [0.61, 0.93] | AUPRC: 0.58 [0.57, 0.59] | +-----------------------+---------------------------+---------------------------+---------------------------+ | RandomForestGini | ROC_AUC 0.88 [0.87, 0.89] | ROC_AUC 0.86 [0.75, 0.97] | ROC_AUC 0.71 [0.69, 0.72] | | | F1: 0.72 [0.71, 0.73] | F1: 0.72 [0.52, 0.91] | F1: 0.48 [0.47, 0.49] | | | AUPRC: 0.77 [0.77, 0.78] | AUPRC: 0.77 [0.61, 0.93] | AUPRC: 0.57 [0.57, 0.58] | +-----------------------+---------------------------+---------------------------+---------------------------+ | ExtraTreesGini | ROC_AUC 0.87 [0.86, 0.88] | ROC_AUC 0.86 [0.75, 0.97] | ROC_AUC 0.7 [0.69, 0.71] | | | F1: 0.71 [0.7, 0.72] | F1: 0.71 [0.51, 0.91] | F1: 0.49 [0.47, 0.51] | | | AUPRC: 0.76 [0.75, 0.77] | AUPRC: 0.76 [0.59, 0.92] | AUPRC: 0.58 [0.56, 0.59] | +-----------------------+---------------------------+---------------------------+---------------------------+ | ExtraTreesEntr | ROC_AUC 0.87 [0.86, 0.88] | ROC_AUC 0.86 [0.75, 0.96] | ROC_AUC 0.7 [0.69, 0.71] | | | F1: 0.71 [0.7, 0.72] | F1: 0.72 [0.52, 0.91] | F1: 0.47 [0.46, 0.49] | | | AUPRC: 0.76 [0.76, 0.77] | AUPRC: 0.77 [0.6, 0.93] | AUPRC: 0.56 [0.55, 0.58] | +-----------------------+---------------------------+---------------------------+---------------------------+ | LightGBMLarge | ROC_AUC 0.83 [0.82, 0.83] | ROC_AUC 0.81 [0.69, 0.94] | ROC_AUC 0.69 [0.67, 0.72] | | | F1: 0.58 [0.57, 0.59] | F1: 0.58 [0.38, 0.77] | F1: 0.36 [0.36, 0.37] | | | AUPRC: 0.7 [0.68, 0.71] | AUPRC: 0.68 [0.52, 0.85] | AUPRC: 0.5 [0.49, 0.52] | +-----------------------+---------------------------+---------------------------+---------------------------+ | KNeighborsUnif | ROC_AUC 0.67 [0.66, 0.68] | ROC_AUC 0.66 [0.59, 0.73] | ROC_AUC 0.61 [0.6, 0.62] | | | F1: 0.42 [0.39, 0.44] | F1: 0.42 [0.37, 0.46] | F1: 0.37 [0.35, 0.38] | | | AUPRC: 0.54 [0.52, 0.55] | AUPRC: 0.53 [0.49, 0.58] | AUPRC: 0.49 [0.48, 0.5] | +-----------------------+---------------------------+---------------------------+---------------------------+ | KNeighborsDist | ROC_AUC 0.73 [0.73, 0.73] | ROC_AUC 0.72 [0.55, 0.89] | ROC_AUC 0.57 [0.54, 0.59] | | | F1: 0.51 [0.5, 0.52] | F1: 0.5 [0.32, 0.68] | F1: 0.35 [0.33, 0.38] | | | AUPRC: 0.61 [0.61, 0.62] | AUPRC: 0.6 [0.44, 0.76] | AUPRC: 0.47 [0.45, 0.49] | +-----------------------+---------------------------+---------------------------+---------------------------+
In [4]:
Copied!
from jarvais.explainer import Explainer
sensitive_features = {k: trainer.X_test[k] for k in ['N Stage', 'Disease Site', 'Sex']}
exp = Explainer.from_trainer(trainer, sensitive_features=sensitive_features)
exp.run()
from jarvais.explainer import Explainer
sensitive_features = {k: trainer.X_test[k] for k in ['N Stage', 'Disease Site', 'Sex']}
exp = Explainer.from_trainer(trainer, sensitive_features=sensitive_features)
exp.run()
⚠️ **Possible Bias Detected in N Stage** ⚠️ === Subgroup Analysis for 'N Stage' Using OLS Regression === Model Statistics: R-squared: 0.031 F-statistic: 3.055 F-statistic p-value: 0.0036 AIC: 874.42 Log-Likelihood: -429.21 Model Coefficients: +---------------+---------------+------------------+ | Feature | Coefficient | Standard Error | +===============+===============+==================+ | const | 0.457 | 0.024 | +---------------+---------------+------------------+ | N Stage_N0 | 0.118 | 0.035 | +---------------+---------------+------------------+ | N Stage_N1 | 0.085 | 0.055 | +---------------+---------------+------------------+ | N Stage_N2 | -0.196 | 0.071 | +---------------+---------------+------------------+ | N Stage_N2a | -0.050 | 0.081 | +---------------+---------------+------------------+ | N Stage_N2b | 0.053 | 0.042 | +---------------+---------------+------------------+ | N Stage_N2c | 0.150 | 0.046 | +---------------+---------------+------------------+ | N Stage_N3 | 0.180 | 0.082 | +---------------+---------------+------------------+ | N Stage_Other | 0.117 | 0.125 | +---------------+---------------+------------------+
=== Subgroup Analysis for 'N Stage' using FairLearn === +------------------------------+---------------------+---------------------+----------------------+----------------------+ | | N0 | N1 | N2 | N2a | +==============================+=====================+=====================+======================+======================+ | mean_prediction | 0.1394422310756972 | 0.23880597014925373 | 0.05405405405405406 | 0.10714285714285714 | +------------------------------+---------------------+---------------------+----------------------+----------------------+ | false_positive_rate | 0.09714285714285714 | 0.09090909090909091 | 0.030303030303030304 | 0.045454545454545456 | +------------------------------+---------------------+---------------------+----------------------+----------------------+ | Relative mean_prediction | 1.000 ✅ | 1.713 ✅ | 0.388 ✅ | 0.768 ✅ | +------------------------------+---------------------+---------------------+----------------------+----------------------+ | Relative false_positive_rate | 1.000 ✅ | 0.936 ✅ | 0.312 ✅ | 0.468 ✅ | +------------------------------+---------------------+---------------------+----------------------+----------------------+ ⚠️ **Possible Bias Detected in Disease Site** ⚠️ === Subgroup Analysis for 'Disease Site' Using OLS Regression === Model Statistics: R-squared: 0.032 F-statistic: 2.719 F-statistic p-value: 0.0059 AIC: 876.02 Log-Likelihood: -429.01 Model Coefficients: +--------------------------------+---------------+------------------+ | Feature | Coefficient | Standard Error | +================================+===============+==================+ | const | 0.521 | 0.030 | +--------------------------------+---------------+------------------+ | Disease Site_Other | 0.246 | 0.108 | +--------------------------------+---------------+------------------+ | Disease Site_esophagus | 0.235 | 0.159 | +--------------------------------+---------------+------------------+ | Disease Site_hypopharynx | 0.147 | 0.080 | +--------------------------------+---------------+------------------+ | Disease Site_larynx | 0.028 | 0.043 | +--------------------------------+---------------+------------------+ | Disease Site_lip & oral cavity | 0.142 | 0.102 | +--------------------------------+---------------+------------------+ | Disease Site_nasal cavity | -0.023 | 0.123 | +--------------------------------+---------------+------------------+ | Disease Site_nasopharynx | -0.140 | 0.058 | +--------------------------------+---------------+------------------+ | Disease Site_oropharynx | 0.034 | 0.039 | +--------------------------------+---------------+------------------+ | Disease Site_unknown | -0.149 | 0.081 | +--------------------------------+---------------+------------------+
=== Subgroup Analysis for 'Disease Site' using FairLearn === +------------------------------+---------------------+---------------------+--------------------+---------------------+ | | Other | esophagus | hypopharynx | larynx | +==============================+=====================+=====================+====================+=====================+ | mean_prediction | 0.1875 | 0.14285714285714285 | 0.5483870967741935 | 0.1891891891891892 | +------------------------------+---------------------+---------------------+--------------------+---------------------+ | false_positive_rate | 0.14285714285714285 | 0.0 | 0.4444444444444444 | 0.13970588235294118 | +------------------------------+---------------------+---------------------+--------------------+---------------------+ | Relative mean_prediction | 0.957 ✅ | 0.729 ✅ | 2.798 ✅ | 0.965 ✅ | +------------------------------+---------------------+---------------------+--------------------+---------------------+ | Relative false_positive_rate | 1.381 ✅ | 0.000 ✅ | 4.296 ✅ | 1.350 ✅ | +------------------------------+---------------------+---------------------+--------------------+---------------------+ Since target_class not specified, SHAP will explain predictions for each class
100%|██████████| 100/100 [04:35<00:00, 2.76s/it]