Skip to content

TrainerSupervised

The TrainerSupervised class is part of the jarvais.trainer module.

jarvais.trainer.TrainerSupervised

TrainerSupervised class for supervised jarvAIs workflows.

This class provides functionality for feature reduction, training models (e.g., AutoGluon, survival models), and performing inference. It supports various tasks such as binary/multiclass classification, regression, and survival analysis.

Attributes:

Name Type Description
task str

Type of task. Must be one of {'binary', 'multiclass', 'regression', 'survival'}.

reduction_method str | None

Feature reduction method. Supported methods include {'mrmr', 'variance_threshold', 'corr', 'chi2'}.

keep_k int

Number of features to retain during reduction.

output_dir str | Path

Directory for saving outputs. Defaults to the current working directory.

Example
from jarvais.trainer import TrainerSupervised

trainer = TrainerSupervised(
    task="binary",
    reduction_method="mrmr",
    keep_k=10,
    output_dir="./results"
)
trainer.run(data=my_data, target_variable="target")

predictions = trainer.infer(new_data)
Source code in src/jarvais/trainer/trainer.py
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
class TrainerSupervised:
    """
    TrainerSupervised class for supervised jarvAIs workflows.

    This class provides functionality for feature reduction, training models (e.g., AutoGluon, survival models), 
    and performing inference. It supports various tasks such as binary/multiclass classification, regression, 
    and survival analysis.

    Attributes:
        task (str, optional): Type of task. Must be one of {'binary', 'multiclass', 'regression', 'survival'}. 
        reduction_method (str | None, optional): Feature reduction method. Supported methods include 
            {'mrmr', 'variance_threshold', 'corr', 'chi2'}.
        keep_k (int, optional): Number of features to retain during reduction.
        output_dir (str | Path, optional): Directory for saving outputs. Defaults to the current working directory.

    Example:
        ```python
        from jarvais.trainer import TrainerSupervised

        trainer = TrainerSupervised(
            task="binary",
            reduction_method="mrmr",
            keep_k=10,
            output_dir="./results"
        )
        trainer.run(data=my_data, target_variable="target")

        predictions = trainer.infer(new_data)
        ```
    """
    def __init__(
            self,
            task: str=None,
            reduction_method: str | None = None,
            keep_k: int = 2,
            output_dir: str | Path = None
        ) -> None:

        self.task = task
        self.reduction_method = reduction_method
        self.keep_k = keep_k

        if task not in ['binary', 'multiclass', 'regression', 'survival', None]:
            raise ValueError("Invalid task parameter. Choose one of: 'binary', 'multiclass', 'regression', 'survival'. Providing None defaults to Autogluon infering.")

        self.output_dir = Path.cwd() if output_dir is None else Path(output_dir)
        self.output_dir.mkdir(exist_ok=True, parents=True)

    def _feature_reduction(self, X: pd.DataFrame, y: pd.DataFrame | pd.Series) -> pd.DataFrame:
        """
        Reduce features based on the specified reduction method. 

        One-hot encoding applied before reduction and reverted afterward.
        """
        # Step 1: Identify categorical columns
        categorical_columns = X.select_dtypes(include=['object', 'category']).columns.tolist()

        mappin = {}

        def find_category_mappings(df, variable):
            return {k: i for i, k in enumerate(df[variable].dropna().unique(), 0)}

        def integer_encode(df, variable, ordinal_mapping):
            df[variable] = df[variable].map(ordinal_mapping)

        for variable in categorical_columns:
            mappings = find_category_mappings(X, variable)
            mappin[variable] = mappings

        for variable in categorical_columns:
            integer_encode(X, variable, mappin[variable])

        # Step 3: Perform feature reduction
        if self.reduction_method == 'mrmr':
            X_reduced = mrmr_reduction(self.task, X, y, self.keep_k)
        elif self.reduction_method == 'variance_threshold':
            X_reduced = var_reduction(X, y)
        elif self.reduction_method == 'corr':
            X_reduced = kbest_reduction(self.task, X, y, self.keep_k)
        elif self.reduction_method == 'chi2':
            if self.task not in ['binary', 'multiclass']:
                raise ValueError('chi-squared reduction can only be done with classification tasks')
            X_reduced = chi2_reduction(X, y, self.keep_k)
        else:
            raise ValueError('Unsupported reduction method: {}'.format(self.reduction_method))

        for col in categorical_columns:
            if col in X_reduced.columns:
                inv_map = {v: k for k, v in mappin[col].items()}
                X_reduced[col] = X_reduced[col].map(inv_map)

        return X_reduced

    def _train_autogluon_with_cv(self) -> None:
        self.predictors, leaderboard, self.best_fold, self.X_val, self.y_val = train_autogluon_with_cv(
            pd.concat([self.X_train, self.y_train], axis=1),
            pd.concat([self.X_test, self.y_test], axis=1),
            target_variable=self.target_variable,
            task=self.task,
            extra_metrics=self.extra_metrics,
            eval_metric=self.eval_metric,
            num_folds=self.k_folds,
            output_dir=(self.output_dir / 'autogluon_models'),
            **self.kwargs
        )

        self.predictor = self.predictors[self.best_fold]
        self.trainer_config['best_fold'] = self.best_fold

        # Update train data to remove validation
        self.X_train = self.X_train[~self.X_train.index.isin(self.X_val.index)]
        self.y_train = self.y_train[~self.y_train.index.isin(self.y_val.index)]

        print('\nModel Leaderboard (Displays values in "mean [min, max]" format across training folds)\n------------------------------------------------------------------------------------')
        print(tabulate(
            leaderboard.sort_values(by='score_test', ascending=False)[self.show_leaderboard],
            tablefmt = "grid",
            headers="keys",
            showindex=False
        ))

    def _train_autogluon(self) -> None:
        self.predictor = TabularPredictor(
            label=self.target_variable, 
            problem_type=self.task, 
            eval_metric=self.eval_metric,
            path=(self.output_dir / 'autogluon_models' / 'autogluon_models_best_fold'),
            log_to_file=False,
        ).fit(
            pd.concat([self.X_train, self.y_train], axis=1),
            **self.kwargs
        )

        self.X_val, self.y_val = self.predictor.load_data_internal(data='val', return_y=True)
        # Update train data to remove validation
        self.X_train = self.X_train[~self.X_train.index.isin(self.X_val.index)]
        self.y_train = self.y_train[~self.y_train.index.isin(self.y_val.index)]

        train_leaderboard = self.predictor.leaderboard(
            pd.concat([self.X_train, self.y_train], axis=1),
            extra_metrics=self.extra_metrics).round(2)
        val_leaderboard = self.predictor.leaderboard(
            pd.concat([self.X_val, self.y_val], axis=1),
            extra_metrics=self.extra_metrics).round(2)
        test_leaderboard = self.predictor.leaderboard(
            pd.concat([self.X_test, self.y_test], axis=1),
            extra_metrics=self.extra_metrics).round(2)

        leaderboard = pd.merge(
            pd.merge(
                format_leaderboard(train_leaderboard, self.extra_metrics, 'score_train'),
                format_leaderboard(val_leaderboard, self.extra_metrics, 'score_val'),
                on='model'
            ),
            format_leaderboard(test_leaderboard, self.extra_metrics, 'score_test'),
            on='model'
        )

        print('\nModel Leaderboard\n----------------')
        print(tabulate(
            leaderboard.sort_values(by='score_test', ascending=False)[self.show_leaderboard],
            tablefmt = "grid",
            headers="keys",
            showindex=False))

    def run(
            self,
            data: pd.DataFrame,
            target_variable: str,
            test_size: float = 0.2,
            exclude: List[str] | None = None,
            stratify_on: str | None = None,
            explain: bool = False,
            k_folds: int = 5,
            **kwargs:dict
        ) -> None:
        """
        Execute the jarvAIs Trainer pipeline on the given dataset.

        Args:
            data (pd.DataFrame): The input dataset containing features and target.
            target_variable (str): The name of the target variable in the dataset.
            test_size (float, optional): Proportion of the dataset to include in the test split. 
                Must be between 0 and 1. Default is 0.2.
            exclude (list of str, optional): List of columns to exclude from the feature set. 
                Default is an empty list.
            stratify_on (str, optional): Column to use for stratification, if any. 
                Must be compatible with `target_variable`.
            explain (bool, optional): Whether to generate explainability reports for the model. 
                Default is False.
            k_folds (int, optional): Number of folds for cross-validation. If 1, uses AutoGluon-specific validation. 
                Default is 5.
            kwargs (dict, optional): Additional arguments passed to the AutoGluon predictor's `fit` method.
        """
        self.trainer_config = dict()
        self.trainer_config['task'] = self.task
        self.trainer_config['output_dir'] = self.output_dir.as_posix()

        self.target_variable = target_variable
        self.trainer_config['target_variable'] = target_variable
        self.k_folds = k_folds
        self.trainer_config['k_folds'] = k_folds
        self.kwargs = kwargs

        self.trainer_config['test_size'] = test_size
        self.trainer_config['stratify_on'] = stratify_on

        # Initialize mutable defaults
        if exclude is None:
            exclude = []

        if isinstance(target_variable, list): # Happens for survival data
            exclude += target_variable
        else:
            exclude.append(target_variable)

        try:
            X = data.drop(columns=exclude)
            y = data[target_variable]
        except KeyError as e:
            raise ValueError(f"Invalid column specified: {e}")

        # Optional feature reduction
        if getattr(self, "reduction_method", None):
            print(f"Applying {self.reduction_method} for feature reduction")
            X = self._feature_reduction(X, y)
            print(f"Features retained: {list(X.columns)}")

            self.feature_names = list(X.columns)
            self.trainer_config['reduction_method'] = self.reduction_method
            self.trainer_config['reduced_feature_set'] = self.feature_names

        if self.task in {'binary', 'multiclass'}:
            stratify_col = (
                y.astype(str) + '_' + data[stratify_on].astype(str)
                if stratify_on is not None
                else y
            )
        else:
            stratify_col = None

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=test_size, stratify=stratify_col, random_state=42)

        if self.task == 'survival':
            self.predictors, scores, data_train, data_val = train_survival_models(
                self.X_train, 
                self.y_train, 
                self.X_test, 
                self.y_test, 
                self.output_dir
            )
            self.predictor = self.predictors[max(scores, key=scores.get)]
            self.trainer_config['survival_models_info'] = scores

            self.X_train, self.y_train = data_train.drop(columns=['time', 'event']), data_train[['time', 'event']] 
            self.X_val, self.y_val = data_val.drop(columns=['time', 'event']), data_val[['time', 'event']] 
        else:
            (self.output_dir / 'autogluon_models').mkdir(exist_ok=True, parents=True)

            if self.task in ['binary', 'multiclass']:
                self.eval_metric = 'roc_auc'
            elif self.task == 'regression':
                self.eval_metric = 'r2'

            ag_auprc_scorer = make_scorer(
                name='auprc', # Move this to a seperate file?
                score_func=auprc,
                optimum=1,
                greater_is_better=True,
                needs_class=True)

            # When changing extra_metrics consider where it's used and make updates accordingly
            self.extra_metrics = ['f1', ag_auprc_scorer] if self.task in ['binary', 'multiclass'] else ['root_mean_squared_error']
            self.show_leaderboard = ['model', 'score_test', 'score_val', 'score_train']

            custom_hyperparameters = get_hyperparameter_config('default')
            custom_hyperparameters[SimpleRegressionModel] = {}
            kwargs['hyperparameters'] = custom_hyperparameters

            if k_folds > 1:
                self._train_autogluon_with_cv()
            else:
                self._train_autogluon()

        self.trained = True

        self.data_dir = self.output_dir / 'data'
        self.data_dir.mkdir(parents=True, exist_ok=True)
        self.X_train.to_csv((self.data_dir / 'X_train.csv'), index=False)
        self.X_test.to_csv((self.data_dir / 'X_test.csv'), index=False)
        self.X_val.to_csv((self.data_dir / 'X_val.csv'), index=False)
        self.y_train.to_csv((self.data_dir / 'y_train.csv'), index=False)
        self.y_test.to_csv((self.data_dir / 'y_test.csv'), index=False)
        self.y_val.to_csv((self.data_dir / 'y_val.csv'), index=False)

        with (self.output_dir / 'trainer_config.yaml').open('w') as f:
            yaml.dump(self.trainer_config, f)

        if explain:
            explainer = Explainer.from_trainer(self)
            explainer.run()

    def model_names(self) -> List[str]:
        """
        Returns all trainer model names.

        This method retrieves the names of all models associated with the 
        current predictor. It requires that the predictor has been trained.

        Returns:
            list: A list of model names available in the predictor.

        Raises:
            ValueError: If the model has not been trained (`self.trained` is False).
        """
        if not self.trained:
            raise ValueError("The model must be trained before accessing model names.")

        if self.task == 'survival':
            return list(self.predictors.keys())
        else:        
            return self.predictor.model_names()

    def infer(self, data: pd.DataFrame, model: str = None) -> np.ndarray:
        """
        Perform inference using the trained predictor on the provided data.

        This method generates predictions based on the input data using the 
        specified model. If no model is provided, the default model is used. 
        The predictor must be trained before inference can be performed.

        Args:
            data (pd.DataFrame): The input data for which inference is to be performed.
            model (str, optional): The name of the model to use for inference. 
                If None, the default model is used.

        Returns:
            np.ndarray: The prediction results from the model.

        Raises:
            ValueError: If the model has not been trained (`self.trained` is False).
            ValueError: If the specified model name is not found in the predictor.
        """
        if not self.trained:
            raise ValueError("The model must be trained before performing inference.")
        if not model is None and not model in self.model_names():
            raise ValueError(f"Model '{model}' not in trainer. Use model_names() to list valid available models.")

        if self.task == 'survival':
            if model is None:
                inference = self.predictor.predict(data)
            else:
                inference = self.predictors[model].predict(data)
        else:
            if self.predictor.can_predict_proba:
                inference = self.predictor.predict_proba(data, model, as_pandas=False)[:, 1]
            else:
                inference = self.predictor.predict(data, model, as_pandas=False)

        return inference

    @classmethod
    def load_trainer(cls, project_dir: str | Path):
        """
        Load a trained TrainerSupervised from the specified directory.

        Args:
            project_dir (str or Path, optional): The directory where the trainer was run.

        Returns:
            trainer (TrainerSupervised): The loaded Trainer.
        """
        project_dir = Path(project_dir)
        with (project_dir / 'trainer_config.yaml').open('r') as f:
            trainer_config = yaml.safe_load(f)

        trainer = cls()
        trainer.task = trainer_config['task']
        trainer.output_dir = project_dir

        if trainer.task == 'survival':
            model_dir = (project_dir / 'survival_models')

            trainer.predictors = {}
            model_info = trainer_config['survival_models_info']
            for model_name, _ in model_info.items():
                if model_name == 'MTLR':
                    trainer.predictors[model_name] = LitMTLR.load_from_checkpoint(model_dir / "MTLR.ckpt")
                elif model_name == 'DeepSurv':
                    trainer.predictors[model_name] = LitDeepSurv.load_from_checkpoint(model_dir / "DeepSurv.ckpt")
                else:
                    with (model_dir / f'{model_name}.pkl').open("rb") as f:
                        trainer.predictors[model_name] = pickle.load(f)

            trainer.predictor = trainer.predictors[max(model_info, key=model_info.get)]
        else:
            model_dir = (project_dir / 'autogluon_models' / 'autogluon_models_best_fold')
            trainer.predictor = TabularPredictor.load(model_dir, verbosity=1)

        trainer.trained = True

        trainer.X_test = pd.read_csv(project_dir / 'data' / 'X_test.csv')
        trainer.X_val = pd.read_csv(project_dir / 'data' / 'X_val.csv')
        trainer.X_train = pd.read_csv(project_dir / 'data' / 'X_train.csv')
        trainer.y_test = pd.read_csv(project_dir / 'data' / 'y_test.csv').squeeze()
        trainer.y_val = pd.read_csv(project_dir / 'data' / 'y_val.csv').squeeze()
        trainer.y_train = pd.read_csv(project_dir / 'data' / 'y_train.csv').squeeze()

        return trainer

run(data, target_variable, test_size=0.2, exclude=None, stratify_on=None, explain=False, k_folds=5, **kwargs)

Execute the jarvAIs Trainer pipeline on the given dataset.

Parameters:

Name Type Description Default
data DataFrame

The input dataset containing features and target.

required
target_variable str

The name of the target variable in the dataset.

required
test_size float

Proportion of the dataset to include in the test split. Must be between 0 and 1. Default is 0.2.

0.2
exclude list of str

List of columns to exclude from the feature set. Default is an empty list.

None
stratify_on str

Column to use for stratification, if any. Must be compatible with target_variable.

None
explain bool

Whether to generate explainability reports for the model. Default is False.

False
k_folds int

Number of folds for cross-validation. If 1, uses AutoGluon-specific validation. Default is 5.

5
kwargs dict

Additional arguments passed to the AutoGluon predictor's fit method.

{}
Source code in src/jarvais/trainer/trainer.py
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
def run(
        self,
        data: pd.DataFrame,
        target_variable: str,
        test_size: float = 0.2,
        exclude: List[str] | None = None,
        stratify_on: str | None = None,
        explain: bool = False,
        k_folds: int = 5,
        **kwargs:dict
    ) -> None:
    """
    Execute the jarvAIs Trainer pipeline on the given dataset.

    Args:
        data (pd.DataFrame): The input dataset containing features and target.
        target_variable (str): The name of the target variable in the dataset.
        test_size (float, optional): Proportion of the dataset to include in the test split. 
            Must be between 0 and 1. Default is 0.2.
        exclude (list of str, optional): List of columns to exclude from the feature set. 
            Default is an empty list.
        stratify_on (str, optional): Column to use for stratification, if any. 
            Must be compatible with `target_variable`.
        explain (bool, optional): Whether to generate explainability reports for the model. 
            Default is False.
        k_folds (int, optional): Number of folds for cross-validation. If 1, uses AutoGluon-specific validation. 
            Default is 5.
        kwargs (dict, optional): Additional arguments passed to the AutoGluon predictor's `fit` method.
    """
    self.trainer_config = dict()
    self.trainer_config['task'] = self.task
    self.trainer_config['output_dir'] = self.output_dir.as_posix()

    self.target_variable = target_variable
    self.trainer_config['target_variable'] = target_variable
    self.k_folds = k_folds
    self.trainer_config['k_folds'] = k_folds
    self.kwargs = kwargs

    self.trainer_config['test_size'] = test_size
    self.trainer_config['stratify_on'] = stratify_on

    # Initialize mutable defaults
    if exclude is None:
        exclude = []

    if isinstance(target_variable, list): # Happens for survival data
        exclude += target_variable
    else:
        exclude.append(target_variable)

    try:
        X = data.drop(columns=exclude)
        y = data[target_variable]
    except KeyError as e:
        raise ValueError(f"Invalid column specified: {e}")

    # Optional feature reduction
    if getattr(self, "reduction_method", None):
        print(f"Applying {self.reduction_method} for feature reduction")
        X = self._feature_reduction(X, y)
        print(f"Features retained: {list(X.columns)}")

        self.feature_names = list(X.columns)
        self.trainer_config['reduction_method'] = self.reduction_method
        self.trainer_config['reduced_feature_set'] = self.feature_names

    if self.task in {'binary', 'multiclass'}:
        stratify_col = (
            y.astype(str) + '_' + data[stratify_on].astype(str)
            if stratify_on is not None
            else y
        )
    else:
        stratify_col = None

    self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
        X, y, test_size=test_size, stratify=stratify_col, random_state=42)

    if self.task == 'survival':
        self.predictors, scores, data_train, data_val = train_survival_models(
            self.X_train, 
            self.y_train, 
            self.X_test, 
            self.y_test, 
            self.output_dir
        )
        self.predictor = self.predictors[max(scores, key=scores.get)]
        self.trainer_config['survival_models_info'] = scores

        self.X_train, self.y_train = data_train.drop(columns=['time', 'event']), data_train[['time', 'event']] 
        self.X_val, self.y_val = data_val.drop(columns=['time', 'event']), data_val[['time', 'event']] 
    else:
        (self.output_dir / 'autogluon_models').mkdir(exist_ok=True, parents=True)

        if self.task in ['binary', 'multiclass']:
            self.eval_metric = 'roc_auc'
        elif self.task == 'regression':
            self.eval_metric = 'r2'

        ag_auprc_scorer = make_scorer(
            name='auprc', # Move this to a seperate file?
            score_func=auprc,
            optimum=1,
            greater_is_better=True,
            needs_class=True)

        # When changing extra_metrics consider where it's used and make updates accordingly
        self.extra_metrics = ['f1', ag_auprc_scorer] if self.task in ['binary', 'multiclass'] else ['root_mean_squared_error']
        self.show_leaderboard = ['model', 'score_test', 'score_val', 'score_train']

        custom_hyperparameters = get_hyperparameter_config('default')
        custom_hyperparameters[SimpleRegressionModel] = {}
        kwargs['hyperparameters'] = custom_hyperparameters

        if k_folds > 1:
            self._train_autogluon_with_cv()
        else:
            self._train_autogluon()

    self.trained = True

    self.data_dir = self.output_dir / 'data'
    self.data_dir.mkdir(parents=True, exist_ok=True)
    self.X_train.to_csv((self.data_dir / 'X_train.csv'), index=False)
    self.X_test.to_csv((self.data_dir / 'X_test.csv'), index=False)
    self.X_val.to_csv((self.data_dir / 'X_val.csv'), index=False)
    self.y_train.to_csv((self.data_dir / 'y_train.csv'), index=False)
    self.y_test.to_csv((self.data_dir / 'y_test.csv'), index=False)
    self.y_val.to_csv((self.data_dir / 'y_val.csv'), index=False)

    with (self.output_dir / 'trainer_config.yaml').open('w') as f:
        yaml.dump(self.trainer_config, f)

    if explain:
        explainer = Explainer.from_trainer(self)
        explainer.run()

model_names()

Returns all trainer model names.

This method retrieves the names of all models associated with the current predictor. It requires that the predictor has been trained.

Returns:

Name Type Description
list List[str]

A list of model names available in the predictor.

Raises:

Type Description
ValueError

If the model has not been trained (self.trained is False).

Source code in src/jarvais/trainer/trainer.py
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
def model_names(self) -> List[str]:
    """
    Returns all trainer model names.

    This method retrieves the names of all models associated with the 
    current predictor. It requires that the predictor has been trained.

    Returns:
        list: A list of model names available in the predictor.

    Raises:
        ValueError: If the model has not been trained (`self.trained` is False).
    """
    if not self.trained:
        raise ValueError("The model must be trained before accessing model names.")

    if self.task == 'survival':
        return list(self.predictors.keys())
    else:        
        return self.predictor.model_names()

infer(data, model=None)

Perform inference using the trained predictor on the provided data.

This method generates predictions based on the input data using the specified model. If no model is provided, the default model is used. The predictor must be trained before inference can be performed.

Parameters:

Name Type Description Default
data DataFrame

The input data for which inference is to be performed.

required
model str

The name of the model to use for inference. If None, the default model is used.

None

Returns:

Type Description
ndarray

np.ndarray: The prediction results from the model.

Raises:

Type Description
ValueError

If the model has not been trained (self.trained is False).

ValueError

If the specified model name is not found in the predictor.

Source code in src/jarvais/trainer/trainer.py
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
def infer(self, data: pd.DataFrame, model: str = None) -> np.ndarray:
    """
    Perform inference using the trained predictor on the provided data.

    This method generates predictions based on the input data using the 
    specified model. If no model is provided, the default model is used. 
    The predictor must be trained before inference can be performed.

    Args:
        data (pd.DataFrame): The input data for which inference is to be performed.
        model (str, optional): The name of the model to use for inference. 
            If None, the default model is used.

    Returns:
        np.ndarray: The prediction results from the model.

    Raises:
        ValueError: If the model has not been trained (`self.trained` is False).
        ValueError: If the specified model name is not found in the predictor.
    """
    if not self.trained:
        raise ValueError("The model must be trained before performing inference.")
    if not model is None and not model in self.model_names():
        raise ValueError(f"Model '{model}' not in trainer. Use model_names() to list valid available models.")

    if self.task == 'survival':
        if model is None:
            inference = self.predictor.predict(data)
        else:
            inference = self.predictors[model].predict(data)
    else:
        if self.predictor.can_predict_proba:
            inference = self.predictor.predict_proba(data, model, as_pandas=False)[:, 1]
        else:
            inference = self.predictor.predict(data, model, as_pandas=False)

    return inference

load_trainer(project_dir) classmethod

Load a trained TrainerSupervised from the specified directory.

Parameters:

Name Type Description Default
project_dir str or Path

The directory where the trainer was run.

required

Returns:

Name Type Description
trainer TrainerSupervised

The loaded Trainer.

Source code in src/jarvais/trainer/trainer.py
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
@classmethod
def load_trainer(cls, project_dir: str | Path):
    """
    Load a trained TrainerSupervised from the specified directory.

    Args:
        project_dir (str or Path, optional): The directory where the trainer was run.

    Returns:
        trainer (TrainerSupervised): The loaded Trainer.
    """
    project_dir = Path(project_dir)
    with (project_dir / 'trainer_config.yaml').open('r') as f:
        trainer_config = yaml.safe_load(f)

    trainer = cls()
    trainer.task = trainer_config['task']
    trainer.output_dir = project_dir

    if trainer.task == 'survival':
        model_dir = (project_dir / 'survival_models')

        trainer.predictors = {}
        model_info = trainer_config['survival_models_info']
        for model_name, _ in model_info.items():
            if model_name == 'MTLR':
                trainer.predictors[model_name] = LitMTLR.load_from_checkpoint(model_dir / "MTLR.ckpt")
            elif model_name == 'DeepSurv':
                trainer.predictors[model_name] = LitDeepSurv.load_from_checkpoint(model_dir / "DeepSurv.ckpt")
            else:
                with (model_dir / f'{model_name}.pkl').open("rb") as f:
                    trainer.predictors[model_name] = pickle.load(f)

        trainer.predictor = trainer.predictors[max(model_info, key=model_info.get)]
    else:
        model_dir = (project_dir / 'autogluon_models' / 'autogluon_models_best_fold')
        trainer.predictor = TabularPredictor.load(model_dir, verbosity=1)

    trainer.trained = True

    trainer.X_test = pd.read_csv(project_dir / 'data' / 'X_test.csv')
    trainer.X_val = pd.read_csv(project_dir / 'data' / 'X_val.csv')
    trainer.X_train = pd.read_csv(project_dir / 'data' / 'X_train.csv')
    trainer.y_test = pd.read_csv(project_dir / 'data' / 'y_test.csv').squeeze()
    trainer.y_val = pd.read_csv(project_dir / 'data' / 'y_val.csv').squeeze()
    trainer.y_train = pd.read_csv(project_dir / 'data' / 'y_train.csv').squeeze()

    return trainer