Skip to content

Functional

jarvais.utils.functional

auprc(y_true, y_scores)

Calculate the Area Under the Precision-Recall Curve (AUPRC).

Parameters:

Name Type Description Default
y_true ndarray

True binary labels. Shape (n_samples,).

required
y_scores ndarray

Predicted scores or probabilities. Shape (n_samples,).

required

Returns:

Name Type Description
auprc_score float

The AUPRC value.

Source code in src/jarvais/utils/functional.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
def auprc(y_true: np.ndarray, y_scores: np.ndarray) -> float:
    """
    Calculate the Area Under the Precision-Recall Curve (AUPRC).

    Args:
        y_true (np.ndarray): True binary labels. Shape (n_samples,).
        y_scores (np.ndarray): Predicted scores or probabilities. Shape (n_samples,).

    Returns:
        auprc_score (float): The AUPRC value.
    """
    precision, recall, _ = precision_recall_curve(y_true, y_scores)
    return auc(recall, precision)

ci_wrapper(y_true, y_pred)

Wrapper for sksurv.metrics.concordance_index_censored to ensure compatibility with bootstrap_metric.

Parameters:

Name Type Description Default
y_true ndarray

A 2D NumPy array of shape (n_samples, 2), where: - y_true[:, 0] represents the observed survival times. - y_true[:, 1] represents the event indicator (1 if the event occurred, 0 if censored).

required
y_pred ndarray

A 1D NumPy array of predicted risk scores or survival times. Higher scores typically indicate higher risk.

required

Returns:

Name Type Description
concordance_index float

The concordance index.

Source code in src/jarvais/utils/functional.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
def ci_wrapper(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    """
    Wrapper for `sksurv.metrics.concordance_index_censored` to ensure compatibility 
    with `bootstrap_metric`.

    Args:
        y_true (np.ndarray): A 2D NumPy array of shape (n_samples, 2), where:
            - `y_true[:, 0]` represents the observed survival times.
            - `y_true[:, 1]` represents the event indicator 
              (1 if the event occurred, 0 if censored).
        y_pred (np.ndarray): A 1D NumPy array of predicted risk scores or 
            survival times. Higher scores typically indicate higher risk.

    Returns:
        concordance_index (float): The concordance index.
    """
    time = y_true[:, 0]
    event = y_true[:, 1]

    return concordance_index_censored(event.astype(bool), time, y_pred)[0]

bootstrap_metric(y_true, y_pred, metric_func, nsamples=100)

Compute a metric using bootstrapping to estimate its variability.

Parameters:

Name Type Description Default
y_true ndarray

True labels. Shape (n_samples,).

required
y_pred ndarray

Predicted values. Shape (n_samples,).

required
metric_func Callable[[ndarray, ndarray], float]

A function that calculates the metric.

required
nsamples int

The number of bootstrap samples. Defaults to 100.

100

Returns:

Name Type Description
bootstrapped_values List[float]

A list of metric values computed on each bootstrap sample.

Source code in src/jarvais/utils/functional.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
def bootstrap_metric(
        y_true: np.ndarray,
        y_pred: np.ndarray,
        metric_func: Callable[[np.ndarray, np.ndarray], float],
        nsamples: int = 100
    ) -> List[float]:
    """
    Compute a metric using bootstrapping to estimate its variability.

    Args:
        y_true (np.ndarray): True labels. Shape (n_samples,).
        y_pred (np.ndarray): Predicted values. Shape (n_samples,).
        metric_func (Callable[[np.ndarray, np.ndarray], float]): A function that calculates the metric.
        nsamples (int, optional): The number of bootstrap samples. Defaults to 100.

    Returns:
        bootstrapped_values (List[float]): A list of metric values computed on each bootstrap sample.
    """
    np.random.seed(0)
    values = []

    for _ in range(nsamples):
        idx = np.random.randint(len(y_true), size=len(y_true))
        pred_sample = y_pred[idx]
        y_true_sample = y_true[idx]
        val = metric_func(y_true_sample, pred_sample)
        values.append(val)

    return values

undummify(df, prefix_sep='_')

Undummifies a DataFrame by collapsing dummy/one-hot encoded columns back into their original categorical column.

Found here: https://stackoverflow.com/a/62085741

Parameters:

Name Type Description Default
df DataFrame

The input DataFrame containing dummy/one-hot encoded columns.

required
prefix_sep str

The separator used to distinguish between the prefix (category) and the column name in the dummy columns. Defaults to "_".

'_'

Returns:

Name Type Description
undummified_df DataFrame

A new DataFrame with the undummified (reconstructed) categorical columns.

Source code in src/jarvais/utils/functional.py
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
def undummify(df, prefix_sep="_"):
    """
    Undummifies a DataFrame by collapsing dummy/one-hot encoded columns back into their original categorical column.

    Found here: https://stackoverflow.com/a/62085741

    Args:
        df (pandas.DataFrame): The input DataFrame containing dummy/one-hot encoded columns.
        prefix_sep (str, optional): The separator used to distinguish between the prefix (category) and the column name in the dummy columns. 
            Defaults to "_".

    Returns:
        undummified_df (pandas.DataFrame): A new DataFrame with the undummified (reconstructed) categorical columns.
    """
    dummy_cols = {
        item.split(prefix_sep)[0]: (prefix_sep in item) for item in df.columns
    }
    series_list = []
    for col, needs_to_collapse in dummy_cols.items():
        if needs_to_collapse:
            undummified = (
                df.filter(like=col)
                .idxmax(axis=1)
                .apply(lambda x: x.split(prefix_sep, maxsplit=1)[1])
                .rename(col)
            )
            series_list.append(undummified)
        else:
            series_list.append(df[col])
    undummified_df = pd.concat(series_list, axis=1)
    return undummified_df

process_RADCURE_clinical(df)

Processes RADCURE clinical data.

Raw data found here: https://www.cancerimagingarchive.net/collection/radcure/

Source code in src/jarvais/utils/functional.py
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
def process_RADCURE_clinical(df):
    """
    Processes RADCURE clinical data.

    Raw data found here: https://www.cancerimagingarchive.net/collection/radcure/
    """
    df_converted = pd.DataFrame({
        'Study ID': df['patient_id'],
        'survival_time': df['Length FU'],
        'death': df['Status'].apply(lambda x: 1 if x == 'Dead' else 0),
        'age at dx': df['Age'],
        'Sex': df['Sex'],
        'T Stage': df['T'],
        'N Stage': df['N'],
        'Stage': df['Stage'],
        'Dose': df['Dose'],
        'Chemotherapy': df['Chemo'].apply(lambda x: 1 if x != 'none' else 0),
        'HPV Combined': df['HPV'].apply(lambda x: 1 if isinstance(x, str) and 'positive' in x.lower() else None),
        'Smoking Status': df['Smoking Status'],
        'Disease Site': df['Ds Site'].str.lower()
    })

    return df_converted