Source code for optimal_cutoffs.cv

"""Cross-validation helpers for threshold optimization."""

import numpy as np
from sklearn.model_selection import KFold

from .optimizers import _metric_score, get_optimal_threshold
from .types import ArrayLike, OptimizationMethod, SampleWeightLike



[docs]
def cv_threshold_optimization(
    true_labs: ArrayLike,
    pred_prob: ArrayLike,
    metric: str = "f1",
    method: OptimizationMethod = "smart_brute",
    cv: int = 5,
    random_state: int | None = None,
    sample_weight: SampleWeightLike = None,
) -> tuple[np.ndarray, np.ndarray]:
    """Estimate an optimal threshold using cross-validation.

    Parameters
    ----------
    true_labs:
        Array of true binary labels.
    pred_prob:
        Predicted probabilities from a classifier.
    metric:
        Metric name to optimize; must exist in the metric registry.
    method:
        Optimization strategy passed to
        :func:`~optimal_cutoffs.optimizers.get_optimal_threshold`.
    cv:
        Number of folds for :class:`~sklearn.model_selection.KFold` cross-validation.
    random_state:
        Seed for the cross-validator shuffling.
    sample_weight:
        Optional array of sample weights for handling imbalanced datasets.

    Returns
    -------
    tuple[np.ndarray, np.ndarray]
        Arrays of per-fold thresholds and scores.
    """

    true_labs = np.asarray(true_labs)
    pred_prob = np.asarray(pred_prob)
    if sample_weight is not None:
        sample_weight = np.asarray(sample_weight)

    kf = KFold(n_splits=cv, shuffle=True, random_state=random_state)
    thresholds = []
    scores = []
    for train_idx, test_idx in kf.split(true_labs):
        # Extract training data and weights
        train_weights = None if sample_weight is None else sample_weight[train_idx]
        test_weights = None if sample_weight is None else sample_weight[test_idx]

        thr = get_optimal_threshold(
            true_labs[train_idx],
            pred_prob[train_idx],
            metric=metric,
            method=method,
            sample_weight=train_weights,
        )
        thresholds.append(thr)
        score = _metric_score(
            true_labs[test_idx], pred_prob[test_idx], thr, metric, test_weights
        )
        scores.append(score)
    return np.array(thresholds), np.array(scores)




[docs]
def nested_cv_threshold_optimization(
    true_labs: ArrayLike,
    pred_prob: ArrayLike,
    metric: str = "f1",
    method: OptimizationMethod = "smart_brute",
    inner_cv: int = 5,
    outer_cv: int = 5,
    random_state: int | None = None,
    sample_weight: SampleWeightLike = None,
) -> tuple[np.ndarray, np.ndarray]:
    """Nested cross-validation for threshold optimization.

    Parameters
    ----------
    true_labs:
        Array of true binary labels.
    pred_prob:
        Predicted probabilities from a classifier.
    metric:
        Metric name to optimize.
    method:
        Optimization strategy passed to
        :func:`~optimal_cutoffs.optimizers.get_optimal_threshold`.
    inner_cv:
        Number of folds in the inner loop used to estimate thresholds.
    outer_cv:
        Number of outer folds for unbiased performance assessment.
    random_state:
        Seed for the cross-validators.
    sample_weight:
        Optional array of sample weights for handling imbalanced datasets.

    Returns
    -------
    tuple[np.ndarray, np.ndarray]
        Arrays of outer-fold thresholds and scores.
    """

    true_labs = np.asarray(true_labs)
    pred_prob = np.asarray(pred_prob)
    if sample_weight is not None:
        sample_weight = np.asarray(sample_weight)

    outer = KFold(n_splits=outer_cv, shuffle=True, random_state=random_state)
    outer_thresholds = []
    outer_scores = []
    for train_idx, test_idx in outer.split(true_labs):
        # Extract training and test data with weights
        train_weights = None if sample_weight is None else sample_weight[train_idx]
        test_weights = None if sample_weight is None else sample_weight[test_idx]

        inner_thresholds, _ = cv_threshold_optimization(
            true_labs[train_idx],
            pred_prob[train_idx],
            metric=metric,
            method=method,
            cv=inner_cv,
            random_state=random_state,
            sample_weight=train_weights,
        )
        thr = float(np.mean(inner_thresholds))
        outer_thresholds.append(thr)
        score = _metric_score(
            true_labs[test_idx], pred_prob[test_idx], thr, metric, test_weights
        )
        outer_scores.append(score)
    return np.array(outer_thresholds), np.array(outer_scores)