Source code for optimal_cutoffs.cv

"""Cross-validation helpers for threshold optimization."""

import numpy as np
from sklearn.model_selection import KFold

from .optimizers import _metric_score, get_optimal_threshold
from .types import ArrayLike, OptimizationMethod, SampleWeightLike


[docs] def cv_threshold_optimization( true_labs: ArrayLike, pred_prob: ArrayLike, metric: str = "f1", method: OptimizationMethod = "smart_brute", cv: int = 5, random_state: int | None = None, sample_weight: SampleWeightLike = None, ) -> tuple[np.ndarray, np.ndarray]: """Estimate an optimal threshold using cross-validation. Parameters ---------- true_labs: Array of true binary labels. pred_prob: Predicted probabilities from a classifier. metric: Metric name to optimize; must exist in the metric registry. method: Optimization strategy passed to :func:`~optimal_cutoffs.optimizers.get_optimal_threshold`. cv: Number of folds for :class:`~sklearn.model_selection.KFold` cross-validation. random_state: Seed for the cross-validator shuffling. sample_weight: Optional array of sample weights for handling imbalanced datasets. Returns ------- tuple[np.ndarray, np.ndarray] Arrays of per-fold thresholds and scores. """ true_labs = np.asarray(true_labs) pred_prob = np.asarray(pred_prob) if sample_weight is not None: sample_weight = np.asarray(sample_weight) kf = KFold(n_splits=cv, shuffle=True, random_state=random_state) thresholds = [] scores = [] for train_idx, test_idx in kf.split(true_labs): # Extract training data and weights train_weights = None if sample_weight is None else sample_weight[train_idx] test_weights = None if sample_weight is None else sample_weight[test_idx] thr = get_optimal_threshold( true_labs[train_idx], pred_prob[train_idx], metric=metric, method=method, sample_weight=train_weights, ) thresholds.append(thr) score = _metric_score( true_labs[test_idx], pred_prob[test_idx], thr, metric, test_weights ) scores.append(score) return np.array(thresholds), np.array(scores)
[docs] def nested_cv_threshold_optimization( true_labs: ArrayLike, pred_prob: ArrayLike, metric: str = "f1", method: OptimizationMethod = "smart_brute", inner_cv: int = 5, outer_cv: int = 5, random_state: int | None = None, sample_weight: SampleWeightLike = None, ) -> tuple[np.ndarray, np.ndarray]: """Nested cross-validation for threshold optimization. Parameters ---------- true_labs: Array of true binary labels. pred_prob: Predicted probabilities from a classifier. metric: Metric name to optimize. method: Optimization strategy passed to :func:`~optimal_cutoffs.optimizers.get_optimal_threshold`. inner_cv: Number of folds in the inner loop used to estimate thresholds. outer_cv: Number of outer folds for unbiased performance assessment. random_state: Seed for the cross-validators. sample_weight: Optional array of sample weights for handling imbalanced datasets. Returns ------- tuple[np.ndarray, np.ndarray] Arrays of outer-fold thresholds and scores. """ true_labs = np.asarray(true_labs) pred_prob = np.asarray(pred_prob) if sample_weight is not None: sample_weight = np.asarray(sample_weight) outer = KFold(n_splits=outer_cv, shuffle=True, random_state=random_state) outer_thresholds = [] outer_scores = [] for train_idx, test_idx in outer.split(true_labs): # Extract training and test data with weights train_weights = None if sample_weight is None else sample_weight[train_idx] test_weights = None if sample_weight is None else sample_weight[test_idx] inner_thresholds, _ = cv_threshold_optimization( true_labs[train_idx], pred_prob[train_idx], metric=metric, method=method, cv=inner_cv, random_state=random_state, sample_weight=train_weights, ) thr = float(np.mean(inner_thresholds)) outer_thresholds.append(thr) score = _metric_score( true_labs[test_idx], pred_prob[test_idx], thr, metric, test_weights ) outer_scores.append(score) return np.array(outer_thresholds), np.array(outer_scores)