Source code for optimal_cutoffs.binary

"""Binary classification threshold optimization.

This module implements threshold optimization for binary classification problems
where we have a single decision threshold τ and predict positive if p ≥ τ.

Key algorithms:
- optimize_f1_binary(): Sort-and-scan O(n log n) for F-measures
- optimize_utility_binary(): Closed-form O(1) for linear utilities
- optimize_metric_binary(): General metric optimization

All functions assume calibrated probabilities: E[y|p] = p
"""

from __future__ import annotations

import numpy as np
from numpy.typing import ArrayLike

from .core import OptimizationResult, Task
from .validation import validate_binary_classification


[docs] def optimize_f1_binary( true_labels: ArrayLike, pred_proba: ArrayLike, *, beta: float = 1.0, sample_weight: ArrayLike | None = None, comparison: str = ">", ) -> OptimizationResult: """Optimize F-beta score for binary classification using sort-and-scan. Uses the O(n log n) sort-and-scan algorithm exploiting the piecewise structure of F-beta metrics. This finds the exact optimal threshold. Parameters ---------- true_labels True binary labels in {0, 1}. Shape: (n_samples,) pred_proba Predicted probabilities for positive class in [0, 1]. Shape: (n_samples,) beta F-beta parameter. beta=1 gives F1 score sample_weight Sample weights. Shape: (n_samples,) comparison Comparison operator for threshold. Must be ">" or ">=" Returns ------- OptimizationResult Result with optimal threshold, F-beta score, and predict function Examples -------- >>> y_true = [0, 1, 1, 0, 1] >>> y_prob = [0.2, 0.8, 0.7, 0.3, 0.9] >>> result = optimize_f1_binary(y_true, y_prob) >>> result.threshold 0.5 >>> result.score # F1 score at optimal threshold 0.8 """ # Import here to avoid circular imports from .piecewise import optimal_threshold_sortscan # Validate inputs true_labels, pred_proba, sample_weight = validate_binary_classification( true_labels, pred_proba, sample_weight, require_proba=True ) # Create F-beta metric function name if beta == 1.0: metric_name = "f1" else: # Register F-beta metric if not already registered from .metrics_core import register_metric def fbeta_metric(tp, tn, fp, fn): # Vectorized F-beta metric tp, tn, fp, fn = ( np.asarray(tp), np.asarray(tn), np.asarray(fp), np.asarray(fn), ) denom = (1 + beta**2) * tp + beta**2 * fp + fn return np.where(denom > 0, (1 + beta**2) * tp / denom, 0.0) metric_name = f"f{beta}_score" register_metric(metric_name, fbeta_metric, is_piecewise=True, maximize=True) # Use sort-and-scan optimization result = optimal_threshold_sortscan( true_labels, pred_proba, metric=metric_name, sample_weight=sample_weight, inclusive=(comparison == ">="), require_proba=True, tolerance=1e-12, ) # The result already has a predict function, but we need to handle different input formats def predict_binary(probs: ArrayLike) -> np.ndarray: p = np.asarray(probs) if p.ndim == 2 and p.shape[1] == 2: p = p[:, 1] # Extract positive class probabilities elif p.ndim == 2 and p.shape[1] == 1: p = p.ravel() return result.predict(p) return OptimizationResult( thresholds=result.thresholds, scores=result.scores, predict=predict_binary, task=Task.BINARY, metric=f"f{beta}_score" if beta != 1.0 else "f1_score", n_classes=2, )
[docs] def optimize_utility_binary( true_labels: ArrayLike | None, pred_proba: ArrayLike, *, utility: dict[str, float], sample_weight: ArrayLike | None = None, ) -> OptimizationResult: """Optimize binary classification using utility/cost specification. Computes the Bayes-optimal threshold using the closed-form formula: Ï„* = (u_tn - u_fp) / [(u_tp - u_fn) + (u_tn - u_fp)] This is exact and runs in O(1) time. Parameters ---------- true_labels True binary labels. Can be None for pure Bayes optimization. Shape: (n_samples,) pred_proba Predicted probabilities for positive class in [0, 1]. Shape: (n_samples,) utility Utility specification with keys "tp", "tn", "fp", "fn" sample_weight Sample weights (affects expected utility computation). Shape: (n_samples,) Returns ------- OptimizationResult Result with optimal threshold, expected utility, and predict function Raises ------ ValueError If probabilities are not in the range [0, 1] for utility optimization. Examples -------- >>> # FN costs 5x more than FP >>> utility = {"tp": 10, "tn": 1, "fp": -1, "fn": -5} >>> result = optimize_utility_binary(None, y_prob, utility=utility) >>> result.threshold # Closed-form optimal 0.167 """ from .bayes import BayesOptimal, UtilitySpec # Validate probabilities pred_proba = np.asarray(pred_proba, dtype=np.float64) if pred_proba.ndim == 2 and pred_proba.shape[1] == 2: pred_proba = pred_proba[:, 1] # Extract positive class elif pred_proba.ndim == 2 and pred_proba.shape[1] == 1: pred_proba = pred_proba.ravel() if not np.all((pred_proba >= 0) & (pred_proba <= 1)): raise ValueError("Probabilities must be in [0, 1] for utility optimization") # Create utility specification utility_spec = UtilitySpec.from_dict(utility) optimizer = BayesOptimal(utility_spec) # Compute optimal threshold (closed form) threshold = optimizer.compute_threshold() # Compute expected utility on this data expected_utility = optimizer.expected_utility(pred_proba) def predict_binary(probs: ArrayLike) -> np.ndarray: p = np.asarray(probs) if p.ndim == 2 and p.shape[1] == 2: p = p[:, 1] elif p.ndim == 2 and p.shape[1] == 1: p = p.ravel() return (p >= threshold).astype(np.int32) return OptimizationResult( thresholds=np.array([threshold]), scores=np.array([expected_utility]), predict=predict_binary, task=Task.BINARY, metric="expected_utility", n_classes=2, )
[docs] def optimize_metric_binary( true_labels: ArrayLike, pred_proba: ArrayLike, *, metric: str = "f1", method: str = "auto", sample_weight: ArrayLike | None = None, comparison: str = ">", tolerance: float = 1e-10, ) -> OptimizationResult: """General binary metric optimization with automatic method selection. Automatically selects the best optimization algorithm based on metric properties and data characteristics. Parameters ---------- true_labels True binary labels in {0, 1}. Shape: (n_samples,) pred_proba Predicted probabilities for positive class in [0, 1]. Shape: (n_samples,) metric Metric to optimize ("f1", "precision", "recall", "accuracy", etc.) method Optimization method: - "auto": Automatically select best method - "sort_scan": O(n log n) sort-and-scan (exact for piecewise metrics) - "minimize": Scipy optimization - "gradient": Simple gradient ascent sample_weight Sample weights. Shape: (n_samples,) comparison Comparison operator for threshold. Must be ">" or ">=" tolerance Numerical tolerance for optimization Returns ------- OptimizationResult Result with optimal threshold, metric score, and predict function Raises ------ ValueError If method is unknown or not supported. Examples -------- >>> result = optimize_metric_binary(y_true, y_prob, metric="precision") >>> result = optimize_metric_binary(y_true, y_prob, metric="f1", method="sort_scan") """ from .metrics_core import is_piecewise_metric from .optimize import optimize_gradient, optimize_scipy from .piecewise import optimal_threshold_sortscan # Validate inputs true_labels, pred_proba, sample_weight = validate_binary_classification( true_labels, pred_proba, sample_weight, require_proba=True ) # Method selection if method == "auto": method = "sort_scan" if is_piecewise_metric(metric) else "minimize" # Route to appropriate optimizer match method: case "sort_scan": result = optimal_threshold_sortscan( true_labels, pred_proba, metric=metric, sample_weight=sample_weight, inclusive=(comparison == ">="), require_proba=True, tolerance=tolerance, ) case "minimize": result = optimize_scipy( true_labels, pred_proba, metric, sample_weight, comparison, tol=tolerance, ) case "gradient": result = optimize_gradient( true_labels, pred_proba, metric, sample_weight, comparison, tol=tolerance, ) case _: raise ValueError(f"Unknown method: {method}") def predict_binary(probs: ArrayLike) -> np.ndarray: p = np.asarray(probs) if p.ndim == 2 and p.shape[1] == 2: p = p[:, 1] elif p.ndim == 2 and p.shape[1] == 1: p = p.ravel() if comparison == ">=": return (p >= result.thresholds[0]).astype(np.int32) else: return (p > result.thresholds[0]).astype(np.int32) return OptimizationResult( thresholds=result.thresholds, scores=result.scores, predict=predict_binary, task=Task.BINARY, metric=metric, n_classes=2, )
__all__ = [ "optimize_f1_binary", "optimize_utility_binary", "optimize_metric_binary", ]