"""Binary classification threshold optimization.
This module implements threshold optimization for binary classification problems
where we have a single decision threshold τ and predict positive if p ≥ τ.
Key algorithms:
- optimize_f1_binary(): Sort-and-scan O(n log n) for F-measures
- optimize_utility_binary(): Closed-form O(1) for linear utilities
- optimize_metric_binary(): General metric optimization
All functions assume calibrated probabilities: E[y|p] = p
"""
from __future__ import annotations
import numpy as np
from numpy.typing import ArrayLike
from .core import OptimizationResult, Task
from .validation import validate_binary_classification
[docs]
def optimize_f1_binary(
true_labels: ArrayLike,
pred_proba: ArrayLike,
*,
beta: float = 1.0,
sample_weight: ArrayLike | None = None,
comparison: str = ">",
) -> OptimizationResult:
"""Optimize F-beta score for binary classification using sort-and-scan.
Uses the O(n log n) sort-and-scan algorithm exploiting the piecewise
structure of F-beta metrics. This finds the exact optimal threshold.
Parameters
----------
true_labels
True binary labels in {0, 1}. Shape: (n_samples,)
pred_proba
Predicted probabilities for positive class in [0, 1]. Shape: (n_samples,)
beta
F-beta parameter. beta=1 gives F1 score
sample_weight
Sample weights. Shape: (n_samples,)
comparison
Comparison operator for threshold. Must be ">" or ">="
Returns
-------
OptimizationResult
Result with optimal threshold, F-beta score, and predict function
Examples
--------
>>> y_true = [0, 1, 1, 0, 1]
>>> y_prob = [0.2, 0.8, 0.7, 0.3, 0.9]
>>> result = optimize_f1_binary(y_true, y_prob)
>>> result.threshold
0.5
>>> result.score # F1 score at optimal threshold
0.8
"""
# Import here to avoid circular imports
from .piecewise import optimal_threshold_sortscan
# Validate inputs
true_labels, pred_proba, sample_weight = validate_binary_classification(
true_labels, pred_proba, sample_weight, require_proba=True
)
# Create F-beta metric function name
if beta == 1.0:
metric_name = "f1"
else:
# Register F-beta metric if not already registered
from .metrics_core import register_metric
def fbeta_metric(tp, tn, fp, fn):
# Vectorized F-beta metric
tp, tn, fp, fn = (
np.asarray(tp),
np.asarray(tn),
np.asarray(fp),
np.asarray(fn),
)
denom = (1 + beta**2) * tp + beta**2 * fp + fn
return np.where(denom > 0, (1 + beta**2) * tp / denom, 0.0)
metric_name = f"f{beta}_score"
register_metric(metric_name, fbeta_metric, is_piecewise=True, maximize=True)
# Use sort-and-scan optimization
result = optimal_threshold_sortscan(
true_labels,
pred_proba,
metric=metric_name,
sample_weight=sample_weight,
inclusive=(comparison == ">="),
require_proba=True,
tolerance=1e-12,
)
# The result already has a predict function, but we need to handle different input formats
def predict_binary(probs: ArrayLike) -> np.ndarray:
p = np.asarray(probs)
if p.ndim == 2 and p.shape[1] == 2:
p = p[:, 1] # Extract positive class probabilities
elif p.ndim == 2 and p.shape[1] == 1:
p = p.ravel()
return result.predict(p)
return OptimizationResult(
thresholds=result.thresholds,
scores=result.scores,
predict=predict_binary,
task=Task.BINARY,
metric=f"f{beta}_score" if beta != 1.0 else "f1_score",
n_classes=2,
)
[docs]
def optimize_utility_binary(
true_labels: ArrayLike | None,
pred_proba: ArrayLike,
*,
utility: dict[str, float],
sample_weight: ArrayLike | None = None,
) -> OptimizationResult:
"""Optimize binary classification using utility/cost specification.
Computes the Bayes-optimal threshold using the closed-form formula:
Ï„* = (u_tn - u_fp) / [(u_tp - u_fn) + (u_tn - u_fp)]
This is exact and runs in O(1) time.
Parameters
----------
true_labels
True binary labels. Can be None for pure Bayes optimization. Shape: (n_samples,)
pred_proba
Predicted probabilities for positive class in [0, 1]. Shape: (n_samples,)
utility
Utility specification with keys "tp", "tn", "fp", "fn"
sample_weight
Sample weights (affects expected utility computation). Shape: (n_samples,)
Returns
-------
OptimizationResult
Result with optimal threshold, expected utility, and predict function
Raises
------
ValueError
If probabilities are not in the range [0, 1] for utility optimization.
Examples
--------
>>> # FN costs 5x more than FP
>>> utility = {"tp": 10, "tn": 1, "fp": -1, "fn": -5}
>>> result = optimize_utility_binary(None, y_prob, utility=utility)
>>> result.threshold # Closed-form optimal
0.167
"""
from .bayes import BayesOptimal, UtilitySpec
# Validate probabilities
pred_proba = np.asarray(pred_proba, dtype=np.float64)
if pred_proba.ndim == 2 and pred_proba.shape[1] == 2:
pred_proba = pred_proba[:, 1] # Extract positive class
elif pred_proba.ndim == 2 and pred_proba.shape[1] == 1:
pred_proba = pred_proba.ravel()
if not np.all((pred_proba >= 0) & (pred_proba <= 1)):
raise ValueError("Probabilities must be in [0, 1] for utility optimization")
# Create utility specification
utility_spec = UtilitySpec.from_dict(utility)
optimizer = BayesOptimal(utility_spec)
# Compute optimal threshold (closed form)
threshold = optimizer.compute_threshold()
# Compute expected utility on this data
expected_utility = optimizer.expected_utility(pred_proba)
def predict_binary(probs: ArrayLike) -> np.ndarray:
p = np.asarray(probs)
if p.ndim == 2 and p.shape[1] == 2:
p = p[:, 1]
elif p.ndim == 2 and p.shape[1] == 1:
p = p.ravel()
return (p >= threshold).astype(np.int32)
return OptimizationResult(
thresholds=np.array([threshold]),
scores=np.array([expected_utility]),
predict=predict_binary,
task=Task.BINARY,
metric="expected_utility",
n_classes=2,
)
[docs]
def optimize_metric_binary(
true_labels: ArrayLike,
pred_proba: ArrayLike,
*,
metric: str = "f1",
method: str = "auto",
sample_weight: ArrayLike | None = None,
comparison: str = ">",
tolerance: float = 1e-10,
) -> OptimizationResult:
"""General binary metric optimization with automatic method selection.
Automatically selects the best optimization algorithm based on metric
properties and data characteristics.
Parameters
----------
true_labels
True binary labels in {0, 1}. Shape: (n_samples,)
pred_proba
Predicted probabilities for positive class in [0, 1]. Shape: (n_samples,)
metric
Metric to optimize ("f1", "precision", "recall", "accuracy", etc.)
method
Optimization method:
- "auto": Automatically select best method
- "sort_scan": O(n log n) sort-and-scan (exact for piecewise metrics)
- "minimize": Scipy optimization
- "gradient": Simple gradient ascent
sample_weight
Sample weights. Shape: (n_samples,)
comparison
Comparison operator for threshold. Must be ">" or ">="
tolerance
Numerical tolerance for optimization
Returns
-------
OptimizationResult
Result with optimal threshold, metric score, and predict function
Raises
------
ValueError
If method is unknown or not supported.
Examples
--------
>>> result = optimize_metric_binary(y_true, y_prob, metric="precision")
>>> result = optimize_metric_binary(y_true, y_prob, metric="f1", method="sort_scan")
"""
from .metrics_core import is_piecewise_metric
from .optimize import optimize_gradient, optimize_scipy
from .piecewise import optimal_threshold_sortscan
# Validate inputs
true_labels, pred_proba, sample_weight = validate_binary_classification(
true_labels, pred_proba, sample_weight, require_proba=True
)
# Method selection
if method == "auto":
method = "sort_scan" if is_piecewise_metric(metric) else "minimize"
# Route to appropriate optimizer
match method:
case "sort_scan":
result = optimal_threshold_sortscan(
true_labels,
pred_proba,
metric=metric,
sample_weight=sample_weight,
inclusive=(comparison == ">="),
require_proba=True,
tolerance=tolerance,
)
case "minimize":
result = optimize_scipy(
true_labels,
pred_proba,
metric,
sample_weight,
comparison,
tol=tolerance,
)
case "gradient":
result = optimize_gradient(
true_labels,
pred_proba,
metric,
sample_weight,
comparison,
tol=tolerance,
)
case _:
raise ValueError(f"Unknown method: {method}")
def predict_binary(probs: ArrayLike) -> np.ndarray:
p = np.asarray(probs)
if p.ndim == 2 and p.shape[1] == 2:
p = p[:, 1]
elif p.ndim == 2 and p.shape[1] == 1:
p = p.ravel()
if comparison == ">=":
return (p >= result.thresholds[0]).astype(np.int32)
else:
return (p > result.thresholds[0]).astype(np.int32)
return OptimizationResult(
thresholds=result.thresholds,
scores=result.scores,
predict=predict_binary,
task=Task.BINARY,
metric=metric,
n_classes=2,
)
__all__ = [
"optimize_f1_binary",
"optimize_utility_binary",
"optimize_metric_binary",
]