Source code for stable_cart.evaluation

"""
evaluation.py
-------------
Evaluation utilities for assessing both model performance and prediction stability.

This module provides functions to:
1. Measure prediction stability across multiple models (how consistent are predictions?)
2. Evaluate predictive performance across standard metrics (accuracy, RMSE, etc.)

These functions are designed to work with collections of fitted sklearn-compatible models
and are useful for comparing different tree algorithms, ensemble methods, or parameter settings.
"""

import numpy as np
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    mean_squared_error,
    mean_absolute_error,
    r2_score,
)
from sklearn.preprocessing import label_binarize


# -------------------------------
# Prediction Stability (OOS)
# -------------------------------

[docs]
def prediction_stability(
    models: dict[str, object], X_oos: np.ndarray, task: str = "categorical"
) -> dict[str, float]:
    """
    Measure how consistent model predictions are across models on the SAME OOS data.

    This metric quantifies prediction stability by measuring how much models agree
    with each other on the same out-of-sample data. Lower values indicate more
    stable/consistent predictions.

    Parameters
    ----------
    models : dict[str, fitted_model]
        Mapping of model name -> fitted model (must have .predict() method).
        Requires at least 2 models.
    X_oos : np.ndarray
        Out-of-sample feature matrix to evaluate on.
    task : {'categorical', 'continuous'}, default='categorical'
        Type of prediction task.

    Returns
    -------
    scores : dict[str, float]
        Stability score for each model.

        For 'categorical':
            Average pairwise DISAGREEMENT rate per model (range: 0-1).
            Lower is better (more stable). 0 = perfect agreement with all other models.

        For 'continuous':
            RMSE of each model's predictions vs the ensemble mean.
            Lower is better (more stable). 0 = identical to ensemble mean.

    Raises
    ------
    ValueError
        If fewer than 2 models provided, or if task is not 'categorical' or 'continuous'.

    Examples
    --------
    >>> from sklearn.tree import DecisionTreeClassifier
    >>> from sklearn.model_selection import train_test_split
    >>> X, y = make_classification(n_samples=100, random_state=42)
    >>> X_train, X_test, y_train, y_test = train_test_split(X, y)
    >>> models = {
    ...     'tree1': DecisionTreeClassifier(random_state=1).fit(X_train, y_train),
    ...     'tree2': DecisionTreeClassifier(random_state=2).fit(X_train, y_train),
    ... }
    >>> stability = prediction_stability(models, X_test, task='categorical')
    >>> print(stability)  # Lower values = more stable predictions
    {'tree1': 0.15, 'tree2': 0.15}

    Notes
    -----
    - Stability is measured relative to other models in the collection
    - For categorical tasks, uses pairwise agreement rates
    - For continuous tasks, uses RMSE to ensemble mean as stability proxy
    - This metric is complementary to predictive accuracy - a model can be
      accurate but unstable, or stable but inaccurate
    """
    names = list(models.keys())
    K = len(names)

    if K < 2:
        raise ValueError("Need at least 2 models to assess stability.")

    # --- CATEGORICAL: pairwise disagreement (1 - agreement rate) ---
    if task == "categorical":
        preds = np.column_stack([models[n].predict(X_oos) for n in names])  # (n, K)

        # Ensure numeric label space for comparisons
        if not np.issubdtype(preds.dtype, np.number):
            # Map labels to integers consistently
            unique, inv = np.unique(preds, return_inverse=True)
            preds = inv.reshape(preds.shape)

        # Compute pairwise agreement matrix A[k,j] = mean(pred_k == pred_j)
        agree = np.ones((K, K), dtype=float)
        for k in range(K):
            for j in range(k + 1, K):
                agreement_rate = float(np.mean(preds[:, k] == preds[:, j]))
                agree[k, j] = agree[j, k] = agreement_rate

        # Per-model disagreement = average disagreement over pairs involving the model
        scores = {}
        for k, name in enumerate(names):
            # Exclude self-comparison
            other_agreements = [agree[k, j] for j in range(K) if j != k]
            avg_disagreement = float(np.mean([1.0 - a for a in other_agreements]))
            scores[name] = avg_disagreement
        return scores

    # --- CONTINUOUS: RMSE to ensemble mean ---
    elif task == "continuous":
        preds = np.column_stack([models[n].predict(X_oos) for n in names])  # (n, K)
        mean_pred = np.mean(preds, axis=1)  # Ensemble mean per sample

        scores = {}
        for k, name in enumerate(names):
            deviation = mean_pred - preds[:, k]
            rmse = float(np.sqrt(np.mean(np.square(deviation))))
            scores[name] = rmse  # Lower = more stable
        return scores

    else:
        raise ValueError("task must be 'categorical' or 'continuous'.")



# -------------------------------
# Model Performance Evaluation
# -------------------------------

[docs]
def evaluate_models(
    models: dict[str, object], X: np.ndarray, y: np.ndarray, task: str = "categorical"
) -> dict[str, dict[str, float]]:
    """
    Evaluate predictive performance of multiple models using standard metrics.

    Computes task-appropriate performance metrics for each model. For classification,
    includes accuracy and AUC (if predict_proba available). For regression, includes
    MAE, RMSE, and R².

    Parameters
    ----------
    models : dict[str, fitted_model]
        Model name -> fitted model mapping. Models must have .predict() method.
    X : np.ndarray
        Feature matrix for evaluation.
    y : np.ndarray
        Ground-truth labels (classification) or targets (regression).
    task : {'categorical', 'continuous'}, default='categorical'
        Type of prediction task.

    Returns
    -------
    metrics : dict[str, dict[str, float]]
        Nested dictionary: {model_name: {metric_name: value}}

        For 'categorical':
            - 'acc': Classification accuracy (0-1)
            - 'auc': ROC AUC score (0-1, if predict_proba available)
                    For binary: standard AUC
                    For multi-class: one-vs-rest macro AUC

        For 'continuous':
            - 'mae': Mean Absolute Error (lower is better)
            - 'rmse': Root Mean Squared Error (lower is better)
            - 'r2': R² coefficient of determination (-∞ to 1, higher is better)

    Raises
    ------
    ValueError
        If task is not 'categorical' or 'continuous'.

    Examples
    --------
    >>> from sklearn.tree import DecisionTreeRegressor
    >>> X, y = make_regression(n_samples=100, random_state=42)
    >>> models = {
    ...     'shallow': DecisionTreeRegressor(max_depth=3, random_state=42).fit(X, y),
    ...     'deep': DecisionTreeRegressor(max_depth=10, random_state=42).fit(X, y),
    ... }
    >>> performance = evaluate_models(models, X, y, task='continuous')
    >>> print(performance['shallow'])
    {'mae': 12.3, 'rmse': 15.7, 'r2': 0.85}

    Notes
    -----
    - AUC computation gracefully handles cases where predict_proba is not available
    - For multi-class classification, uses one-vs-rest strategy for AUC
    - All metrics use standard sklearn implementations
    - Consider using separate train/test sets to avoid overfitting bias
    """
    results: dict[str, dict[str, float]] = {}

    if task == "categorical":
        y_unique = np.unique(y)
        is_binary = len(y_unique) == 2

        for name, mdl in models.items():
            y_hat = mdl.predict(X)
            try:
                acc = float(accuracy_score(y, y_hat))
            except ValueError:
                # Handle cases where predictions contain NaN or invalid values
                acc = np.nan
            entry = {"acc": acc}

            # Compute AUC if model supports probability predictions
            if hasattr(mdl, "predict_proba"):
                try:
                    proba = mdl.predict_proba(X)
                    if is_binary:
                        auc = float(roc_auc_score(y, proba[:, 1]))
                    else:
                        # One-vs-rest macro AUC for multi-class
                        Yb = label_binarize(y, classes=y_unique)
                        auc = float(roc_auc_score(Yb, proba, average="macro", multi_class="ovr"))
                    entry["auc"] = auc
                except Exception:
                    # Silently skip AUC if computation fails (e.g., single class in y)
                    pass

            results[name] = entry

    elif task == "continuous":
        for name, mdl in models.items():
            y_pred = mdl.predict(X)
            try:
                mae = float(mean_absolute_error(y, y_pred))
                rmse = float(np.sqrt(mean_squared_error(y, y_pred)))
                r2 = float(r2_score(y, y_pred))
            except ValueError:
                # Handle cases where predictions contain NaN or invalid values
                mae = np.nan
                rmse = np.nan
                r2 = np.nan
            results[name] = {"mae": mae, "rmse": rmse, "r2": r2}

    else:
        raise ValueError("task must be 'categorical' or 'continuous'.")

    return results