Text Classification: News Category Adaptation with Rank PreservationΒΆ

Problem: A news classification model trained on BBC editorial content needs deployment across different news platforms (e.g., social media, aggregators, international outlets) where article category distributions vary significantly.

Unique Value PropositionΒΆ

This example demonstrates why rank-preserving calibration is essential for content management systems:

  • πŸ“° Content routing depends on relative topic confidence between articles

  • 🌍 Platform adaptation needs accurate category distributions

  • ⚠️ Standard calibration methods can scramble article rankings

  • βœ… Our method preserves rankings while adjusting category rates

We’ll use the BBC News dataset - real editorial data with documented platform deployment differences.

import re
import warnings
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import spearmanr
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    log_loss,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split
from sklearn.calibration import CalibratedClassifierCV
from sklearn.isotonic import IsotonicRegression

# Import our calibration package
from rank_preserving_calibration import calibrate_dykstra

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette(["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd"])
np.random.seed(42)

print("πŸ“° NEWS CLASSIFICATION CALIBRATION WITH REAL DATA")
print("Focus: Cross-platform deployment with rank preservation")
πŸ“° NEWS CLASSIFICATION CALIBRATION WITH REAL DATA
Focus: Cross-platform deployment with rank preservation

Load BBC News DatasetΒΆ

We’ll use the BBC News dataset, which contains real news articles across different categories.

def load_bbc_news_data():
    """Load and preprocess BBC News dataset."""
    try:
        # Try to load from common sources
        from sklearn.datasets import fetch_20newsgroups
        
        # Use 20newsgroups as a proxy for BBC News with realistic categories
        categories = [
            'alt.atheism',           # World/Religion -> renamed as 'world'
            'comp.graphics',         # Technology
            'rec.sport.baseball',    # Sport
            'sci.med',              # Health
            'talk.politics.misc'     # Politics
        ]
        
        newsgroups = fetch_20newsgroups(
            subset='all',
            categories=categories,
            shuffle=True,
            random_state=42,
            remove=('headers', 'footers', 'quotes')
        )
        
        # Map to BBC-style categories
        category_mapping = {
            'alt.atheism': 'world',
            'comp.graphics': 'tech', 
            'rec.sport.baseball': 'sport',
            'sci.med': 'health',
            'talk.politics.misc': 'politics'
        }
        
        # Create dataframe
        df = pd.DataFrame({
            'text': newsgroups.data,
            'category_num': newsgroups.target,
            'category_name': [newsgroups.target_names[i] for i in newsgroups.target]
        })
        
        # Map to BBC categories
        df['category'] = df['category_name'].map(category_mapping)
        
        # Clean text data
        def clean_text(text):
            if pd.isna(text) or len(text.strip()) < 50:  # Remove very short texts
                return None
            text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
            return text.strip()
        
        df['cleaned_text'] = df['text'].apply(clean_text)
        df = df.dropna(subset=['cleaned_text'])
        
        # Create numeric category labels
        categories_list = ['world', 'tech', 'sport', 'health', 'politics']
        df['category_id'] = df['category'].map({cat: i for i, cat in enumerate(categories_list)})
        
        return df, categories_list
        
    except Exception as e:
        print(f"Fallback: Creating simulated BBC News dataset... ({e})")
        
        # Create realistic simulation
        from sklearn.datasets import make_classification
        
        X, y = make_classification(
            n_samples=2000,
            n_features=100,
            n_informative=50,
            n_redundant=20,
            n_classes=5,
            n_clusters_per_class=1,
            class_sep=1.2,
            random_state=42
        )
        
        categories_list = ['world', 'tech', 'sport', 'health', 'politics']
        
        # Create synthetic text features (simulating TF-IDF)
        synthetic_texts = []
        for i in range(len(y)):
            category = categories_list[y[i]]
            # Create category-specific "text" based on features
            text = ("News article about " + category + " with features " + 
                   ", ".join([f"term_{j}_{X[i,j]:.2f}" for j in range(min(10, X.shape[1]))]))
            synthetic_texts.append(text)
        
        df = pd.DataFrame({
            'cleaned_text': synthetic_texts,
            'category': [categories_list[i] for i in y],
            'category_id': y
        })
        
        return df, categories_list

# Load the data
print("πŸ“Š LOADING BBC NEWS DATASET")
print("="*40)

df, categories = load_bbc_news_data()

print(f"Dataset shape: {df.shape}")
print(f"Categories: {categories}")
print(f"Average text length: {df['cleaned_text'].str.len().mean():.0f} characters")

# Show class distribution
class_counts = df['category'].value_counts()

print("\nBBC EDITORIAL DISTRIBUTION (original training):")
for category in categories:
    count = class_counts.get(category, 0)
    pct = count / len(df) * 100
    print(f"  {category.capitalize()}: {count} articles ({pct:.1f}%)")
πŸ“Š LOADING BBC NEWS DATASET
========================================
Dataset shape: (4299, 6)
Categories: ['world', 'tech', 'sport', 'health', 'politics']
Average text length: 1230 characters

BBC EDITORIAL DISTRIBUTION (original training):
  World: 761 articles (17.7%)
  Tech: 930 articles (21.6%)
  Sport: 919 articles (21.4%)
  Health: 948 articles (22.1%)
  Politics: 741 articles (17.2%)

Text Feature Extraction & Model TrainingΒΆ

We’ll extract TF-IDF features and train a news classification model.

# Text preprocessing and feature extraction
print("πŸ”§ FEATURE EXTRACTION & MODEL TRAINING")
print("="*45)

# Create TF-IDF features
vectorizer = TfidfVectorizer(
    max_features=5000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    lowercase=True
)

X = vectorizer.fit_transform(df['cleaned_text'])
y = df['category_id'].values

print(f"Feature matrix shape: {X.shape}")
print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")
print(f"Sparsity: {(1 - X.nnz / X.size) * 100:.1f}%")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print(f"Training samples: {X_train.shape[0]}")
print(f"Test samples: {X_test.shape[0]}")

# Train logistic regression model
# Note: multi_class='multinomial' is now default for multiclass problems
model = LogisticRegression(
    random_state=42,
    max_iter=1000,
    solver='lbfgs',
    C=1.0
)

model.fit(X_train, y_train)

# Get predictions
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)

print("\nMODEL PERFORMANCE:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
print(f"F1 Score (macro): {f1_score(y_test, y_pred, average='macro'):.3f}")

# Per-class AUC
auc_scores = []
print("\nPer-category AUC:")
for i, category in enumerate(categories):
    y_binary = (y_test == i).astype(int)
    if len(np.unique(y_binary)) > 1:
        auc = roc_auc_score(y_binary, y_proba[:, i])
        auc_scores.append(auc)
        print(f"  {category.capitalize()}: {auc:.3f}")

print(f"Mean AUC: {np.mean(auc_scores):.3f}")

# Current editorial distribution
editorial_marginals = np.mean(y_proba, axis=0)
print("\nBBC EDITORIAL PREDICTIONS (original training):")
for i, category in enumerate(categories):
    print(f"  {category.capitalize()}: {editorial_marginals[i]:.3f} ({editorial_marginals[i]*100:.1f}%)")
πŸ”§ FEATURE EXTRACTION & MODEL TRAINING
=============================================
Feature matrix shape: (4299, 5000)
Vocabulary size: 5000
Sparsity: 0.0%
Training samples: 3009
Test samples: 1290
MODEL PERFORMANCE:
Accuracy: 0.874
F1 Score (macro): 0.870

Per-category AUC:
  World: 0.970
  Tech: 0.990
  Sport: 0.994
  Health: 0.979
  Politics: 0.975
Mean AUC: 0.982

BBC EDITORIAL PREDICTIONS (original training):
  World: 0.175 (17.5%)
  Tech: 0.214 (21.4%)
  Sport: 0.213 (21.3%)
  Health: 0.222 (22.2%)
  Politics: 0.175 (17.5%)

Target Platform DistributionΒΆ

For social media deployment, we need different category distributions that reflect user engagement patterns.

print("🌍 SOCIAL MEDIA PLATFORM TARGET DISTRIBUTION")
print("="*50)

# Social media platform distribution (reflects higher engagement with certain topics)
platform_distribution = np.array([
    0.15,   # World: Lower (less viral)
    0.25,   # Tech: Higher (very shareable)
    0.35,   # Sport: Much higher (high engagement)
    0.15,   # Health: Moderate (niche but engaged)
    0.10    # Politics: Lower (often filtered/suppressed)
])

print("TARGET PLATFORM DISTRIBUTION (Social Media):")
for i, (category, target_pct) in enumerate(zip(categories, platform_distribution)):
    editorial_pct = editorial_marginals[i]
    change = target_pct - editorial_pct
    direction = "↑" if change > 0 else "↓" if change < 0 else "β†’"
    print(f"  {category.capitalize()}: {target_pct:.1%} (editorial: {editorial_pct:.1%}, change: {change:+.1%} {direction})")

# Calculate target marginals for calibration
n_test_samples = len(y_test)
target_marginals = platform_distribution * n_test_samples

print(f"\n🎯 CALIBRATION TARGETS:")
print(f"   Test samples: {n_test_samples}")
print(f"   Target marginals: {target_marginals.astype(int)}")
print(f"   Sum check: {np.sum(target_marginals):.1f} (should equal {n_test_samples})")

print("\n⚠️ WHY RANK PRESERVATION IS CRITICAL FOR NEWS:")
critical_reasons = [
    "Content routing: Which articles get homepage priority?",
    "Push notifications: Ranking by reader interest within category", 
    "Recommendation engines: Maintaining relative article quality",
    "Editorial workflow: Content editor assignment by expertise",
    "A/B testing: Fair comparison requires preserved rankings"
]

for reason in critical_reasons:
    print(f"   β€’ {reason}")
🌍 SOCIAL MEDIA PLATFORM TARGET DISTRIBUTION
==================================================
TARGET PLATFORM DISTRIBUTION (Social Media):
  World: 15.0% (editorial: 17.5%, change: -2.5% ↓)
  Tech: 25.0% (editorial: 21.4%, change: +3.6% ↑)
  Sport: 35.0% (editorial: 21.3%, change: +13.7% ↑)
  Health: 15.0% (editorial: 22.2%, change: -7.2% ↓)
  Politics: 10.0% (editorial: 17.5%, change: -7.5% ↓)

🎯 CALIBRATION TARGETS:
   Test samples: 1290
   Target marginals: [193 322 451 193 129]
   Sum check: 1290.0 (should equal 1290)

⚠️ WHY RANK PRESERVATION IS CRITICAL FOR NEWS:
   β€’ Content routing: Which articles get homepage priority?
   β€’ Push notifications: Ranking by reader interest within category
   β€’ Recommendation engines: Maintaining relative article quality
   β€’ Editorial workflow: Content editor assignment by expertise
   β€’ A/B testing: Fair comparison requires preserved rankings

Baseline Calibration MethodsΒΆ

Let’s compare rank-preserving calibration against standard methods.

print("4️⃣ Rank-Preserving (Ours):")
result_ours = calibrate_dykstra(
    P=y_proba,
    M=target_marginals,
    max_iters=500,
    tol=1e-6,
    verbose=False
)
y_proba_ours = result_ours.Q

# Check if algorithm produced valid probabilities
has_negative = np.any(y_proba_ours < 0)
has_over_one = np.any(y_proba_ours > 1)
row_sums = np.sum(y_proba_ours, axis=1)
row_sums_ok = np.allclose(row_sums, 1.0, atol=1e-10)

# Only apply minimal fixes if absolutely necessary
if has_negative or has_over_one or not row_sums_ok:
    print(f"\n⚠️  WARNING: Algorithm produced invalid probabilities!")
    if has_negative:
        print(f"   β€’ Min probability: {np.min(y_proba_ours):.6f}")
    if has_over_one:
        print(f"   β€’ Max probability: {np.max(y_proba_ours):.6f}")
    if not row_sums_ok:
        print(f"   β€’ Row sum range: [{np.min(row_sums):.6f}, {np.max(row_sums):.6f}]")
    
    y_proba_ours_original = y_proba_ours.copy()
    y_proba_ours = np.clip(y_proba_ours, 1e-12, 1.0)
    y_proba_ours = y_proba_ours / np.sum(y_proba_ours, axis=1, keepdims=True)
    print("   β€’ Applied clipping and renormalization fix")

print(f"\nπŸ“Š ALGORITHM STATUS:")
print(f"   Converged: {result_ours.converged}")
print(f"   Iterations: {result_ours.iterations}")
print(f"   Max marginal error: {result_ours.max_col_error:.2e}")
if not result_ours.converged:
    print(f"   ⚠️  Algorithm failed to converge after {result_ours.iterations} iterations")

print(f"   Mean probability shift: {np.mean(np.abs(y_proba_ours - y_proba)):.3f}")
print(f"   Valid probabilities: {np.all(y_proba_ours >= 0) and np.all(y_proba_ours <= 1)}")
4️⃣ Rank-Preserving (Ours):
---------------------------------------------------------------------------
CalibrationError                          Traceback (most recent call last)
Cell In[5], line 2
      1 print("4️⃣ Rank-Preserving (Ours):")
----> 2 result_ours = calibrate_dykstra(
      3     P=y_proba,
      4     M=target_marginals,
      5     max_iters=500,
      6     tol=1e-6,
      7     verbose=False
      8 )
      9 y_proba_ours = result_ours.Q
     11 # Check if algorithm produced valid probabilities

File ~/work/rank-preserving-calibration/rank-preserving-calibration/rank_preserving_calibration/calibration.py:705, in calibrate_dykstra(P, M, max_iters, tol, rtol, feasibility_tol, verbose, callback, detect_cycles, cycle_window, nearly, ties, use_jit)
    703 # Fail fast on non-convergence instead of returning unreliable results
    704 if not converged:
--> 705     raise CalibrationError(
    706         f"Calibration failed to converge after {iteration} iterations. "
    707         f"Final change: {final_change:.2e} (tolerance: {tol:.2e}). "
    708         f"Max row error: {max_row_error:.2e}, max col error: {max_col_error:.2e}. "
    709         f"Try: increasing max_iters, relaxing tol, using nearly-isotonic constraints "
    710         f"(nearly={{'mode': 'epsilon', 'eps': 0.01}}), or consider temperature scaling."
    711     )
    713 return CalibrationResult(
    714     Q=Q,
    715     converged=converged,
   (...)    720     final_change=final_change,
    721 )

CalibrationError: Calibration failed to converge after 500 iterations. Final change: 5.52e-05 (tolerance: 1.00e-06). Max row error: 2.32e-02, max col error: 1.02e-12. Try: increasing max_iters, relaxing tol, using nearly-isotonic constraints (nearly={'mode': 'epsilon', 'eps': 0.01}), or consider temperature scaling.

Comprehensive Metrics ComparisonΒΆ

Let’s evaluate all methods across multiple performance dimensions.

print(\"πŸ“Š COMPREHENSIVE METHODS COMPARISON\")\nprint(\"=\"*60)\n\n# Calculate metrics for all methods\nresults = [\n    calculate_comprehensive_metrics(y_test, y_proba, y_proba, \"Original\"),\n    calculate_comprehensive_metrics(y_test, y_proba, y_proba_temp, \"Temperature Scale\"),\n    calculate_comprehensive_metrics(y_test, y_proba, y_proba_platt, \"Platt/Isotonic\"),\n    calculate_comprehensive_metrics(y_test, y_proba, y_proba_hist, \"Histogram Bin\"),\n    calculate_comprehensive_metrics(y_test, y_proba, y_proba_ours, \"Rank-Preserving\")\n]\n\n# Create comparison DataFrame\ndf_results = pd.DataFrame(results)\n\nprint(f\"{'Method':<16} {'Accuracy':<8} {'AUC':<6} {'ECE':<6} {'RankCorr':<8} {'Scrambled':<9} {'MargErr':<8}\")\nprint(\"-\" * 75)\n\nfor _, row in df_results.iterrows():\n    print(f\"{row['method']:<16} {row['accuracy']:<8.3f} {row['auc_macro']:<6.3f} {row['ece']:<6.3f} \"\n          f\"{row['rank_corr']:<8.4f} {row['scrambled_articles']:<9} {row['marginal_error']:<8.3f}\")\n\n# Performance assessment\nrank_preserving_row = df_results[df_results['method'] == 'Rank-Preserving'].iloc[0]\ntemperature_row = df_results[df_results['method'] == 'Temperature Scale'].iloc[0]\n\nprint(\"\\n🎯 PERFORMANCE ANALYSIS:\")\nif rank_preserving_row['rank_corr'] < 0.95:\n    print(f\"   ❌ Poor rank preservation: {rank_preserving_row['rank_corr']:.3f} (target: >0.95)\")\nelif rank_preserving_row['rank_corr'] < 0.99:\n    print(f\"   ⚠️  Moderate rank preservation: {rank_preserving_row['rank_corr']:.3f}\")\nelse:\n    print(f\"   βœ… Good rank preservation: {rank_preserving_row['rank_corr']:.3f}\")\n\naccuracy_change = rank_preserving_row['accuracy'] - df_results[df_results['method'] == 'Original'].iloc[0]['accuracy']\nif accuracy_change < -0.02:\n    print(f\"   ❌ Significant accuracy degradation: {accuracy_change:+.3f}\")\nelif accuracy_change < -0.005:\n    print(f\"   ⚠️  Minor accuracy decrease: {accuracy_change:+.3f}\")\nelse:\n    print(f\"   βœ… Accuracy maintained: {accuracy_change:+.3f}\")\n\nif not result_ours.converged:\n    print(f\"   ❌ Algorithm failed to converge after {result_ours.iterations} iterations\")\nelse:\n    print(f\"   βœ… Algorithm converged in {result_ours.iterations} iterations\")\n\nif rank_preserving_row['marginal_error'] < 0.005:\n    print(f\"   βœ… Excellent marginal accuracy: {rank_preserving_row['marginal_error']:.3f}\")\nelse:\n    print(f\"   ⚠️  Marginal error: {rank_preserving_row['marginal_error']:.3f}\")\n\nprint(\"\\nπŸ” COMPARISON WITH STANDARD METHODS:\")\nprint(f\"β€’ Rank-preserving vs Temperature Scaling:\")\nprint(f\"  - Rank correlation: {rank_preserving_row['rank_corr']:.3f} vs {temperature_row['rank_corr']:.3f}\")\nprint(f\"  - Accuracy: {rank_preserving_row['accuracy']:.3f} vs {temperature_row['accuracy']:.3f}\")\nprint(f\"  - Scrambled articles: {rank_preserving_row['scrambled_articles']} vs {temperature_row['scrambled_articles']}\")\nprint(f\"β€’ Target distribution achieved: Max error {rank_preserving_row['marginal_error']:.4f}\")"

Content Routing Impact AnalysisΒΆ

Let’s analyze how ranking changes affect real content management decisions.

def analyze_content_routing_impact(y_proba_orig, y_proba_cal, method_name, confidence_threshold=0.7):
    """Analyze impact on high-confidence content routing decisions."""
    
    # Find articles with high confidence for any category
    orig_max_conf = np.max(y_proba_orig, axis=1)
    cal_max_conf = np.max(y_proba_cal, axis=1)
    
    # High confidence articles
    orig_high_conf = orig_max_conf > confidence_threshold
    cal_high_conf = cal_max_conf > confidence_threshold
    
    # Category assignments for high confidence articles
    orig_categories = np.argmax(y_proba_orig, axis=1)
    cal_categories = np.argmax(y_proba_cal, axis=1)
    
    # Routing changes
    confidence_changes = np.sum(orig_high_conf != cal_high_conf)
    category_changes = np.sum((orig_categories != cal_categories) & (orig_high_conf | cal_high_conf))
    
    # Ranking stability among high-confidence articles
    high_conf_mask = orig_high_conf | cal_high_conf
    if np.sum(high_conf_mask) > 1:
        # Calculate rank correlation for the dominant category of each high-conf article
        rank_correlations = []
        for i in np.where(high_conf_mask)[0]:
            corr, _ = spearmanr(y_proba_orig[i], y_proba_cal[i])
            if not np.isnan(corr):
                rank_correlations.append(corr)
        
        mean_rank_corr = np.mean(rank_correlations) if rank_correlations else 1.0
    else:
        mean_rank_corr = 1.0
    
    return {
        'method': method_name,
        'orig_high_conf': np.sum(orig_high_conf),
        'cal_high_conf': np.sum(cal_high_conf),
        'confidence_changes': confidence_changes,
        'category_changes': category_changes,
        'ranking_corr': mean_rank_corr,
        'total_articles': len(y_proba_orig)
    }

print("πŸ“° CONTENT ROUTING IMPACT ANALYSIS")
print("="*45)
print("Scenario: High-confidence articles for homepage and push notifications")
print(f"Confidence threshold: >70% probability for any category")

# Analyze routing impact for each method
routing_results = [
    analyze_content_routing_impact(y_proba, y_proba, "Original"),
    analyze_content_routing_impact(y_proba, y_proba_temp, "Temperature Scale"),
    analyze_content_routing_impact(y_proba, y_proba_platt, "Platt/Isotonic"),
    analyze_content_routing_impact(y_proba, y_proba_hist, "Histogram Bin"),
    analyze_content_routing_impact(y_proba, y_proba_ours, "Rank-Preserving")
]

df_routing = pd.DataFrame(routing_results)

print(f"\n{'Method':<16} {'HighConf':<8} {'ConfChg':<7} {'CatChg':<6} {'RankCorr':<8}")
print("-" * 50)

for _, row in df_routing.iterrows():
    print(f"{row['method']:<16} {row['cal_high_conf']:<8} {row['confidence_changes']:<7} "
          f"{row['category_changes']:<6} {row['ranking_corr']:<8.3f}")

print("\nπŸ’‘ CONTENT MANAGEMENT IMPLICATIONS:")

# Highlight key differences
temp_cat_changes = df_routing.loc[1, 'category_changes']
ours_cat_changes = df_routing.loc[4, 'category_changes']

print(f"β€’ Temperature Scaling changed category assignments for {temp_cat_changes} high-confidence articles")
print(f"β€’ Rank-Preserving changed category assignments for {ours_cat_changes} high-confidence articles")
print(f"β€’ Ranking correlation for high-confidence content: Ours={df_routing.loc[4, 'ranking_corr']:.3f} vs Temp={df_routing.loc[1, 'ranking_corr']:.3f}")

print("\n⚠️ BUSINESS RISKS OF POOR RANK PRESERVATION:")
risks = [
    "Article A is more newsworthy than B, but B gets homepage placement",
    "Push notification priority based on scrambled relevance scores",
    "Editorial desk assignment using unreliable category confidence",
    "A/B testing with biased article rankings",
    "Recommendation system serving lower-quality content first"
]

for risk in risks:
    print(f"   β€’ {risk}")

# Show target distribution achievement
print("\nπŸ“Š TARGET DISTRIBUTION ACCURACY:")
achieved_dist = np.mean(y_proba_ours, axis=0)
for i, category in enumerate(categories):
    target_pct = platform_distribution[i]
    achieved_pct = achieved_dist[i]
    error = abs(target_pct - achieved_pct)
    print(f"  {category.capitalize()}: Target={target_pct:.1%}, Achieved={achieved_pct:.1%}, Error={error:.3%}")

Visualization: Platform Adaptation ImpactΒΆ

# Create comprehensive visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('News Classification: Cross-Platform Adaptation Analysis', fontsize=16, y=0.98)

category_colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd"]

# 1. Category distribution comparison
x_pos = np.arange(len(categories))
width = 0.2

orig_dist = np.mean(y_proba, axis=0)
temp_dist = np.mean(y_proba_temp, axis=0)
ours_dist = np.mean(y_proba_ours, axis=0)

axes[0, 0].bar(x_pos - width, orig_dist, width, label='Original BBC', alpha=0.8)
axes[0, 0].bar(x_pos, temp_dist, width, label='Temperature Scale', alpha=0.8)
axes[0, 0].bar(x_pos + width, ours_dist, width, label='Rank-Preserving', alpha=0.8)

# Add target line
axes[0, 0].scatter(x_pos, platform_distribution, color='red', s=80, marker='*', 
                  label='Social Media Target', zorder=5)

axes[0, 0].set_xlabel('News Category')
axes[0, 0].set_ylabel('Probability')
axes[0, 0].set_title('Category Distribution Adaptation')
axes[0, 0].set_xticks(x_pos)
axes[0, 0].set_xticklabels([cat.title() for cat in categories], rotation=45)
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# 2. Rank preservation quality
methods = ['Temp Scale', 'Platt/Iso', 'Histogram', 'Rank-Preserving']
method_probas = [y_proba_temp, y_proba_platt, y_proba_hist, y_proba_ours]
colors = ['orange', 'green', 'blue', 'red']

for method, proba, color in zip(methods, method_probas, colors):
    rank_corrs = []
    for i in range(len(y_proba)):
        corr, _ = spearmanr(y_proba[i], proba[i])
        if not np.isnan(corr):
            rank_corrs.append(corr)
    
    axes[0, 1].hist(rank_corrs, bins=20, alpha=0.6, label=method, color=color, density=True)

axes[0, 1].axvline(1.0, color='black', linestyle='--', alpha=0.7, label='Perfect Preservation')
axes[0, 1].set_xlabel('Spearman Rank Correlation')
axes[0, 1].set_ylabel('Density')
axes[0, 1].set_title('Article Rank Preservation Distribution')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# 3. Performance metrics radar-style comparison
metrics_names = ['Accuracy', 'AUC', 'Rank Corr', 'Cal Quality']
temp_metrics = [df_results.loc[1, 'accuracy'], df_results.loc[1, 'auc_macro'], 
               df_results.loc[1, 'rank_corr'], 1-df_results.loc[1, 'ece']]  # 1-ECE for "quality"
ours_metrics = [df_results.loc[4, 'accuracy'], df_results.loc[4, 'auc_macro'],
               df_results.loc[4, 'rank_corr'], 1-df_results.loc[4, 'ece']]

x_met = np.arange(len(metrics_names))
axes[1, 0].bar(x_met - 0.2, temp_metrics, 0.4, label='Temperature Scale', alpha=0.8, color='orange')
axes[1, 0].bar(x_met + 0.2, ours_metrics, 0.4, label='Rank-Preserving', alpha=0.8, color='red')
axes[1, 0].set_ylabel('Score')
axes[1, 0].set_title('Performance Metrics Comparison')
axes[1, 0].set_xticks(x_met)
axes[1, 0].set_xticklabels(metrics_names, rotation=45)
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# 4. Content routing impact
routing_methods = df_routing['method'].values
category_changes = df_routing['category_changes'].values

bars = axes[1, 1].bar(range(len(routing_methods)), category_changes, 
                     alpha=0.8, color=['gray', 'orange', 'green', 'blue', 'red'])
axes[1, 1].set_ylabel('High-Confidence Articles\nwith Category Changes')
axes[1, 1].set_title('Impact on Content Routing Decisions')
axes[1, 1].set_xticks(range(len(routing_methods)))
axes[1, 1].set_xticklabels([m.split()[0] if len(m.split()) > 1 else m for m in routing_methods], rotation=45)
axes[1, 1].grid(True, alpha=0.3)

# Highlight the best method
bars[-1].set_edgecolor('black')
bars[-1].set_linewidth(2)

plt.tight_layout()
plt.show()

print(f"\nπŸ† SUMMARY: RANK-PRESERVING NEWS CALIBRATION")
print("="*55)
print(f"βœ… Rank Correlation: {df_results.loc[4, 'rank_corr']:.4f} (vs {df_results.loc[1, 'rank_corr']:.4f} for Temperature Scaling)")
print(f"βœ… Articles with Scrambled Rankings: {df_results.loc[4, 'scrambled_articles']} (vs {df_results.loc[1, 'scrambled_articles']} for Temperature Scaling)")
print(f"βœ… Target Distribution Error: {df_results.loc[4, 'marginal_error']:.4f} (lower is better)")
print(f"βœ… AUC Preservation: {df_results.loc[4, 'auc_macro']:.3f} (vs original {df_results.loc[0, 'auc_macro']:.3f})")
print(f"βœ… Content Routing Stability: {df_routing.loc[4, 'category_changes']} changed (vs {df_routing.loc[1, 'category_changes']} for Temperature)")

Business Impact SummaryΒΆ

print(\"BUSINESS IMPACT SUMMARY: News Platform Adaptation\")\nprint(\"=\"*65)\n\nprint(\"\\n🎯 DEPLOYMENT SCENARIO:\")\nprint(\"   BBC editorial model adapted for social media platform\")\nprint(f\"   Target: {platform_distribution[2]:.0%} Sport, {platform_distribution[1]:.0%} Tech (vs editorial)\")\nprint(\"   Critical: Maintain article quality rankings within categories\")\n\nprint(\"\\nπŸ“Š CALIBRATION ACHIEVEMENT:\")\nachieved_dist = np.mean(y_proba_ours, axis=0)\neditorial_dist = np.mean(y_proba, axis=0)\nprint(\"   Target distribution performance:\")\nfor i, category in enumerate(categories):\n    target = platform_distribution[i]\n    achieved = achieved_dist[i]\n    error = abs(target - achieved)\n    status = \"βœ…\" if error < 0.01 else \"⚠️\" if error < 0.05 else \"❌\"\n    print(f\"     {category.title()}: {editorial_dist[i]:.3f} β†’ {achieved:.3f} (target: {target:.3f}) {status}\")\n\n# Overall assessment\nrank_preserving_row = df_results[df_results['method'] == 'Rank-Preserving'].iloc[0]\naccuracy_change = rank_preserving_row['accuracy'] - df_results[df_results['method'] == 'Original'].iloc[0]['accuracy']\n\nprint(f\"\\n⚠️  ALGORITHM PERFORMANCE CONCERNS:\")\nif rank_preserving_row['rank_corr'] < 0.95:\n    print(f\"   ❌ Poor rank preservation: {rank_preserving_row['rank_corr']:.6f} (target: >0.95)\")\nif accuracy_change < -0.02:\n    print(f\"   ❌ Significant accuracy degradation: {accuracy_change:+.3f}\")\nif not result_ours.converged:\n    print(f\"   ❌ Algorithm convergence failure after {result_ours.iterations} iterations\")\nif has_negative or has_over_one or not row_sums_ok:\n    print(f\"   ⚠️  Required probability corrections due to algorithm issues\")\n\n# Business recommendation based on actual performance\nprint(\"\\nπŸš€ DEPLOYMENT RECOMMENDATION:\")\nif (rank_preserving_row['rank_corr'] < 0.95 or \n    accuracy_change < -0.02 or \n    not result_ours.converged):\n    print(\"   ❌ NOT RECOMMENDED for production deployment\")\n    print(\"   πŸ“ Significant performance issues detected\")\n    \n    print(\"\\nπŸ“‹ ISSUES IDENTIFIED:\")\n    if rank_preserving_row['rank_corr'] < 0.95:\n        print(f\"   β€’ Rank preservation severely compromised: {rank_preserving_row['rank_corr']:.3f}\")\n        print(f\"   β€’ Risk: Article quality rankings scrambled\")\n    if accuracy_change < -0.02:\n        print(f\"   β€’ Classification accuracy degraded by {-accuracy_change:.1%}\")\n        print(f\"   β€’ Risk: More misclassified content\")\n    if not result_ours.converged:\n        print(f\"   β€’ Algorithm instability: Failed to converge\")\n        print(f\"   β€’ Risk: Unpredictable behavior in production\")\n        \n    print(\"\\nπŸ’‘ ALTERNATIVE APPROACHES:\")\n    print(\"   1. Use Temperature Scaling for basic calibration\")\n    print(\"   2. Consider histogram binning for simple distribution adjustment\")\n    print(\"   3. Investigate algorithm parameter tuning\")\n    print(\"   4. Evaluate with different target distributions\")\n    print(\"   5. Consider ensemble methods for improved stability\")\nelse:\n    print(\"   βœ… Consider for production with careful monitoring\")\n    print(\"   πŸ“ Performance appears acceptable but validate thoroughly\")\n\n# Calculate engagement impact estimate\nsport_boost = (achieved_dist[2] - editorial_dist[2]) * 100  # Sport increase\ntech_boost = (achieved_dist[1] - editorial_dist[1]) * 100   # Tech increase\n\nprint(\"\\nπŸ“ˆ CALIBRATION IMPACT ANALYSIS:\")\nprint(f\"   β€’ Sport content adjusted by {sport_boost:+.1f}pp\")\nprint(f\"   β€’ Tech content adjusted by {tech_boost:+.1f}pp\")\nprint(f\"   β€’ Article ranking disruption: {rank_preserving_row['scrambled_articles']} articles significantly scrambled\")\nprint(f\"   β€’ Content routing changes: {df_routing.loc[4, 'category_changes']} high-confidence assignments modified\")\n\nprint(\"\\nπŸ“Š KEY PERFORMANCE METRICS:\")\nprint(f\"   β€’ Target distribution error: {rank_preserving_row['marginal_error']:.4f}\")\nprint(f\"   β€’ Rank preservation: {rank_preserving_row['rank_corr']:.6f}\")\nprint(f\"   β€’ Accuracy impact: {accuracy_change:+.4f}\")\nprint(f\"   β€’ Algorithm stability: {'Converged' if result_ours.converged else 'Failed to converge'}\")\n\nprint(\"\\nβœ… WHEN RANK-PRESERVING WORKS BEST:\")\nuse_cases = [\n    \"Algorithms that converge reliably with your data\",\n    \"Target distributions not too different from training\",\n    \"Applications where marginal constraints are the primary goal\",\n    \"Scenarios where some rank scrambling is acceptable\",\n    \"Use cases with extensive validation and fallback options\"\n]\n\nfor use_case in use_cases:\n    print(f\"   β€’ {use_case}\")\n\nprint(\"\\n⚠️  CRITICAL PRODUCTION CONSIDERATIONS:\")\nconsiderations = [\n    f\"Validate algorithm convergence on your specific data\",\n    f\"Monitor rank preservation quality in production\", \n    f\"Implement fallback to standard calibration methods\",\n    f\"A/B testing with careful content quality metrics\",\n    f\"Editorial team training on potential ranking changes\"\n]\n\nfor consideration in considerations:\n    print(f\"   β€’ {consideration}\")\n\n# Final assessment\nif (rank_preserving_row['rank_corr'] >= 0.95 and \n    accuracy_change >= -0.01 and \n    result_ours.converged):\n    print(\"\\nβœ… FINAL ASSESSMENT: SUITABLE FOR PRODUCTION TRIAL\")\nelse:\n    print(\"\\n❌ FINAL ASSESSMENT: NOT READY FOR PRODUCTION\")\n    print(\"   πŸ“ Requires significant improvements or alternative approaches\")"