Examples ======== This section provides comprehensive examples of using pyppur for different scenarios. Basic Usage Examples -------------------- Distance Distortion Example ~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: python import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import load_digits from sklearn.preprocessing import StandardScaler from pyppur import ProjectionPursuit, Objective # Load and prepare data digits = load_digits() X, y = digits.data, digits.target scaler = StandardScaler() X_scaled = scaler.fit_transform(X) # Distance distortion with nonlinearity (default) pp_nonlinear = ProjectionPursuit( n_components=2, objective=Objective.DISTANCE_DISTORTION, alpha=1.5, use_nonlinearity_in_distance=True, n_init=5, verbose=True ) X_nl = pp_nonlinear.fit_transform(X_scaled) # Distance distortion without nonlinearity (linear) pp_linear = ProjectionPursuit( n_components=2, objective=Objective.DISTANCE_DISTORTION, alpha=1.5, use_nonlinearity_in_distance=False, n_init=5, verbose=True ) X_linear = pp_linear.fit_transform(X_scaled) # Compare results print("Nonlinear distance distortion:", pp_nonlinear.distance_distortion(X_scaled)) print("Linear distance distortion:", pp_linear.distance_distortion(X_scaled)) # Plot results fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5)) scatter1 = ax1.scatter(X_nl[:, 0], X_nl[:, 1], c=y, cmap='tab10', alpha=0.7) ax1.set_title('Distance Distortion (Nonlinear)') ax1.set_xlabel('Component 1') ax1.set_ylabel('Component 2') scatter2 = ax2.scatter(X_linear[:, 0], X_linear[:, 1], c=y, cmap='tab10', alpha=0.7) ax2.set_title('Distance Distortion (Linear)') ax2.set_xlabel('Component 1') ax2.set_ylabel('Component 2') plt.tight_layout() plt.show() Reconstruction Examples ~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: python # Reconstruction with tied weights (default) pp_tied = ProjectionPursuit( n_components=3, objective=Objective.RECONSTRUCTION, alpha=1.0, tied_weights=True, n_init=3, verbose=True ) X_tied = pp_tied.fit_transform(X_scaled) tied_error = pp_tied.reconstruction_error(X_scaled) # Reconstruction with untied weights pp_untied = ProjectionPursuit( n_components=3, objective=Objective.RECONSTRUCTION, alpha=1.0, tied_weights=False, l2_reg=0.01, n_init=3, verbose=True ) X_untied = pp_untied.fit_transform(X_scaled) untied_error = pp_untied.reconstruction_error(X_scaled) print(f"Tied weights reconstruction error: {tied_error:.6f}") print(f"Untied weights reconstruction error: {untied_error:.6f}") print(f"Improvement: {((tied_error - untied_error) / tied_error * 100):.1f}%") # Access decoder weights print("Tied decoder weights:", pp_tied.decoder_weights_) # None print("Untied decoder shape:", pp_untied.decoder_weights_.shape) Advanced Examples ----------------- Parameter Sensitivity Analysis ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: python from sklearn.datasets import make_swiss_roll import pandas as pd # Generate Swiss roll data X_swiss, color = make_swiss_roll(n_samples=1000, noise=0.1, random_state=42) scaler = StandardScaler() X_swiss_scaled = scaler.fit_transform(X_swiss) # Test different alpha values alphas = [0.1, 0.5, 1.0, 2.0, 5.0] results = [] for alpha in alphas: pp = ProjectionPursuit( n_components=2, objective=Objective.RECONSTRUCTION, alpha=alpha, tied_weights=False, l2_reg=0.01, max_iter=100, random_state=42 ) X_proj = pp.fit_transform(X_swiss_scaled) recon_error = pp.reconstruction_error(X_swiss_scaled) results.append({ 'alpha': alpha, 'reconstruction_error': recon_error, 'fit_time': pp.fit_time_ }) results_df = pd.DataFrame(results) print(results_df) # Plot reconstruction error vs alpha plt.figure(figsize=(8, 6)) plt.plot(results_df['alpha'], results_df['reconstruction_error'], 'bo-') plt.xlabel('Alpha (Ridge Function Steepness)') plt.ylabel('Reconstruction Error') plt.title('Reconstruction Error vs Alpha Parameter') plt.xscale('log') plt.grid(True, alpha=0.3) plt.show() Comparison with Other Methods ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: python from sklearn.decomposition import PCA from sklearn.manifold import TSNE import time # Prepare data X_sample = X_scaled[:500] # Use subset for t-SNE speed y_sample = y[:500] methods = {} times = {} # PCA start_time = time.time() pca = PCA(n_components=2, random_state=42) X_pca = pca.fit_transform(X_sample) times['PCA'] = time.time() - start_time methods['PCA'] = X_pca # t-SNE start_time = time.time() tsne = TSNE(n_components=2, random_state=42, perplexity=30) X_tsne = tsne.fit_transform(X_sample) times['t-SNE'] = time.time() - start_time methods['t-SNE'] = X_tsne # pyppur (Distance Distortion) start_time = time.time() pp_dist = ProjectionPursuit( n_components=2, objective=Objective.DISTANCE_DISTORTION, alpha=1.5, n_init=3, random_state=42 ) X_pp_dist = pp_dist.fit_transform(X_sample) times['pyppur (Distance)'] = time.time() - start_time methods['pyppur (Distance)'] = X_pp_dist # pyppur (Reconstruction) start_time = time.time() pp_recon = ProjectionPursuit( n_components=2, objective=Objective.RECONSTRUCTION, tied_weights=False, alpha=1.0, n_init=3, random_state=42 ) X_pp_recon = pp_recon.fit_transform(X_sample) times['pyppur (Reconstruction)'] = time.time() - start_time methods['pyppur (Reconstruction)'] = X_pp_recon # Plot comparison fig, axes = plt.subplots(2, 2, figsize=(12, 10)) axes = axes.ravel() for i, (method_name, X_proj) in enumerate(methods.items()): scatter = axes[i].scatter(X_proj[:, 0], X_proj[:, 1], c=y_sample, cmap='tab10', alpha=0.7, s=20) axes[i].set_title(f'{method_name} (Time: {times[method_name]:.2f}s)') axes[i].set_xlabel('Component 1') axes[i].set_ylabel('Component 2') plt.tight_layout() plt.show() # Print timing comparison print("\nTiming Comparison:") for method, time_taken in times.items(): print(f"{method}: {time_taken:.3f} seconds") Evaluation and Metrics ~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: python from pyppur.utils.metrics import evaluate_embedding # Comprehensive evaluation pp = ProjectionPursuit( n_components=2, objective=Objective.DISTANCE_DISTORTION, alpha=1.5, n_init=5, random_state=42 ) X_proj = pp.fit_transform(X_scaled) # Built-in evaluation metrics = pp.evaluate(X_scaled, y, n_neighbors=10) print("Built-in evaluation:") for metric, value in metrics.items(): print(f" {metric}: {value:.4f}") # Manual evaluation using utils manual_metrics = evaluate_embedding(X_scaled, X_proj, y, n_neighbors=10) print("\nManual evaluation:") for metric, value in manual_metrics.items(): print(f" {metric}: {value:.4f}") # Additional metrics print(f"\nAdditional metrics:") print(f"Distance distortion: {pp.distance_distortion(X_scaled):.6f}") print(f"Reconstruction error: {pp.reconstruction_error(X_scaled):.6f}") print(f"Trustworthiness (k=5): {pp.compute_trustworthiness(X_scaled, 5):.4f}") print(f"Trustworthiness (k=15): {pp.compute_trustworthiness(X_scaled, 15):.4f}") Working with Large Datasets ~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: python # For large datasets, consider these strategies: # 1. Reduce the number of initializations pp_fast = ProjectionPursuit( n_components=2, objective=Objective.RECONSTRUCTION, n_init=1, # Fewer initializations max_iter=50, # Fewer iterations alpha=1.0 ) # 2. Use reconstruction objective (more memory efficient than distance) # Distance distortion requires O(n²) memory for distance matrices # Reconstruction requires O(nk) memory # 3. For distance distortion with large n, consider subsampling if X_scaled.shape[0] > 5000: print("Large dataset detected, using reconstruction objective") pp_large = ProjectionPursuit( n_components=2, objective=Objective.RECONSTRUCTION, tied_weights=True, # Faster than untied alpha=1.0, n_init=1, max_iter=100 ) else: pp_large = ProjectionPursuit( n_components=2, objective=Objective.DISTANCE_DISTORTION, alpha=1.5, n_init=3 ) X_large_proj = pp_large.fit_transform(X_scaled) print(f"Processed {X_scaled.shape[0]} samples in {pp_large.fit_time_:.2f} seconds") Custom Workflows ~~~~~~~~~~~~~~~~ .. code-block:: python # Custom preprocessing and postprocessing pipeline from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler, RobustScaler # Custom pipeline pipeline = Pipeline([ ('robust_scaler', RobustScaler()), # More robust to outliers ('projection_pursuit', ProjectionPursuit( n_components=2, objective=Objective.RECONSTRUCTION, tied_weights=False, l2_reg=0.05, alpha=1.2, n_init=5, verbose=True )) ]) # Fit and transform X_pipeline = pipeline.fit_transform(X) # Access the fitted pyppur model pp_model = pipeline.named_steps['projection_pursuit'] print(f"Final loss: {pp_model.best_loss_:.6f}") print(f"Optimization info: {pp_model.optimizer_info_}") # Visualize results plt.figure(figsize=(8, 6)) scatter = plt.scatter(X_pipeline[:, 0], X_pipeline[:, 1], c=y, cmap='tab10', alpha=0.7) plt.colorbar(scatter) plt.title('pyppur with Robust Scaling Pipeline') plt.xlabel('Component 1') plt.ylabel('Component 2') plt.show() This examples section demonstrates the flexibility and power of pyppur for various dimensionality reduction tasks.