Examples

This section provides comprehensive examples of using pyppur for different scenarios.

Basic Usage Examples

Distance Distortion Example

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.preprocessing import StandardScaler
from pyppur import ProjectionPursuit, Objective

# Load and prepare data
digits = load_digits()
X, y = digits.data, digits.target
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Distance distortion with nonlinearity (default)
pp_nonlinear = ProjectionPursuit(
    n_components=2,
    objective=Objective.DISTANCE_DISTORTION,
    alpha=1.5,
    use_nonlinearity_in_distance=True,
    n_init=5,
    verbose=True
)

X_nl = pp_nonlinear.fit_transform(X_scaled)

# Distance distortion without nonlinearity (linear)
pp_linear = ProjectionPursuit(
    n_components=2,
    objective=Objective.DISTANCE_DISTORTION,
    alpha=1.5,
    use_nonlinearity_in_distance=False,
    n_init=5,
    verbose=True
)

X_linear = pp_linear.fit_transform(X_scaled)

# Compare results
print("Nonlinear distance distortion:", pp_nonlinear.distance_distortion(X_scaled))
print("Linear distance distortion:", pp_linear.distance_distortion(X_scaled))

# Plot results
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

scatter1 = ax1.scatter(X_nl[:, 0], X_nl[:, 1], c=y, cmap='tab10', alpha=0.7)
ax1.set_title('Distance Distortion (Nonlinear)')
ax1.set_xlabel('Component 1')
ax1.set_ylabel('Component 2')

scatter2 = ax2.scatter(X_linear[:, 0], X_linear[:, 1], c=y, cmap='tab10', alpha=0.7)
ax2.set_title('Distance Distortion (Linear)')
ax2.set_xlabel('Component 1')
ax2.set_ylabel('Component 2')

plt.tight_layout()
plt.show()

Reconstruction Examples

# Reconstruction with tied weights (default)
pp_tied = ProjectionPursuit(
    n_components=3,
    objective=Objective.RECONSTRUCTION,
    alpha=1.0,
    tied_weights=True,
    n_init=3,
    verbose=True
)

X_tied = pp_tied.fit_transform(X_scaled)
tied_error = pp_tied.reconstruction_error(X_scaled)

# Reconstruction with untied weights
pp_untied = ProjectionPursuit(
    n_components=3,
    objective=Objective.RECONSTRUCTION,
    alpha=1.0,
    tied_weights=False,
    l2_reg=0.01,
    n_init=3,
    verbose=True
)

X_untied = pp_untied.fit_transform(X_scaled)
untied_error = pp_untied.reconstruction_error(X_scaled)

print(f"Tied weights reconstruction error: {tied_error:.6f}")
print(f"Untied weights reconstruction error: {untied_error:.6f}")
print(f"Improvement: {((tied_error - untied_error) / tied_error * 100):.1f}%")

# Access decoder weights
print("Tied decoder weights:", pp_tied.decoder_weights_)  # None
print("Untied decoder shape:", pp_untied.decoder_weights_.shape)

Advanced Examples

Parameter Sensitivity Analysis

from sklearn.datasets import make_swiss_roll
import pandas as pd

# Generate Swiss roll data
X_swiss, color = make_swiss_roll(n_samples=1000, noise=0.1, random_state=42)
scaler = StandardScaler()
X_swiss_scaled = scaler.fit_transform(X_swiss)

# Test different alpha values
alphas = [0.1, 0.5, 1.0, 2.0, 5.0]
results = []

for alpha in alphas:
    pp = ProjectionPursuit(
        n_components=2,
        objective=Objective.RECONSTRUCTION,
        alpha=alpha,
        tied_weights=False,
        l2_reg=0.01,
        max_iter=100,
        random_state=42
    )

    X_proj = pp.fit_transform(X_swiss_scaled)
    recon_error = pp.reconstruction_error(X_swiss_scaled)

    results.append({
        'alpha': alpha,
        'reconstruction_error': recon_error,
        'fit_time': pp.fit_time_
    })

results_df = pd.DataFrame(results)
print(results_df)

# Plot reconstruction error vs alpha
plt.figure(figsize=(8, 6))
plt.plot(results_df['alpha'], results_df['reconstruction_error'], 'bo-')
plt.xlabel('Alpha (Ridge Function Steepness)')
plt.ylabel('Reconstruction Error')
plt.title('Reconstruction Error vs Alpha Parameter')
plt.xscale('log')
plt.grid(True, alpha=0.3)
plt.show()

Comparison with Other Methods

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import time

# Prepare data
X_sample = X_scaled[:500]  # Use subset for t-SNE speed
y_sample = y[:500]

methods = {}
times = {}

# PCA
start_time = time.time()
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_sample)
times['PCA'] = time.time() - start_time
methods['PCA'] = X_pca

# t-SNE
start_time = time.time()
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
X_tsne = tsne.fit_transform(X_sample)
times['t-SNE'] = time.time() - start_time
methods['t-SNE'] = X_tsne

# pyppur (Distance Distortion)
start_time = time.time()
pp_dist = ProjectionPursuit(
    n_components=2,
    objective=Objective.DISTANCE_DISTORTION,
    alpha=1.5,
    n_init=3,
    random_state=42
)
X_pp_dist = pp_dist.fit_transform(X_sample)
times['pyppur (Distance)'] = time.time() - start_time
methods['pyppur (Distance)'] = X_pp_dist

# pyppur (Reconstruction)
start_time = time.time()
pp_recon = ProjectionPursuit(
    n_components=2,
    objective=Objective.RECONSTRUCTION,
    tied_weights=False,
    alpha=1.0,
    n_init=3,
    random_state=42
)
X_pp_recon = pp_recon.fit_transform(X_sample)
times['pyppur (Reconstruction)'] = time.time() - start_time
methods['pyppur (Reconstruction)'] = X_pp_recon

# Plot comparison
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.ravel()

for i, (method_name, X_proj) in enumerate(methods.items()):
    scatter = axes[i].scatter(X_proj[:, 0], X_proj[:, 1], c=y_sample,
                             cmap='tab10', alpha=0.7, s=20)
    axes[i].set_title(f'{method_name} (Time: {times[method_name]:.2f}s)')
    axes[i].set_xlabel('Component 1')
    axes[i].set_ylabel('Component 2')

plt.tight_layout()
plt.show()

# Print timing comparison
print("\nTiming Comparison:")
for method, time_taken in times.items():
    print(f"{method}: {time_taken:.3f} seconds")

Evaluation and Metrics

from pyppur.utils.metrics import evaluate_embedding

# Comprehensive evaluation
pp = ProjectionPursuit(
    n_components=2,
    objective=Objective.DISTANCE_DISTORTION,
    alpha=1.5,
    n_init=5,
    random_state=42
)

X_proj = pp.fit_transform(X_scaled)

# Built-in evaluation
metrics = pp.evaluate(X_scaled, y, n_neighbors=10)
print("Built-in evaluation:")
for metric, value in metrics.items():
    print(f"  {metric}: {value:.4f}")

# Manual evaluation using utils
manual_metrics = evaluate_embedding(X_scaled, X_proj, y, n_neighbors=10)
print("\nManual evaluation:")
for metric, value in manual_metrics.items():
    print(f"  {metric}: {value:.4f}")

# Additional metrics
print(f"\nAdditional metrics:")
print(f"Distance distortion: {pp.distance_distortion(X_scaled):.6f}")
print(f"Reconstruction error: {pp.reconstruction_error(X_scaled):.6f}")
print(f"Trustworthiness (k=5): {pp.compute_trustworthiness(X_scaled, 5):.4f}")
print(f"Trustworthiness (k=15): {pp.compute_trustworthiness(X_scaled, 15):.4f}")

Working with Large Datasets

# For large datasets, consider these strategies:

# 1. Reduce the number of initializations
pp_fast = ProjectionPursuit(
    n_components=2,
    objective=Objective.RECONSTRUCTION,
    n_init=1,  # Fewer initializations
    max_iter=50,  # Fewer iterations
    alpha=1.0
)

# 2. Use reconstruction objective (more memory efficient than distance)
# Distance distortion requires O(n²) memory for distance matrices
# Reconstruction requires O(nk) memory

# 3. For distance distortion with large n, consider subsampling
if X_scaled.shape[0] > 5000:
    print("Large dataset detected, using reconstruction objective")
    pp_large = ProjectionPursuit(
        n_components=2,
        objective=Objective.RECONSTRUCTION,
        tied_weights=True,  # Faster than untied
        alpha=1.0,
        n_init=1,
        max_iter=100
    )
else:
    pp_large = ProjectionPursuit(
        n_components=2,
        objective=Objective.DISTANCE_DISTORTION,
        alpha=1.5,
        n_init=3
    )

X_large_proj = pp_large.fit_transform(X_scaled)
print(f"Processed {X_scaled.shape[0]} samples in {pp_large.fit_time_:.2f} seconds")

Custom Workflows

# Custom preprocessing and postprocessing pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler

# Custom pipeline
pipeline = Pipeline([
    ('robust_scaler', RobustScaler()),  # More robust to outliers
    ('projection_pursuit', ProjectionPursuit(
        n_components=2,
        objective=Objective.RECONSTRUCTION,
        tied_weights=False,
        l2_reg=0.05,
        alpha=1.2,
        n_init=5,
        verbose=True
    ))
])

# Fit and transform
X_pipeline = pipeline.fit_transform(X)

# Access the fitted pyppur model
pp_model = pipeline.named_steps['projection_pursuit']
print(f"Final loss: {pp_model.best_loss_:.6f}")
print(f"Optimization info: {pp_model.optimizer_info_}")

# Visualize results
plt.figure(figsize=(8, 6))
scatter = plt.scatter(X_pipeline[:, 0], X_pipeline[:, 1], c=y, cmap='tab10', alpha=0.7)
plt.colorbar(scatter)
plt.title('pyppur with Robust Scaling Pipeline')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.show()

This examples section demonstrates the flexibility and power of pyppur for various dimensionality reduction tasks.