Tutorial: ASCICat vs. Pareto Analysis¶

This tutorial compares ASCICat ranking with Pareto frontier analysis.

Overview¶

Both methods address multi-objective optimization but differently:

Aspect	Pareto	ASCICat
Output	Set of non-dominated solutions	Ranked list
Preferences	Not required	Explicit weights
Comparability	Within dataset	Across studies
Interpretation	Trade-off set	Prioritized ranking

Implementation¶

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ascicat import ASCICalculator
from pathlib import Path

# Setup
output_dir = Path('results/pareto_comparison')
output_dir.mkdir(parents=True, exist_ok=True)

# Calculate ASCI
calc = ASCICalculator(reaction='HER')
calc.load_data('data/HER_clean.csv')
results = calc.calculate_asci(w_a=0.33, w_s=0.33, w_c=0.34)

print(f"Dataset: {len(results):,} catalysts")

Pareto Frontier Calculation¶

def is_pareto_optimal(costs):
    """
    Find Pareto-optimal points.

    Parameters
    ----------
    costs : array-like, shape (n_samples, n_objectives)
        Objective values to minimize (lower is better)

    Returns
    -------
    is_optimal : array, shape (n_samples,)
        Boolean array indicating Pareto-optimal points
    """
    is_optimal = np.ones(costs.shape[0], dtype=bool)
    for i, c in enumerate(costs):
        if is_optimal[i]:
            # Keep any point not dominated by c
            is_optimal[is_optimal] = np.any(
                costs[is_optimal] < c, axis=1
            ) | np.all(costs[is_optimal] == c, axis=1)
            is_optimal[i] = True
    return is_optimal


# Prepare objectives (minimize = better)
# Activity: maximize → minimize (1 - score)
# Stability: maximize → minimize (1 - score)
# Cost: maximize → minimize (1 - score)
objectives = np.column_stack([
    1 - results['activity_score'].values,
    1 - results['stability_score'].values,
    1 - results['cost_score'].values
])

# Find Pareto-optimal points
pareto_mask = is_pareto_optimal(objectives)
n_pareto = pareto_mask.sum()

print(f"\nPareto-optimal catalysts: {n_pareto}")
print(f"Percentage of dataset: {100*n_pareto/len(results):.1f}%")

Compare with ASCI Ranking¶

# Get top ASCI-ranked catalysts
top_n = 100
asci_top = set(results.head(top_n)['symbol'].values)

# Get Pareto-optimal catalysts
pareto_catalysts = set(results[pareto_mask]['symbol'].values)

# Overlap analysis
overlap = asci_top & pareto_catalysts
asci_only = asci_top - pareto_catalysts
pareto_only = pareto_catalysts - asci_top

print(f"\n{'='*60}")
print(f"COMPARISON: Top {top_n} ASCI vs Pareto Frontier")
print(f"{'='*60}")
print(f"\nOverlap: {len(overlap)} catalysts")
print(f"  ({100*len(overlap)/top_n:.1f}% of top ASCI are Pareto-optimal)")
print(f"\nASCI top {top_n} only: {len(asci_only)} catalysts")
print(f"Pareto-optimal only: {len(pareto_only)} catalysts")

Visualize Comparison¶

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Colors
colors = np.where(pareto_mask, 'red', 'lightgray')
colors = np.where(results.index < top_n, 'blue', colors)
# Overlap: purple
for i, row in results.iterrows():
    if row['symbol'] in overlap:
        colors[i] = 'purple'

# Plot 2D projections
score_pairs = [
    ('activity_score', 'cost_score', 'Activity vs Cost'),
    ('activity_score', 'stability_score', 'Activity vs Stability'),
    ('stability_score', 'cost_score', 'Stability vs Cost')
]

for ax, (x_col, y_col, title) in zip(axes, score_pairs):
    # All points (gray background)
    ax.scatter(results[x_col], results[y_col],
               c='lightgray', s=10, alpha=0.3, label='_nolegend_')

    # Pareto points (red)
    pareto_data = results[pareto_mask]
    ax.scatter(pareto_data[x_col], pareto_data[y_col],
               c='red', s=30, alpha=0.7, label='Pareto-optimal')

    # Top ASCI (blue)
    asci_data = results.head(top_n)
    ax.scatter(asci_data[x_col], asci_data[y_col],
               c='blue', s=30, alpha=0.7, label=f'Top {top_n} ASCI')

    ax.set_xlabel(x_col.replace('_', ' ').title())
    ax.set_ylabel(y_col.replace('_', ' ').title())
    ax.set_title(title)
    ax.legend(loc='lower left', fontsize=9)
    ax.set_xlim(0, 1.05)
    ax.set_ylim(0, 1.05)

plt.tight_layout()
fig.savefig(output_dir / 'pareto_vs_asci_2d.png', dpi=600)
plt.close()

print(f"\nFigure saved: {output_dir}/pareto_vs_asci_2d.png")

Rank Distribution of Pareto Points¶

# Where do Pareto-optimal points rank in ASCI?
pareto_ranks = results[pareto_mask].index + 1  # 1-indexed ranks

fig, ax = plt.subplots(figsize=(10, 5))

ax.hist(pareto_ranks, bins=50, edgecolor='black', alpha=0.7)
ax.axvline(top_n, color='red', linestyle='--',
           label=f'Top {top_n} threshold')
ax.set_xlabel('ASCI Rank')
ax.set_ylabel('Count of Pareto-Optimal Catalysts')
ax.set_title('Distribution of Pareto-Optimal Catalysts by ASCI Rank')
ax.legend()

# Add statistics
stats_text = f'n = {n_pareto} Pareto-optimal\n'
stats_text += f'Median rank: {np.median(pareto_ranks):.0f}\n'
stats_text += f'Mean rank: {np.mean(pareto_ranks):.0f}'
ax.text(0.95, 0.95, stats_text, transform=ax.transAxes,
        fontsize=10, va='top', ha='right',
        bbox=dict(boxstyle='round', facecolor='white'))

fig.savefig(output_dir / 'pareto_rank_distribution.png', dpi=600)
plt.close()

print(f"Figure saved: {output_dir}/pareto_rank_distribution.png")

Detailed Comparison Table¶

# Create comparison DataFrame
comparison_data = []

# Top 20 ASCI-ranked
for rank, (_, row) in enumerate(results.head(20).iterrows(), 1):
    is_pareto = row['symbol'] in pareto_catalysts
    comparison_data.append({
        'ASCI_Rank': rank,
        'Symbol': row['symbol'],
        'ASCI': row['ASCI'],
        'Activity': row['activity_score'],
        'Stability': row['stability_score'],
        'Cost': row['cost_score'],
        'Pareto_Optimal': is_pareto
    })

comparison_df = pd.DataFrame(comparison_data)

print("\n" + "="*80)
print("TOP 20 ASCI-RANKED CATALYSTS: Pareto Status")
print("="*80)
print(f"\n{'Rank':<6} {'Symbol':<14} {'ASCI':<8} {'S_a':<8} "
      f"{'S_s':<8} {'S_c':<8} {'Pareto?':<10}")
print("-"*80)

for _, row in comparison_df.iterrows():
    pareto_str = "Yes" if row['Pareto_Optimal'] else "No"
    print(f"{row['ASCI_Rank']:<6} {row['Symbol']:<14} {row['ASCI']:.4f}   "
          f"{row['Activity']:.4f}   {row['Stability']:.4f}   "
          f"{row['Cost']:.4f}   {pareto_str:<10}")

# Summary
n_pareto_in_top20 = comparison_df['Pareto_Optimal'].sum()
print(f"\n→ {n_pareto_in_top20}/20 top ASCI catalysts are Pareto-optimal "
      f"({100*n_pareto_in_top20/20:.0f}%)")

When to Use Each Method¶

print("\n" + "="*70)
print("WHEN TO USE EACH METHOD")
print("="*70)

print("""
PARETO FRONTIER
---------------
✓ Initial exploration without preference bias
✓ Identifying the full trade-off space
✓ When stakeholders disagree on priorities
✓ Generating options for discussion

ASCI SCORING
------------
✓ When priorities can be quantified
✓ For reproducible, documented rankings
✓ Cross-study comparisons
✓ Final prioritization for experiments

COMPLEMENTARY USE
-----------------
1. First: Use Pareto to identify non-dominated set
2. Then: Apply ASCI within Pareto set for ranking
3. Verify: Top ASCI candidates should be Pareto-optimal
""")

Combined Workflow¶

# Filter to Pareto-optimal catalysts only
pareto_results = results[pareto_mask].copy()
pareto_results = pareto_results.reset_index(drop=True)

# Re-rank within Pareto set
pareto_results['Pareto_Rank'] = range(1, len(pareto_results) + 1)

print(f"\nFiltered to {len(pareto_results)} Pareto-optimal catalysts")
print("\nTop 10 within Pareto set:")
print(pareto_results[['symbol', 'ASCI', 'activity_score',
                      'stability_score', 'cost_score']].head(10))

Key Findings¶

print("\n" + "="*70)
print("KEY FINDINGS")
print("="*70)

findings = f"""
1. OVERLAP VALIDATION
   - {100*len(overlap)/top_n:.0f}% of top {top_n} ASCI catalysts are Pareto-optimal
   - This validates ASCI's focus on high-performers

2. PARETO SET SIZE
   - {n_pareto} Pareto-optimal catalysts ({100*n_pareto/len(results):.1f}% of dataset)
   - ASCI reduces this to a single prioritized list

3. COMPLEMENTARITY
   - Pareto shows the trade-off space
   - ASCI provides actionable priorities

4. RECOMMENDATION
   - Use Pareto for understanding trade-offs
   - Use ASCI for final experimental prioritization
"""

print(findings)

# Save comparison
comparison_df.to_csv(output_dir / 'asci_pareto_comparison.csv', index=False)
print(f"\nResults saved to: {output_dir}/")