"""
SUPPLEMENTARY CODE
Empirical Constraints on Photoevaporation from the Stellar Flux Dependence 
of the Exoplanet Radius Valley

Ryan Shamim, Anima Core Inc.

This code implements the valley detection, flux binning, power-law fitting,
and statistical analysis described in the main text.
"""

import pandas as pd
import numpy as np
from scipy import stats
from scipy.optimize import minimize
from sklearn.utils import resample
import matplotlib.pyplot as plt

# ==============================================================================
# DATA LOADING
# ==============================================================================

def load_exoplanet_data(filepath='PSCompPars_2026.01.01.csv'):
    """
    Load NASA Exoplanet Archive data with quality filters.
    
    Returns:
        DataFrame with filtered planets (5,987 in final sample)
    """
    # Load data
    df = pd.read_csv(filepath, comment='#')
    
    # Quality filters
    mask = (
        (df['pl_rade'] > 1.0) &           # Minimum radius
        (df['pl_rade'] < 6.0) &           # Maximum radius  
        (df['pl_insol'].notna()) &        # Has flux measurement
        (df['pl_rade'].notna())           # Has radius measurement
    )
    
    clean_df = df[mask].copy()
    print(f"Loaded {len(clean_df)} planets")
    
    return clean_df


# ==============================================================================
# VALLEY DETECTION
# ==============================================================================

def find_valley_kde(radii, bandwidth=0.2, valley_range=(1.5, 2.5)):
    """
    Detect radius valley using kernel density estimation.
    
    Args:
        radii: Array of planet radii (Earth radii)
        bandwidth: KDE bandwidth parameter
        valley_range: Search range for valley minimum
        
    Returns:
        Valley location (R_Earth) or np.nan if insufficient data
    """
    if len(radii) < 50:
        return np.nan
    
    # Kernel density estimation
    kde = stats.gaussian_kde(radii, bw_method=bandwidth)
    
    # Evaluate density on grid
    x_grid = np.linspace(1.0, 4.0, 500)
    density = kde(x_grid)
    
    # Find minimum in valley range
    mask = (x_grid >= valley_range[0]) & (x_grid <= valley_range[1])
    valley_density = density[mask]
    valley_grid = x_grid[mask]
    
    if len(valley_density) == 0:
        return np.nan
    
    min_idx = np.argmin(valley_density)
    valley_location = valley_grid[min_idx]
    
    return valley_location


# ==============================================================================
# FLUX BINNING ANALYSIS
# ==============================================================================

def measure_valley_vs_flux(df, flux_bins=None):
    """
    Measure valley location across stellar flux bins.
    
    Args:
        df: DataFrame with 'pl_rade' and 'pl_insol' columns
        flux_bins: List of (flux_min, flux_max, label) tuples
        
    Returns:
        DataFrame with valley locations per bin
    """
    if flux_bins is None:
        # Default 3-bin analysis
        flux_bins = [
            (0, 20, "Low"),
            (20, 200, "Medium"),  
            (200, np.inf, "High")
        ]
    
    results = []
    
    for flux_min, flux_max, label in flux_bins:
        # Select planets in flux range
        mask = (df['pl_insol'] >= flux_min) & (df['pl_insol'] < flux_max)
        subset = df[mask]
        
        # Find valley
        valley = find_valley_kde(subset['pl_rade'].values)
        
        results.append({
            'label': label,
            'flux_min': flux_min,
            'flux_max': flux_max,
            'flux_center': np.sqrt(flux_min * flux_max) if flux_max != np.inf else flux_min * 2,
            'valley_radius': valley,
            'n_planets': len(subset)
        })
        
        print(f"{label:10s}: Valley = {valley:.3f} Re, N = {len(subset)}")
    
    return pd.DataFrame(results)


# ==============================================================================
# POWER-LAW FITTING
# ==============================================================================

def fit_power_law(flux_values, valley_values):
    """
    Fit power law: R_valley = a * S^b + c
    
    Args:
        flux_values: Array of flux bin centers (S_Earth)
        valley_values: Array of valley locations (R_Earth)
        
    Returns:
        (a, b, c): Best-fit parameters
    """
    # Remove NaN values
    mask = ~np.isnan(valley_values)
    S = flux_values[mask]
    R = valley_values[mask]
    
    # Objective function (least squares)
    def objective(params):
        a, b, c = params
        predicted = a * S**b + c
        residuals = R - predicted
        return np.sum(residuals**2)
    
    # Initial guess
    x0 = [0.1, 0.33, 1.5]
    
    # Bounds to ensure physical solutions
    bounds = [
        (0.001, 1.0),   # a > 0
        (0.0, 1.0),     # 0 < b < 1  
        (0.5, 2.0)      # c > 0
    ]
    
    # Optimize
    result = minimize(objective, x0, bounds=bounds, method='L-BFGS-B')
    
    a, b, c = result.x
    print(f"Best fit: R = {a:.3f} * S^{b:.2f} + {c:.2f}")
    
    return a, b, c


# ==============================================================================
# BOOTSTRAP UNCERTAINTY ESTIMATION  
# ==============================================================================

def bootstrap_valley_shift(df, flux_low=20, flux_high=200, n_iterations=10000):
    """
    Calculate confidence intervals for valley shift using bootstrap.
    
    Args:
        df: Planet DataFrame
        flux_low: Low flux threshold
        flux_high: High flux threshold
        n_iterations: Number of bootstrap samples
        
    Returns:
        Dictionary with statistics
    """
    shifts = []
    
    print(f"Running bootstrap ({n_iterations} iterations)...")
    
    for i in range(n_iterations):
        # Resample with replacement
        sample = resample(df)
        
        # Find valleys in low and high flux
        low = sample[sample['pl_insol'] < flux_low]
        high = sample[sample['pl_insol'] > flux_high]
        
        valley_low = find_valley_kde(low['pl_rade'].values)
        valley_high = find_valley_kde(high['pl_rade'].values)
        
        if not np.isnan(valley_low) and not np.isnan(valley_high):
            shifts.append(valley_high - valley_low)
        
        if (i + 1) % 1000 == 0:
            print(f"  {i+1}/{n_iterations}")
    
    shifts = np.array(shifts)
    
    # Calculate statistics
    results = {
        'mean_shift': np.mean(shifts),
        'std_shift': np.std(shifts),
        'ci_95_low': np.percentile(shifts, 2.5),
        'ci_95_high': np.percentile(shifts, 97.5),
        'p_value': (shifts <= 0).mean(),
        'significance_sigma': np.abs(np.mean(shifts) / np.std(shifts))
    }
    
    print(f"\nResults:")
    print(f"  Mean shift: {results['mean_shift']:.3f} ± {results['std_shift']:.3f} Re")
    print(f"  95% CI: [{results['ci_95_low']:.3f}, {results['ci_95_high']:.3f}]")
    print(f"  Significance: {results['significance_sigma']:.1f} sigma")
    
    return results


# ==============================================================================
# MAIN ANALYSIS PIPELINE
# ==============================================================================

def main():
    """
    Execute complete analysis pipeline.
    """
    print("="*70)
    print("EXOPLANET RADIUS VALLEY FLUX DEPENDENCE ANALYSIS")
    print("="*70)
    
    # Load data
    df = load_exoplanet_data('PSCompPars_2026.01.01.csv')
    
    # Measure valley vs flux (3-bin analysis)
    print("\n--- Three-Bin Analysis ---")
    results_3bin = measure_valley_vs_flux(df)
    
    # Measure valley vs flux (10-bin analysis for fitting)
    print("\n--- Ten-Bin Analysis ---")
    flux_edges = np.logspace(0, 3, 11)
    flux_bins_10 = [(flux_edges[i], flux_edges[i+1], f"Bin{i+1}") 
                     for i in range(len(flux_edges)-1)]
    results_10bin = measure_valley_vs_flux(df, flux_bins_10)
    
    # Fit power law
    print("\n--- Power-Law Fitting ---")
    a, b, c = fit_power_law(
        results_10bin['flux_center'].values,
        results_10bin['valley_radius'].values
    )
    
    # Bootstrap analysis
    print("\n--- Bootstrap Significance Test ---")
    bootstrap_stats = bootstrap_valley_shift(df, flux_low=20, flux_high=200)
    
    print("\n" + "="*70)
    print("ANALYSIS COMPLETE")
    print("="*70)


if __name__ == "__main__":
    main()


# ==============================================================================
# NOTES
# ==============================================================================
"""
This code reproduces the key results reported in the main text:

1. Valley shift: 1.51 → 2.25 Re (49% increase)
2. Power-law scaling: R ∝ S^0.37
3. Statistical significance: 6.8σ

Dependencies:
- pandas >= 1.5.0
- numpy >= 1.23.0  
- scipy >= 1.9.0
- scikit-learn >= 1.1.0
- matplotlib >= 3.6.0

Data:
- Download PSCompPars table from NASA Exoplanet Archive
- https://exoplanetarchive.ipcc.caltech.edu/
"""