"""
redundancy_floor_ablation.py

Computational implementation of the ablation and sensitivity analyses for
"An explanatory audit of constraints on the genetic code's redundancy ratio:
five candidate pathways" (Reich).

Each of the five constraint paths P1-P5 is encoded as a (lower, upper) bound
function on r = 64/k. The script:
  (1) reports the intersection window under each ablation (single-path removal),
  (2) sweeps over plausible parameter ranges for P4 and P5,
  (3) classifies each path as binding (load-bearing) or non-binding witness.

No external dependencies beyond the Python standard library.

Author: Jonathan Reich
"""

from dataclasses import dataclass
from typing import Callable, Dict, List, Optional, Tuple
from itertools import product

CODONS = 64
R_SGC = 64 / 23  # ~2.7826


# ----------------------------------------------------------------------
# Path definitions
# ----------------------------------------------------------------------
#
# Each path returns (lower_bound_on_r, upper_bound_on_r). Use -inf / +inf
# to indicate a non-binding direction. We use float("-inf") and float("inf").

NEG_INF = float("-inf")
POS_INF = float("inf")


@dataclass
class PathBounds:
    name: str
    lower: float
    upper: float
    source: str

    def width(self) -> float:
        if self.lower == NEG_INF or self.upper == POS_INF:
            return POS_INF
        return self.upper - self.lower


def path_P1_comparative_genomics() -> PathBounds:
    """Empirical window across 24 naturally-occurring codes (Table 1 of paper,
    NCBI translation tables 1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13, 14, 15, 16,
    21, 22, 23, 24, 25, 26, 29, 30, 32, 33, with ambiguous-stop codes 27, 28,
    31 discussed separately).

    Signal counts cluster as: k in {21, 22, 23, 24}, giving
        r in {64/24, 64/23, 64/22, 64/21} = {2.667, 2.783, 2.909, 3.048}.
    Empirical window: r in [2.667, 3.048].
    """
    return PathBounds(
        name="P1 (comparative genomics)",
        lower=2.667,
        upper=3.048,
        source="24 NCBI natural codes",
    )


def path_P2_shannon(p: float = 1e-4) -> PathBounds:
    """Shannon channel capacity at per-base mutation rate p (translation-level).
    At biological noise the bound is essentially trivial: k <= 64, r >= 1.
    """
    from math import log2

    # 4-ary symmetric channel capacity per base
    if p <= 0 or p >= 1:
        C = 2.0
    else:
        # H_4(p) for symmetric 4-ary error: H = -((1-p)log(1-p) + p log(p/3))
        # Standard formula yields C = 2 - H_4(p)
        H = -((1 - p) * log2(1 - p) + p * log2(p / 3))
        C = 2.0 - H
    k_max = 2 ** (3 * C)
    r_min = CODONS / k_max
    return PathBounds(
        name="P2 (Shannon capacity)",
        lower=max(r_min, 1.0),
        upper=POS_INF,
        source=f"Shannon at p={p:g}",
    )


def path_P3_mutation_load(
    N_codons: int = 10**6,
    mu_replication: float = 3e-9,
    epsilon_translation: float = 1e-3,
    L_protein: int = 300,
    c_per_misincorp: float = 0.05,
    viability_threshold: float = 0.1,
) -> PathBounds:
    """Mutation-load floor on r.

    At biological replication mutation rates, the Eigen threshold
        N * mu * (1 - sigma) < 1
    is satisfied for ANY sigma >= 0 (since N * mu ~ 3e-3 << 1), so this
    imposes no lower bound on sigma hence none on r.

    At translation level, the expected cost per copy
        epsilon * L * (1 - sigma) * c
    is well below viability for any plausible sigma. Non-binding.

    The function returns a non-binding pair to make the non-bindingness
    explicit, with diagnostic output available via the dataclass fields.
    """
    eigen_load = N_codons * mu_replication  # ~3e-3 at defaults
    translation_load = epsilon_translation * L_protein * c_per_misincorp  # ~0.015

    binding = (eigen_load >= 1.0) or (translation_load >= viability_threshold)

    if not binding:
        return PathBounds(
            name="P3 (mutation load)",
            lower=NEG_INF,
            upper=POS_INF,
            source=(
                f"non-binding: Eigen load={eigen_load:.3g}<1, "
                f"translation cost={translation_load:.3g}<{viability_threshold}"
            ),
        )

    # If parameters pushed into binding regime, a coarse floor:
    # need sigma >= 1 - 1/(N*mu) which translates to r ~ 1/(1 - sigma_min)
    sigma_min = max(0.0, 1.0 - 1.0 / eigen_load)
    r_min = 1.0 / (1.0 - sigma_min + 1e-12)
    return PathBounds(
        name="P3 (mutation load)",
        lower=r_min,
        upper=POS_INF,
        source=f"Eigen-binding at N*mu={eigen_load:.3g}",
    )


def path_P4_trna_discrimination(
    anticodon_classes_max: int = 32, n_stops_max: int = 3
) -> PathBounds:
    """tRNA codon-anticodon discrimination ceiling on signal count k.

    Under Crick's wobble rules (1966), the minimum cytoplasmic tRNA repertoire
    to read all 61 sense codons is ~32 distinct anticodons. Each amino acid
    requires at least one cognate anticodon class, so #aa <= 32; combined with
    at most 3 stop codons (UAA/UAG/UGA), this gives k <= 35.

    Floor: r >= 64 / (anticodon_classes_max + n_stops_max).

    Default 32 + 3 = 35 -> r >= 1.829 (loose theoretical bound).

    A tighter empirical figure (anticodon_classes_max ~= 22, from aaRS
    saturation) gives r >= 2.56, but is partially circular with P1/P5 since
    aaRS count tracks the canonical 20-aa alphabet.
    """
    k_max = anticodon_classes_max + n_stops_max
    return PathBounds(
        name="P4 (tRNA discrimination)",
        lower=CODONS / k_max,
        upper=POS_INF,
        source=f"anticodon_classes_max={anticodon_classes_max}, stops_max={n_stops_max}",
    )


def path_P5_proteome_viability(
    n_aa_min: int = 13, n_stops: int = 3
) -> PathBounds:
    """Proteome viability ceiling.

    Reduced-alphabet engineering (Riddle 1997, Akanuma 2002, Doi 2005,
    Frenkel-Pinter 2023) establishes that individual proteins can be built
    with 5-12 amino acids. Proteome-wide viability requires more; the
    chemistry-class enumeration (3 hydrophobic + 2 polar + 2 positive +
    2 negative + 1 aromatic + 3 special-purpose) gives a stipulative floor
    of ~13. The integers in that decomposition are stipulations, not
    derivations, and the resulting figure is a working choice rather than
    a sharp bound. Default n_aa_min=13 corresponds to that decomposition.

    Chemistry-space coverage (Philip & Freeland 2011, Ilardo et al. 2015)
    shows the encoded 20-aa set is highly tuned, but does not directly
    bound the minimum alphabet size for proteome viability.

    Ceiling: r <= 64 / (n_aa_min + n_stops).
    """
    k_min = n_aa_min + n_stops
    return PathBounds(
        name="P5 (proteome viability)",
        lower=NEG_INF,
        upper=CODONS / k_min,
        source=f"#aa_min={n_aa_min}, stops={n_stops}",
    )


# ----------------------------------------------------------------------
# Intersection and ablation
# ----------------------------------------------------------------------


def intersect(paths: List[PathBounds]) -> Tuple[float, float]:
    """Intersection of bound intervals across paths. Returns (lo, hi)."""
    lo = max((p.lower for p in paths), default=NEG_INF)
    hi = min((p.upper for p in paths), default=POS_INF)
    return lo, hi


def format_interval(lo: float, hi: float) -> str:
    lo_s = "-inf" if lo == NEG_INF else f"{lo:.3f}"
    hi_s = "+inf" if hi == POS_INF else f"{hi:.3f}"
    width = (hi - lo) if (lo != NEG_INF and hi != POS_INF) else POS_INF
    width_s = "inf" if width == POS_INF else f"{width:.3f}"
    return f"[{lo_s}, {hi_s}] (width {width_s})"


def run_ablation(
    paths: Dict[str, PathBounds],
) -> List[Tuple[str, float, float]]:
    """Compute the intersection window for the full set and for each
    single-path ablation. Returns list of (configuration_label, lo, hi).
    """
    names = list(paths.keys())
    results = []

    full = list(paths.values())
    lo, hi = intersect(full)
    results.append(("All paths (full)", lo, hi))

    for ablated in names:
        kept = [paths[n] for n in names if n != ablated]
        lo, hi = intersect(kept)
        results.append((f"\\ {ablated}", lo, hi))

    # Theory-only and empirical-only views
    theory_only = [paths[n] for n in names if n != "P1"]
    lo, hi = intersect(theory_only)
    results.append(("Theory only (P2 cap P3 cap P4 cap P5)", lo, hi))

    if "P1" in paths:
        results.append(("P1 only (empirical)", paths["P1"].lower, paths["P1"].upper))

    # The tight theoretical bracket
    if "P4" in paths and "P5" in paths:
        lo, hi = intersect([paths["P4"], paths["P5"]])
        results.append(("P4 cap P5 (binding theory only)", lo, hi))

    return results


def classify_binding(paths: Dict[str, PathBounds]) -> Dict[str, str]:
    """Two-level classification reflecting the paper's framing:

    (i) "binding in full intersection" if removing the path from the full set
        widens the final intersection.
    (ii) "binding theoretically" if removing the path widens the *theoretical*
        bracket (i.e., the intersection of P2, P3, P4, P5 excluding P1).

    P1 is the empirical anchor and dominates the full intersection; P4 and P5
    are theoretically binding (active edges of P2 cap P3 cap P4 cap P5) but
    shadowed by P1. P2 and P3 are non-binding in both views.
    """
    full_lo, full_hi = intersect(list(paths.values()))
    theory_lo, theory_hi = intersect([paths[n] for n in paths if n != "P1"])
    out = {}
    for name in paths:
        kept_full = [paths[n] for n in paths if n != name]
        lo_f, hi_f = intersect(kept_full)
        in_full = (lo_f < full_lo - 1e-9) or (hi_f > full_hi + 1e-9)

        if name == "P1":
            kept_theory = [paths[n] for n in paths if n != "P1"]  # unchanged
            in_theory = False
        else:
            kept_theory = [paths[n] for n in paths if n not in (name, "P1")]
            lo_t, hi_t = intersect(kept_theory)
            in_theory = (lo_t < theory_lo - 1e-9) or (hi_t > theory_hi + 1e-9)

        if in_full and in_theory:
            label = "binding (full + theoretical)"
        elif in_full:
            label = "binding (full intersection)"
        elif in_theory:
            label = "binding (theoretical bracket only, shadowed by P1)"
        else:
            label = "non-binding witness"
        out[name] = label
    return out


# ----------------------------------------------------------------------
# Sensitivity sweep
# ----------------------------------------------------------------------


def sensitivity_sweep(
    anticodon_max_values: List[int] = None,
    n_aa_min_values: List[int] = None,
) -> List[Dict]:
    """Sweep over P4 and P5 parameters, return theoretical-bracket bounds
    for each combination and a flag indicating whether the SGC is contained.

    anticodon_max_values: 32 = cytoplasmic wobble minimum;
                          38 = typical cytoplasmic;
                          46 = upper end of empirical tRNA-gene counts.
    n_aa_min_values: 10, 13, 15, 17 spans the chemistry-class enumeration
                     range from minimalist to within-class diversity.
    """
    if anticodon_max_values is None:
        anticodon_max_values = [32, 38, 46]
    if n_aa_min_values is None:
        n_aa_min_values = [10, 13, 15, 17]

    rows = []
    for ac_max, n_aa in product(anticodon_max_values, n_aa_min_values):
        p4 = path_P4_trna_discrimination(anticodon_classes_max=ac_max)
        p5 = path_P5_proteome_viability(n_aa_min=n_aa)
        lo, hi = intersect([p4, p5])
        contains_sgc = (lo <= R_SGC <= hi)
        rows.append(
            {
                "anticodon_max": ac_max,
                "n_aa_min": n_aa,
                "r_lower": lo,
                "r_upper": hi,
                "width": hi - lo,
                "contains_sgc": contains_sgc,
            }
        )
    return rows


# ----------------------------------------------------------------------
# Main report
# ----------------------------------------------------------------------


def main() -> None:
    paths = {
        "P1": path_P1_comparative_genomics(),
        "P2": path_P2_shannon(),
        "P3": path_P3_mutation_load(),
        "P4": path_P4_trna_discrimination(),
        "P5": path_P5_proteome_viability(),
    }

    print("=" * 70)
    print("REDUNDANCY FLOOR ABLATION ANALYSIS")
    print("=" * 70)
    print(f"SGC redundancy: r = 64/23 = {R_SGC:.4f}")
    print()

    print("--- Individual path bounds ---")
    for name, p in paths.items():
        lo = "-inf" if p.lower == NEG_INF else f"{p.lower:.3f}"
        hi = "+inf" if p.upper == POS_INF else f"{p.upper:.3f}"
        print(f"  {name}: r in [{lo}, {hi}]    ({p.source})")
    print()

    print("--- Ablation table ---")
    results = run_ablation(paths)
    for label, lo, hi in results:
        print(f"  {label:<48s}  r in {format_interval(lo, hi)}")
    print()

    print("--- Binding classification ---")
    classification = classify_binding(paths)
    for name, status in classification.items():
        print(f"  {paths[name].name:<35s}  {status}")
    print()

    print("--- Sensitivity sweep (theoretical bracket P4 cap P5) ---")
    print(f"  {'A_max':>6s}  {'#aa_min':>8s}  {'r_lower':>8s}  "
          f"{'r_upper':>8s}  {'width':>6s}  contains SGC?")
    rows = sensitivity_sweep()
    for row in rows:
        print(
            f"  {row['anticodon_max']:>6d}  {row['n_aa_min']:>8d}  "
            f"{row['r_lower']:>8.3f}  {row['r_upper']:>8.3f}  "
            f"{row['width']:>6.3f}  {row['contains_sgc']}"
        )
    print()

    print("--- Summary ---")
    full_lo, full_hi = intersect(list(paths.values()))
    print(f"  Full intersection:        r in {format_interval(full_lo, full_hi)}")
    th_lo, th_hi = intersect([paths["P4"], paths["P5"]])
    print(f"  Theoretical bracket only: r in {format_interval(th_lo, th_hi)}")
    print(f"  SGC r = {R_SGC:.4f} contained in full intersection: "
          f"{full_lo <= R_SGC <= full_hi}")
    print(f"  SGC r = {R_SGC:.4f} contained in theoretical bracket: "
          f"{th_lo <= R_SGC <= th_hi}")


if __name__ == "__main__":
    main()