import sys
import pandas as pd
import numpy as np
from tqdm import tqdm

def compare_alignments(alignment1: np.ndarray, alignment2: np.ndarray) -> list[float]:
    """
    Takes two 1D alignments, returns in order:
    1. norm_mean_diff
    2. normalized_max_diff
    3. pct_identical_boundaries
    """
    if len(alignment1) != len(alignment2):
        raise Exception("Alignment length mismatch")

    total_signal_span = max(alignment1[-1], alignment2[-1]) - min(alignment1[0], alignment2[0])

    boundary_diffs = []
    for i in range(len(alignment1)):
        diff = np.abs(alignment1[i] - alignment2[i])
        boundary_diffs.append(diff)

    return [
        np.nanmean(boundary_diffs) / total_signal_span,
        max(boundary_diffs) / total_signal_span,
        sum(1 for d in boundary_diffs if d==0) / len(boundary_diffs)
    ]


def main():
    in_parquet = sys.argv[1]
    out_file = sys.argv[2]

    df = pd.read_parquet(in_parquet)

    rows = []

    for row in tqdm(df.itertuples()):
        read_id = row.read_id

        alignments = [
            row.alignment_fishnet[4:-4].astype(np.float64) if row.alignment_fishnet is not None else None,  #row.alignment_fishnet[4:-4], 
            row.alignment_remora[4:-4].astype(np.float64) if row.alignment_remora is not None else None,  #row.alignment_remora[4:-4], 
            row.alignment_f5c.astype(np.float64) if row.alignment_f5c is not None else None,  #row.alignment_f5c, 
            row.alignment_uncalled4.astype(np.float64) if row.alignment_uncalled4 is not None else None #row.alignment_uncalled4
        ]

        index = {
            "fishnet": 0, "remora": 1, "f5c": 2, "uncalled4": 3
        }

        for a, b in [
            ("fishnet", "remora"),
            ("fishnet", "f5c"),
            ("fishnet", "uncalled4"),
            ("remora", "f5c"),
            ("remora", "uncalled4"),
            ("f5c", "uncalled4")
        ]:
            if alignments[index[a]] is None or alignments[index[b]] is None:
                result_row = [read_id, a, b, None, None, None]
            else:
                result_row = [read_id, a, b] + compare_alignments(alignments[index[a]], alignments[index[b]])
            rows.append(result_row)

    data = pd.DataFrame(rows, columns=[
        "read_id",
        "tool1",
        "tool2",
        "norm_mean_diff",
        "normalized_max_diff",
        "pct_identical_boundaries"
    ])

    data.to_parquet(out_file)


if __name__=="__main__":
    main()

