import os
from Bio import SeqIO
from collections import Counter, defaultdict
import pandas as pd

# Define synonymous codons for each amino acid
synonymous_codons = {
    'Ala': ['GCT', 'GCC', 'GCA', 'GCG'],
    'Arg': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'],
    'Asn': ['AAT', 'AAC'],
    'Asp': ['GAT', 'GAC'],
    'Cys': ['TGT', 'TGC'],
    'Gln': ['CAA', 'CAG'],
    'Glu': ['GAA', 'GAG'],
    'Gly': ['GGT', 'GGC', 'GGA', 'GGG'],
    'His': ['CAT', 'CAC'],
    'Ile': ['ATT', 'ATC', 'ATA'],
    'Leu': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],
    'Lys': ['AAA', 'AAG'],
    'Met': ['ATG'],
    'Phe': ['TTT', 'TTC'],
    'Pro': ['CCT', 'CCC', 'CCA', 'CCG'],
    'Ser': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],
    'Thr': ['ACT', 'ACC', 'ACA', 'ACG'],
    'Trp': ['TGG'],
    'Tyr': ['TAT', 'TAC'],
    'Val': ['GTT', 'GTC', 'GTA', 'GTG'],
    'Stop': ['TAA', 'TAG', 'TGA']
}

# Function to calculate RSCU and save results to an Excel file
def calculate_rscu_for_genbank(file_path):
    # Read the GenBank file
    genome_record = SeqIO.read(file_path, "genbank")
    
    # Extract coding sequences (CDS) from the genome
    coding_sequences = []
    for feature in genome_record.features:
        if feature.type == "CDS":  # Only select coding sequences
            # Join exons if the CDS contains multiple segments (handle introns)
            coding_seq = "".join(str(feature.extract(genome_record.seq)))
            coding_sequences.append(coding_seq)

    # Concatenate all CDS sequences
    all_cds_sequence = "".join(coding_sequences)

    # Count occurrences of each codon in the concatenated CDS sequence
    codon_counts = Counter([all_cds_sequence[i:i+3] for i in range(0, len(all_cds_sequence) - 2, 3)])

    # Calculate total occurrences for each amino acid
    aa_totals = defaultdict(int)
    for aa, codons in synonymous_codons.items():
        for codon in codons:
            aa_totals[aa] += codon_counts[codon]

    # Calculate RSCU values
    rscu_values = []
    for aa, codons in synonymous_codons.items():
        for codon in codons:
            observed_count = codon_counts[codon]
            expected_count = aa_totals[aa] / len(codons) if aa_totals[aa] > 0 else 0
            rscu = observed_count / expected_count if expected_count > 0 else 0
            rscu_values.append([aa, codon, rscu])

    # Create a DataFrame for the RSCU results
    df_rscu = pd.DataFrame(rscu_values, columns=["Amino Acid", "Codon", "RSCU"])

    # Extract the name of the GenBank file (without extension) for the Excel file name
    base_name = os.path.splitext(os.path.basename(file_path))[0]

    # Save the DataFrame to an Excel file
    output_file = f"{base_name}_RSCU.xlsx"
    df_rscu.to_excel(output_file, index=False)

    print(f"RSCU results saved to: {output_file}")


# Function to process all GenBank files in a folder
def process_genbank_files_in_folder(folder_path):
    # Get all GenBank (.gb, .gbf) files in the folder
    genbank_files = [f for f in os.listdir(folder_path) if f.endswith(('.gb', '.gbf'))]

    # Process each GenBank file
    for genbank_file in genbank_files:
        file_path = os.path.join(folder_path, genbank_file)
        calculate_rscu_for_genbank(file_path)


# Specify the folder containing the GenBank files
folder_path = "/Path"  # Replace with the actual folder path

# Process all GenBank files in the folder
process_genbank_files_in_folder(folder_path)