import os from Bio import SeqIO from collections import Counter, defaultdict import pandas as pd # Define synonymous codons for each amino acid synonymous_codons = { 'Ala': ['GCT', 'GCC', 'GCA', 'GCG'], 'Arg': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'], 'Asn': ['AAT', 'AAC'], 'Asp': ['GAT', 'GAC'], 'Cys': ['TGT', 'TGC'], 'Gln': ['CAA', 'CAG'], 'Glu': ['GAA', 'GAG'], 'Gly': ['GGT', 'GGC', 'GGA', 'GGG'], 'His': ['CAT', 'CAC'], 'Ile': ['ATT', 'ATC', 'ATA'], 'Leu': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'], 'Lys': ['AAA', 'AAG'], 'Met': ['ATG'], 'Phe': ['TTT', 'TTC'], 'Pro': ['CCT', 'CCC', 'CCA', 'CCG'], 'Ser': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'], 'Thr': ['ACT', 'ACC', 'ACA', 'ACG'], 'Trp': ['TGG'], 'Tyr': ['TAT', 'TAC'], 'Val': ['GTT', 'GTC', 'GTA', 'GTG'], 'Stop': ['TAA', 'TAG', 'TGA'] } # Function to calculate RSCU and save results to an Excel file def calculate_rscu_for_genbank(file_path): # Read the GenBank file genome_record = SeqIO.read(file_path, "genbank") # Extract coding sequences (CDS) from the genome coding_sequences = [] for feature in genome_record.features: if feature.type == "CDS": # Only select coding sequences # Join exons if the CDS contains multiple segments (handle introns) coding_seq = "".join(str(feature.extract(genome_record.seq))) coding_sequences.append(coding_seq) # Concatenate all CDS sequences all_cds_sequence = "".join(coding_sequences) # Count occurrences of each codon in the concatenated CDS sequence codon_counts = Counter([all_cds_sequence[i:i+3] for i in range(0, len(all_cds_sequence) - 2, 3)]) # Calculate total occurrences for each amino acid aa_totals = defaultdict(int) for aa, codons in synonymous_codons.items(): for codon in codons: aa_totals[aa] += codon_counts[codon] # Calculate RSCU values rscu_values = [] for aa, codons in synonymous_codons.items(): for codon in codons: observed_count = codon_counts[codon] expected_count = aa_totals[aa] / len(codons) if aa_totals[aa] > 0 else 0 rscu = observed_count / expected_count if expected_count > 0 else 0 rscu_values.append([aa, codon, rscu]) # Create a DataFrame for the RSCU results df_rscu = pd.DataFrame(rscu_values, columns=["Amino Acid", "Codon", "RSCU"]) # Extract the name of the GenBank file (without extension) for the Excel file name base_name = os.path.splitext(os.path.basename(file_path))[0] # Save the DataFrame to an Excel file output_file = f"{base_name}_RSCU.xlsx" df_rscu.to_excel(output_file, index=False) print(f"RSCU results saved to: {output_file}") # Function to process all GenBank files in a folder def process_genbank_files_in_folder(folder_path): # Get all GenBank (.gb, .gbf) files in the folder genbank_files = [f for f in os.listdir(folder_path) if f.endswith(('.gb', '.gbf'))] # Process each GenBank file for genbank_file in genbank_files: file_path = os.path.join(folder_path, genbank_file) calculate_rscu_for_genbank(file_path) # Specify the folder containing the GenBank files folder_path = "/Path" # Replace with the actual folder path # Process all GenBank files in the folder process_genbank_files_in_folder(folder_path)