import os import pandas as pd from Bio import SeqIO from Bio.Seq import Seq from Bio.SeqUtils import ProtParam # Function to get organism name from GenBank record def get_organism_name(genebank_file): try: with open(genebank_file, "r") as gb_file: record = next(SeqIO.parse(gb_file, "genbank")) return record.annotations.get("organism", "Unknown").replace(" ", "_") except Exception as e: print(f"Error reading organism name from {genebank_file}: {e}") return "Unknown" # Function to analyze amino acid frequency from a GenBank file def analyze_amino_acid_frequency(genebank_file): all_amino_acid_sequences = [] with open(genebank_file, "r") as gb_file: records = SeqIO.parse(gb_file, "genbank") for record in records: for feature in record.features: if feature.type == "CDS": try: # Extract and translate the CDS sequence cds_seq = feature.extract(record.seq) amino_acid_sequence = cds_seq.translate(to_stop=True) all_amino_acid_sequences.append(str(amino_acid_sequence)) except Exception as e: print(f"Error translating CDS in {genebank_file}: {e}") combined_sequence = "".join(all_amino_acid_sequences) if not combined_sequence: print(f"No amino acid sequences found in {genebank_file}. Skipping.") return pd.DataFrame() # Analyze amino acid composition protein_analysis = ProtParam.ProteinAnalysis(combined_sequence) composition = protein_analysis.get_amino_acids_percent() # Convert to DataFrame organism_name = get_organism_name(genebank_file) df = pd.DataFrame(list(composition.items()), columns=['Amino Acid', organism_name]) df['Amino Acid'] = df['Amino Acid'].str.capitalize() df[organism_name] = df[organism_name] * 100 # Convert to percentage return df # Process all GenBank files in a folder def process_folder(folder_path): genebank_files = [f for f in os.listdir(folder_path) if f.endswith(".gb") or f.endswith(".gbf")] for genebank_file in genebank_files: file_path = os.path.join(folder_path, genebank_file) df = analyze_amino_acid_frequency(file_path) print(f"\nOverall Amino Acid Composition for {genebank_file}:") print(df) if not df.empty: output_file = os.path.splitext(genebank_file)[0] + ".xlsx" df.to_excel(output_file, index=False) print(f"Results saved to {output_file}") # Set your folder path folder_path = "/path/" process_folder(folder_path)