##Supplemental Data 6: The bash script used in extracting the coding sequences from the genome. from Bio import SeqIO from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq # Function to remove stop codons from a nucleotide sequence def remove_stop_codons(sequence): stop_codons = ["TAA", "TAG", "TGA"] for codon in stop_codons: sequence = sequence.replace(codon, "") return sequence # Function to extract unique protein-coding genes and save them to all_genes.fasta def extract_and_save_protein_coding_genes(genbank_file, output_fasta): unique_genes = set() gene_records = [] for record in SeqIO.parse(genbank_file, "genbank"): for feature in record.features: if feature.type == "CDS": if "gene" in feature.qualifiers: gene_name = feature.qualifiers["gene"][0] if gene_name not in unique_genes: unique_genes.add(gene_name) gene_seq = feature.location.extract(record).seq gene_record = SeqRecord(gene_seq, id=gene_name, description="") gene_records.append(gene_record) SeqIO.write(gene_records, output_fasta, "fasta") return gene_records # Provide the path to your GenBank file genbank_file = "14.gb" # Output file for all extracted genes all_genes_output_fasta = "all_genes.fasta" # Extract and save protein-coding genes to all_genes.fasta gene_records = extract_and_save_protein_coding_genes(genbank_file, all_genes_output_fasta) # Concatenate gene sequences, remove stop codons, and save as concatenated.fasta concatenated_sequence = "" for record in gene_records: concatenated_sequence += str(record.seq) # Remove stop codons from the concatenated sequence concatenated_sequence_no_stop = remove_stop_codons(concatenated_sequence) # Write the concatenated sequence without stop codons to a file concatenated_output_fasta = "concatenated.fasta" with open(concatenated_output_fasta, "w") as output_handle: output_handle.write(">Concatenated_Protein_Coding_Sequences\n") output_handle.write(concatenated_sequence_no_stop) print("All extracted protein-coding genes have been written to", all_genes_output_fasta) print("Concatenated nucleotide sequences of protein-coding genes with stop codons removed have been written to", concatenated_output_fasta)