import openpyxl import os def count_substitutions(fasta_file): # Load the FASTA sequences sequences = [] with open(fasta_file) as file: sequence = '' for line in file: line = line.strip() if line.startswith('>'): if sequence: sequences.append(sequence) sequence = '' else: sequence += line sequences.append(sequence) # Initialize substitution counters and position list substitutions = { 'A_to_G': 0, 'G_to_A': 0, 'T_to_G': 0, 'G_to_T': 0, 'A_to_C': 0, 'C_to_A': 0, 'A_to_G': 0, 'G_to_A': 0, 'C_to_T': 0, 'T_to_C': 0, 'G_to_C': 0, 'C_to_G': 0, 'A_to_T': 0, 'T_to_A': 0 } substitution_positions = { 'A_to_G': [], 'G_to_A': [], 'T_to_G': [], 'G_to_T': [], 'A_to_C': [], 'C_to_A': [], 'A_to_G': [], 'G_to_A': [], 'C_to_T': [], 'T_to_C': [], 'G_to_C': [], 'C_to_G': [], 'A_to_T': [], 'T_to_A': [] } # Iterate through the sequences and count substitutions for i in range(len(sequences[0])): ref_base = sequences[0][i] alt_base = sequences[1][i] substitution = f"{ref_base}_to_{alt_base}" if substitution in substitutions: substitutions[substitution] += 1 substitution_positions[substitution].append(i + 1) # Calculate the sum of each type of substitution substitution_sums = {substitution: sum(substitution_positions[substitution]) for substitution in substitutions} # Calculate the sum of Ts and Tv substitutions ts_substitutions = ['A_to_G', 'G_to_A', 'C_to_T', 'T_to_C'] tv_substitutions = ['T_to_G', 'G_to_T', 'A_to_C', 'C_to_A', 'G_to_C', 'C_to_G', 'A_to_T', 'T_to_A'] ts_count = sum(substitutions[substitution] for substitution in ts_substitutions) tv_count = sum(substitutions[substitution] for substitution in tv_substitutions) # Calculate the Ts/Tv ratio ts_tv_ratio = ts_count / tv_count return substitutions, substitution_sums, substitution_positions, ts_count, tv_count, ts_tv_ratio # Specify the folder path where your .fasta files are located folder_path = '/path/to/your/folder' # Iterate through all .fasta files in the folder for filename in os.listdir(folder_path): if filename.endswith(".fasta"): fasta_file = os.path.join(folder_path, filename) substitutions, substitution_sums, substitution_positions, ts_count, tv_count, ts_tv_ratio = count_substitutions(fasta_file) # Create a new Excel workbook and sheet workbook = openpyxl.Workbook() sheet = workbook.active # Set column headers headers = ['Substitution', 'Count', 'Sum', 'Positions'] sheet.append(headers) # Add data rows to the sheet for substitution in substitutions: count = substitutions[substitution] sum_value = substitution_sums[substitution] positions = substitution_positions[substitution] positions_str = ', '.join(map(str, positions)) # Convert position list to a string data_row = [substitution, count, sum_value, positions_str] sheet.append(data_row) # Update the 'Substitution' and 'Count' columns for Ts, Tv, and the ratio sheet['A14'] = 'Ts' sheet['B14'] = ts_count sheet['A15'] = 'Tv' sheet['B15'] = tv_count sheet['A16'] = 'Ts/Tv' sheet['B16'] = ts_tv_ratio # Save the workbook as an Excel file with the same name as the .fasta file output_file = os.path.splitext(filename)[0] + '.xlsx' workbook.save(output_file) print("Substitutions file generated:", output_file)