#!/usr/bin/env __author__ = 'Gordon Sun' import sys, time import format_tools as ft import seq_tools as sts import input_tools as it ''' This script reads in a file containing sequences for either protein optimization or gibson assembly primer design. Commands are input into the program in the following format: python seq_analyzer.py -if sequence_file -m (g)ibson/(p)rotoptimization -c organism_codon_file output_file speed Ex: python seq_analyzer.py -if sequence_file -m g -c ecoli -o output -t 1 An output file name is not required, as the program can also autogenerate an outputfile name. A speed is not required, the default speed is 0 which is the fastest setting. If no organism codon_file is selected, the program will automatically pick the codon frequency table for E. coli K12 The only required arguments are the input filename, the mode the program should run in (gibson assembly or protein optimization. This file cannot be run without: input_tools.py seq_tools.py format_tools.py These files contain the function libraries the main program calls. This file set also cannot be run without Codon_Lib NT_Lib restriction_enzymes These files are libraries for the program to read from and contain codon pairing maps, nucleotide pairing maps, and restriction site sequences Included with the program are the following files: ecoli ecoliK12 yeast These files are the codon frequency usage tables for the respective organisms. They are referenced from the following sources respectively: http://www.sci.sdsu.edu/~smaloy/MicrobialGenetics/topics/in-vitro-genetics/codon-usage.html http://openwetware.org/wiki/Escherichia_coli/Codon_usage http://www.kazusa.or.jp/codon/cgi-bin/showcodon.cgi?species=4932 If there are any questions and/or comments concerning the program and its usaage, the author can be reached at gsun3301@gmail.com. Thanks! ''' # run as def main(args): options = it.get_args(args) duration = options['speed'] print "> ------------------------------------------" print "> Program starting\n" time.sleep(duration) output_filename = options['output'] outputfile = open(output_filename, "w") try: print "> Reading sequence data file..." time.sleep(duration) seq_data = ft.read_table(options['input_file']) sequence_file = [] for x1 in xrange(len(seq_data)): sequence_file.append(seq_data[x1][0]) it.write_n_print("> Sequence data read successfully\n", outputfile) time.sleep(duration) except IndexERROR: it.write_n_print("> ERROR: Sequence data file name not provided, process exiting\n", outputfile) outputfile.close() it.quit_flush(outputfile) try: print "> Reading organism codon frequency data file..." time.sleep(duration) codon_frequencies = ft.process_org_codon_table(options['organism_codon_filename']) it.write_n_print("> Organism codon frequency file read successfully\n", outputfile) time.sleep(duration) except IndexERROR: it.write_n_print("> ERROR: Organism codon frequency file not provided, process exiting\n", outputfile) outputfile.close() it.quit_flush(outputfile) if not sts.find_all_instances(options['mode'], "gp"): print "> ERROR: Mode not recognized: argument should be one of the following:" \ "\n> (g)ibson" \ "\n> (p)rotein optimization" sys.exit() else: mode = options['mode'] if mode == 'i': it.write_n_print("> Mode selected: (i)nfusion\n", outputfile) elif mode == 'p': it.write_n_print("> Mode selected: (p)rotein optimization\n", outputfile) else: it.write_n_print("> Mode selected: (g)ibson assembly\n", outputfile) time.sleep(duration) it.write_n_print("> Files & libraries read successfully\n", outputfile) it.write_n_print("> ------------------------------------------\n", outputfile) time.sleep(duration) it.write_n_print("> Organism used is " + str(options['organism_codon_filename']) + "\n", outputfile) time.sleep(duration) if_OK = True for seq, item in enumerate(sequence_file): if sts.DNAorProt(sequence_file[seq]) == 0: it.write_n_print("> ERROR: Sequence " + str(seq) + " is invalid, not recognized as prot or DNA seq\n", outputfile) if_OK = False if not if_OK: it.write_n_print("> ERROR: Sequences not recognized, process terminating\n***END", outputfile) it.quit_flush(outputfile) else: it.write_n_print("> All sequences are OK (nt/AA)\n", outputfile) it.write_n_print("> ====================================================================================\n", outputfile) time.sleep(duration) if options['mode'] == 'p': valid = True it.write_n_print("> Protein sequence optimization starting...\n", outputfile) it.write_n_print( "> NOTE: For protein optimization, the sequences file can contain either nucleotide or single letter\n" "\tamino acid sequences. Each sequence should be on a separate line, blank newline entries in the \n" "\tfile will result in a read error. The file should contain no other characters than the ones for a\n" "\tsequence. You can have the protein sequences in either amino acid or nucleotide sequence, and \n" "\tsequences in the file are not limited to only one format. The program will automatically translate\n" "\tDNA and/or Protein sequences into a preliminary optimized DNA sequence, which will subsequently be\n" "\tfurther optimized. If your file is not formatted as such, please press [Ctrl]-C to stop the program \n" "\tand reformat your file.\n", outputfile) time.sleep(duration * 5) seq_data_dna = [] for d, item in enumerate(seq_data): sequence = ft.array2str(item).lower() seq_type = sts.DNAorProt(sequence) dnatranslation = '' if seq_type != 0 and seq_type < 3: if seq_type == 1: prottranslation = sts.translator(sequence) dnatranslation = ''.join([row[0] for row in sts.rev_translator(prottranslation, codon_frequencies)]) else: dnatranslation = ''.join([row[0] for row in sts.rev_translator(sequence, codon_frequencies)]) seq_data_dna.append(dnatranslation) else: valid = False if not valid: it.write_n_print("> ------------------------------------------\n", outputfile) it.write_n_print("> ERROR: Protein DNA sequences not of proper length, process terminating\n***END", outputfile) it.quit_flush(outputfile) else: for s, item in enumerate(seq_data_dna): seq_item = item it.write_n_print("> ==========================================\n", outputfile) it.write_n_print("> Sequence " + str(s) + " is (DNA form):\n", outputfile) it.write_n_print("\t" + seq_item + "\n", outputfile) time.sleep(duration) it.write_n_print(" Translation (initial): \n", outputfile) it.write_n_print("\t" + sts.translator(seq_item) + "\n", outputfile) time.sleep(duration) rx_sites = sts.find_restriction_sites(seq_item) rx_sites = [sublist for sublist in rx_sites if sublist[2]] if len(rx_sites) == 0: it.write_n_print(" No restriction sites found, only codon optimization needed\n", outputfile) else: it.write_n_print(" Restriction Sites found at following indices to be removed: \n", outputfile) for f, item in enumerate(rx_sites): it.write_n_print("\t", outputfile) it.write_n_print(item, outputfile) it.write_n_print("\n", outputfile) time.sleep(duration) it.write_n_print(" Codon optimized and and (-) restriction site sequence:\n", outputfile) TAADAAA = sts.prot_optimization(seq_item, codon_frequencies) it.write_n_print("\t" + TAADAAA + "\n", outputfile) time.sleep(duration) if sts.translator(seq_item) == sts.translator(TAADAAA): it.write_n_print(" Translation (verification): -good optimization\n", outputfile) it.write_n_print("\t" + sts.translator(TAADAAA) + "\n", outputfile) else: it.write_n_print(" Translation (verification): -bad optimization, mismatch\n", outputfile) it.write_n_print("\t" + sts.translator(TAADAAA) + "\n", outputfile) it.write_n_print(" Optimization complete\n", outputfile) it.write_n_print("> ==========================================\n", outputfile) it.write_n_print("> QuitFlush completed, Process Completed.\n", outputfile) it.quit_flush(outputfile) elif options['mode'] == 'g': it.write_n_print("> Gibson Assembly operation starting...\n", outputfile) it.write_n_print( "> NOTE: For Gibson assembly, the sequence file should contain ONLY DNA sequences. The first sequence\n" "\tin the file should contain the 3' end of the backbone that the rest of the assembly is to connect\n" "\tto in 5'-3' order. The last sequence in the file should contain the 5' end of the backbone that the\n" "\tbackend of the assembly should connect to in 5'-3' order. Note the first and last sequence should\n" "\tbe at least 100 nucleotides long so the algorithm can calculate the primer extensions.\n\n" "\tThe middle sequences should include the fragments you want to assemble together.Each fragment should\n" "\talso be at minimum 50nt long\n\n" "> IMPORTANT NOTE: DO NOT INCLUDE sequence homologies and overlaps in the assembly fragments, \n" "\tthese are auto-generated to optimize for uniformity of melting temperature. Keep in mind multiple\n" "\tfragment assemblies have a lower likelihood of success with more fragments.\n\n" "\tEach sequence should be on a separate line, blank newline entries in the file will result in a \n" "\tread error. The file should contain no other characters than the ones for a DNA sequence. \n" "\tIf your file is not formatted as such, please press [Ctrl]-C to stop the program and reformat your \n" "\tfile.\n", outputfile) it.write_n_print("> ==========================================\n", outputfile) time.sleep(duration * 5) seq_strings = [] for x, item in enumerate(seq_data): seq_strings.append(ft.array2str(item)) junctions_array = [] for y in xrange(len(seq_strings) - 1): junctions_array.append(sts.designate_overlap(seq_strings[y], seq_strings[y + 1])) while True: Melting_temperature = raw_input("What is the desired Melting temperature for Gibson primers(40-100C)?\n" "Recommended Gibson Temperature is 48C from NEB\n\n>") try: Melting_temperature = int(Melting_temperature) except ValueError: print "Please enter a reasonable temperature (integer).\n" time.sleep(duration) continue if 40 < Melting_temperature < 100: break else: print "Please enter a melting temperature between 40 and 100C.\n" time.sleep(duration) while True: salt_conc = raw_input("What is the salt concentration for the Gibson reaction in M?\n" "Common concentration is 0.05M, or 50mM\n\n>") try: salt_conc = float(salt_conc) break except ValueError: print "Please enter a reasonable concentration (float).\n" time.sleep(duration) continue it.write_n_print("> Melting Temperature: " + str(Melting_temperature) + "C\n", outputfile) it.write_n_print("> Salt Concentration : " + str(salt_conc) + "M == " + str(salt_conc * 1000) + "mM\n", outputfile) it.write_n_print("> ==========================================\n", outputfile) it.write_n_print("> Front Backbone sequence 5'->3' (3' accessible): " + str(len(seq_strings[0])) + " nt long\n", outputfile) it.write_n_print("\t" + seq_strings[0] + "\n", outputfile) for item3 in seq_strings[1:len(seq_strings) - 1]: it.write_n_print("> Fragment " + str(seq_strings.index(item3)) + ": " + str(len(item3)) + " nt long\n", outputfile) it.write_n_print("\t" + seq_strings[seq_strings.index(item3)] + "\n", outputfile) it.write_n_print("> Rear Backbone sequence 5'->3' (5' accessible): " + str( len(seq_strings[len(seq_strings) - 1])) + " nt long\n", outputfile) it.write_n_print("\t" + seq_strings[len(seq_strings) - 1] + "\n", outputfile) it.write_n_print("> ==========================================\n", outputfile) overlap_region = [] for x, junction in enumerate(junctions_array): overlap_region.append(sts.pick_overlap(junction, Melting_temperature, salt_conc)) it.write_n_print("> Front Backbone + Fragment 1 overlap:\n\t" + str(overlap_region[0]) + "\n", outputfile) for item4 in overlap_region[1:len(overlap_region) - 1]: it.write_n_print("> Fragment " + str(overlap_region.index(item4)) + " + Fragment " + str( overlap_region.index(item4) + 1) + " overlap:\n\t" + str(item4) + "\n", outputfile) it.write_n_print("> Fragment " + str(len(overlap_region) - 1) + " + Rear Backbone overlap:\n\t" + str( overlap_region[len(overlap_region) - 1]) + "\n", outputfile) it.write_n_print("> ==========================================\n", outputfile) it.write_n_print("> The following overlaps were determined between the fragments for optimizing T_m.\n" "> They are listed in the format [overlap region, overlap with * marking junction, T_m of fragment.\n", outputfile) for item5 in overlap_region: it.write_n_print("\t" + str(item5) + "\n", outputfile) it.write_n_print("> ==========================================\n", outputfile) it.write_n_print("> Extending primer ends and determining reverse primer sequence...\n", outputfile) time.sleep(duration) it.write_n_print("> ==========================================\n", outputfile) it.write_n_print("> Primers listed below are in 5'->3' direction.\n\n", outputfile) target_junction = [] for x, item5 in enumerate(overlap_region): target_junction.append(item5[1]) fwd_primers = [] rev_primers = [] for x2, item6 in enumerate(target_junction): fwd_primers.append(sts.fwd_primer_extension(junctions_array[x2], item6, Melting_temperature, salt_conc)) rev_primers.append(sts.rev_primer_extension(junctions_array[x2], item6, Melting_temperature, salt_conc)) it.write_n_print("> Front Backbone + Fragment 1 primers:\n", outputfile) it.write_n_print("\tForward Primer (Tm = " + str(fwd_primers[1][1]) + " C):\n", outputfile) it.write_n_print("\t\t" + fwd_primers[1][0] + "\n", outputfile) it.write_n_print("\tReverse Primer (Tm = " + str(rev_primers[1][1]) + " C):\n", outputfile) it.write_n_print("\t\t" + rev_primers[1][0] + "\n", outputfile) for x7 in xrange(len(fwd_primers) - 2): it.write_n_print("> Fragment " + str(x7 + 1) + " + Fragment " + str(x7 + 2) + " primers:\n", outputfile) it.write_n_print("\tForward Primer (Tm = " + str(fwd_primers[x7][1]) + " C):\n", outputfile) it.write_n_print("\t\t" + fwd_primers[x7][0] + "\n", outputfile) it.write_n_print("\tReverse Primer (Tm = " + str(rev_primers[x7][1]) + " C):\n", outputfile) it.write_n_print("\t\t" + rev_primers[x7][0] + "\n", outputfile) it.write_n_print("> Fragment " + str(len(fwd_primers) - 1) + " + Rear Backbone primers:\n", outputfile) it.write_n_print("\tForward Primer (Tm = " + str(fwd_primers[len(fwd_primers) - 1][1]) + " C):\n", outputfile) it.write_n_print("\t\t" + fwd_primers[len(fwd_primers) - 1][0] + "\n", outputfile) it.write_n_print("\tReverse Primer (Tm = " + str(rev_primers[len(rev_primers) - 1][1]) + " C):\n", outputfile) it.write_n_print("\t\t" + rev_primers[len(rev_primers) - 1][0] + "\n", outputfile) temp_range = [] for x, item in enumerate(fwd_primers): temp_range.append(item[1]) temp_range.append(rev_primers[x][1]) it.write_n_print("> ==========================================\n", outputfile) it.write_n_print("> Temperature spread is : [" + str(min(temp_range)) + ", " + str(max(temp_range)) + "]\n", outputfile) it.write_n_print("> Temperature range is : " + str(max(temp_range) - min(temp_range)) + "\n", outputfile) it.write_n_print("> ==========================================\n", outputfile) it.write_n_print("> QuitFlush completed, Process Completed.\n", outputfile) it.quit_flush(outputfile) else: it.write_n_print("YOU BROKE THE CODE", outputfile) if __name__ == '__main__': main(sys.argv)