#=======================================================================================================================# # ratepartitions.py # # Written by: T Malm - 20120922 # # Script to read a TIGER rate file (or other rate files) and produce partitions from the rates usable for analyses. # # # # Currently partition sizes are calculated as: # # The first partition is calculated as highest_rate - ((highest_rate - minimum_rate)/(divfactor)), the remaining bins as: # # higher_rate - ((higher_rate - lower_rate)/divfactor+partition_number*0.3) - leading to smaller bin rate spans with # # faster rates (=lower rates), the last partition will be created when 10% or less characters are left. # # # # run the script from terminal with added rate-file after as well as wanted divfactor and it will produce # # a new output file with partition rate summaries and parition schemes for MrBayes and PHYLIP. # # # # The rate file should be a file with rates separated by hard return (TIGER rate output -rl command) and in order of # # the character alignment you want to use later. # # # # Run as: "python ratepartitions.py X Y" where X is the input rate file and Y for division factor (e.g. 1.35) # # # # Output file will be named after the input values: inputfilename_divfactor.txt # #=======================================================================================================================# import sys; import re; import string; import decimal; # opening and reading file if len(sys.argv) != 3: print "You need to run the script like this: >python py-file.py rate-file.nex 2.5"; sys.exit(0) infile = sys.argv[1]; # setting division factor divnum = sys.argv[2]; divnum = float(divnum) if not (isinstance(divnum, float)): print "1-You need to enter factor for division as positiv numerical value ( >1.5 ) as 2nd input: e.g. py-file.py rate-file.nex 2.5"; sys.exit(0) if (divnum<1.1): print "2-You need to enter factor for division as positiv numerical value ( >=1.1 ) as 2nd input: e.g. py-file.py rate-file.nex 2.5"; sys.exit(0) # opening file f = open(sys.argv[1], "r"); lines = f.readlines(); f.close() #reading rates lines into List List=[] i=1 for Line in lines: LineX = Line.strip() if (LineX != ""): List.append(float(LineX)) i+=1 maxL = max(List); minL = min(List); numchars = len(List) print "\nTotal data - ", numchars, " sites.\n" print "Slowest rate: ", maxL print "Fastest rate: ", minL spread = maxL - minL print "Rate spread: ", spread, "\n" # setting values for partitioning upperVal=maxL BinsLists = [] t=0; #total partitioned sites count cutoff = numchars*0.10 # cutoff value for creating last partition output_phy = ["PHYLIP style"] output_mrb = ["MrBayes style\nbegin mrbayes;"] oi = ["Partition output from ratepartitions.py\n--Written by Tobias Malm (20121130)\n\nFor rate file: ", infile, " with ", str(numchars), " sites!"] oi=''.join(oi) output_info=[oi,''] oi = ["Manually set dividing factor: ", str(divnum)] oi=''.join(oi) output_info.append(oi) oi = ["Partitions calculated according to:\n\t1st partition: highest rate - ((highest rate - minimum_rate)/(", str(divnum),")),\n\tthe remaining as lower boundary rate= upper boundary rate -((upper boundary rate - minimum rate)/(", str(divnum),"+Partitionnumber*0.3)).\n"] oi.append("\tLast partition created when less than 10% of total characters are left (=") oi.append(str(cutoff)) oi.append(" characters).") oi=''.join(oi) output_info.append(oi) output_info.append('') oi = ["Rate spread of entire data set (Highest (slowest, 1=invariant) to lowest (fastest) ): Highest: ", str(maxL), ", lowest: ", str(minL), ", spread: ", str(spread)] oi=''.join(oi) output_info.append(oi) nBins = 0 for b in range (1,100): Ltest = numchars-t if (Ltest <= cutoff): #for last partition to include all the rest lowerVal = minL BinL=[] i=1 for n in List: if (upperVal>n>=lowerVal): BinL.append(i); i+=1 nBins+=1 elif (b == 1): # for first partition lowerVal = upperVal-((upperVal-minL)/(divnum)) BinL=[] i=1 for n in List: if (upperVal>=n>lowerVal): BinL.append(i); i+=1 nBins+=1 else: # for all other partitions than the last lowerVal = upperVal-((upperVal-minL)/(divnum+b*0.3)) BinL=[] i=1 for n in List: if (upperVal>=n>lowerVal): BinL.append(i); i+=1 nBins+=1 t+=len(BinL) #for total site count #info for output in file and screen oi = ["Partition_", str(b), "(", str(len(BinL))," sites): Rate-span: ", str(upperVal), "-", str(lowerVal)]#, BinL oi=''.join(oi) output_info.append(oi) output_info.append('') pout = ["Partition_",str(b), " (", str(len(BinL))," sites): "] pout = ''.join(pout) print pout pout = ["Rate-span: ",str("{0:.5f}".format(upperVal)), "-",str("{0:.5f}".format(lowerVal)),"\n"] pout = ''.join(pout) print pout if (len(BinL)>0): # setting the output for phylip partitions charset = str(BinL) replace_list = ['[',']',' ','\''] for rl in replace_list: charset = charset.replace(rl,'') output = ["DNA, Partition_", str(b), " = ", charset] output = ''.join(output) for rl in replace_list: output = output.replace(rl,'') output_phy.append(output) # setting the output format for charsets as MrBayes partitions charset = str(BinL) replace_list = [',','[',']',' ','\''] for rl in replace_list: charset = charset.replace(rl,'') output = ["Charset Partition_", str(b), " = ", charset, ";"] output = ''.join(output) for rl in replace_list: output = output.replace(rl,'') output_mrb.append(output) upperVal = lowerVal #resetting the upper range value to the current lower value (for next bin) #breaking loop on last partition if (Ltest <= cutoff): break #more info if (t != numchars): print "Total sites paritioned is not identical to imported sites!:", t, " vs ", numchars oi = ["Total sites paritioned is not identical to imported sites!:", str(t)," vs ", str(numchars)] oi = ''.join(oi) output_info.append(oi) #fixing partition finishing for output #mrb partitioning listB = [] for b in range (1,nBins+1): bapp = ["Partition_", str(b)] bapp = ''.join(bapp) listB.append(bapp) listB = ', '.join(listB) out_finish = ["partition Partitions = ", str(nBins), ": ", listB, ";"] out_finish = ''.join(out_finish) output_mrb.append(out_finish) output_mrb.append("set partition = Partitions;") #collecting outputs output_fin1 = '\n'.join(output_info) output_fin2 = '\n'.join(output_mrb) output_fin3 = '\n'.join(output_phy) output_finished = [output_fin1,output_fin2,output_fin3] output_finished = '\n\n\n'.join(output_finished) #opening outfile and writing to it outfile = [infile, "_", str(divnum), ".txt"] outfile = ''.join(outfile) print "Output file with rate partition summary and MrBayes and PHYLIP partition schemes has been created: ", outfile outwrite = open(outfile, "w"); #open output file outwrite.write(output_finished)