@@ -96,7 +96,7 @@ def __init__(self,prog):
9696 lib .copyDirectory (os .path .join (parentdir , 'html_template' , 'css' ), os .path .join (args .out , 'css' ))
9797if not os .path .isdir (os .path .join (args .out , 'js' )):
9898 lib .copyDirectory (os .path .join (parentdir , 'html_template' , 'js' ), os .path .join (args .out , 'js' ))
99-
99+
100100#loop through each genome
101101stats = []
102102merops = []
@@ -106,6 +106,7 @@ def __init__(self,prog):
106106eggnog = []
107107busco = []
108108gbkfilenames = []
109+ scinames = []
109110num_input = len (args .input )
110111if num_input == 0 :
111112 lib .log .error ("Error, you did not specify an input, -i" )
@@ -142,22 +143,24 @@ def __init__(self,prog):
142143 lib .parseGOterms (GBK , go_folder , stats [i ][0 ].replace (' ' , '_' ))
143144 lib .gb2proteinortho (GBK , protortho , stats [i ][0 ].replace (' ' , '_' ))
144145 eggnog .append (lib .getEggNogfromNote (GBK ))
145-
146- #convert eggnog to a single dictionary for lookup later
147- EGGNOG = { k : v for d in eggnog for k , v in d .items () }
146+ scinames .append (stats [i ][0 ].replace (' ' , '_' ))
148147
149148#convert busco to dictionary
150- busco = lib .dictFlip (busco )
149+ busco = lib .busco_dictFlip (busco )
151150
152151#add species names to pandas table
153152names = []
154153for i in stats :
155154 sci_name = i [0 ]
156- genus = sci_name .split (' ' )[0 ]
157- species = ' ' .join (sci_name .split (' ' )[1 :])
158- abbrev = genus [:1 ] + '.'
159- final_name = abbrev + ' ' + species
160- names .append (final_name )
155+ if '_' in sci_name : #here I'm assuming that somebody used an abbreviated name and an underscore, this would be atypical I think
156+ names .append (sci_name )
157+ else :
158+ genus = sci_name .split (' ' )[0 ]
159+ species = ' ' .join (sci_name .split (' ' )[1 :])
160+ abbrev = genus [:1 ] + '.'
161+ final_name = abbrev + ' ' + species
162+ names .append (final_name )
163+
161164
162165#PFAM#############################################
163166lib .log .info ("Summarizing PFAM domain results" )
@@ -170,8 +173,11 @@ def __init__(self,prog):
170173pfamdf ['species' ] = names
171174pfamdf .set_index ('species' , inplace = True )
172175
176+ #remove any "empty" genomes
177+ pfamdf = pfamdf [(pfamdf .T != 0 ).any ()]
178+
173179#make an nmds
174- if len (args . input ) > 1 :
180+ if len (pfamdf . index ) > 1 : #make sure number of species is at least two
175181 lib .distance2mds (pfamdf , 'braycurtis' , 'PFAM' , os .path .join (args .out , 'pfam' ,'PFAM.nmds.pdf' ))
176182
177183#get the PFAM descriptions
@@ -195,7 +201,7 @@ def __init__(self,prog):
195201 output .write (lib .FOOTER )
196202
197203##################################################
198-
204+
199205####InterProScan##################################
200206lib .log .info ("Summarizing InterProScan results" )
201207if not os .path .isdir (os .path .join (args .out , 'interpro' )):
@@ -207,14 +213,21 @@ def __init__(self,prog):
207213IPRdf ['species' ] = names
208214IPRdf .set_index ('species' , inplace = True )
209215
216+ #some checking here of data, if genome is missing, i.e. counts are zero, drop it
217+ #print IPRdf
218+ #print len(IPRdf.columns)
219+ IPRdf = IPRdf [(IPRdf .T != 0 ).any ()]
220+ #print len(IPRdf.index)
221+
210222#analysis of InterPro Domains
211223#get IPR descriptions
224+ lib .log .info ("Loading InterPro descriptions" )
212225INTERPRO = lib .iprxml2dict (os .path .join (parentdir , 'DB' , 'interpro.xml' ))
213226#NMDS
214- if len (args . input ) > 1 :
215- if len (IPRdf .columns ) > 1 :
227+ if len (IPRdf . index ) > 1 : #count number of species
228+ if len (IPRdf .columns ) > 1 : #count number of IPR domains
216229 lib .distance2mds (IPRdf , 'braycurtis' , 'InterProScan' , os .path .join (args .out , 'interpro' , 'InterProScan.nmds.pdf' ))
217-
230+
218231 #write to csv file
219232 ipr2 = IPRdf .transpose ()
220233 ipr_desc = []
@@ -232,12 +245,12 @@ def __init__(self,prog):
232245 output .write (lib .HEADER )
233246 output .write (lib .INTERPRO )
234247 if len (IPRdf .columns ) > 1 :
235- output .write (ipr2 .to_html (index = False , escape = False , classes = 'table table-hover' ))
248+ if len (IPRdf .index ) > 1 :
249+ output .write (ipr2 .to_html (index = False , escape = False , classes = 'table table-hover' ))
236250 output .write (lib .FOOTER )
237251
238252##############################################
239253
240-
241254####MEROPS################################
242255lib .log .info ("Summarizing MEROPS protease results" )
243256if not os .path .isdir (os .path .join (args .out , 'merops' )):
@@ -388,7 +401,6 @@ def __init__(self,prog):
388401 output .write (lib .FOOTER )
389402########################################################
390403
391-
392404####GO Terms, GO enrichment############################
393405if not os .path .isdir (os .path .join (args .out , 'go_enrichment' )):
394406 os .makedirs (os .path .join (args .out , 'go_enrichment' ))
@@ -444,7 +456,7 @@ def __init__(self,prog):
444456 else :
445457 output .write ('<table border="1" class="dataframe table table-hover">\n <th>No enrichment found</th></table>' )
446458 output .write (lib .FOOTER )
447-
459+
448460####################################################
449461
450462##ProteinOrtho################################
@@ -455,78 +467,126 @@ def __init__(self,prog):
455467 lib .log .info ("Running orthologous clustering tool, ProteinOrtho5. This may take awhile..." )
456468 #setup protein ortho inputs, some are a bit strange in the sense that they use equals signs
457469 log = os .path .join (protortho , 'proteinortho.log' )
458- #get list of files in folder
470+
471+ #generate list of files based on input order for consistency
459472 filelist = []
460- for file in os .listdir (protortho ):
461- if file .endswith ('.faa' ):
462- filelist .append (file )
473+ for i in stats :
474+ name = i [0 ].replace (' ' , '_' )
475+ name = name + '.faa'
476+ filelist .append (name )
463477 fileinput = ' ' .join (filelist )
478+ #print fileinput
464479 cmd = ['proteinortho5.pl' , '-project=funannotate' , '-synteny' , '-cpus=' + str (args .cpus ), '-singles' , '-selfblast' ]
465480 cmd2 = cmd + filelist
466481 if not os .path .isfile (os .path .join (args .out , 'protortho' , 'funannotate.poff' )):
467482 with open (log , 'w' ) as logfile :
468483 subprocess .call (cmd2 , cwd = protortho , stderr = logfile , stdout = logfile )
469484
470- #now process the output, get # of singletons per genome, total orthologs, single-copy orthologs and append to stats, output text file with groups
485+ #open poff in pandas to parse "easier" for stats, orthologs, etc
486+ df = pd .read_csv (os .path .join (args .out , 'protortho' , 'funannotate.poff' ), sep = '\t ' , header = 0 )
487+ df .rename (columns = lambda x : x .replace ('.faa' , '' ), inplace = True )
488+ #reorder table to it matches up with busco list of dicts
489+ newhead = [df .columns .values [0 ], df .columns .values [1 ], df .columns .values [2 ]]
490+ newhead += scinames
491+ df = df [newhead ]
492+ #write to file (not sure I need this now?)
493+ #df.to_csv(os.path.join(args.out, 'protortho', 'funannotate_reorder.poff'), sep='\t', index=False)
494+ #now filter table to only single copy orthologs to use with phylogeny
495+ num_species = len (df .columns ) - 3
496+ sco = df [(df ['# Species' ] == num_species ) & (df ['Genes' ] == num_species )]
497+ sco_hits = sco .drop (sco .columns [0 :3 ], axis = 1 )
498+ #now cross reference with busco, as we want this for phylogeny
499+ keep = []
500+ sc_buscos = []
501+ for index , row in sco_hits .iterrows ():
502+ busco_check = []
503+ for i in range (0 , num_species ):
504+ if row [i ] in busco [i ]:
505+ busco_check .append (busco [i ].get (row [i ]))
506+ busco_check = lib .flatten (busco_check )
507+ #need to check if outgroup is passed and this model exists in that outgroup
508+ if len (set (busco_check )) == 1 :
509+ if args .outgroup :
510+ available_busco = []
511+ with open (outgroup_species , 'rU' ) as outfasta :
512+ for line in outfasta :
513+ if line .startswith ('>' ):
514+ line = line .replace ('\n ' , '' )
515+ name = line .replace ('>' , '' )
516+ available_busco .append (name )
517+ if busco_check [0 ] in available_busco :
518+ keep .append (index )
519+ sc_buscos .append (busco_check [0 ])
520+ else :
521+ keep .append (index )
522+ sco_final = sco_hits .ix [keep ]
523+
524+ #take dataframe and output the ortholog table.
525+ dftrim = df .drop (df .columns [0 :3 ], axis = 1 ) #trim down to just gene models
526+ orthdf = df [(df ['# Species' ] > 1 )] #get rid of singletons in this dataset
527+ orth_hits = orthdf .drop (orthdf .columns [0 :3 ], axis = 1 ) #trim to just gene models
528+
471529 orthologs = os .path .join (args .out , 'annotations' ,'orthology_groups.txt' )
472530 with open (orthologs , 'w' ) as output :
473- with open (os .path .join (args .out , 'protortho' , 'funannotate.poff' ), 'rU' ) as input :
474- count = 0
475- scoCount = 0
476- for line in input :
477- line = line .replace ('\n ' , '' ) #strip line ending
478- if line .startswith ('#' ):
479- header = line
480- species = header .split ('\t ' )[3 :]
481- num_species = header .count ('\t ' ) - 2
482- continue
483- col = re .split (r'[,\t]' , line )
484- if col [0 ] != '1' :
485- count += 1
486- ID = 'orth' + str (count )
487- prots = col [3 :]
488- prots = [x for x in prots if x != '*' ]
489- eggs = []
490- buscos = []
491- for i in prots :
492- hit = EGGNOG .get (i )
493- if not hit in eggs :
494- eggs .append (hit )
495- hit2 = busco .get (i )
496- if not hit2 in buscos :
497- buscos .append (hit2 )
498- eggs = [x for x in eggs if x is not None ]
499- buscos = [x for x in buscos if x is not None ]
500- buscos = lib .flatten (buscos )
501- if len (eggs ) > 0 :
502- eggs = ', ' .join (str (v ) for v in eggs )
503- else :
504- eggs = 'None'
505- if len (buscos ) > 0 :
506- buscos = set (buscos )
507- buscos = ', ' .join (str (v ) for v in buscos )
508- else :
509- buscos = 'None'
510- if col [0 ] == str (num_species ) and col [1 ] == str (num_species ):
511- scoCount += 1
512- output .write ("%s\t %s\t %s\t %s\n " % (ID , eggs , buscos , ', ' .join (prots )))
531+ #should be able to parse the pandas ortho dataframe now
532+ for index , row in orth_hits .iterrows ():
533+ ID = 'orth' + str (index )
534+ buscos = []
535+ eggs = []
536+ proteins = []
537+ for x in range (0 , len (row )):
538+ if row [x ] != '*' :
539+ prots = row [x ].split (',' )
540+ for y in prots :
541+ proteins .append (y )
542+ egghit = eggnog [x ].get (y )
543+ if not egghit in eggs :
544+ eggs .append (egghit )
545+ buscohit = busco [x ].get (y )
546+ if not buscohit in buscos :
547+ buscos .append (buscohit )
548+ #clean up the None's that get added
549+ eggs = [x for x in eggs if x is not None ]
550+ buscos = [x for x in buscos if x is not None ]
551+ buscos = lib .flatten (buscos )
552+
553+ #write to output
554+ if len (eggs ) > 0 :
555+ eggs = ', ' .join (str (v ) for v in eggs )
556+ else :
557+ eggs = 'None'
558+ if len (buscos ) > 0 :
559+ buscos = set (buscos )
560+ buscos = ', ' .join (str (v ) for v in buscos )
561+ else :
562+ buscos = 'None'
563+ output .write ("%s\t %s\t %s\t %s\n " % (ID , eggs , buscos , ', ' .join (proteins )))
513564
514565if not os .path .isdir (os .path .join (args .out , 'stats' )):
515566 os .makedirs (os .path .join (args .out , 'stats' ))
516567summary = []
517- for i in stats :
518- try :
519- singles = lib .singletons (os .path .join (args .out , 'protortho' , 'funannotate.poff' ), i [0 ])
520- except IOError :
521- singles = 0
522- i .append ("{0:,}" .format (singles ))
523- try :
524- orthos = lib .orthologs (os .path .join (args .out , 'protortho' , 'funannotate.poff' ), i [0 ])
525- except IOError :
526- orthos = 0
527- i .append ("{0:,}" .format (orthos ))
528- i .append ("{0:,}" .format (scoCount ))
529- summary .append (i )
568+ #get stats, this is all single copy orthologs
569+ scoCount = len (sco_hits )
570+ for i in range (0 , len (stats )):
571+ orthos = 0
572+ for index , row in orth_hits [scinames [i ]].iteritems ():
573+ if row != '*' :
574+ add = row .count (',' ) + 1
575+ orthos += add
576+ singletons = 0
577+ for index , row in dftrim .iterrows ():
578+ if row [scinames [i ]] != '*' :
579+ others = []
580+ for y in range (0 , len (row )):
581+ others .append (row [y ])
582+ others = set (others )
583+ if len (others ) == 2 :
584+ singletons += 1
585+ stats [i ].append ("{0:,}" .format (singletons ))
586+ stats [i ].append ("{0:,}" .format (orthos ))
587+ stats [i ].append ("{0:,}" .format (scoCount ))
588+ summary .append (stats [i ])
589+
530590#convert to dataframe for easy output
531591header = ['species' , 'isolate' , 'Assembly Size' , 'Largest Scaffold' , 'Average Scaffold' , 'Num Scaffolds' , 'Scaffold N50' , 'Percent GC' , 'Num Genes' , 'Num Proteins' , 'Num tRNA' , 'Unique Proteins' , 'Prots atleast 1 ortholog' , 'Single-copy orthologs' ]
532592df = pd .DataFrame (summary , columns = header )
@@ -539,6 +599,7 @@ def __init__(self,prog):
539599 output .write (df .transpose ().to_html (classes = 'table table-condensed' ))
540600 output .write (lib .FOOTER )
541601############################################
602+
542603######summarize all annotation for each gene in a table
543604lib .log .info ("Compiling all annotations for each genome" )
544605
@@ -574,14 +635,13 @@ def __init__(self,prog):
574635meropsDict = lib .dictFlip (merops )
575636cazyDict = lib .dictFlip (cazy )
576637
577-
578638table = []
579639header = ['GeneID' ,'length' ,'description' , 'Ortho Group' , 'EggNog' , 'BUSCO' ,'Protease family' , 'CAZyme family' , 'InterPro Domains' , 'PFAM Domains' , 'GO terms' , 'SecMet Cluster' , 'SMCOG' ]
580- for i in range (0 ,num_input ):
581- outputname = os .path .join (args .out , 'annotations' , stats [ i ][ 0 ]. replace ( ' ' , '_' ) + '.all.annotations.tsv' )
640+ for y in range (0 ,num_input ):
641+ outputname = os .path .join (args .out , 'annotations' , scinames [ y ] + '.all.annotations.tsv' )
582642 with open (outputname , 'w' ) as output :
583643 output .write ("%s\n " % ('\t ' .join (header )))
584- with open (gbkfilenames [i ], 'rU' ) as input :
644+ with open (gbkfilenames [y ], 'rU' ) as input :
585645 SeqRecords = SeqIO .parse (input , 'genbank' )
586646 for record in SeqRecords :
587647 for f in record .features :
@@ -608,8 +668,8 @@ def __init__(self,prog):
608668 cazydomains = "; " .join (cazyDict .get (ID ))
609669 else :
610670 cazydomains = ''
611- if ID in busco :
612- buscogroup = busco .get (ID )[0 ]
671+ if ID in busco [ y ] :
672+ buscogroup = busco [ y ] .get (ID )[0 ]
613673 else :
614674 buscogroup = ''
615675 if ID in goDict :
@@ -635,6 +695,7 @@ def __init__(self,prog):
635695 final_result = [ID , str (length ), description , orthogroup , egg , buscogroup , meropsdomains , cazydomains , IPRdomains , pfamdomains , goTerms , cluster , smcog ]
636696 output .write ("%s\n " % ('\t ' .join (final_result )))
637697############################################
698+
638699#build phylogeny
639700if not os .path .isfile (os .path .join (args .out , 'phylogeny' , 'RAxML.phylogeny.pdf' )):
640701 if outgroup :
@@ -643,7 +704,8 @@ def __init__(self,prog):
643704 num_phylogeny = len (args .input )
644705 if num_phylogeny > 3 :
645706 lib .log .info ("Inferring phylogeny using RAxML" )
646- lib .ortho2phylogeny (os .path .join (args .out , 'protortho' , 'funannotate.poff' ), args .num_orthos , busco , args .cpus , args .bootstrap , phylogeny , outgroup , outgroup_species , outgroup_name )
707+ folder = os .path .join (args .out , 'protortho' )
708+ lib .ortho2phylogeny (folder , sco_final , args .num_orthos , busco , args .cpus , args .bootstrap , phylogeny , outgroup , outgroup_species , outgroup_name , sc_buscos )
647709 else :
648710 lib .log .info ("Skipping RAxML phylogeny as at least 4 taxa are required" )
649711 with open (os .path .join (args .out ,'phylogeny.html' ), 'w' ) as output :
0 commit comments