@@ -157,7 +157,6 @@ def __init__(self,prog):
157157 eggnog .append (lib .getEggNogfromNote (GBK ))
158158 scinames .append (stats [i ][0 ].replace (' ' , '_' ))
159159
160-
161160#convert busco to dictionary
162161busco = lib .busco_dictFlip (busco )
163162
@@ -515,8 +514,9 @@ def __init__(self,prog):
515514 os .makedirs (os .path .join (args .out , 'tfs' ))
516515#should be able to pull transcription factor counts from InterPro Domains, load into pandas df
517516iprTF = os .path .join (parentdir , 'lib' , 'tf_interpro.txt' )
518-
519517tf = pd .read_csv (iprTF , names = ['InterPro' , 'Description' ])
518+ #convert to dictionary for all annotations later
519+ TFDict = tf .set_index ('InterPro' )['Description' ].to_dict ()
520520iprall = IPRdf .transpose ()
521521iprall .reset_index (inplace = True )
522522dfmerged = pd .merge (tf ,iprall , left_on = 'InterPro' , right_on = 'index' , how = 'left' )
@@ -756,7 +756,7 @@ def __init__(self,prog):
756756 for line in input :
757757 line = line .replace ('\n ' , '' )
758758 col = line .split ('\t ' )
759- genes = col [1 ].split (',' )
759+ genes = col [- 1 ].split (', ' )
760760 for i in genes :
761761 orthoDict [i ] = col [0 ]
762762
@@ -785,9 +785,16 @@ def __init__(self,prog):
785785meropsDict = lib .dictFlip (merops )
786786cazyDict = lib .dictFlip (cazy )
787787
788+ #get Transcription factors in a dictionary
789+ TFLookup = {}
790+ for k ,v in iprDict .items ():
791+ for x in v :
792+ IPRid = x .split (':' )[0 ]
793+ if IPRid in TFDict :
794+ TFLookup [k ] = TFDict .get (IPRid )
788795
789796table = []
790- header = ['GeneID' ,'scaffold:start-end' ,'strand' ,'length' ,'description' , 'Ortho Group' , 'EggNog' , 'BUSCO' , 'Secreted' , 'Protease family' , 'CAZyme family' , 'InterPro Domains' , 'PFAM Domains' , 'GO terms' , 'SecMet Cluster' , 'SMCOG' ]
797+ header = ['GeneID' ,'scaffold:start-end' ,'strand' ,'length' ,'description' , 'Ortho Group' , 'EggNog' , 'BUSCO' , 'Secreted' , 'Protease family' , 'CAZyme family' , 'Transcription factor' , ' InterPro Domains' , 'PFAM Domains' , 'GO terms' , 'SecMet Cluster' , 'SMCOG' ]
791798for y in range (0 ,num_input ):
792799 outputname = os .path .join (args .out , 'annotations' , scinames [y ]+ '.all.annotations.tsv' )
793800 with open (outputname , 'w' ) as output :
@@ -843,6 +850,10 @@ def __init__(self,prog):
843850 orthogroup = orthoDict .get (ID )
844851 else :
845852 orthogroup = ''
853+ if ID in TFLookup :
854+ transfactor = TFLookup .get (ID )
855+ else :
856+ transfactor = ''
846857 for k ,v in f .qualifiers .items ():
847858 if k == 'note' :
848859 notes = v [0 ].split ('; ' )
@@ -855,7 +866,7 @@ def __init__(self,prog):
855866 if i .startswith ('SMCOG:' ):
856867 smcog = i
857868
858- final_result = [ID , location , strand , str (length ), description , orthogroup , egg , buscogroup , signalphit , meropsdomains , cazydomains , IPRdomains , pfamdomains , goTerms , cluster , smcog ]
869+ final_result = [ID , location , strand , str (length ), description , orthogroup , egg , buscogroup , signalphit , meropsdomains , cazydomains , transfactor , IPRdomains , pfamdomains , goTerms , cluster , smcog ]
859870 output .write ("%s\n " % ('\t ' .join (final_result )))
860871############################################
861872
0 commit comments