1414import zipfile
1515import argparse
1616import pandas as pd
17- from oggmap import qlin
17+ from oggmap import of2orthomap , qlin
1818
1919
2020def define_parser ():
@@ -25,7 +25,7 @@ def define_parser():
2525
2626 :rtype: argparse.ArgumentParser
2727 """
28- of2orthomap_example = '''broccoli2orthomap example:
28+ broccoli2orthomap_example = '''broccoli2orthomap example:
2929
3030 # download Broccoli example:
3131 $ wget https://zenodo.org/records/14935293/files/broccoli_example_table_OGs_protein_counts.txt
@@ -92,7 +92,7 @@ def get_broccoli_orthomap(seqname,
9292 ncbi = None ,
9393 dbname = None ):
9494 """
95- This function return an orthomap for a given query species and PLAZA gene family data.
95+ This function return an orthomap for a given query species and Broccoli input data.
9696
9797 :param qt: Query species taxID.
9898 :param sl: Path to species list file containing <Broccoli name><tab><species taxID>.
@@ -120,7 +120,20 @@ def get_broccoli_orthomap(seqname,
120120
121121 Example
122122 -------
123- >>>
123+ >>> from oggmap import datasets, broccoli2orthomap, of2orthomap, qlin
124+ >>> datasets.broccoli_example(datapath='.')
125+ >>> query_orthomap, orthofinder_species_list, of_species_abundance = broccoli2orthomap.get_broccoli_orthomap(
126+ >>> seqname='proteome.selected_transcript.ath.fasta',
127+ >>> qt='3702',
128+ >>> sl='broccoli_example_species_list.tsv',
129+ >>> oc='broccoli_example_table_OGs_protein_counts.txt',
130+ >>> og='broccoli_example_table_OGs_protein_names.txt',
131+ >>> out=None,
132+ >>> quiet=False,
133+ >>> continuity=True,
134+ >>> overwrite=True,
135+ >>> dbname='taxadb.sqlite')
136+ >>> query_orthomap
124137 """
125138 outhandle = None
126139 og_continuity_score = None
@@ -142,109 +155,128 @@ def get_broccoli_orthomap(seqname,
142155 sep = '\t ' ,
143156 header = None ,
144157 comment = '#' )
145- species_list .columns = ['species' , 'common_name' , 'tax_id' , 'source' , 'data_provider' , 'pubmed_id' ]
146- qt_species = list (species_list ['species' ][species_list ['tax_id' ] == int (qt )])
147- if len (qt_species ) == 0 :
148- print ('\n Error <-qt>: query species taxID not in PLAZA results, please check taxID.' )
149- sys .exit ()
150- ogs = pd .DataFrame (pd .read_csv (og ,
151- sep = '\t ' ,
152- header = None ,
153- comment = '#' ))
154- ogs .columns = ['gf_id' , 'species' , 'gene_id' ]
155- ogs_grouped = ogs .groupby ('gf_id' )['species' ].apply (set ).apply (list ).apply (_get_species_tax_id ,
156- species_list = species_list )
157- ogs_grouped_qt = pd .DataFrame (ogs_grouped [ogs_grouped .apply (lambda x : int (qt ) in x )])
158- ogs_qt = ogs [ogs ['gf_id' ].isin (ogs_grouped_qt .index )]
159- ogs_qt_red = ogs_qt [ogs_qt ['species' ].isin (qt_species )]
160- ogs_qt_red_grouped = ogs_qt_red .groupby ('gf_id' )['gene_id' ].apply (list )
161- ogs_grouped_qt ['gene_id' ] = ogs_qt_red_grouped
162- ogs_grouped_qt_species = np .sort (list (set ([x [0 ] for x in ogs_grouped_qt ['species' ].to_dict ().values ()])))
163- ogs_grouped_qt_species_names = [qlin .get_qlin (qt = x ,
164- quiet = True ,
165- ncbi = ncbi )[0 ] for x in ogs_grouped_qt_species ]
166- species_list_df = pd .DataFrame (ogs_grouped_qt_species_names ,
167- columns = ['species' ])
168- species_list_df ['taxID' ] = ogs_grouped_qt_species
169- species_list_df ['lineage' ] = species_list_df .apply (lambda x : qlin .ncbi_get_lineage (qt = x .iloc [1 ],
170- ncbi = ncbi ),
171- axis = 1 )
172- species_list_df ['youngest_common' ] = [qlin .get_youngest_common (qlineage ,
173- x ) for x in species_list_df .lineage ]
174- species_list_df ['youngest_name' ] = [list (x .values ())[0 ] for x in [qlin .ncbi_get_taxid_translator (qt_vec = [x ],
175- ncbi = ncbi )
176- for x in list (species_list_df .youngest_common )]]
158+ species_list .columns = ['species' , 'taxID' ]
159+ species_list ['lineage' ] = species_list .apply (lambda x : qlin .ncbi_get_lineage (qt = x .iloc [1 ],
160+ ncbi = ncbi ),
161+ axis = 1 )
162+ species_list ['youngest_common' ] = [qlin .get_youngest_common (qlineage , x ) for x in species_list .lineage ]
163+ species_list ['youngest_name' ] = [list (x .values ())[0 ] for x in [qlin .ncbi_get_taxid_translator (qt_vec = [x ],
164+ ncbi = ncbi )
165+ for x in list (species_list .youngest_common )]]
177166 if not quiet :
167+ print (seqname )
178168 print (qname )
179169 print (qt )
180- print (species_list_df )
170+ print (species_list )
181171 youngest_common_counts_df = of2orthomap .get_youngest_common_counts (qlineage ,
182- species_list_df )
172+ species_list )
183173 for node in qlin .traverse_postorder (query_lineage_topo .root ):
184174 if node .name :
185175 nsplit = node .name .split ('/' )
186176 if len (nsplit ) == 3 :
187177 node .species_count = list (youngest_common_counts_df [youngest_common_counts_df .PStaxID .isin (
188178 [int (nsplit [1 ])])].counts )[0 ]
189- #for node in query_lineage_topo.traverse('postorder'):
190- # nsplit = node.name.split('/')
191- # if len(nsplit) == 3:
192- # node.add_feature('species_count',
193- # list(youngest_common_counts_df[youngest_common_counts_df.PStaxID.isin(
194- # [int(nsplit[1])])].counts)[0])
195- og_dict = {}
179+ oc_og_dict = {}
196180 continuity_dict = {}
197- for og in ogs_grouped_qt .index :
198- og_hits = np .sort (
199- list (set (list (ogs_grouped_qt [ogs_grouped_qt .index .isin ([og ])]['species' ].to_dict ().values ())[0 ])))
200- # get list of the youngest common between query and all other species
201- og_hits_youngest_common = list (species_list_df .youngest_common [
202- [x for x , y in enumerate (species_list_df .taxID )
203- if y in og_hits ]])
204- # evaluate all youngest common nodes to retain the oldest of them and assign as the orthogroup
205- # ancestral state (gene age)
206- if len (og_hits_youngest_common ) > 0 :
207- og_oldest_common = qlin .get_oldest_common (qlineage ,
208- og_hits_youngest_common )
209- og_dict [og ] = og_oldest_common
210- if continuity :
211- continuity_dict [og ] = \
212- of2orthomap .get_youngest_common_counts (qlineage ,
213- pd .DataFrame (og_hits_youngest_common ,
214- columns = ['youngest_common' ])).counts
181+ if os .path .basename (oc ).split ('.' )[- 1 ] == 'zip' :
182+ oc_zip = zipfile .Path (oc , at = '.' .join (os .path .basename (oc ).split ('.' )[:- 1 ]))
183+ oc_lines = oc_zip .open ()
184+ else :
185+ oc_lines = open (oc ,
186+ 'r' )
187+ oc_species = next (oc_lines )
188+ if type (oc_species ) == bytes :
189+ oc_species = oc_species .decode ('utf-8' ).strip ().split ('\t ' )
190+ else :
191+ oc_species = oc_species .strip ().split ('\t ' )
192+ oc_qidx = [x for x , y in enumerate (oc_species ) if y == seqname ]
193+ if len (oc_qidx ) == 0 :
194+ print ('\n Error <-qname>: query species name not in Broccoli results, please check spelling\n '
195+ 'e.g. <head -1 table_OGs_protein_counts.txt>' )
196+ sys .exit ()
197+ for oc_line in oc_lines :
198+ if type (oc_line ) == bytes :
199+ oc_og = oc_line .decode ('utf-8' ).strip ().split ('\t ' )
200+ else :
201+ oc_og = oc_line .strip ().split ('\t ' )
202+ if int (oc_og [oc_qidx [0 ]]) == 0 :
203+ continue
204+ if int (oc_og [oc_qidx [0 ]]) > 0 :
205+ oc_og_hits = [oc_species [x + 1 ] for x , y in enumerate (oc_og [1 ::][::- 1 ][1 ::][::- 1 ]) if int (y ) > 0 ]
206+ # get list of the youngest common between query and all other species
207+ oc_og_hits_youngest_common = list (species_list .youngest_common [
208+ [x for x , y in enumerate (species_list .species )
209+ if y in oc_og_hits ]])
210+ # evaluate all youngest common nodes to retain the oldest of them and assign as the orthogroup
211+ # ancestral state (gene age)
212+ if len (oc_og_hits_youngest_common ) > 0 :
213+ oc_og_oldest_common = qlin .get_oldest_common (qlineage ,
214+ oc_og_hits_youngest_common )
215+ oc_og_dict [oc_og [0 ]] = oc_og_oldest_common
216+ if continuity :
217+ continuity_dict [oc_og [0 ]] = of2orthomap .get_youngest_common_counts (
218+ qlineage ,
219+ pd .DataFrame (oc_og_hits_youngest_common ,
220+ columns = ['youngest_common' ])).counts
221+ oc_lines .close ()
215222 if continuity :
216223 youngest_common_counts_df = youngest_common_counts_df .join (pd .DataFrame .from_dict (continuity_dict ))
217224 omap = []
218225 if out :
219226 if os .path .exists (out ) and not overwrite :
220227 print ('\n Error <-overwrite>: output file exists, please set to True if it should be overwritten\n ' )
221228 sys .exit ()
222- outhandle = open (out , 'w' )
229+ outhandle = open (out ,
230+ 'w' )
223231 if continuity :
224232 outhandle .write ('seqID\t Orthogroup\t PSnum\t PStaxID\t PSname\t PScontinuity\n ' )
225233 else :
226234 outhandle .write ('seqID\t Orthogroup\t PSnum\t PStaxID\t PSname\n ' )
227- for og in ogs_grouped_qt .index :
228- og_tmp = ogs_grouped_qt [ogs_grouped_qt .index .isin ([og ])]
229- og_ps = qlineagenames [qlineagenames ['PStaxID' ] ==
230- str (og_dict [og ])].values .tolist ()[0 ]
231- og_ps_join = '\t ' .join (og_ps )
232- if continuity :
233- og_continuity_score = of2orthomap .get_continuity_score (og ,
234- youngest_common_counts_df )
235+ if os .path .basename (og ).split ('.' )[- 1 ] == 'zip' :
236+ og_zip = zipfile .Path (og ,
237+ at = '.' .join (os .path .basename (og ).split ('.' )[:- 1 ]))
238+ og_lines = og_zip .open ()
239+ else :
240+ og_lines = open (og ,
241+ 'r' )
242+ og_species = next (og_lines )
243+ if type (og_species ) == bytes :
244+ og_species = og_species .decode ('utf-8' ).strip ().split ('\t ' )
245+ else :
246+ og_species = og_species .strip ().split ('\t ' )
247+ og_qidx = [x for x , y in enumerate (og_species ) if y == seqname ]
248+ if len (oc_qidx ) == 0 :
249+ print ('\n Error <-qname>: query species name not in Broccoli results, please check spelling\n '
250+ 'e.g. <head -1 table_OGs_protein_counts.txt>' )
251+ sys .exit ()
252+ for og_line in og_lines :
253+ if type (og_line ) == bytes :
254+ og_og = og_line .decode ('utf-8' ).strip ().split ('\t ' )
255+ else :
256+ og_og = og_line .strip ().split ('\t ' )
257+ if og_og [0 ] not in oc_og_dict :
258+ continue
259+ else :
260+ og_ps = qlineagenames [qlineagenames ['PStaxID' ] ==
261+ str (oc_og_dict [og_og [0 ]])].values .tolist ()[0 ]
262+ og_ps_join = '\t ' .join (og_ps )
263+ if continuity :
264+ og_continuity_score = of2orthomap .get_continuity_score (og_name = og_og [0 ],
265+ youngest_common_counts_df = youngest_common_counts_df )
235266 if out :
236267 if continuity :
237- [outhandle .write (x .replace (' ' , '' ) + '\t ' + og + '\t ' + og_ps_join + '\t ' +
238- str (og_continuity_score ) + '\n ' ) for x in list ( og_tmp [ 'gene_id' ]) [0 ]]
268+ [outhandle .write (x .replace (' ' , '' ) + '\t ' + og_og [ 0 ] + '\t ' + og_ps_join + '\t ' +
269+ str (og_continuity_score ) + '\n ' ) for x in og_og [ og_qidx [0 ]]. split ( ',' ) ]
239270 else :
240- [outhandle .write (x .replace (' ' , '' ) + '\t ' + og + '\t ' + og_ps_join + '\n ' )
241- for x in list ( og_tmp [ 'gene_id' ]) [0 ]]
271+ [outhandle .write (x .replace (' ' , '' ) + '\t ' + og_og [ 0 ] + '\t ' + og_ps_join + '\n ' )
272+ for x in og_og [ og_qidx [0 ]]. split ( ',' ) ]
242273 if continuity :
243- omap += [[x .replace (' ' , '' ), og , og_ps [0 ], og_ps [1 ], og_ps [2 ], og_continuity_score ]
244- for x in list ( og_tmp [ 'gene_id' ]) [0 ]]
274+ omap += [[x .replace (' ' , '' ), og_og [ 0 ] , og_ps [0 ], og_ps [1 ], og_ps [2 ], og_continuity_score ]
275+ for x in og_og [ og_qidx [0 ]]. split ( ',' ) ]
245276 else :
246- omap += [[x .replace (' ' , '' ), og , og_ps [0 ], og_ps [1 ], og_ps [2 ]]
247- for x in list (og_tmp ['gene_id' ])[0 ]]
277+ omap += [[x .replace (' ' , '' ), og_og [0 ], og_ps [0 ], og_ps [1 ], og_ps [2 ]]
278+ for x in og_og [og_qidx [0 ]].split (',' )]
279+ og_lines .close ()
248280 if out :
249281 outhandle .close ()
250282 omap_df = pd .DataFrame (omap )
@@ -263,7 +295,7 @@ def get_broccoli_orthomap(seqname,
263295 'PSname' ]
264296 omap_df ['PSnum' ] = [int (x ) for x in list (omap_df ['PSnum' ])]
265297 return [omap_df ,
266- species_list_df ,
298+ species_list ,
267299 youngest_common_counts_df ]
268300
269301
@@ -277,26 +309,36 @@ def main():
277309 if not args .dbname :
278310 print ('\n Error <-dbname>: Please specify taxadb.sqlite file' )
279311 sys .exit ()
312+ if not args .seqname :
313+ parser .print_help ()
314+ print ('\n Error <-seqname>: Please specify query species name in Broccoli and taxID' )
315+ sys .exit ()
280316 if not args .qt :
281317 parser .print_help ()
282318 print ('\n Error <-qt>: Please specify query species taxID' )
283319 sys .exit ()
284320 if not args .sl :
285321 parser .print_help ()
286- print ('\n Error <-sl>: Please specify PLAZA species information file <species_information.csv>' )
322+ print ('\n Error <-sl>: Please specify species list as <Broccoli name><tab><species taxID>' )
323+ sys .exit ()
324+ if not args .oc :
325+ parser .print_help ()
326+ print ('\n Error <-oc>: Please specify Broccoli <table_OGs_protein_counts.txt> (see dir_step3 directory)' )
287327 sys .exit ()
288328 if not args .og :
289329 parser .print_help ()
290- print ('\n Error <-og>: Please specify PLAZA gene family file <genefamily_data.ORTHOFAM.csv> or '
291- '<genefamily_data.HOMFAM.csv>' )
330+ print ('\n Error <-og>: Please specify Broccoli <table_OGs_protein_names.txt> (see dir_step3 directory)' )
292331 sys .exit ()
293- get_plaza_orthomap (seqname = args .seqname ,
294- qt = args .qt ,
295- sl = args .sl ,
296- og = args .og ,
297- out = args .out ,
298- overwrite = args .overwrite ,
299- dbname = args .dbname )
332+ get_broccoli_orthomap (seqname = args .seqname ,
333+ qt = args .qt ,
334+ sl = args .sl ,
335+ oc = args .oc ,
336+ og = args .og ,
337+ out = args .out ,
338+ quiet = False ,
339+ continuity = True ,
340+ overwrite = args .overwrite ,
341+ dbname = args .dbname )
300342
301343
302344if __name__ == '__main__' :
0 commit comments