@@ -87,7 +87,8 @@ def hmmer_search(hmmfile, sequences, cpus=0, bit_cutoffs=None, evalue=10.0):
8787 "seq_length" : hit .length ,
8888 "hmm_length" : s_domains [0 ]["hmm_length" ],
8989 "hmm_aln_length" : s_domains [0 ]["hmm_aln" ],
90- "hmm_coverage" : s_domains [0 ]["hmm_aln" ] / s_domains [0 ]["hmm_length" ],
90+ "hmm_coverage" : s_domains [0 ]["hmm_aln" ]
91+ / s_domains [0 ]["hmm_length" ],
9192 "bitscore" : hit .score ,
9293 "evalue" : hit .evalue ,
9394 "domains" : s_domains ,
@@ -157,15 +158,20 @@ def hmmer_scan(hmmfile, sequences, cpus=0, bit_cutoffs=None, evalue=10.0):
157158 ),
158159 "name" : hit .name .decode (),
159160 "accession" : (
160- None if hit .accession is None else hit .accession .decode ()
161+ None
162+ if hit .accession is None
163+ else hit .accession .decode ()
161164 ),
162165 "description" : (
163- None if hit .description is None else hit .description .decode ()
166+ None
167+ if hit .description is None
168+ else hit .description .decode ()
164169 ),
165170 "seq_length" : len (top_hits .query .sequence ),
166171 "hmm_length" : s_domains [0 ]["hmm_length" ],
167172 "hmm_aln_length" : s_domains [0 ]["hmm_aln" ],
168- "hmm_coverage" : s_domains [0 ]["hmm_aln" ] / s_domains [0 ]["hmm_length" ],
173+ "hmm_coverage" : s_domains [0 ]["hmm_aln" ]
174+ / s_domains [0 ]["hmm_length" ],
169175 "bitscore" : hit .score ,
170176 "evalue" : hit .evalue ,
171177 "domains" : s_domains ,
@@ -421,7 +427,9 @@ def merops2tsv(results, output, annots):
421427 json .dump (results , outfile , indent = 2 )
422428 with open (annots , "w" ) as annot :
423429 for result in results :
424- annot .write (f"{ result ['qseqid' ]} \t note\t MEROPS:{ result ['sseqid' ]} { result ['family' ]} \n " )
430+ annot .write (
431+ f"{ result ['qseqid' ]} \t note\t MEROPS:{ result ['sseqid' ]} { result ['family' ]} \n "
432+ )
425433 a = add2dict (
426434 a ,
427435 result ["qseqid" ],
@@ -431,7 +439,9 @@ def merops2tsv(results, output, annots):
431439 return a
432440
433441
434- def swissprot_blast (query , evalue = 1e-5 , cpus = 1 , min_pident = 60 , min_cov = 60 , max_target_seqs = 1 ):
442+ def swissprot_blast (
443+ query , evalue = 1e-5 , cpus = 1 , min_pident = 60 , min_cov = 60 , max_target_seqs = 1
444+ ):
435445 """
436446 Perform a BLAST search against the SwissProt database using Diamond.
437447
@@ -535,7 +545,9 @@ def swissprot2tsv(results, output, annots):
535545 json .dump (results , outfile , indent = 2 )
536546 with open (annots , "w" ) as annot :
537547 for result in results :
538- annot .write (f"{ result ['query' ]} \t db_xref\t UniProtKB/Swiss-Prot:{ result ['accession' ]} \n " )
548+ annot .write (
549+ f"{ result ['query' ]} \t db_xref\t UniProtKB/Swiss-Prot:{ result ['accession' ]} \n "
550+ )
539551 # add db_xref
540552 a = add2dict (
541553 a ,
@@ -579,26 +591,98 @@ def swissprot2tsv(results, output, annots):
579591
580592def parse_annotations (tsv ):
581593 """
582- Parse a three-column annotation file into a dictionary.
594+ Parse a three-column annotation file into a dictionary with robust error handling .
583595
584596 This function reads a TSV file containing annotations, where each line consists of a gene,
585597 a database identifier, and a value. It processes the file and returns a dictionary where
586598 each gene is a key, and its associated database and value are stored as entries.
587599
600+ Lines that are not properly formatted (not exactly 3 tab-separated columns) are skipped
601+ and reported as parsing errors.
602+
588603 Parameters:
589604 - tsv (str): The file path to the annotation file in TSV format.
590605
591606 Returns:
592- - dict: A dictionary with genes as keys and their corresponding database and value entries.
607+ - tuple: (dict, dict) A tuple containing:
608+ - dict: A dictionary with genes as keys and their corresponding database and value entries.
609+ - dict: A dictionary with parsing error information including file path, line numbers, and problematic lines.
593610 """
611+ import logging
612+
613+ logger = logging .getLogger (__name__ )
614+
594615 # parse a three column annotation file into a dictionary
595616 a = {}
596- with open (tsv , "r" ) as infile :
597- for line in infile :
598- line = line .rstrip ()
599- gene , db , value = line .split ("\t " )
600- a = add2dict (a , gene , db , value )
601- return a
617+ parse_errors = {
618+ "file" : tsv ,
619+ "malformed_lines" : [],
620+ "empty_lines" : 0 ,
621+ "comment_lines" : 0 ,
622+ "total_lines" : 0 ,
623+ "parsed_lines" : 0 ,
624+ }
625+
626+ try :
627+ with open (tsv , "r" ) as infile :
628+ for line_num , line in enumerate (infile , 1 ):
629+ parse_errors ["total_lines" ] += 1
630+ original_line = line
631+ line = line .rstrip ()
632+
633+ # Skip empty lines
634+ if not line :
635+ parse_errors ["empty_lines" ] += 1
636+ continue
637+
638+ # Skip comment lines (starting with #)
639+ if line .startswith ("#" ):
640+ parse_errors ["comment_lines" ] += 1
641+ continue
642+
643+ # Split by tab and check column count
644+ columns = line .split ("\t " )
645+
646+ if len (columns ) != 3 :
647+ parse_errors ["malformed_lines" ].append (
648+ {
649+ "line_number" : line_num ,
650+ "line_content" : original_line .rstrip (),
651+ "column_count" : len (columns ),
652+ "error" : f"Expected 3 columns, found { len (columns )} " ,
653+ }
654+ )
655+ continue
656+
657+ gene , db , value = columns
658+
659+ # Check for empty values in any column
660+ if not gene .strip () or not db .strip () or not value .strip ():
661+ parse_errors ["malformed_lines" ].append (
662+ {
663+ "line_number" : line_num ,
664+ "line_content" : original_line .rstrip (),
665+ "column_count" : len (columns ),
666+ "error" : "One or more columns are empty" ,
667+ }
668+ )
669+ continue
670+
671+ # Successfully parsed line
672+ a = add2dict (a , gene .strip (), db .strip (), value .strip ())
673+ parse_errors ["parsed_lines" ] += 1
674+
675+ except FileNotFoundError :
676+ logger .error (f"Annotation file not found: { tsv } " )
677+ parse_errors ["file_error" ] = f"File not found: { tsv } "
678+ except PermissionError :
679+ logger .error (f"Permission denied reading annotation file: { tsv } " )
680+ parse_errors ["file_error" ] = f"Permission denied: { tsv } "
681+ except Exception as e :
682+ logger .error (f"Unexpected error reading annotation file { tsv } : { str (e )} " )
683+ parse_errors ["file_error" ] = f"Unexpected error: { str (e )} "
684+
685+ return a , parse_errors
602686
603687
604688def add2dict (adict , gene , key , value ):
@@ -630,7 +714,12 @@ def add2dict(adict, gene, key, value):
630714
631715
632716def swissprot_valid_gene (name ):
633- if number_present (name ) and len (name ) > 2 and not morethanXnumbers (name , 3 ) and "." not in name :
717+ if (
718+ number_present (name )
719+ and len (name ) > 2
720+ and not morethanXnumbers (name , 3 )
721+ and "." not in name
722+ ):
634723 return True
635724 else :
636725 return False
@@ -746,9 +835,11 @@ def busco2tsv(results, buscodb, busco_results, annots):
746835 # so now we want to construct the 3 column
747836 a = {}
748837 with open (annots , "w" ) as annot :
749- for k , v in natsorted (results .items (), key = lambda x : (x [1 ]["hit" ], - x [1 ]["bitscore" ])):
838+ for k , v in natsorted (
839+ results .items (), key = lambda x : (x [1 ]["hit" ], - x [1 ]["bitscore" ])
840+ ):
750841 if v ["name" ] in busco_data :
751- defline = f' BUSCO:{ k } [{ odb_version } ] { busco_data .get (v [" name" ])[" description" ] } '
842+ defline = f" BUSCO:{ k } [{ odb_version } ] { busco_data .get (v [' name' ])[' description' ] } "
752843 else :
753844 defline = f"BUSCO:{ k } [{ odb_version } ]"
754845 if v ["hit" ] not in a :
0 commit comments