Skip to content

Commit 196bf62

Browse files
author
Jon Palmer
committed
better parsing of annotation data #25
1 parent f96e1c8 commit 196bf62

File tree

2 files changed

+244
-22
lines changed

2 files changed

+244
-22
lines changed

funannotate2/annotate.py

Lines changed: 135 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from gfftk.gff import dict2gff3, gff2dict
1414
from gfftk.stats import annotation_stats
1515

16-
from .config import env
16+
from .config import env, busco_taxonomy
1717
from .log import finishLogging, startLogging, system_info
1818
from .name_cleaner import (
1919
NameCleaner,
@@ -48,7 +48,121 @@
4848
get_odb_version,
4949
validate_busco_lineage,
5050
)
51-
from .config import busco_taxonomy
51+
52+
53+
def _report_parsing_errors(parse_errors, logger):
54+
"""
55+
Report parsing errors for annotation files in a user-friendly format.
56+
57+
Parameters:
58+
- parse_errors (dict): Dictionary containing parsing error information
59+
- logger: Logger instance for reporting errors
60+
"""
61+
file_path = parse_errors.get("file", "Unknown file")
62+
63+
# Report file-level errors
64+
if "file_error" in parse_errors:
65+
logger.error(f"Failed to read annotation file: {file_path}")
66+
logger.error(f"Error: {parse_errors['file_error']}")
67+
return
68+
69+
# Report parsing statistics
70+
total_lines = parse_errors.get("total_lines", 0)
71+
parsed_lines = parse_errors.get("parsed_lines", 0)
72+
malformed_lines = parse_errors.get("malformed_lines", [])
73+
empty_lines = parse_errors.get("empty_lines", 0)
74+
comment_lines = parse_errors.get("comment_lines", 0)
75+
76+
if malformed_lines:
77+
logger.warning(f"Parsing issues found in annotation file: {file_path}")
78+
logger.warning(
79+
f"Total lines: {total_lines}, Successfully parsed: {parsed_lines}"
80+
)
81+
logger.warning(
82+
f"Skipped: {len(malformed_lines)} malformed, {empty_lines} empty, {comment_lines} comment lines"
83+
)
84+
85+
# Report details for first few malformed lines
86+
max_errors_to_show = 5
87+
logger.warning(
88+
f"First {min(len(malformed_lines), max_errors_to_show)} malformed lines:"
89+
)
90+
91+
for error_info in malformed_lines[:max_errors_to_show]:
92+
line_num = error_info.get("line_number", "Unknown")
93+
line_content = error_info.get("line_content", "")
94+
error_msg = error_info.get("error", "Unknown error")
95+
96+
# Truncate very long lines for display
97+
if len(line_content) > 100:
98+
line_content = line_content[:97] + "..."
99+
100+
logger.warning(f" Line {line_num}: {error_msg}")
101+
logger.warning(f" Content: '{line_content}'")
102+
103+
if len(malformed_lines) > max_errors_to_show:
104+
remaining = len(malformed_lines) - max_errors_to_show
105+
logger.warning(f" ... and {remaining} more malformed lines")
106+
107+
logger.warning(
108+
"Consider fixing the annotation file format or contact the tool provider"
109+
)
110+
elif empty_lines > 0 or comment_lines > 0:
111+
logger.info(
112+
f"Annotation file {file_path}: skipped {empty_lines} empty and {comment_lines} comment lines"
113+
)
114+
115+
116+
def _report_parsing_summary(parsing_errors, logger):
117+
"""
118+
Report a summary of all parsing errors encountered during annotation processing.
119+
120+
Parameters:
121+
- parsing_errors (list): List of parsing error dictionaries
122+
- logger: Logger instance for reporting errors
123+
"""
124+
if not parsing_errors:
125+
return
126+
127+
total_files_with_errors = len(parsing_errors)
128+
total_malformed_lines = sum(
129+
len(pe.get("malformed_lines", [])) for pe in parsing_errors
130+
)
131+
total_parsed_lines = sum(pe.get("parsed_lines", 0) for pe in parsing_errors)
132+
files_with_file_errors = sum(1 for pe in parsing_errors if "file_error" in pe)
133+
134+
logger.warning("=" * 60)
135+
logger.warning("ANNOTATION FILE PARSING SUMMARY")
136+
logger.warning("=" * 60)
137+
logger.warning(f"Files processed with parsing issues: {total_files_with_errors}")
138+
139+
if files_with_file_errors > 0:
140+
logger.warning(f"Files that could not be read: {files_with_file_errors}")
141+
142+
if total_malformed_lines > 0:
143+
logger.warning(f"Total malformed lines skipped: {total_malformed_lines}")
144+
logger.warning(f"Total lines successfully parsed: {total_parsed_lines}")
145+
146+
logger.warning("\nFiles with parsing issues:")
147+
for pe in parsing_errors:
148+
if pe.get("malformed_lines") or "file_error" in pe:
149+
file_path = pe.get("file", "Unknown file")
150+
if "file_error" in pe:
151+
logger.warning(f" {file_path}: FILE ERROR - {pe['file_error']}")
152+
else:
153+
malformed_count = len(pe.get("malformed_lines", []))
154+
parsed_count = pe.get("parsed_lines", 0)
155+
logger.warning(
156+
f" {file_path}: {malformed_count} malformed lines, {parsed_count} parsed successfully"
157+
)
158+
159+
logger.warning("\nRecommendations:")
160+
logger.warning(
161+
"1. Check annotation files are properly formatted (3 tab-separated columns)"
162+
)
163+
logger.warning("2. Ensure no extra spaces, missing tabs, or empty columns")
164+
logger.warning("3. Contact the annotation tool provider if issues persist")
165+
logger.warning("=" * 60)
52166

53167

54168
def _sortDict(d):
@@ -279,14 +393,27 @@ def annotate(args):
279393
all_annotations = [pfam_dict, dbcan_dict, swiss_dict, merops_dict, busco_dict]
280394
# we need to look for any additional annotations that might be added by f2a
281395
annots_in_dir = find_files(misc_dir, ".annotations.txt")
396+
parsing_errors = []
282397
for annotfile in annots_in_dir:
283-
a = parse_annotations(annotfile)
398+
a, parse_errors = parse_annotations(annotfile)
284399
logger.info(f"Loaded {len(a)} annotations from {annotfile}")
400+
401+
# Report parsing errors if any
402+
if parse_errors["malformed_lines"] or "file_error" in parse_errors:
403+
parsing_errors.append(parse_errors)
404+
_report_parsing_errors(parse_errors, logger)
405+
285406
all_annotations.append(a)
286407
if args.annotations:
287408
for annotfile in args.annotations:
288-
a = parse_annotations(annotfile)
409+
a, parse_errors = parse_annotations(annotfile)
289410
logger.info(f"Loaded {len(a)} annotations from {annotfile}")
411+
412+
# Report parsing errors if any
413+
if parse_errors["malformed_lines"] or "file_error" in parse_errors:
414+
parsing_errors.append(parse_errors)
415+
_report_parsing_errors(parse_errors, logger)
416+
290417
all_annotations.append(a)
291418

292419
# merge annotations into the gene/funannotate dictionary
@@ -449,5 +576,9 @@ def annotate(args):
449576
logger.info("Annotation Summary:")
450577
logger.info(f"\n{json.dumps(stats, indent=2)}")
451578

579+
# Report parsing errors summary if any occurred
580+
if parsing_errors:
581+
_report_parsing_summary(parsing_errors, logger)
582+
452583
# finish
453584
finishLogging(log, vars(sys.modules[__name__])["__name__"])

funannotate2/search.py

Lines changed: 109 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,8 @@ def hmmer_search(hmmfile, sequences, cpus=0, bit_cutoffs=None, evalue=10.0):
8787
"seq_length": hit.length,
8888
"hmm_length": s_domains[0]["hmm_length"],
8989
"hmm_aln_length": s_domains[0]["hmm_aln"],
90-
"hmm_coverage": s_domains[0]["hmm_aln"] / s_domains[0]["hmm_length"],
90+
"hmm_coverage": s_domains[0]["hmm_aln"]
91+
/ s_domains[0]["hmm_length"],
9192
"bitscore": hit.score,
9293
"evalue": hit.evalue,
9394
"domains": s_domains,
@@ -157,15 +158,20 @@ def hmmer_scan(hmmfile, sequences, cpus=0, bit_cutoffs=None, evalue=10.0):
157158
),
158159
"name": hit.name.decode(),
159160
"accession": (
160-
None if hit.accession is None else hit.accession.decode()
161+
None
162+
if hit.accession is None
163+
else hit.accession.decode()
161164
),
162165
"description": (
163-
None if hit.description is None else hit.description.decode()
166+
None
167+
if hit.description is None
168+
else hit.description.decode()
164169
),
165170
"seq_length": len(top_hits.query.sequence),
166171
"hmm_length": s_domains[0]["hmm_length"],
167172
"hmm_aln_length": s_domains[0]["hmm_aln"],
168-
"hmm_coverage": s_domains[0]["hmm_aln"] / s_domains[0]["hmm_length"],
173+
"hmm_coverage": s_domains[0]["hmm_aln"]
174+
/ s_domains[0]["hmm_length"],
169175
"bitscore": hit.score,
170176
"evalue": hit.evalue,
171177
"domains": s_domains,
@@ -421,7 +427,9 @@ def merops2tsv(results, output, annots):
421427
json.dump(results, outfile, indent=2)
422428
with open(annots, "w") as annot:
423429
for result in results:
424-
annot.write(f"{result['qseqid']}\tnote\tMEROPS:{result['sseqid']} {result['family']}\n")
430+
annot.write(
431+
f"{result['qseqid']}\tnote\tMEROPS:{result['sseqid']} {result['family']}\n"
432+
)
425433
a = add2dict(
426434
a,
427435
result["qseqid"],
@@ -431,7 +439,9 @@ def merops2tsv(results, output, annots):
431439
return a
432440

433441

434-
def swissprot_blast(query, evalue=1e-5, cpus=1, min_pident=60, min_cov=60, max_target_seqs=1):
442+
def swissprot_blast(
443+
query, evalue=1e-5, cpus=1, min_pident=60, min_cov=60, max_target_seqs=1
444+
):
435445
"""
436446
Perform a BLAST search against the SwissProt database using Diamond.
437447
@@ -535,7 +545,9 @@ def swissprot2tsv(results, output, annots):
535545
json.dump(results, outfile, indent=2)
536546
with open(annots, "w") as annot:
537547
for result in results:
538-
annot.write(f"{result['query']}\tdb_xref\tUniProtKB/Swiss-Prot:{result['accession']}\n")
548+
annot.write(
549+
f"{result['query']}\tdb_xref\tUniProtKB/Swiss-Prot:{result['accession']}\n"
550+
)
539551
# add db_xref
540552
a = add2dict(
541553
a,
@@ -579,26 +591,98 @@ def swissprot2tsv(results, output, annots):
579591

580592
def parse_annotations(tsv):
581593
"""
582-
Parse a three-column annotation file into a dictionary.
594+
Parse a three-column annotation file into a dictionary with robust error handling.
583595
584596
This function reads a TSV file containing annotations, where each line consists of a gene,
585597
a database identifier, and a value. It processes the file and returns a dictionary where
586598
each gene is a key, and its associated database and value are stored as entries.
587599
600+
Lines that are not properly formatted (not exactly 3 tab-separated columns) are skipped
601+
and reported as parsing errors.
602+
588603
Parameters:
589604
- tsv (str): The file path to the annotation file in TSV format.
590605
591606
Returns:
592-
- dict: A dictionary with genes as keys and their corresponding database and value entries.
607+
- tuple: (dict, dict) A tuple containing:
608+
- dict: A dictionary with genes as keys and their corresponding database and value entries.
609+
- dict: A dictionary with parsing error information including file path, line numbers, and problematic lines.
593610
"""
611+
import logging
612+
613+
logger = logging.getLogger(__name__)
614+
594615
# parse a three column annotation file into a dictionary
595616
a = {}
596-
with open(tsv, "r") as infile:
597-
for line in infile:
598-
line = line.rstrip()
599-
gene, db, value = line.split("\t")
600-
a = add2dict(a, gene, db, value)
601-
return a
617+
parse_errors = {
618+
"file": tsv,
619+
"malformed_lines": [],
620+
"empty_lines": 0,
621+
"comment_lines": 0,
622+
"total_lines": 0,
623+
"parsed_lines": 0,
624+
}
625+
626+
try:
627+
with open(tsv, "r") as infile:
628+
for line_num, line in enumerate(infile, 1):
629+
parse_errors["total_lines"] += 1
630+
original_line = line
631+
line = line.rstrip()
632+
633+
# Skip empty lines
634+
if not line:
635+
parse_errors["empty_lines"] += 1
636+
continue
637+
638+
# Skip comment lines (starting with #)
639+
if line.startswith("#"):
640+
parse_errors["comment_lines"] += 1
641+
continue
642+
643+
# Split by tab and check column count
644+
columns = line.split("\t")
645+
646+
if len(columns) != 3:
647+
parse_errors["malformed_lines"].append(
648+
{
649+
"line_number": line_num,
650+
"line_content": original_line.rstrip(),
651+
"column_count": len(columns),
652+
"error": f"Expected 3 columns, found {len(columns)}",
653+
}
654+
)
655+
continue
656+
657+
gene, db, value = columns
658+
659+
# Check for empty values in any column
660+
if not gene.strip() or not db.strip() or not value.strip():
661+
parse_errors["malformed_lines"].append(
662+
{
663+
"line_number": line_num,
664+
"line_content": original_line.rstrip(),
665+
"column_count": len(columns),
666+
"error": "One or more columns are empty",
667+
}
668+
)
669+
continue
670+
671+
# Successfully parsed line
672+
a = add2dict(a, gene.strip(), db.strip(), value.strip())
673+
parse_errors["parsed_lines"] += 1
674+
675+
except FileNotFoundError:
676+
logger.error(f"Annotation file not found: {tsv}")
677+
parse_errors["file_error"] = f"File not found: {tsv}"
678+
except PermissionError:
679+
logger.error(f"Permission denied reading annotation file: {tsv}")
680+
parse_errors["file_error"] = f"Permission denied: {tsv}"
681+
except Exception as e:
682+
logger.error(f"Unexpected error reading annotation file {tsv}: {str(e)}")
683+
parse_errors["file_error"] = f"Unexpected error: {str(e)}"
684+
685+
return a, parse_errors
602686

603687

604688
def add2dict(adict, gene, key, value):
@@ -630,7 +714,12 @@ def add2dict(adict, gene, key, value):
630714

631715

632716
def swissprot_valid_gene(name):
633-
if number_present(name) and len(name) > 2 and not morethanXnumbers(name, 3) and "." not in name:
717+
if (
718+
number_present(name)
719+
and len(name) > 2
720+
and not morethanXnumbers(name, 3)
721+
and "." not in name
722+
):
634723
return True
635724
else:
636725
return False
@@ -746,9 +835,11 @@ def busco2tsv(results, buscodb, busco_results, annots):
746835
# so now we want to construct the 3 column
747836
a = {}
748837
with open(annots, "w") as annot:
749-
for k, v in natsorted(results.items(), key=lambda x: (x[1]["hit"], -x[1]["bitscore"])):
838+
for k, v in natsorted(
839+
results.items(), key=lambda x: (x[1]["hit"], -x[1]["bitscore"])
840+
):
750841
if v["name"] in busco_data:
751-
defline = f'BUSCO:{k} [{odb_version}] {busco_data.get(v["name"])["description"]}'
842+
defline = f"BUSCO:{k} [{odb_version}] {busco_data.get(v['name'])['description']}"
752843
else:
753844
defline = f"BUSCO:{k} [{odb_version}]"
754845
if v["hit"] not in a:

0 commit comments

Comments
 (0)