better parsing of annotation data #25

Jon Palmer · Jon Palmer · commit 196bf62d0768 · 2025-08-04T08:16:50.000-07:00
diff --git a/funannotate2/annotate.py b/funannotate2/annotate.py
@@ -13,7 +13,7 @@
 from gfftk.gff import dict2gff3, gff2dict
 from gfftk.stats import annotation_stats
 
-from .config import env
+from .config import env, busco_taxonomy
 from .log import finishLogging, startLogging, system_info
 from .name_cleaner import (
     NameCleaner,
@@ -48,7 +48,121 @@
     get_odb_version,
     validate_busco_lineage,
 )
-from .config import busco_taxonomy
+
+
+def _report_parsing_errors(parse_errors, logger):
+    """
+    Report parsing errors for annotation files in a user-friendly format.
+
+    Parameters:
+    - parse_errors (dict): Dictionary containing parsing error information
+    - logger: Logger instance for reporting errors
+    """
+    file_path = parse_errors.get("file", "Unknown file")
+
+    # Report file-level errors
+    if "file_error" in parse_errors:
+        logger.error(f"Failed to read annotation file: {file_path}")
+        logger.error(f"Error: {parse_errors['file_error']}")
+        return
+
+    # Report parsing statistics
+    total_lines = parse_errors.get("total_lines", 0)
+    parsed_lines = parse_errors.get("parsed_lines", 0)
+    malformed_lines = parse_errors.get("malformed_lines", [])
+    empty_lines = parse_errors.get("empty_lines", 0)
+    comment_lines = parse_errors.get("comment_lines", 0)
+
+    if malformed_lines:
+        logger.warning(f"Parsing issues found in annotation file: {file_path}")
+        logger.warning(
+            f"Total lines: {total_lines}, Successfully parsed: {parsed_lines}"
+        )
+        logger.warning(
+            f"Skipped: {len(malformed_lines)} malformed, {empty_lines} empty, {comment_lines} comment lines"
+        )
+
+        # Report details for first few malformed lines
+        max_errors_to_show = 5
+        logger.warning(
+            f"First {min(len(malformed_lines), max_errors_to_show)} malformed lines:"
+        )
+
+        for error_info in malformed_lines[:max_errors_to_show]:
+            line_num = error_info.get("line_number", "Unknown")
+            line_content = error_info.get("line_content", "")
+            error_msg = error_info.get("error", "Unknown error")
+
+            # Truncate very long lines for display
+            if len(line_content) > 100:
+                line_content = line_content[:97] + "..."
+
+            logger.warning(f"  Line {line_num}: {error_msg}")
+            logger.warning(f"    Content: '{line_content}'")
+
+        if len(malformed_lines) > max_errors_to_show:
+            remaining = len(malformed_lines) - max_errors_to_show
+            logger.warning(f"  ... and {remaining} more malformed lines")
+
+        logger.warning(
+            "Consider fixing the annotation file format or contact the tool provider"
+        )
+    elif empty_lines > 0 or comment_lines > 0:
+        logger.info(
+            f"Annotation file {file_path}: skipped {empty_lines} empty and {comment_lines} comment lines"
+        )
+
+
+def _report_parsing_summary(parsing_errors, logger):
+    """
+    Report a summary of all parsing errors encountered during annotation processing.
+
+    Parameters:
+    - parsing_errors (list): List of parsing error dictionaries
+    - logger: Logger instance for reporting errors
+    """
+    if not parsing_errors:
+        return
+
+    total_files_with_errors = len(parsing_errors)
+    total_malformed_lines = sum(
+        len(pe.get("malformed_lines", [])) for pe in parsing_errors
+    )
+    total_parsed_lines = sum(pe.get("parsed_lines", 0) for pe in parsing_errors)
+    files_with_file_errors = sum(1 for pe in parsing_errors if "file_error" in pe)
+
+    logger.warning("=" * 60)
+    logger.warning("ANNOTATION FILE PARSING SUMMARY")
+    logger.warning("=" * 60)
+    logger.warning(f"Files processed with parsing issues: {total_files_with_errors}")
+
+    if files_with_file_errors > 0:
+        logger.warning(f"Files that could not be read: {files_with_file_errors}")
+
+    if total_malformed_lines > 0:
+        logger.warning(f"Total malformed lines skipped: {total_malformed_lines}")
+        logger.warning(f"Total lines successfully parsed: {total_parsed_lines}")
+
+        logger.warning("\nFiles with parsing issues:")
+        for pe in parsing_errors:
+            if pe.get("malformed_lines") or "file_error" in pe:
+                file_path = pe.get("file", "Unknown file")
+                if "file_error" in pe:
+                    logger.warning(f"  {file_path}: FILE ERROR - {pe['file_error']}")
+                else:
+                    malformed_count = len(pe.get("malformed_lines", []))
+                    parsed_count = pe.get("parsed_lines", 0)
+                    logger.warning(
+                        f"  {file_path}: {malformed_count} malformed lines, {parsed_count} parsed successfully"
+                    )
+
+    logger.warning("\nRecommendations:")
+    logger.warning(
+        "1. Check annotation files are properly formatted (3 tab-separated columns)"
+    )
+    logger.warning("2. Ensure no extra spaces, missing tabs, or empty columns")
+    logger.warning("3. Contact the annotation tool provider if issues persist")
+    logger.warning("=" * 60)
 
 
 def _sortDict(d):
@@ -279,14 +393,27 @@ def annotate(args):
     all_annotations = [pfam_dict, dbcan_dict, swiss_dict, merops_dict, busco_dict]
     # we need to look for any additional annotations that might be added by f2a
     annots_in_dir = find_files(misc_dir, ".annotations.txt")
+    parsing_errors = []
     for annotfile in annots_in_dir:
-        a = parse_annotations(annotfile)
+        a, parse_errors = parse_annotations(annotfile)
         logger.info(f"Loaded {len(a)} annotations from {annotfile}")
+
+        # Report parsing errors if any
+        if parse_errors["malformed_lines"] or "file_error" in parse_errors:
+            parsing_errors.append(parse_errors)
+            _report_parsing_errors(parse_errors, logger)
+
         all_annotations.append(a)
     if args.annotations:
         for annotfile in args.annotations:
-            a = parse_annotations(annotfile)
+            a, parse_errors = parse_annotations(annotfile)
             logger.info(f"Loaded {len(a)} annotations from {annotfile}")
+
+            # Report parsing errors if any
+            if parse_errors["malformed_lines"] or "file_error" in parse_errors:
+                parsing_errors.append(parse_errors)
+                _report_parsing_errors(parse_errors, logger)
+
             all_annotations.append(a)
 
     # merge annotations into the gene/funannotate dictionary
@@ -449,5 +576,9 @@ def annotate(args):
     logger.info("Annotation Summary:")
     logger.info(f"\n{json.dumps(stats, indent=2)}")
 
+    # Report parsing errors summary if any occurred
+    if parsing_errors:
+        _report_parsing_summary(parsing_errors, logger)
+
     # finish
     finishLogging(log, vars(sys.modules[__name__])["__name__"])
diff --git a/funannotate2/search.py b/funannotate2/search.py
@@ -87,7 +87,8 @@ def hmmer_search(hmmfile, sequences, cpus=0, bit_cutoffs=None, evalue=10.0):
                             "seq_length": hit.length,
                             "hmm_length": s_domains[0]["hmm_length"],
                             "hmm_aln_length": s_domains[0]["hmm_aln"],
-                            "hmm_coverage": s_domains[0]["hmm_aln"] / s_domains[0]["hmm_length"],
+                            "hmm_coverage": s_domains[0]["hmm_aln"]
+                            / s_domains[0]["hmm_length"],
                             "bitscore": hit.score,
                             "evalue": hit.evalue,
                             "domains": s_domains,
@@ -157,15 +158,20 @@ def hmmer_scan(hmmfile, sequences, cpus=0, bit_cutoffs=None, evalue=10.0):
                             ),
                             "name": hit.name.decode(),
                             "accession": (
-                                None if hit.accession is None else hit.accession.decode()
+                                None
+                                if hit.accession is None
+                                else hit.accession.decode()
                             ),
                             "description": (
-                                None if hit.description is None else hit.description.decode()
+                                None
+                                if hit.description is None
+                                else hit.description.decode()
                             ),
                             "seq_length": len(top_hits.query.sequence),
                             "hmm_length": s_domains[0]["hmm_length"],
                             "hmm_aln_length": s_domains[0]["hmm_aln"],
-                            "hmm_coverage": s_domains[0]["hmm_aln"] / s_domains[0]["hmm_length"],
+                            "hmm_coverage": s_domains[0]["hmm_aln"]
+                            / s_domains[0]["hmm_length"],
                             "bitscore": hit.score,
                             "evalue": hit.evalue,
                             "domains": s_domains,
@@ -421,7 +427,9 @@ def merops2tsv(results, output, annots):
         json.dump(results, outfile, indent=2)
     with open(annots, "w") as annot:
         for result in results:
-            annot.write(f"{result['qseqid']}\tnote\tMEROPS:{result['sseqid']} {result['family']}\n")
+            annot.write(
+                f"{result['qseqid']}\tnote\tMEROPS:{result['sseqid']} {result['family']}\n"
+            )
             a = add2dict(
                 a,
                 result["qseqid"],
@@ -431,7 +439,9 @@ def merops2tsv(results, output, annots):
     return a
 
 
-def swissprot_blast(query, evalue=1e-5, cpus=1, min_pident=60, min_cov=60, max_target_seqs=1):
+def swissprot_blast(
+    query, evalue=1e-5, cpus=1, min_pident=60, min_cov=60, max_target_seqs=1
+):
     """
     Perform a BLAST search against the SwissProt database using Diamond.
 
@@ -535,7 +545,9 @@ def swissprot2tsv(results, output, annots):
         json.dump(results, outfile, indent=2)
     with open(annots, "w") as annot:
         for result in results:
-            annot.write(f"{result['query']}\tdb_xref\tUniProtKB/Swiss-Prot:{result['accession']}\n")
+            annot.write(
+                f"{result['query']}\tdb_xref\tUniProtKB/Swiss-Prot:{result['accession']}\n"
+            )
             # add db_xref
             a = add2dict(
                 a,
@@ -579,26 +591,98 @@ def swissprot2tsv(results, output, annots):
 
 def parse_annotations(tsv):
     """
-    Parse a three-column annotation file into a dictionary.
+    Parse a three-column annotation file into a dictionary with robust error handling.
 
     This function reads a TSV file containing annotations, where each line consists of a gene,
     a database identifier, and a value. It processes the file and returns a dictionary where
     each gene is a key, and its associated database and value are stored as entries.
 
+    Lines that are not properly formatted (not exactly 3 tab-separated columns) are skipped
+    and reported as parsing errors.
+
     Parameters:
     - tsv (str): The file path to the annotation file in TSV format.
 
     Returns:
-    - dict: A dictionary with genes as keys and their corresponding database and value entries.
+    - tuple: (dict, dict) A tuple containing:
+        - dict: A dictionary with genes as keys and their corresponding database and value entries.
+        - dict: A dictionary with parsing error information including file path, line numbers, and problematic lines.
     """
+    import logging
+
+    logger = logging.getLogger(__name__)
+
     # parse a three column annotation file into a dictionary
     a = {}
-    with open(tsv, "r") as infile:
-        for line in infile:
-            line = line.rstrip()
-            gene, db, value = line.split("\t")
-            a = add2dict(a, gene, db, value)
-    return a
+    parse_errors = {
+        "file": tsv,
+        "malformed_lines": [],
+        "empty_lines": 0,
+        "comment_lines": 0,
+        "total_lines": 0,
+        "parsed_lines": 0,
+    }
+
+    try:
+        with open(tsv, "r") as infile:
+            for line_num, line in enumerate(infile, 1):
+                parse_errors["total_lines"] += 1
+                original_line = line
+                line = line.rstrip()
+
+                # Skip empty lines
+                if not line:
+                    parse_errors["empty_lines"] += 1
+                    continue
+
+                # Skip comment lines (starting with #)
+                if line.startswith("#"):
+                    parse_errors["comment_lines"] += 1
+                    continue
+
+                # Split by tab and check column count
+                columns = line.split("\t")
+
+                if len(columns) != 3:
+                    parse_errors["malformed_lines"].append(
+                        {
+                            "line_number": line_num,
+                            "line_content": original_line.rstrip(),
+                            "column_count": len(columns),
+                            "error": f"Expected 3 columns, found {len(columns)}",
+                        }
+                    )
+                    continue
+
+                gene, db, value = columns
+
+                # Check for empty values in any column
+                if not gene.strip() or not db.strip() or not value.strip():
+                    parse_errors["malformed_lines"].append(
+                        {
+                            "line_number": line_num,
+                            "line_content": original_line.rstrip(),
+                            "column_count": len(columns),
+                            "error": "One or more columns are empty",
+                        }
+                    )
+                    continue
+
+                # Successfully parsed line
+                a = add2dict(a, gene.strip(), db.strip(), value.strip())
+                parse_errors["parsed_lines"] += 1
+
+    except FileNotFoundError:
+        logger.error(f"Annotation file not found: {tsv}")
+        parse_errors["file_error"] = f"File not found: {tsv}"
+    except PermissionError:
+        logger.error(f"Permission denied reading annotation file: {tsv}")
+        parse_errors["file_error"] = f"Permission denied: {tsv}"
+    except Exception as e:
+        logger.error(f"Unexpected error reading annotation file {tsv}: {str(e)}")
+        parse_errors["file_error"] = f"Unexpected error: {str(e)}"
+
+    return a, parse_errors
 
 
 def add2dict(adict, gene, key, value):
@@ -630,7 +714,12 @@ def add2dict(adict, gene, key, value):
 
 
 def swissprot_valid_gene(name):
-    if number_present(name) and len(name) > 2 and not morethanXnumbers(name, 3) and "." not in name:
+    if (
+        number_present(name)
+        and len(name) > 2
+        and not morethanXnumbers(name, 3)
+        and "." not in name
+    ):
         return True
     else:
         return False
@@ -746,9 +835,11 @@ def busco2tsv(results, buscodb, busco_results, annots):
     # so now we want to construct the 3 column
     a = {}
     with open(annots, "w") as annot:
-        for k, v in natsorted(results.items(), key=lambda x: (x[1]["hit"], -x[1]["bitscore"])):
+        for k, v in natsorted(
+            results.items(), key=lambda x: (x[1]["hit"], -x[1]["bitscore"])
+        ):
             if v["name"] in busco_data:
-                defline = f'BUSCO:{k} [{odb_version}] {busco_data.get(v["name"])["description"]}'
+                defline = f"BUSCO:{k} [{odb_version}] {busco_data.get(v['name'])['description']}"
             else:
                 defline = f"BUSCO:{k} [{odb_version}]"
             if v["hit"] not in a: