add gene model length filtering prior to consensus #28

Jon Palmer · Jon Palmer · commit 01cc25437e87 · 2025-11-01T14:28:59.000-07:00
diff --git a/CITATION.cff b/CITATION.cff
@@ -1,4 +1,4 @@
-cff-version: version = "25.10.16"
+cff-version: version = "25.11.1"
 title: 'funannotate2: eukaryotic genome annotation'
 message: >-
   If you use this software, please cite it using the
@@ -17,5 +17,5 @@ keywords:
   - functional annotation
   - consensus gene models
 license: BSD-2-Clause
-version: version = "25.10.16"
-date-released: '2025-10-16'
+version: version = "25.11.1"
+date-released: '2025-11-01'
diff --git a/funannotate2/__main__.py b/funannotate2/__main__.py
@@ -293,6 +293,20 @@ def predict_subparser(subparsers):
         help="Skip specific ab initio predictors (choices: snap, augustus, glimmerhmm, genemark)",
         metavar="",
     )
+    optional_args.add_argument(
+        "--min-protein-length",
+        type=int,
+        default=30,
+        help="Minimum protein length in amino acids for gene models (default: 30)",
+        metavar="",
+    )
+    optional_args.add_argument(
+        "--max-protein-length",
+        type=int,
+        default=30000,
+        help="Maximum protein length in amino acids for gene models (default: 30000)",
+        metavar="",
+    )
     other_args = group.add_argument_group("Other arguments")
     other_args.add_argument(
         "-h",
diff --git a/funannotate2/predict.py b/funannotate2/predict.py
@@ -24,6 +24,7 @@
 from .align import align_mito, align_proteins, align_transcripts
 from .config import env
 from .database import fetch_pretrained_species
+from .utilities import filter_and_write_gff3
 from .fastx import (
     analyzeAssembly,
     annotate_fasta,
@@ -451,14 +452,16 @@ def predict(args):
             gene_counts[ab] = 0
             if ab == "augustus":  # split hiq and regular
                 gene_counts["augustus-hiq"] = 0
-                abinitio_preds.append(os.path.join(misc_dir, f"predictions.{ab}.gff3"))
-                abinitio_preds.append(
-                    os.path.join(misc_dir, f"predictions.{ab}-hiq.gff3")
-                )
-                with open(os.path.join(misc_dir, f"predictions.{ab}.gff3"), "w") as aug:
-                    with open(
-                        os.path.join(misc_dir, f"predictions.{ab}-hiq.gff3"), "w"
-                    ) as hiq:
+                aug_output = os.path.join(misc_dir, f"predictions.{ab}.gff3")
+                hiq_output = os.path.join(misc_dir, f"predictions.{ab}-hiq.gff3")
+                abinitio_preds.append(aug_output)
+                abinitio_preds.append(hiq_output)
+
+                # First consolidate all predictions into temporary files
+                aug_temp = os.path.join(misc_dir, f"predictions.{ab}.temp.gff3")
+                hiq_temp = os.path.join(misc_dir, f"predictions.{ab}-hiq.temp.gff3")
+                with open(aug_temp, "w") as aug:
+                    with open(hiq_temp, "w") as hiq:
                         aug.write("##gff-version 3\n")
                         hiq.write("##gff-version 3\n")
                         for f in natsorted(os.listdir(tmp_dir)):
@@ -467,31 +470,69 @@ def predict(args):
                                     for line in infile:
                                         if line.startswith("#"):
                                             continue
-                                        if "\tgene\t" in line:
-                                            if "augustus-hiq" in line:
-                                                gene_counts["augustus-hiq"] += 1
-                                            else:
-                                                gene_counts[ab] += 1
                                         if "augustus-hiq" in line:
                                             hiq.write(line)
                                         else:
                                             aug.write(line)
+
+                # Now filter both files
+                aug_stats = filter_and_write_gff3(
+                    aug_temp,
+                    aug_output,
+                    min_protein_length=args.min_protein_length,
+                    max_protein_length=args.max_protein_length,
+                )
+                hiq_stats = filter_and_write_gff3(
+                    hiq_temp,
+                    hiq_output,
+                    min_protein_length=args.min_protein_length,
+                    max_protein_length=args.max_protein_length,
+                )
+                gene_counts[ab] = aug_stats["kept"]
+                gene_counts["augustus-hiq"] = hiq_stats["kept"]
+
+                # Clean up temp files
+                os.remove(aug_temp)
+                os.remove(hiq_temp)
+
+                logger.info(
+                    f"Augustus predictions filtered: {aug_stats['kept']} kept, {aug_stats['filtered']} filtered (protein length {args.min_protein_length}-{args.max_protein_length} aa)"
+                )
+                logger.info(
+                    f"Augustus-hiq predictions filtered: {hiq_stats['kept']} kept, {hiq_stats['filtered']} filtered (protein length {args.min_protein_length}-{args.max_protein_length} aa)"
+                )
             else:
-                abinitio_preds.append(os.path.join(misc_dir, f"predictions.{ab}.gff3"))
-                with open(
-                    os.path.join(misc_dir, f"predictions.{ab}.gff3"), "w"
-                ) as outfile:
+                output_file = os.path.join(misc_dir, f"predictions.{ab}.gff3")
+                abinitio_preds.append(output_file)
+
+                # First consolidate all predictions into temporary file
+                temp_file = os.path.join(misc_dir, f"predictions.{ab}.temp.gff3")
+                with open(temp_file, "w") as outfile:
                     outfile.write("##gff-version 3\n")
                     for f in natsorted(os.listdir(tmp_dir)):
                         if f.endswith(f"{ab}.gff3"):
                             with open(os.path.join(tmp_dir, f), "r") as infile:
                                 for line in infile:
                                     if line.startswith("#"):
                                         continue
-                                    if "\tgene\t" in line:
-                                        gene_counts[ab] += 1
                                     outfile.write(line)
 
+                # Now filter the file
+                stats = filter_and_write_gff3(
+                    temp_file,
+                    output_file,
+                    min_protein_length=args.min_protein_length,
+                    max_protein_length=args.max_protein_length,
+                )
+                gene_counts[ab] = stats["kept"]
+
+                # Clean up temp file
+                os.remove(temp_file)
+
+                logger.info(
+                    f"{ab} predictions filtered: {stats['kept']} kept, {stats['filtered']} filtered (protein length {args.min_protein_length}-{args.max_protein_length} aa)"
+                )
+
         # clean up
         shutil.rmtree(tmp_dir)
 
diff --git a/funannotate2/utilities.py b/funannotate2/utilities.py
@@ -1454,3 +1454,110 @@ def rename_gff_contigs(gff, output, contigHeaderMap):
                     if cols[0] in contigHeaderMap:
                         cols[0] = contigHeaderMap[cols[0]]
                     outfile.write("\t".join(cols))
+
+
+def filter_and_write_gff3(
+    input_gff, output_gff, min_protein_length=30, max_protein_length=30000
+):
+    """
+    Filter gene models by protein length while writing GFF3 file.
+
+    This function reads a GFF3 file line by line, calculates protein length from CDS coordinates,
+    and only writes genes and their associated features if the protein length is within the
+    specified range [min_protein_length, max_protein_length].
+
+    Parameters:
+    - input_gff (str): Path to the input GFF3 file.
+    - output_gff (str): Path to the output GFF3 file.
+    - min_protein_length (int): Minimum protein length in amino acids (default: 30).
+    - max_protein_length (int): Maximum protein length in amino acids (default: 30000).
+
+    Returns:
+    - dict: Dictionary with keys 'kept' and 'filtered' containing counts of genes.
+    """
+
+    def calculate_protein_length(cds_coords):
+        """Calculate protein length from CDS coordinates."""
+        if not cds_coords:
+            return 0
+        # Sum up CDS lengths
+        total_cds_length = sum(end - start for start, end in cds_coords)
+        # Protein length is CDS length / 3
+        return total_cds_length // 3
+
+    # Track genes and their features
+    current_gene_id = None
+    current_gene_lines = []
+    current_cds_coords = []
+    genes_kept = 0
+    genes_filtered = 0
+
+    with open(output_gff, "w") as outfile:
+        outfile.write("##gff-version 3\n")
+
+        with open(input_gff, "r") as infile:
+            for line in infile:
+                # Skip header lines
+                if line.startswith("#"):
+                    continue
+
+                cols = line.rstrip("\n").split("\t")
+                if len(cols) < 9:
+                    continue
+
+                feature_type = cols[2]
+                start = int(cols[3])
+                end = int(cols[4])
+
+                # Parse attributes
+                attributes = cols[8]
+                attr_dict = {}
+                for attr in attributes.split(";"):
+                    if "=" in attr:
+                        key, val = attr.split("=", 1)
+                        attr_dict[key] = val
+
+                # Handle gene features
+                if feature_type == "gene":
+                    # Process previous gene if exists
+                    if current_gene_id is not None:
+                        protein_length = calculate_protein_length(current_cds_coords)
+                        if min_protein_length <= protein_length <= max_protein_length:
+                            for gene_line in current_gene_lines:
+                                outfile.write(gene_line)
+                            genes_kept += 1
+                        else:
+                            genes_filtered += 1
+
+                    # Start new gene
+                    current_gene_id = attr_dict.get("ID")
+                    current_gene_lines = [line]
+                    current_cds_coords = []
+
+                # Handle mRNA/transcript features
+                elif feature_type in ["mRNA", "transcript"]:
+                    if current_gene_id is not None:
+                        current_gene_lines.append(line)
+
+                # Handle CDS features
+                elif feature_type == "CDS":
+                    if current_gene_id is not None:
+                        current_gene_lines.append(line)
+                        current_cds_coords.append((start, end))
+
+                # Handle other features (exon, etc.)
+                elif feature_type in ["exon", "five_prime_UTR", "three_prime_UTR"]:
+                    if current_gene_id is not None:
+                        current_gene_lines.append(line)
+
+        # Process last gene
+        if current_gene_id is not None:
+            protein_length = calculate_protein_length(current_cds_coords)
+            if min_protein_length <= protein_length <= max_protein_length:
+                for gene_line in current_gene_lines:
+                    outfile.write(gene_line)
+                genes_kept += 1
+            else:
+                genes_filtered += 1
+
+    return {"kept": genes_kept, "filtered": genes_filtered}
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "funannotate2"
-version = "25.10.16"
+version = "25.11.1"
 description = "Funannotate2: eukarytoic genome annotation pipeline"
 readme = {file = "README.md", content-type = "text/markdown"}
 authors = [
diff --git a/tests/unit/test_filter_gff3.py b/tests/unit/test_filter_gff3.py