Skip to content

Commit 01cc254

Browse files
author
Jon Palmer
committed
add gene model length filtering prior to consensus #28
1 parent 1b8aa6a commit 01cc254

File tree

6 files changed

+376
-23
lines changed

6 files changed

+376
-23
lines changed

CITATION.cff

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
cff-version: version = "25.10.16"
1+
cff-version: version = "25.11.1"
22
title: 'funannotate2: eukaryotic genome annotation'
33
message: >-
44
If you use this software, please cite it using the
@@ -17,5 +17,5 @@ keywords:
1717
- functional annotation
1818
- consensus gene models
1919
license: BSD-2-Clause
20-
version: version = "25.10.16"
21-
date-released: '2025-10-16'
20+
version: version = "25.11.1"
21+
date-released: '2025-11-01'

funannotate2/__main__.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,20 @@ def predict_subparser(subparsers):
293293
help="Skip specific ab initio predictors (choices: snap, augustus, glimmerhmm, genemark)",
294294
metavar="",
295295
)
296+
optional_args.add_argument(
297+
"--min-protein-length",
298+
type=int,
299+
default=30,
300+
help="Minimum protein length in amino acids for gene models (default: 30)",
301+
metavar="",
302+
)
303+
optional_args.add_argument(
304+
"--max-protein-length",
305+
type=int,
306+
default=30000,
307+
help="Maximum protein length in amino acids for gene models (default: 30000)",
308+
metavar="",
309+
)
296310
other_args = group.add_argument_group("Other arguments")
297311
other_args.add_argument(
298312
"-h",

funannotate2/predict.py

Lines changed: 60 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from .align import align_mito, align_proteins, align_transcripts
2525
from .config import env
2626
from .database import fetch_pretrained_species
27+
from .utilities import filter_and_write_gff3
2728
from .fastx import (
2829
analyzeAssembly,
2930
annotate_fasta,
@@ -451,14 +452,16 @@ def predict(args):
451452
gene_counts[ab] = 0
452453
if ab == "augustus": # split hiq and regular
453454
gene_counts["augustus-hiq"] = 0
454-
abinitio_preds.append(os.path.join(misc_dir, f"predictions.{ab}.gff3"))
455-
abinitio_preds.append(
456-
os.path.join(misc_dir, f"predictions.{ab}-hiq.gff3")
457-
)
458-
with open(os.path.join(misc_dir, f"predictions.{ab}.gff3"), "w") as aug:
459-
with open(
460-
os.path.join(misc_dir, f"predictions.{ab}-hiq.gff3"), "w"
461-
) as hiq:
455+
aug_output = os.path.join(misc_dir, f"predictions.{ab}.gff3")
456+
hiq_output = os.path.join(misc_dir, f"predictions.{ab}-hiq.gff3")
457+
abinitio_preds.append(aug_output)
458+
abinitio_preds.append(hiq_output)
459+
460+
# First consolidate all predictions into temporary files
461+
aug_temp = os.path.join(misc_dir, f"predictions.{ab}.temp.gff3")
462+
hiq_temp = os.path.join(misc_dir, f"predictions.{ab}-hiq.temp.gff3")
463+
with open(aug_temp, "w") as aug:
464+
with open(hiq_temp, "w") as hiq:
462465
aug.write("##gff-version 3\n")
463466
hiq.write("##gff-version 3\n")
464467
for f in natsorted(os.listdir(tmp_dir)):
@@ -467,31 +470,69 @@ def predict(args):
467470
for line in infile:
468471
if line.startswith("#"):
469472
continue
470-
if "\tgene\t" in line:
471-
if "augustus-hiq" in line:
472-
gene_counts["augustus-hiq"] += 1
473-
else:
474-
gene_counts[ab] += 1
475473
if "augustus-hiq" in line:
476474
hiq.write(line)
477475
else:
478476
aug.write(line)
477+
478+
# Now filter both files
479+
aug_stats = filter_and_write_gff3(
480+
aug_temp,
481+
aug_output,
482+
min_protein_length=args.min_protein_length,
483+
max_protein_length=args.max_protein_length,
484+
)
485+
hiq_stats = filter_and_write_gff3(
486+
hiq_temp,
487+
hiq_output,
488+
min_protein_length=args.min_protein_length,
489+
max_protein_length=args.max_protein_length,
490+
)
491+
gene_counts[ab] = aug_stats["kept"]
492+
gene_counts["augustus-hiq"] = hiq_stats["kept"]
493+
494+
# Clean up temp files
495+
os.remove(aug_temp)
496+
os.remove(hiq_temp)
497+
498+
logger.info(
499+
f"Augustus predictions filtered: {aug_stats['kept']} kept, {aug_stats['filtered']} filtered (protein length {args.min_protein_length}-{args.max_protein_length} aa)"
500+
)
501+
logger.info(
502+
f"Augustus-hiq predictions filtered: {hiq_stats['kept']} kept, {hiq_stats['filtered']} filtered (protein length {args.min_protein_length}-{args.max_protein_length} aa)"
503+
)
479504
else:
480-
abinitio_preds.append(os.path.join(misc_dir, f"predictions.{ab}.gff3"))
481-
with open(
482-
os.path.join(misc_dir, f"predictions.{ab}.gff3"), "w"
483-
) as outfile:
505+
output_file = os.path.join(misc_dir, f"predictions.{ab}.gff3")
506+
abinitio_preds.append(output_file)
507+
508+
# First consolidate all predictions into temporary file
509+
temp_file = os.path.join(misc_dir, f"predictions.{ab}.temp.gff3")
510+
with open(temp_file, "w") as outfile:
484511
outfile.write("##gff-version 3\n")
485512
for f in natsorted(os.listdir(tmp_dir)):
486513
if f.endswith(f"{ab}.gff3"):
487514
with open(os.path.join(tmp_dir, f), "r") as infile:
488515
for line in infile:
489516
if line.startswith("#"):
490517
continue
491-
if "\tgene\t" in line:
492-
gene_counts[ab] += 1
493518
outfile.write(line)
494519

520+
# Now filter the file
521+
stats = filter_and_write_gff3(
522+
temp_file,
523+
output_file,
524+
min_protein_length=args.min_protein_length,
525+
max_protein_length=args.max_protein_length,
526+
)
527+
gene_counts[ab] = stats["kept"]
528+
529+
# Clean up temp file
530+
os.remove(temp_file)
531+
532+
logger.info(
533+
f"{ab} predictions filtered: {stats['kept']} kept, {stats['filtered']} filtered (protein length {args.min_protein_length}-{args.max_protein_length} aa)"
534+
)
535+
495536
# clean up
496537
shutil.rmtree(tmp_dir)
497538

funannotate2/utilities.py

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1454,3 +1454,110 @@ def rename_gff_contigs(gff, output, contigHeaderMap):
14541454
if cols[0] in contigHeaderMap:
14551455
cols[0] = contigHeaderMap[cols[0]]
14561456
outfile.write("\t".join(cols))
1457+
1458+
1459+
def filter_and_write_gff3(
1460+
input_gff, output_gff, min_protein_length=30, max_protein_length=30000
1461+
):
1462+
"""
1463+
Filter gene models by protein length while writing GFF3 file.
1464+
1465+
This function reads a GFF3 file line by line, calculates protein length from CDS coordinates,
1466+
and only writes genes and their associated features if the protein length is within the
1467+
specified range [min_protein_length, max_protein_length].
1468+
1469+
Parameters:
1470+
- input_gff (str): Path to the input GFF3 file.
1471+
- output_gff (str): Path to the output GFF3 file.
1472+
- min_protein_length (int): Minimum protein length in amino acids (default: 30).
1473+
- max_protein_length (int): Maximum protein length in amino acids (default: 30000).
1474+
1475+
Returns:
1476+
- dict: Dictionary with keys 'kept' and 'filtered' containing counts of genes.
1477+
"""
1478+
1479+
def calculate_protein_length(cds_coords):
1480+
"""Calculate protein length from CDS coordinates."""
1481+
if not cds_coords:
1482+
return 0
1483+
# Sum up CDS lengths
1484+
total_cds_length = sum(end - start for start, end in cds_coords)
1485+
# Protein length is CDS length / 3
1486+
return total_cds_length // 3
1487+
1488+
# Track genes and their features
1489+
current_gene_id = None
1490+
current_gene_lines = []
1491+
current_cds_coords = []
1492+
genes_kept = 0
1493+
genes_filtered = 0
1494+
1495+
with open(output_gff, "w") as outfile:
1496+
outfile.write("##gff-version 3\n")
1497+
1498+
with open(input_gff, "r") as infile:
1499+
for line in infile:
1500+
# Skip header lines
1501+
if line.startswith("#"):
1502+
continue
1503+
1504+
cols = line.rstrip("\n").split("\t")
1505+
if len(cols) < 9:
1506+
continue
1507+
1508+
feature_type = cols[2]
1509+
start = int(cols[3])
1510+
end = int(cols[4])
1511+
1512+
# Parse attributes
1513+
attributes = cols[8]
1514+
attr_dict = {}
1515+
for attr in attributes.split(";"):
1516+
if "=" in attr:
1517+
key, val = attr.split("=", 1)
1518+
attr_dict[key] = val
1519+
1520+
# Handle gene features
1521+
if feature_type == "gene":
1522+
# Process previous gene if exists
1523+
if current_gene_id is not None:
1524+
protein_length = calculate_protein_length(current_cds_coords)
1525+
if min_protein_length <= protein_length <= max_protein_length:
1526+
for gene_line in current_gene_lines:
1527+
outfile.write(gene_line)
1528+
genes_kept += 1
1529+
else:
1530+
genes_filtered += 1
1531+
1532+
# Start new gene
1533+
current_gene_id = attr_dict.get("ID")
1534+
current_gene_lines = [line]
1535+
current_cds_coords = []
1536+
1537+
# Handle mRNA/transcript features
1538+
elif feature_type in ["mRNA", "transcript"]:
1539+
if current_gene_id is not None:
1540+
current_gene_lines.append(line)
1541+
1542+
# Handle CDS features
1543+
elif feature_type == "CDS":
1544+
if current_gene_id is not None:
1545+
current_gene_lines.append(line)
1546+
current_cds_coords.append((start, end))
1547+
1548+
# Handle other features (exon, etc.)
1549+
elif feature_type in ["exon", "five_prime_UTR", "three_prime_UTR"]:
1550+
if current_gene_id is not None:
1551+
current_gene_lines.append(line)
1552+
1553+
# Process last gene
1554+
if current_gene_id is not None:
1555+
protein_length = calculate_protein_length(current_cds_coords)
1556+
if min_protein_length <= protein_length <= max_protein_length:
1557+
for gene_line in current_gene_lines:
1558+
outfile.write(gene_line)
1559+
genes_kept += 1
1560+
else:
1561+
genes_filtered += 1
1562+
1563+
return {"kept": genes_kept, "filtered": genes_filtered}

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
44

55
[project]
66
name = "funannotate2"
7-
version = "25.10.16"
7+
version = "25.11.1"
88
description = "Funannotate2: eukarytoic genome annotation pipeline"
99
readme = {file = "README.md", content-type = "text/markdown"}
1010
authors = [

0 commit comments

Comments
 (0)