2424from .align import align_mito , align_proteins , align_transcripts
2525from .config import env
2626from .database import fetch_pretrained_species
27+ from .utilities import filter_and_write_gff3
2728from .fastx import (
2829 analyzeAssembly ,
2930 annotate_fasta ,
@@ -451,14 +452,16 @@ def predict(args):
451452 gene_counts [ab ] = 0
452453 if ab == "augustus" : # split hiq and regular
453454 gene_counts ["augustus-hiq" ] = 0
454- abinitio_preds .append (os .path .join (misc_dir , f"predictions.{ ab } .gff3" ))
455- abinitio_preds .append (
456- os .path .join (misc_dir , f"predictions.{ ab } -hiq.gff3" )
457- )
458- with open (os .path .join (misc_dir , f"predictions.{ ab } .gff3" ), "w" ) as aug :
459- with open (
460- os .path .join (misc_dir , f"predictions.{ ab } -hiq.gff3" ), "w"
461- ) as hiq :
455+ aug_output = os .path .join (misc_dir , f"predictions.{ ab } .gff3" )
456+ hiq_output = os .path .join (misc_dir , f"predictions.{ ab } -hiq.gff3" )
457+ abinitio_preds .append (aug_output )
458+ abinitio_preds .append (hiq_output )
459+
460+ # First consolidate all predictions into temporary files
461+ aug_temp = os .path .join (misc_dir , f"predictions.{ ab } .temp.gff3" )
462+ hiq_temp = os .path .join (misc_dir , f"predictions.{ ab } -hiq.temp.gff3" )
463+ with open (aug_temp , "w" ) as aug :
464+ with open (hiq_temp , "w" ) as hiq :
462465 aug .write ("##gff-version 3\n " )
463466 hiq .write ("##gff-version 3\n " )
464467 for f in natsorted (os .listdir (tmp_dir )):
@@ -467,31 +470,69 @@ def predict(args):
467470 for line in infile :
468471 if line .startswith ("#" ):
469472 continue
470- if "\t gene\t " in line :
471- if "augustus-hiq" in line :
472- gene_counts ["augustus-hiq" ] += 1
473- else :
474- gene_counts [ab ] += 1
475473 if "augustus-hiq" in line :
476474 hiq .write (line )
477475 else :
478476 aug .write (line )
477+
478+ # Now filter both files
479+ aug_stats = filter_and_write_gff3 (
480+ aug_temp ,
481+ aug_output ,
482+ min_protein_length = args .min_protein_length ,
483+ max_protein_length = args .max_protein_length ,
484+ )
485+ hiq_stats = filter_and_write_gff3 (
486+ hiq_temp ,
487+ hiq_output ,
488+ min_protein_length = args .min_protein_length ,
489+ max_protein_length = args .max_protein_length ,
490+ )
491+ gene_counts [ab ] = aug_stats ["kept" ]
492+ gene_counts ["augustus-hiq" ] = hiq_stats ["kept" ]
493+
494+ # Clean up temp files
495+ os .remove (aug_temp )
496+ os .remove (hiq_temp )
497+
498+ logger .info (
499+ f"Augustus predictions filtered: { aug_stats ['kept' ]} kept, { aug_stats ['filtered' ]} filtered (protein length { args .min_protein_length } -{ args .max_protein_length } aa)"
500+ )
501+ logger .info (
502+ f"Augustus-hiq predictions filtered: { hiq_stats ['kept' ]} kept, { hiq_stats ['filtered' ]} filtered (protein length { args .min_protein_length } -{ args .max_protein_length } aa)"
503+ )
479504 else :
480- abinitio_preds .append (os .path .join (misc_dir , f"predictions.{ ab } .gff3" ))
481- with open (
482- os .path .join (misc_dir , f"predictions.{ ab } .gff3" ), "w"
483- ) as outfile :
505+ output_file = os .path .join (misc_dir , f"predictions.{ ab } .gff3" )
506+ abinitio_preds .append (output_file )
507+
508+ # First consolidate all predictions into temporary file
509+ temp_file = os .path .join (misc_dir , f"predictions.{ ab } .temp.gff3" )
510+ with open (temp_file , "w" ) as outfile :
484511 outfile .write ("##gff-version 3\n " )
485512 for f in natsorted (os .listdir (tmp_dir )):
486513 if f .endswith (f"{ ab } .gff3" ):
487514 with open (os .path .join (tmp_dir , f ), "r" ) as infile :
488515 for line in infile :
489516 if line .startswith ("#" ):
490517 continue
491- if "\t gene\t " in line :
492- gene_counts [ab ] += 1
493518 outfile .write (line )
494519
520+ # Now filter the file
521+ stats = filter_and_write_gff3 (
522+ temp_file ,
523+ output_file ,
524+ min_protein_length = args .min_protein_length ,
525+ max_protein_length = args .max_protein_length ,
526+ )
527+ gene_counts [ab ] = stats ["kept" ]
528+
529+ # Clean up temp file
530+ os .remove (temp_file )
531+
532+ logger .info (
533+ f"{ ab } predictions filtered: { stats ['kept' ]} kept, { stats ['filtered' ]} filtered (protein length { args .min_protein_length } -{ args .max_protein_length } aa)"
534+ )
535+
495536 # clean up
496537 shutil .rmtree (tmp_dir )
497538
0 commit comments