11#!/usr/bin/env python
22
3- import sys , os , subprocess , inspect , multiprocessing , shutil , argparse , time
3+ import sys , os , subprocess , inspect , multiprocessing , shutil , argparse , time , re
44from Bio import SeqIO
55currentdir = os .path .dirname (os .path .abspath (inspect .getfile (inspect .currentframe ())))
66parentdir = os .path .dirname (currentdir )
@@ -35,7 +35,7 @@ def __init__(self,prog):
3535parser .add_argument ('--rna_bam' , help = 'BAM (sorted) of RNAseq aligned to reference for BRAKER1' )
3636parser .add_argument ('--min_intronlen' , default = 10 , help = 'Minimum intron length for gene models' )
3737parser .add_argument ('--max_intronlen' , default = 3000 , help = 'Maximum intron length for gene models' )
38- parser .add_argument ('--min_protlen' , default = 51 , type = int , help = 'Minimum amino acid length for valid gene model' )
38+ parser .add_argument ('--min_protlen' , default = 50 , type = int , help = 'Minimum amino acid length for valid gene model' )
3939parser .add_argument ('--keep_no_stops' , action = 'store_true' , help = 'Keep gene models without valid stop codons' )
4040parser .add_argument ('--cpus' , default = 2 , type = int , help = 'Number of CPUs to use' )
4141parser .add_argument ('--busco_seed_species' , default = 'anidulans' , help = 'Augustus species to use as initial training point for BUSCO' )
@@ -448,37 +448,74 @@ def __init__(self,prog):
448448 subprocess .call (['perl' , Converter , aug_out ], stdout = output , stderr = FNULL )
449449
450450 if not GeneMark :
451+ #count contigs
452+ num_contigs = lib .countfasta (MaskGenome )
451453 #now run GeneMark-ES, first check for gmhmm mod file, use if available otherwise run ES
452454 if not args .genemark_mod :
453- GeneMarkGFF3 = os .path .join (args .out , 'predict_misc' , 'genemark.gff' )
454- if not os .path .isfile (GeneMarkGFF3 ):
455- if args .organism == 'fungus' :
456- lib .RunGeneMarkES (MaskGenome , args .cpus , os .path .join (args .out , 'predict_misc' ), GeneMarkGFF3 , True )
457- else :
458- lib .RunGeneMarkES (MaskGenome , args .cpus , os .path .join (args .out , 'predict_misc' ), GeneMarkGFF3 , False )
459- GeneMarkTemp = os .path .join (args .out , 'predict_misc' , 'genemark.temp.gff' )
460- with open (GeneMarkTemp , 'w' ) as output :
461- subprocess .call (['perl' , Converter , GeneMarkGFF3 ], stdout = output , stderr = FNULL )
462- GeneMark = os .path .join (args .out , 'predict_misc' , 'genemark.evm.gff3' )
463- with open (GeneMark , 'w' ) as output :
464- with open (GeneMarkTemp , 'rU' ) as input :
465- lines = input .read ().replace ("Augustus" ,"GeneMark" )
466- output .write (lines )
455+ #if there are less than 2 data points (contigs, self-training fails), count contigs
456+ if num_contigs < 2 :
457+ lib .log .error ("GeneMark-ES cannot run with only a single contig, you must provide --ini_mod file to run GeneMark" )
458+ else :
459+ GeneMarkGFF3 = os .path .join (args .out , 'predict_misc' , 'genemark.gff' )
460+ if not os .path .isfile (GeneMarkGFF3 ):
461+ if args .organism == 'fungus' :
462+ lib .RunGeneMarkES (MaskGenome , args .cpus , os .path .join (args .out , 'predict_misc' ), GeneMarkGFF3 , True )
463+ else :
464+ lib .RunGeneMarkES (MaskGenome , args .cpus , os .path .join (args .out , 'predict_misc' ), GeneMarkGFF3 , False )
465+ GeneMarkTemp = os .path .join (args .out , 'predict_misc' , 'genemark.temp.gff' )
466+ with open (GeneMarkTemp , 'w' ) as output :
467+ subprocess .call (['perl' , Converter , GeneMarkGFF3 ], stdout = output , stderr = FNULL )
468+ GeneMark = os .path .join (args .out , 'predict_misc' , 'genemark.evm.gff3' )
469+ with open (GeneMark , 'w' ) as output :
470+ with open (GeneMarkTemp , 'rU' ) as input :
471+ lines = input .read ().replace ("Augustus" ,"GeneMark" )
472+ output .write (lines )
467473 else : #have training parameters file, so just run genemark with
468474 GeneMarkGFF3 = os .path .join (args .out , 'predict_misc' , 'genemark.gff' )
469- if not os .path .isfile (GeneMarkGFF3 ):
470- if args .organism == 'fungus' :
471- lib .RunGeneMark (MaskGenome , args .genemark_mod , args .cpus , os .path .join (args .out , 'predict_misc' ), GeneMarkGFF3 , True )
472- else :
473- lib .RunGeneMark (MaskGenome , args .genemark_mod , args .cpus , os .path .join (args .out , 'predict_misc' ), GeneMarkGFF3 , False )
474- GeneMarkTemp = os .path .join (args .out , 'predict_misc' , 'genemark.temp.gff' )
475- with open (GeneMarkTemp , 'w' ) as output :
476- subprocess .call (['perl' , Converter , GeneMarkGFF3 ], stdout = output , stderr = FNULL )
477- GeneMark = os .path .join (args .out , 'predict_misc' , 'genemark.evm.gff3' )
478- with open (GeneMark , 'w' ) as output :
479- with open (GeneMarkTemp , 'rU' ) as input :
480- lines = input .read ().replace ("Augustus" ,"GeneMark" )
481- output .write (lines )
475+ if num_contigs < 2 : #now can run modified prediction on single contig
476+ with open (MaskGenome , 'rU' ) as genome :
477+ for line in genome :
478+ if line .startswith ('>' ):
479+ header = line .replace ('>' , '' )
480+ header = header .replace ('\n ' , '' )
481+ GeneMark = os .path .join (args .out , 'predict_misc' , 'genemark.evm.gff3' )
482+ GeneMarkTemp = os .path .join (args .out , 'predict_misc' , 'genemark.temp.gff' )
483+ if not os .path .isfile (GeneMarkGFF3 ):
484+ lib .log .info ("Running GeneMark on single-contig assembly" )
485+ subprocess .call (['gmhmme3' , '-m' , args .genemark_mod , '-o' , GeneMarkGFF3 , '-f' , 'gff3' , MaskGenome ])
486+ #now open output and reformat
487+ lib .log .info ("Converting GeneMark GTF file to GFF3" )
488+ with open (GeneMarkTemp , 'w' ) as geneout :
489+ with open (GeneMarkGFF3 , 'rU' ) as genein :
490+ for line in genein :
491+ if not line .startswith ('#' ):
492+ if not '\t Intron\t ' in line :
493+ newline = line .replace ('seq' , header )
494+ newline = newline .replace ('.hmm3' , '' )
495+ geneout .write (newline )
496+ GeneMarkTemp2 = os .path .join (args .out , 'predict_misc' , 'genemark.temp2.gff' )
497+ with open (GeneMarkTemp2 , 'w' ) as output :
498+ subprocess .call (['perl' , Converter , GeneMarkTemp ], stdout = output , stderr = FNULL )
499+ with open (GeneMark , 'w' ) as output :
500+ with open (GeneMarkTemp2 , 'rU' ) as input :
501+ lines = input .read ().replace ("Augustus" ,"GeneMark" )
502+ output .write (lines )
503+
504+ else :
505+ GeneMarkGFF3 = os .path .join (args .out , 'predict_misc' , 'genemark.gff' )
506+ if not os .path .isfile (GeneMarkGFF3 ):
507+ if args .organism == 'fungus' :
508+ lib .RunGeneMark (MaskGenome , args .genemark_mod , args .cpus , os .path .join (args .out , 'predict_misc' ), GeneMarkGFF3 , True )
509+ else :
510+ lib .RunGeneMark (MaskGenome , args .genemark_mod , args .cpus , os .path .join (args .out , 'predict_misc' ), GeneMarkGFF3 , False )
511+ GeneMarkTemp = os .path .join (args .out , 'predict_misc' , 'genemark.temp.gff' )
512+ with open (GeneMarkTemp , 'w' ) as output :
513+ subprocess .call (['perl' , Converter , GeneMarkGFF3 ], stdout = output , stderr = FNULL )
514+ GeneMark = os .path .join (args .out , 'predict_misc' , 'genemark.evm.gff3' )
515+ with open (GeneMark , 'w' ) as output :
516+ with open (GeneMarkTemp , 'rU' ) as input :
517+ lines = input .read ().replace ("Augustus" ,"GeneMark" )
518+ output .write (lines )
482519
483520 if not Augustus :
484521 if not args .augustus_species :
@@ -542,28 +579,70 @@ def __init__(self,prog):
542579 if GM_check < 3 :
543580 gmc = 0
544581 lib .log .error ("GeneMark predictions failed, proceeding with only Augustus" )
582+
583+ #if hints used for Augustus, get high quality models > 80% coverage to pass to EVM
584+ if os .path .isfile (hints_all ):
585+ lib .log .info ("Pulling out high quality Augustus predictions" )
586+ hiQ_models = []
587+ with open (aug_out , 'rU' ) as augustus :
588+ for pred in lib .readBlocks (augustus , '# start gene' ):
589+ values = []
590+ geneID = ''
591+ support = ''
592+ if pred [0 ].startswith ('# This output' ):
593+ continue
594+ if pred [0 ].startswith ('##gff-version 3' ):
595+ continue
596+ for line in pred :
597+ line = line .replace ('\n ' , '' )
598+ if line .startswith ('# start gene' ):
599+ geneID = line .split (' ' )[- 1 ]
600+ values .append (geneID )
601+ if line .startswith ('# % of transcript supported by hints' ):
602+ support = line .split (' ' )[- 1 ]
603+ values .append (support )
604+ if float (values [1 ]) > 65 : #greater than ~66% of exons supported, i.e. 2/3 or 3/4, but not less
605+ hiQ_models .append (values [0 ])
606+
607+ #now open evm augustus and pull out models
608+ HiQ = set (hiQ_models )
609+ lib .log .info ("Found %i high quality predictions from Augustus (>66%% exon evidence)" % len (HiQ ))
610+ HiQ_match = re .compile (r'\b(?:%s)[\.t1;]+\b' % '|' .join (HiQ ))
611+ AugustusHiQ = os .path .join (args .out , 'predict_misc' , 'augustus-HiQ.evm.gff3' )
612+ with open (AugustusHiQ , 'w' ) as HiQ_out :
613+ with open (Augustus , 'rU' ) as evm_aug :
614+ for line in evm_aug :
615+ if HiQ_match .search (line ):
616+ newline = line .replace ('\t Augustus\t ' , '\t HiQ\t ' )
617+ HiQ_out .write (newline )
618+
545619
546620 #EVM related input tasks, find all predictions and concatenate together
547621 if args .pasa_gff :
548- pred_in = [Augustus , GeneMark , args .pasa_gff ]
622+ if os .path .isfile (hints_all ):
623+ pred_in = [Augustus , GeneMark , args .pasa_gff , AugustusHiQ ]
624+ else :
625+ pred_in = [Augustus , GeneMark , args .pasa_gff ]
549626 else :
550- pred_in = [Augustus , GeneMark ]
627+ if os .path .isfile (hints_all ):
628+ pred_in = [Augustus , GeneMark , AugustusHiQ ]
629+ else :
630+ pred_in = [Augustus , GeneMark ]
551631 Predictions = os .path .join (args .out , 'predict_misc' , 'predictions.gff3' )
552632 with open (Predictions , 'w' ) as output :
553633 for f in pred_in :
554634 with open (f ) as input :
555635 output .write (input .read ())
556636
557- #set Weights file dependent on which data is present. I have yet to find an example of where Augustus outperforms GeneMark for fungi, but i don't have too much evidence to think that genemark is perfect either....
637+ #set Weights file dependent on which data is present.
558638 Weights = os .path .join (args .out , 'predict_misc' , 'weights.evm.txt' )
559639 with open (Weights , 'w' ) as output :
640+ output .write ("ABINITIO_PREDICTION\t Augustus\t 1\n " )
641+ output .write ("ABINITIO_PREDICTION\t GeneMark\t 1\n " )
642+ if os .path .isfile (hints_all ):
643+ output .write ("OTHER_PREDICTION\t HiQ\t 10\n " )
560644 if args .pasa_gff :
561645 output .write ("OTHER_PREDICTION\t transdecoder\t 10\n " )
562- output .write ("ABINITIO_PREDICTION\t Augustus\t 1\n " )
563- output .write ("ABINITIO_PREDICTION\t GeneMark\t 1\n " )
564- else :
565- output .write ("ABINITIO_PREDICTION\t Augustus\t 1\n " )
566- output .write ("ABINITIO_PREDICTION\t GeneMark\t 1\n " )
567646 if exonerate_out :
568647 output .write ("PROTEIN\t exonerate\t 1\n " )
569648 if Transcripts :
@@ -607,6 +686,12 @@ def __init__(self,prog):
607686else :
608687 lib .log .info ('{0:,}' .format (total ) + ' total gene models from EVM' )
609688
689+ #move EVM folder to predict folder
690+ if os .path .isdir ('EVM_tmp' ):
691+ if os .path .isdir (os .path .join (args .out , 'predict_misc' , 'EVM' )):
692+ shutil .rmtree (os .path .join (args .out , 'predict_misc' , 'EVM' ))
693+ os .rename ('EVM_tmp' , os .path .join (args .out , 'predict_misc' , 'EVM' ))
694+
610695#run tRNAscan
611696lib .log .info ("Predicting tRNAs" )
612697tRNAscan = os .path .join (args .out , 'predict_misc' , 'trnascan.gff3' )
@@ -631,7 +716,7 @@ def __init__(self,prog):
631716lib .log .info ('{0:,}' .format (total ) + ' total gene models' )
632717
633718#filter bad models
634- lib .log .info ("Filtering out bad gene models (< %i aa in length, transposable elements, etc)." % (args .min_protlen - 1 ))
719+ lib .log .info ("Filtering out bad gene models (< %i aa in length, transposable elements, etc)." % (args .min_protlen ))
635720Blast_rep_remove = os .path .join (args .out , 'predict_misc' , 'repeat.gene.models.txt' )
636721if not os .path .isfile (Blast_rep_remove ):
637722 lib .RepeatBlast (GAG_proteins , args .cpus , 1e-10 , os .path .join (args .out , 'predict_misc' ), Blast_rep_remove )
0 commit comments