|
1 | 1 | from __future__ import division |
2 | | -import os, subprocess, logging, sys, argparse, inspect, csv, time, re, shutil, datetime, glob, platform, multiprocessing |
| 2 | +import os, subprocess, logging, sys, argparse, inspect, csv, time, re, shutil, datetime, glob, platform, multiprocessing, itertools |
3 | 3 | from natsort import natsorted |
4 | 4 | import warnings |
5 | 5 | from Bio import SeqIO |
@@ -74,6 +74,9 @@ def readBlocks(source, pattern): |
74 | 74 | buffer.append( line ) |
75 | 75 | yield buffer |
76 | 76 |
|
| 77 | +def empty_line_sep(line): |
| 78 | + return line=='\n' |
| 79 | + |
77 | 80 | def get_parent_dir(directory): |
78 | 81 | return os.path.dirname(directory) |
79 | 82 |
|
@@ -1732,31 +1735,35 @@ def getTrainResults(input): |
1732 | 1735 | values3 = line.split('|') #get [6] and [7] |
1733 | 1736 | return (values1[1], values1[2], values2[6], values2[7], values3[6], values3[7]) |
1734 | 1737 |
|
1735 | | -def trainAugustus(AUGUSTUS_BASE, train_species, trainingset, genome, outdir, cpus): |
| 1738 | +def trainAugustus(AUGUSTUS_BASE, train_species, trainingset, genome, outdir, cpus, optimize): |
1736 | 1739 | RANDOMSPLIT = os.path.join(AUGUSTUS_BASE, 'scripts', 'randomSplit.pl') |
1737 | 1740 | OPTIMIZE = os.path.join(AUGUSTUS_BASE, 'scripts', 'optimize_augustus.pl') |
| 1741 | + NEW_SPECIES = os.path.join(AUGUSTUS_BASE, 'scripts', 'new_species.pl') |
1738 | 1742 | aug_cpus = '--cpus='+str(cpus) |
1739 | 1743 | species = '--species='+train_species |
1740 | 1744 | aug_log = os.path.join(outdir, 'logfiles', 'augustus_training.log') |
1741 | 1745 | trainingdir = 'tmp_opt_'+train_species |
1742 | 1746 | with open(aug_log, 'w') as logfile: |
1743 | | - subprocess.call([RANDOMSPLIT, trainingset, '200']) #split off 100 models for testing purposes |
1744 | | - if not CheckAugustusSpecies(train_species): #check if training set exists, if not run etraining |
1745 | | - subprocess.call(['etraining', species, trainingset], stderr = logfile, stdout = logfile) |
| 1747 | + if not CheckAugustusSpecies(train_species): |
| 1748 | + subprocess.call([NEW_SPECIES, species], stdout = logfile, stderr = logfile) |
| 1749 | + #run etraining again to only use best models from EVM for training |
| 1750 | + subprocess.call(['etraining', species, trainingset], stderr = logfile, stdout = logfile) |
| 1751 | + subprocess.call([RANDOMSPLIT, trainingset, '200']) #split off 200 models for testing purposes |
1746 | 1752 | with open(os.path.join(outdir, 'predict_misc', 'augustus.initial.training.txt'), 'w') as initialtraining: |
1747 | 1753 | subprocess.call(['augustus', species, trainingset+'.test'], stdout=initialtraining) |
1748 | 1754 | train_results = getTrainResults(os.path.join(outdir, 'predict_misc', 'augustus.initial.training.txt')) |
1749 | 1755 | log.info('Initial training: '+'{0:.2%}'.format(float(train_results[4]))+' genes predicted exactly and '+'{0:.2%}'.format(float(train_results[2]))+' of exons predicted exactly') |
1750 | | - #now run optimization |
1751 | | - subprocess.call([OPTIMIZE, species, aug_cpus, trainingset], stderr = logfile, stdout = logfile) |
1752 | | - #run etraining again |
1753 | | - subprocess.call(['etraining', species, trainingset], stderr = logfile, stdout = logfile) |
1754 | | - with open(os.path.join(outdir, 'predict_misc', 'augustus.final.training.txt'), 'w') as finaltraining: |
1755 | | - subprocess.call(['augustus', species, os.path.join(trainingdir, 'bucket1.gb')], stdout=finaltraining) |
1756 | | - train_results = getTrainResults(os.path.join(outdir, 'predict_misc', 'augustus.final.training.txt')) |
1757 | | - log.info('Optimized training: '+'{0:.2%}'.format(float(train_results[4]))+' genes predicted exactly and '+'{0:.2%}'.format(float(train_results[2]))+' of exons predicted exactly') |
1758 | | - #clean up tmp folder |
1759 | | - shutil.rmtree(trainingdir) |
| 1756 | + if optimize: |
| 1757 | + #now run optimization |
| 1758 | + subprocess.call([OPTIMIZE, species, aug_cpus, trainingset], stderr = logfile, stdout = logfile) |
| 1759 | + #run etraining again |
| 1760 | + subprocess.call(['etraining', species, trainingset], stderr = logfile, stdout = logfile) |
| 1761 | + with open(os.path.join(outdir, 'predict_misc', 'augustus.final.training.txt'), 'w') as finaltraining: |
| 1762 | + subprocess.call(['augustus', species, trainingset+'.test'], stdout=finaltraining) |
| 1763 | + train_results = getTrainResults(os.path.join(outdir, 'predict_misc', 'augustus.final.training.txt')) |
| 1764 | + log.info('Optimized training: '+'{0:.2%}'.format(float(train_results[4]))+' genes predicted exactly and '+'{0:.2%}'.format(float(train_results[2]))+' of exons predicted exactly') |
| 1765 | + #clean up tmp folder |
| 1766 | + shutil.rmtree(trainingdir) |
1760 | 1767 |
|
1761 | 1768 | HEADER = ''' |
1762 | 1769 | <!DOCTYPE html> |
|
0 commit comments