Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
235 changes: 180 additions & 55 deletions modules/Bio/EnsEMBL/Analysis/Hive/Config/EnsemblAnnoHelixer_conf.pm
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am unable to see this script at the given location, is it on a different branch than main in ensembl-genes
registry_status_update_python_script => catfile( $self->o('enscode_root_dir'), 'ensembl-genes', 'src', 'python', 'ensembl', 'genes', 'info_from_registry', 'update_assembly_registry.py' ),
@JackCurragh

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See the ensembl-genes PR that I linked above for the corresponding changes. There are many from Anna that will need to be merged there in coordination with the merger of this

Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ sub default_options {
'stable_id_start' => '', #optional, already defined in ProcessGCA When mapping is not required this is usually set to 0
'mapping_required' => '0',# If set to 1 this will run stable_id mapping sometime in the future. At the moment it does nothing
'uniprot_version' => 'uniprot_2021_04', # What UniProt data dir to use for various analyses
'production_name_modifier' => '', # Do not set unless working with non-reference strains, breeds etc. Must include _ in modifier, e.g. _hni for medaka strain HNI
'production_name_modifier' => 'reg_test', # Do not set unless working with non-reference strains, breeds etc. Must include _ in modifier, e.g. _hni for medaka strain HNI

# Keys for custom loading, only set/modify if that's what you're doing
'load_toplevel_only' => '1', # This will not load the assembly info and will instead take any chromosomes, unplaced and unlocalised scaffolds directly in the DNA table
Expand All @@ -97,7 +97,8 @@ sub default_options {
'busco_lower_threshold' => 50, # If the busco score is above this threshod and the difference less than 'busco_difference_threshold', the pre-release files will be produced
'busco_difference_threshold' => 10, # If the difference between the gene and protein busco score is less than this value, the pre-release files will be produced as long as the busco score is above 'busco_lower_threshold'


'mysql_dump_options' => '--max_allowed_packet=1000MB',

#gff file dump options
'gt_exe' => 'gt',
'gff3_tidy' => $self->o('gt_exe') . ' gff3 -tidy -sort -retainids -fixregionboundaries -force',
Expand Down Expand Up @@ -141,10 +142,11 @@ sub default_options {
print_protein_script_path => catfile( $self->o('ensembl_analysis_script'), 'genebuild', 'print_translations.pl' ),
ensembl_gst_script => catdir( $self->o('enscode_root_dir'), 'ensembl-genes', 'pipelines' , 'gene_symbol_classifier' ),
gst_dump_proteins_script => catfile( $self->o('ensembl_gst_script'), 'dump_protein_sequences.pl' ),
gst_load_symbols_script => catfile( $self->o('ensembl_gst_script'), 'load_gene_symbols.pl' ),
registry_status_update_script => catfile( $self->o('ensembl_analysis_script'), 'update_assembly_registry.pl' ),
core_metadata_script => catdir( $self->o('enscode_root_dir'), 'ensembl-genes', 'src', 'python', 'ensembl', 'genes', 'metadata', 'core_meta_data.py'),
core_stats_script => catdir( $self->o('enscode_root_dir'), 'ensembl-genes', 'src', 'perl', 'ensembl', 'genes', 'generate_species_homepage_stats.pl'),
gst_load_symbols_script => catfile( $self->o('ensembl_gst_script'), 'load_gene_symbols.pl' ),
registry_status_update_script => catfile( $self->o('ensembl_analysis_script'), 'update_assembly_registry.pl' ),
registry_status_update_python_script => catfile( $self->o('enscode_root_dir'), 'ensembl-genes', 'src', 'python', 'ensembl', 'genes', 'info_from_registry', 'update_assembly_registry.py' ),
core_metadata_script => catdir( $self->o('enscode_root_dir'), 'ensembl-genes', 'src', 'python', 'ensembl', 'genes', 'metadata', 'core_meta_data.py'),
core_stats_script => catdir( $self->o('enscode_root_dir'), 'ensembl-genes', 'src', 'perl', 'ensembl', 'genes', 'generate_species_homepage_stats.pl'),


########################
Expand Down Expand Up @@ -255,6 +257,7 @@ sub default_options {
'registry_db_server' => $ENV{GBS1}, # host for registry db
'registry_db_port' => $ENV{GBP1}, # port for registry db
'registry_db_name' => 'gb_assembly_registry',
'new_registry_db_name' => 'gb_assembly_metadata',

'core_db' => {
-dbname => $self->o('dna_db_name'),
Expand Down Expand Up @@ -434,17 +437,18 @@ sub pipeline_analyses {
# ASSEMBLY LOADING ANALYSES
#
###############################################################################
# 1) Process GCA - works out settings, flows them down the pipeline -> this should be seeded by another analysis later
# 1) Settings are worked out via the setup script and seeded into analysis 1
# 2) Standard create core, populate tables, download data etc
# 3) Either run gbiab or setup gbiab
# 4) Finalise steps


{
# Creates a reference db for each species
-logic_name => 'process_gca',
-module => 'Bio::EnsEMBL::Analysis::Hive::RunnableDB::ProcessGCA',
# Initial registry status update - first analysis in pipeline
-logic_name => 'update_registry_in_progress',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
-parameters => {
# All the ProcessGCA parameters to ensure downstream flow works
'num_threads' => $self->o('num_threads'),
'dbowner' => $self->o('dbowner'),
'core_db' => $self->o('core_db'),
Expand All @@ -459,19 +463,31 @@ sub pipeline_analyses {
'override_clade' => $self->o('override_clade'),
'pipe_db' => $self->o('pipe_db'),
'current_genebuild' => $self->o('current_genebuild'),
'init_config' =>$self->o('init_config'),
'assembly_accession' =>$self->o('assembly_accession'),
'repeatmodeler_library' =>$self->o('repeatmodeler_library'),
'init_config' => $self->o('init_config'),
'assembly_accession' => $self->o('assembly_accession'),
'repeatmodeler_library' => $self->o('repeatmodeler_library'),

# The actual registry update command
cmd => 'python ' . $self->o('registry_status_update_python_script') .
' --host ' . $self->o('registry_db_server') .
' --port ' . $self->o('registry_db_port') .
' --user ' . $self->o('user') .
' --password ' . $self->o('password') .
' --database ' . $self->o('new_registry_db_name') .
' --assembly #assembly_accession#' .
' --status in_progress' .
' --genebuilder ' . $ENV{USER} .
' --annotation_source ensembl' .
' --annotation_method pending',
},
-rc_name => 'default',

-rc_name => '1GB',
-flow_into => {
1 => ['download_rnaseq_csv'],
},
-analysis_capacity => 1,
-input_ids => [
#{'assembly_accession' => 'GCA_910591885.1'},
],
-input_ids => [
# {'assembly_accession' => 'GCA_910591885.1'},
],
},
{
-logic_name => 'download_rnaseq_csv',
Expand Down Expand Up @@ -605,6 +621,27 @@ sub pipeline_analyses {
' --status "Insufficient Data"' ,
},
-rc_name => '1GB',
-flow_into => {
1 => ['update_new_registry_insufficient_data'],
},
},
{
-logic_name => 'update_new_registry_insufficient_data',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
-parameters => {
cmd => 'python ' . $self->o('registry_status_update_python_script') .
' --host ' . $self->o('registry_db_server') .
' --port ' . $self->o('registry_db_port') .
' --user ' . $self->o('user') .
' --password ' . $self->o('password') .
' --database ' . $self->o('new_registry_db_name') .
' --assembly #assembly_accession#' .
' --status insufficient_data' .
' --genebuilder $USER' .
' --annotation_source ensembl' .
' --annotation_method pending',
},
-rc_name => '1GB',
},
{
-logic_name => 'fan_long_read_download',
Expand Down Expand Up @@ -814,9 +851,27 @@ sub pipeline_analyses {
-max_retry_count => 0,
-rc_name => 'default',
-flow_into => {
1 => ['load_taxonomy_info'],
1 => ['update_registry_anno_annotation_source'],
},
},
{
-logic_name => 'update_registry_anno_annotation_source',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
-parameters => {
cmd => 'perl ' . $self->o('registry_status_update_script') .
' --user ' . $self->o('user') .
' --pass ' . $self->o('password') .
' --assembly_accession ' . '#assembly_accession#' .
' --registry_host ' . $self->o('registry_db_server') .
' --registry_port ' . $self->o('registry_db_port') .
' --registry_db ' . $self->o('registry_db_name') .
' --status "Insufficient Data"' ,
},
-rc_name => '1GB',
-flow_into => {
1 => ['load_taxonomy_info'],
},
},
{
# Load some meta info and seq_region_synonyms
-logic_name => 'helixer_load_meta_info',
Expand Down Expand Up @@ -852,9 +907,27 @@ sub pipeline_analyses {
-max_retry_count => 0,
-rc_name => 'default',
-flow_into => {
1 => ['load_taxonomy_info'],
1 => ['update_registry_helixer_annotation_source'],
},
},
{
-logic_name => 'update_registry_helixer_annotation_source',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
-parameters => {
cmd => 'perl ' . $self->o('registry_status_update_script') .
' --user ' . $self->o('user') .
' --pass ' . $self->o('password') .
' --assembly_accession ' . '#assembly_accession#' .
' --registry_host ' . $self->o('registry_db_server') .
' --registry_port ' . $self->o('registry_db_port') .
' --registry_db ' . $self->o('registry_db_name') .
' --status "Insufficient Data"' ,
},
-rc_name => '1GB',
-flow_into => {
1 => ['load_taxonomy_info'],
},
},

{
-logic_name => 'load_taxonomy_info',
Expand Down Expand Up @@ -1489,7 +1562,7 @@ sub pipeline_analyses {
},
-rc_name => 'default',
-flow_into => {
1 => 'backbone_job_pipeline',
1 => 'update_registry_final',
2 => 'update_registry_as_check',
}
},
Expand All @@ -1507,6 +1580,66 @@ sub pipeline_analyses {
' --status "Check BUSCO"',
},
-rc_name => 'default',
-flow_into => {
1 => ['update_new_registry_as_check'],
}
},
{
-logic_name => 'update_new_registry_as_check',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
-parameters => {
cmd => 'python ' . $self->o('registry_status_update_python_script') .
' --host ' . $self->o('registry_db_server') .
' --port ' . $self->o('registry_db_port') .
' --user ' . $self->o('user') .
' --password ' . $self->o('password') .
' --database ' . $self->o('new_registry_db_name') .
' --assembly #assembly_accession#' .
' --status check_busco' .
' --genebuilder $USER' .
' --annotation_source ensembl' .
' --annotation_method full_genebuild',
},
-rc_name => 'default',
},
{
-logic_name => 'update_registry_final',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
-parameters => {
cmd => 'perl ' . $self->o('registry_status_update_script') .
' --user ' . $self->o('user') .
' --pass ' . $self->o('password') .
' --assembly_accession ' . '#assembly_accession#' .
' --registry_host ' . $self->o('registry_db_server') .
' --registry_port ' . $self->o('registry_db_port') .
' --registry_db ' . $self->o('registry_db_name') .
' --status "Completed"',
},
-rc_name => 'default',
-flow_into => {
1 => ['update_new_registry_final'],
}
},
{
-logic_name => 'update_new_registry_final',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
-parameters => {
cmd => 'python ' . $self->o('registry_status_update_python_script') .
' --host ' . $self->o('registry_db_server') .
' --port ' . $self->o('registry_db_port') .
' --user ' . $self->o('user') .
' --password ' . $self->o('password') .
' --database ' . $self->o('new_registry_db_name') .
' --assembly #assembly_accession#' .
' --status completed' .
' --genebuilder $USER' .
' --annotation_source ensembl' .
' --annotation_method full_genebuild',
},
-rc_name => 'default',
-flow_into => {
1 => ['backbone_job_pipeline'],
}
},
{
-logic_name => 'backbone_job_pipeline',
Expand Down Expand Up @@ -1645,7 +1778,6 @@ sub pipeline_analyses {
-batch_size => 10,
-rc_name => '2GB',
},

{
-logic_name => 'rsync_ftp_release',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
Expand All @@ -1654,38 +1786,10 @@ sub pipeline_analyses {
},
-rc_name => 'datamover',
-flow_into => {
1 => ['set_dir_permission'],
1 => ['delete_short_reads'],
},
},

{
-logic_name => 'set_dir_permission',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
-parameters => {
cmd => "sudo -u genebuild find " . catdir($self->o('production_ftp_dir'), ucfirst($self->o('species_name'))) . " -user genebuild -exec chmod g+w {} \\;",
},
-rc_name => 'datamover',
-flow_into => {
1 => ['update_registry_pre_release'],
},
},
{
-logic_name => 'update_registry_pre_release',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
-parameters => {
cmd => 'perl ' . $self->o('registry_status_update_script') .
' --user ' . $self->o('user') .
' --pass ' . $self->o('password') .
' --assembly_accession ' . '#assembly_accession#' .
' --registry_host ' . $self->o('registry_db_server') .
' --registry_port ' . $self->o('registry_db_port') .
' --registry_db ' . $self->o('registry_db_name') .
' --status "Pre-Released"',
},
-rc_name => '1GB',
-flow_into => { 1 => ['delete_short_reads'], },
},
{
-logic_name => 'delete_short_reads',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
-parameters => {
Expand Down Expand Up @@ -1715,6 +1819,7 @@ sub pipeline_analyses {
-pass => $self->o('password'),
-driver => $self->o('hive_driver'),
},
'enscode_root_dir' => $self->o('enscode_root_dir'),
'create_type' => 'core_only',
},
-rc_name => 'default',
Expand All @@ -1736,15 +1841,16 @@ sub pipeline_analyses {
-pass => $self->o('password'),
-driver => $self->o('hive_driver'),
},
'dump_options' => $self->o('mysql_dump_options'),
'exclude_ehive' => 1,
},
-rc_name => '10GB',
-flow_into => {
1 => ['update_registry_final'],
},
},
{
-logic_name => 'update_registry_final',
{
-logic_name => 'update_registry_pre_release',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
-parameters => {
cmd => 'perl ' . $self->o('registry_status_update_script') .
Expand All @@ -1754,9 +1860,28 @@ sub pipeline_analyses {
' --registry_host ' . $self->o('registry_db_server') .
' --registry_port ' . $self->o('registry_db_port') .
' --registry_db ' . $self->o('registry_db_name') .
' --status "Completed"',
' --status "Pre-Released"',
},
-rc_name => 'default',
-rc_name => '1GB',
-flow_into => { 1 => ['update_new_registry_pre_release'], },
},
{
-logic_name => 'update_new_registry_pre_release',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
-parameters => {
cmd => 'python ' . $self->o('registry_status_update_python_script') .
' --host ' . $self->o('registry_db_server') .
' --port ' . $self->o('registry_db_port') .
' --user ' . $self->o('user') .
' --password ' . $self->o('password') .
' --database ' . $self->o('new_registry_db_name') .
' --assembly #assembly_accession#' .
' --status pre_released' .
' --genebuilder $USER' .
' --annotation_source ensembl' .
' --annotation_method full_genebuild',
},
-rc_name => '1GB',
},
];
}
Expand Down
2 changes: 1 addition & 1 deletion scripts/update_assembly_registry.pl
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@
$self->throw("Could not update annotation status");
}

if ($status eq 'completed') {
if ($status eq 'Completed') {
$sth = $registry_dba->dbc->prepare("UPDATE genebuild_status set date_completed =? where assembly_id=? and is_current=? and annotation_source = ?");
$sth->bind_param(1,$date);
$sth->bind_param(2,$registry_assembly_id);
Expand Down