Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
f816bc2
wip
corneliusroemer Oct 8, 2025
4885508
Generate continents from country
corneliusroemer Oct 10, 2025
ad40fea
Replace nulls with empty string
corneliusroemer Oct 10, 2025
eae6e59
E2E
corneliusroemer Oct 11, 2025
6df1b9f
Only use latest version
corneliusroemer Oct 15, 2025
dc414aa
Sanitize newlines to make compatible with simple tsv tools
corneliusroemer Oct 15, 2025
fb48188
Map continent to region
corneliusroemer Oct 15, 2025
067c988
Inspiration from RSV PPX ingest
corneliusroemer Oct 17, 2025
92989d1
Fix snakemake>=9.11.9
corneliusroemer Oct 17, 2025
8dc9d9a
Configure for PPX data
corneliusroemer Oct 17, 2025
1d40b5b
f
corneliusroemer Oct 17, 2025
6cd660f
Map from INSDC -> PPX accessions
corneliusroemer Oct 17, 2025
067439a
Update example data and example data rule and configs
corneliusroemer Oct 17, 2025
9c7a403
comment
corneliusroemer Oct 17, 2025
4623dc1
Merge branch 'master' into ppx
corneliusroemer Oct 17, 2025
eef2a1d
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Oct 17, 2025
9ab3369
clean
corneliusroemer Oct 24, 2025
e25f18a
genbank -> ppx
corneliusroemer Oct 24, 2025
8224927
one more
corneliusroemer Oct 24, 2025
0b39106
Update example data
corneliusroemer Oct 24, 2025
7feb4ec
Adapt prepare sequences
corneliusroemer Oct 24, 2025
73bc10c
make script with shebang executable
corneliusroemer Oct 24, 2025
e7caf18
Use script as input so that changes trigger rerun
corneliusroemer Oct 24, 2025
120b55d
Fix automation paths
corneliusroemer Oct 24, 2025
4132fdb
To be reverted: use branch/ppx data for test builds
corneliusroemer Oct 24, 2025
518dcba
Fix mpxv
corneliusroemer Oct 24, 2025
ffa34af
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Oct 24, 2025
ff24b87
We can use augur clades now for clade I as well
corneliusroemer Oct 31, 2025
284e29b
Speed up tree by hardcoding 4 threads
corneliusroemer Nov 4, 2025
2dbc71c
fix(clade): outgroup clade now called "unassigned"
corneliusroemer Nov 4, 2025
a9f2552
Restricted-until coloring plus INSDC accession url
corneliusroemer Nov 4, 2025
07adcaf
ingest: Upload ppx.ndjson.zst as ppx.ndjson.zst - not ncbi.ndjson.zst
corneliusroemer Nov 4, 2025
7302350
Enrich with group URL and submission_database in curate-urls
corneliusroemer Nov 4, 2025
ace54cb
Ingest output Pathoplexus group URL, surface in phylogenetic
corneliusroemer Nov 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ingest/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ Follow the [standard installation instructions](https://docs.nextstrain.org/en/l
Fetch sequences with

```sh
nextstrain build . data/ncbi.ndjson
nextstrain build . data/ppx.ndjson
```

Run the complete ingest pipeline with
Expand Down
2 changes: 1 addition & 1 deletion ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ rule all:
"results/metadata.tsv",


include: "rules/fetch_from_ncbi.smk"
include: "rules/curate.smk"
include: "rules/nextclade.smk"
include: "rules/fetch_from_ppx.smk"


if "custom_rules" in config:
Expand Down
6 changes: 1 addition & 5 deletions ingest/build-configs/nextstrain-automation/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,12 @@ upload:
dst: 's3://nextstrain-data/files/workflows/mpox'
# Mapping of files to upload, with key as remote file name and the value
# the local file path relative to the ingest directory.
# .gz/.xz targets will be removed by 28 July 2025
# to avoid duplicate files with different compressions
files_to_upload:
ncbi.ndjson.zst: data/ncbi.ndjson
metadata.tsv.gz: results/metadata.tsv
ppx.ndjson.zst: results/ppx.ndjson.zst
metadata.tsv.zst: results/metadata.tsv
sequences.fasta.xz: results/sequences.fasta
sequences.fasta.zst: results/sequences.fasta
nextclade.tsv.zst: results/nextclade.tsv
alignment.fasta.xz: results/alignment.fasta
alignment.fasta.zst: results/alignment.fasta
translations.zip: results/translations.zip

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def _get_all_targets(wildcards):
if send_slack_notifications:
all_targets.extend(
[
"data/notify/genbank-record-change.done",
"data/notify/input-data-change.done",
"data/notify/metadata-diff.done",
]
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,17 @@ if not slack_envvars_defined:
S3_SRC = "s3://nextstrain-data/files/workflows/mpox"


rule notify_on_genbank_record_change:
rule notify_on_input_data_change:
input:
genbank_ndjson="data/ncbi.ndjson",
ppx_ndjson="results/ppx.ndjson.zst",
output:
touch("data/notify/genbank-record-change.done"),
touch("data/notify/input-data-change.done"),
params:
s3_src=S3_SRC,
vendored_scripts=VENDORED_SCRIPTS,
shell:
"""
{params.vendored_scripts}/notify-on-record-change {input.genbank_ndjson} {params.s3_src:q}/ncbi.ndjson.zst Genbank
{params.vendored_scripts}/notify-on-record-change {input.ppx_ndjson} {params.s3_src:q}/ppx.ndjson.zst Pathoplexus
"""


Expand Down
4 changes: 2 additions & 2 deletions ingest/build-configs/nextstrain-automation/upload.smk
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ def _get_upload_inputs(wildcards):
if send_notifications:
flag_file = []

if inputs["file_to_upload"] == "data/ncbi.ndjson":
flag_file = "data/notify/genbank-record-change.done"
if inputs["file_to_upload"] == "results/ppx.ndjson.zst":
flag_file = "data/notify/input-data-change.done"
elif inputs["file_to_upload"] == "results/metadata.tsv":
flag_file = "data/notify/metadata-diff.done"

Expand Down
96 changes: 46 additions & 50 deletions ingest/defaults/config.yaml
Original file line number Diff line number Diff line change
@@ -1,63 +1,50 @@
# Pathogen NCBI Taxonomy ID
ncbi_taxon_id: '10244'
# The list of NCBI Datasets fields to include from NCBI Datasets output
# These need to be the "mnemonics" of the NCBI Datasets fields, see docs for full list of fields
# https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
# Note: the "accession" field MUST be provided to match with the sequences
ncbi_datasets_fields:
- accession
- sourcedb
- isolate-lineage
- geo-region
- geo-location
- isolate-collection-date
- release-date
- update-date
- length
- host-name
- isolate-lineage-source
- bioprojects
- biosample-acc
- sra-accs
- submitter-names
- submitter-affiliation

# Params for the curate rule
curate:
# Fields to rename.
# This is the first step in the pipeline, so any references to field names
# in the configs below should use the new field names
field_map:
accession: accession
accession_version: genbank_accession_rev
sourcedb: database
isolate-lineage: strain
geo-region: region
geo-location: location
isolate-collection-date: date
release-date: date_released
update-date: date_updated
accession: PPX_accession
accessionVersion: PPX_accession_version
insdcAccessionBase: INSDC_accession
insdcAccessionFull: INSDC_accession_version
# sourcedb: database # Where original submission was made: Pathoplexus vs INSDC
specimenCollectorSampleId: strain
geoLocContinent: region
geoLocCountry: country
geoLocAdmin1: division
geoLocAdmin2: location
geoLocCity: city
geoLocSite: site
# geo-region: region # We need to derive region from country (possibly do in PPX preprocessing)
sampleCollectionDate: date
earliestReleaseDate: date_released
releasedDate: date_updated
length: length
host-name: host
isolate-lineage-source: isolation_source
bioprojects: bioproject_accession
biosample-acc: biosample_accessions
sra-accs: sra_accession
submitter-names: full_authors
submitter-affiliation: institution
hostNameScientific: host
# isolate-lineage-source: isolation_source
bioprojectAccession: bioproject_accession
biosampleAccession: biosample_accessions
insdcRawReadsAccession: sra_accession
authors: full_authors
authorAffiliations: institution
dataUseTerms: dataUseTerms
dataUseTermsRestrictedUntil: restrictedUntil
dataUseTermsUrl: dataUseTerms__url
groupId: Pathoplexus_group_id
groupName: Pathoplexus_group
# TODO: There are more fields I should look into porting
# Standardized strain name regex
# Currently accepts any characters because we do not have a clear standard for strain names
strain_regex: '^.+$'
# Back up strain name field if 'strain' doesn't match regex above
strain_backup_fields: ['accession']
strain_backup_fields: ['submissionId','PPX_accession']
# List of date fields to standardize
date_fields: ['date', 'date_released', 'date_updated']
# Expected date formats present in date fields
# These date formats should use directives expected by datetime
# See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes
expected_date_formats: ['%Y', '%Y-%m', '%Y-%m-%d', '%Y-%m-%dT%H:%M:%SZ']
# The expected field that contains the GenBank geo_loc_name
genbank_location_field: location
# Titlecase rules
titlecase:
# Abbreviations not cast to titlecase, keeps uppercase
Expand All @@ -80,17 +67,19 @@ curate:
# User annotations file
annotations: 'annotations.tsv'
# ID field used to merge annotations
annotations_id: 'accession'
annotations_id: 'PPX_accession'
# Field to use as the sequence ID in the FASTA file
id_field: 'accession'
id_field: 'PPX_accession'
# Field to use as the sequence in the FASTA file
sequence_field: 'sequence'
# The field in the NDJSON record that contains the actual GenBank accession
genbank_accession: 'accession'
# Final output columns for the metadata TSV
metadata_columns: [
'accession',
'genbank_accession_rev',
'PPX_accession',
'PPX_accession__url',
'PPX_accession_version',
'PPX_accession_version__url',
'INSDC_accession_version',
'INSDC_accession_version__url',
'strain',
'date',
'region',
Expand All @@ -105,7 +94,14 @@ curate:
'authors',
'full_authors',
'institution',
'url'
'dataUseTerms',
'dataUseTerms__url',
'restrictedUntil',
'displayName',
'Pathoplexus_group',
'Pathoplexus_group_id',
'Pathoplexus_group__url',
'submission_database',
]

# Params for Nextclade related rules
Expand Down
64 changes: 30 additions & 34 deletions ingest/rules/curate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
This part of the workflow handles curating the data into standardized
formats and expects input file

sequences_ndjson = "data/ncbi.ndjson"
sequences_ndjson = "data/ppx_flat.ndjson"

This will produce output files as

Expand All @@ -22,16 +22,38 @@ def format_field_map(field_map: dict[str, str]) -> list[str]:
return [f"{key}={value}" for key, value in field_map.items()]


rule generate_continent:
input:
ndjson="results/ppx_flat.ndjson.zst",
script="scripts/generate_continent.py",
output:
ndjson="results/ppx_flat_continent.ndjson.zst",
benchmark:
"benchmarks/generate_continent.txt"
log:
"logs/generate_continent.txt",
shell:
r"""
exec &> >(tee {log:q})

python {input.script:q} \
--input {input.ndjson:q} \
--output {output.ndjson:q}
"""


rule curate:
input:
sequences_ndjson="data/ncbi.ndjson",
sequences_ndjson="results/ppx_flat_continent.ndjson.zst",
geolocation_rules=resolve_config_path(
config["curate"]["local_geolocation_rules"]
),
annotations=resolve_config_path(config["curate"]["annotations"]),
urls_script="scripts/curate-urls.py",
output:
metadata="data/all_metadata.tsv",
sequences="results/sequences.fasta",
# ndjson="results/curated.ndjson.zst",
benchmark:
"benchmarks/curate.txt"
log:
Expand All @@ -42,7 +64,6 @@ rule curate:
strain_backup_fields=config["curate"]["strain_backup_fields"],
date_fields=config["curate"]["date_fields"],
expected_date_formats=config["curate"]["expected_date_formats"],
genbank_location_field=config["curate"]["genbank_location_field"],
articles=config["curate"]["titlecase"]["articles"],
abbreviations=config["curate"]["titlecase"]["abbreviations"],
titlecase_fields=config["curate"]["titlecase"]["fields"],
Expand All @@ -56,7 +77,10 @@ rule curate:
r"""
exec &> >(tee {log:q})

cat {input.sequences_ndjson:q} \
# TODO
# - Curate doesn't handle PPX authors

zstdcat {input.sequences_ndjson:q} \
| augur curate rename \
--field-map {params.field_map:q} \
| augur curate normalize-strings \
Expand All @@ -66,8 +90,6 @@ rule curate:
| augur curate format-dates \
--date-fields {params.date_fields:q} \
--expected-date-formats {params.expected_date_formats:q} \
| augur curate parse-genbank-location \
--location-field {params.genbank_location_field:q} \
| augur curate titlecase \
--titlecase-fields {params.titlecase_fields:q} \
--articles {params.articles:q} \
Expand All @@ -78,6 +100,7 @@ rule curate:
--abbr-authors-field {params.abbr_authors_field:q} \
| augur curate apply-geolocation-rules \
--geolocation-rules {input.geolocation_rules:q} \
| python {input.urls_script:q} \
| augur curate apply-record-annotations \
--annotations {input.annotations:q} \
--id-field {params.annotations_id:q} \
Expand All @@ -88,36 +111,9 @@ rule curate:
"""


rule add_metadata_columns:
"""Add columns to metadata
Notable columns:
- url: URL linking to the NCBI GenBank record ('https://www.ncbi.nlm.nih.gov/nuccore/*').
"""
input:
metadata="data/all_metadata.tsv",
output:
metadata=temp("data/all_metadata_added.tsv"),
params:
accession=config["curate"]["genbank_accession"],
benchmark:
"benchmarks/add_metadata_columns.txt"
log:
"logs/add_metadata_columns.txt",
shell:
r"""
exec &> >(tee {log:q})

csvtk mutate2 -t \
-n url \
-e '"https://www.ncbi.nlm.nih.gov/nuccore/" + ${params.accession:q}' \
{input.metadata:q} \
> {output.metadata:q}
"""


rule subset_metadata:
input:
metadata="data/all_metadata_added.tsv",
metadata="data/all_metadata.tsv",
output:
subset_metadata="data/subset_metadata.tsv",
params:
Expand Down
Loading
Loading