nextstrain · corneliusroemer · Oct 8, 2025 · Oct 10, 2025 · Oct 10, 2025 · Oct 11, 2025
diff --git a/ingest/README.md b/ingest/README.md
@@ -14,7 +14,7 @@ Follow the [standard installation instructions](https://docs.nextstrain.org/en/l
 Fetch sequences with
 
 ```sh
-nextstrain build . data/ncbi.ndjson
+nextstrain build . data/ppx.ndjson
 ```
 
 Run the complete ingest pipeline with

diff --git a/ingest/Snakefile b/ingest/Snakefile
@@ -26,9 +26,9 @@ rule all:
         "results/metadata.tsv",
 
 
-include: "rules/fetch_from_ncbi.smk"
 include: "rules/curate.smk"
 include: "rules/nextclade.smk"
+include: "rules/fetch_from_ppx.smk"
 
 
 if "custom_rules" in config:

diff --git a/ingest/build-configs/nextstrain-automation/config.yaml b/ingest/build-configs/nextstrain-automation/config.yaml
@@ -13,16 +13,12 @@ upload:
     dst: 's3://nextstrain-data/files/workflows/mpox'
     # Mapping of files to upload, with key as remote file name and the value
     # the local file path relative to the ingest directory.
-    # .gz/.xz targets will be removed by 28 July 2025
     # to avoid duplicate files with different compressions
     files_to_upload:
-      ncbi.ndjson.zst: data/ncbi.ndjson
-      metadata.tsv.gz: results/metadata.tsv
+      ppx.ndjson.zst: results/ppx.ndjson.zst
       metadata.tsv.zst: results/metadata.tsv
-      sequences.fasta.xz: results/sequences.fasta
       sequences.fasta.zst: results/sequences.fasta
       nextclade.tsv.zst: results/nextclade.tsv
-      alignment.fasta.xz: results/alignment.fasta
       alignment.fasta.zst: results/alignment.fasta
       translations.zip: results/translations.zip
 

diff --git a/ingest/build-configs/nextstrain-automation/nextstrain_automation.smk b/ingest/build-configs/nextstrain-automation/nextstrain_automation.smk
@@ -38,7 +38,7 @@ def _get_all_targets(wildcards):
     if send_slack_notifications:
         all_targets.extend(
             [
-                "data/notify/genbank-record-change.done",
+                "data/notify/input-data-change.done",
                 "data/notify/metadata-diff.done",
             ]
         )

diff --git a/ingest/build-configs/nextstrain-automation/slack_notifications.smk b/ingest/build-configs/nextstrain-automation/slack_notifications.smk
@@ -22,17 +22,17 @@ if not slack_envvars_defined:
 S3_SRC = "s3://nextstrain-data/files/workflows/mpox"
 
 
-rule notify_on_genbank_record_change:
+rule notify_on_input_data_change:
     input:
-        genbank_ndjson="data/ncbi.ndjson",
+        ppx_ndjson="results/ppx.ndjson.zst",
     output:
-        touch("data/notify/genbank-record-change.done"),
+        touch("data/notify/input-data-change.done"),
     params:
         s3_src=S3_SRC,
         vendored_scripts=VENDORED_SCRIPTS,
     shell:
         """
-        {params.vendored_scripts}/notify-on-record-change {input.genbank_ndjson} {params.s3_src:q}/ncbi.ndjson.zst Genbank
+        {params.vendored_scripts}/notify-on-record-change {input.ppx_ndjson} {params.s3_src:q}/ppx.ndjson.zst Pathoplexus
         """
 
 

diff --git a/ingest/build-configs/nextstrain-automation/upload.smk b/ingest/build-configs/nextstrain-automation/upload.smk
@@ -36,8 +36,8 @@ def _get_upload_inputs(wildcards):
     if send_notifications:
         flag_file = []
 
-        if inputs["file_to_upload"] == "data/ncbi.ndjson":
-            flag_file = "data/notify/genbank-record-change.done"
+        if inputs["file_to_upload"] == "results/ppx.ndjson.zst":
+            flag_file = "data/notify/input-data-change.done"
         elif inputs["file_to_upload"] == "results/metadata.tsv":
             flag_file = "data/notify/metadata-diff.done"
 

diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml
@@ -1,63 +1,50 @@
-# Pathogen NCBI Taxonomy ID
-ncbi_taxon_id: '10244'
-# The list of NCBI Datasets fields to include from NCBI Datasets output
-# These need to be the "mnemonics" of the NCBI Datasets fields, see docs for full list of fields
-# https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
-# Note: the "accession" field MUST be provided to match with the sequences
-ncbi_datasets_fields:
-  - accession
-  - sourcedb
-  - isolate-lineage
-  - geo-region
-  - geo-location
-  - isolate-collection-date
-  - release-date
-  - update-date
-  - length
-  - host-name
-  - isolate-lineage-source
-  - bioprojects
-  - biosample-acc
-  - sra-accs
-  - submitter-names
-  - submitter-affiliation
-
 # Params for the curate rule
 curate:
   # Fields to rename.
   # This is the first step in the pipeline, so any references to field names
   # in the configs below should use the new field names
   field_map:
-    accession: accession
-    accession_version: genbank_accession_rev
-    sourcedb: database
-    isolate-lineage: strain
-    geo-region: region
-    geo-location: location
-    isolate-collection-date: date
-    release-date: date_released
-    update-date: date_updated
+    accession: PPX_accession
+    accessionVersion: PPX_accession_version
+    insdcAccessionBase: INSDC_accession
+    insdcAccessionFull: INSDC_accession_version
+    # sourcedb: database  # Where original submission was made: Pathoplexus vs INSDC
+    specimenCollectorSampleId: strain
+    geoLocContinent: region
+    geoLocCountry: country
+    geoLocAdmin1: division
+    geoLocAdmin2: location
+    geoLocCity: city
+    geoLocSite: site
+    # geo-region: region # We need to derive region from country (possibly do in PPX preprocessing)
+    sampleCollectionDate: date
+    earliestReleaseDate: date_released
+    releasedDate: date_updated
     length: length
-    host-name: host
-    isolate-lineage-source: isolation_source
-    bioprojects: bioproject_accession
-    biosample-acc: biosample_accessions
-    sra-accs: sra_accession
-    submitter-names: full_authors
-    submitter-affiliation: institution
+    hostNameScientific: host
+    # isolate-lineage-source: isolation_source
+    bioprojectAccession: bioproject_accession
+    biosampleAccession: biosample_accessions
+    insdcRawReadsAccession: sra_accession
+    authors: full_authors
+    authorAffiliations: institution
+    dataUseTerms: dataUseTerms
+    dataUseTermsRestrictedUntil: restrictedUntil
+    dataUseTermsUrl: dataUseTerms__url
+    groupId: Pathoplexus_group_id
+    groupName: Pathoplexus_group
+    # TODO: There are more fields I should look into porting
   # Standardized strain name regex
   # Currently accepts any characters because we do not have a clear standard for strain names
   strain_regex: '^.+$'
   # Back up strain name field if 'strain' doesn't match regex above
-  strain_backup_fields: ['accession']
+  strain_backup_fields: ['submissionId','PPX_accession']
   # List of date fields to standardize
   date_fields: ['date', 'date_released', 'date_updated']
   # Expected date formats present in date fields
   # These date formats should use directives expected by datetime
   # See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes
   expected_date_formats: ['%Y', '%Y-%m', '%Y-%m-%d', '%Y-%m-%dT%H:%M:%SZ']
-  # The expected field that contains the GenBank geo_loc_name
-  genbank_location_field: location
   # Titlecase rules
   titlecase:
     # Abbreviations not cast to titlecase, keeps uppercase
@@ -80,17 +67,19 @@ curate:
   # User annotations file
   annotations: 'annotations.tsv'
   # ID field used to merge annotations
-  annotations_id: 'accession'
+  annotations_id: 'PPX_accession'
   # Field to use as the sequence ID in the FASTA file
-  id_field: 'accession'
+  id_field: 'PPX_accession'
   # Field to use as the sequence in the FASTA file
   sequence_field: 'sequence'
-  # The field in the NDJSON record that contains the actual GenBank accession
-  genbank_accession: 'accession'
   # Final output columns for the metadata TSV
   metadata_columns: [
-    'accession',
-    'genbank_accession_rev',
+    'PPX_accession',
+    'PPX_accession__url',
+    'PPX_accession_version',
+    'PPX_accession_version__url',
+    'INSDC_accession_version',
+    'INSDC_accession_version__url',
     'strain',
     'date',
     'region',
@@ -105,7 +94,14 @@ curate:
     'authors',
     'full_authors',
     'institution',
-    'url'
+    'dataUseTerms',
+    'dataUseTerms__url',
+    'restrictedUntil',
+    'displayName',
+    'Pathoplexus_group',
+    'Pathoplexus_group_id',
+    'Pathoplexus_group__url',
+    'submission_database',
   ]
 
 # Params for Nextclade related rules

diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk
@@ -2,7 +2,7 @@
 This part of the workflow handles curating the data into standardized
 formats and expects input file
 
-    sequences_ndjson = "data/ncbi.ndjson"
+    sequences_ndjson = "data/ppx_flat.ndjson"
 
 This will produce output files as
 
@@ -22,16 +22,38 @@ def format_field_map(field_map: dict[str, str]) -> list[str]:
     return [f"{key}={value}" for key, value in field_map.items()]
 
 
+rule generate_continent:
+    input:
+        ndjson="results/ppx_flat.ndjson.zst",
+        script="scripts/generate_continent.py",
+    output:
+        ndjson="results/ppx_flat_continent.ndjson.zst",
+    benchmark:
+        "benchmarks/generate_continent.txt"
+    log:
+        "logs/generate_continent.txt",
+    shell:
+        r"""
+        exec &> >(tee {log:q})
+
+        python {input.script:q} \
+            --input {input.ndjson:q} \
+            --output {output.ndjson:q}
+        """
+
+
 rule curate:
     input:
-        sequences_ndjson="data/ncbi.ndjson",
+        sequences_ndjson="results/ppx_flat_continent.ndjson.zst",
         geolocation_rules=resolve_config_path(
             config["curate"]["local_geolocation_rules"]
         ),
         annotations=resolve_config_path(config["curate"]["annotations"]),
+        urls_script="scripts/curate-urls.py",
     output:
         metadata="data/all_metadata.tsv",
         sequences="results/sequences.fasta",
+        # ndjson="results/curated.ndjson.zst",
     benchmark:
         "benchmarks/curate.txt"
     log:
@@ -42,7 +64,6 @@ rule curate:
         strain_backup_fields=config["curate"]["strain_backup_fields"],
         date_fields=config["curate"]["date_fields"],
         expected_date_formats=config["curate"]["expected_date_formats"],
-        genbank_location_field=config["curate"]["genbank_location_field"],
         articles=config["curate"]["titlecase"]["articles"],
         abbreviations=config["curate"]["titlecase"]["abbreviations"],
         titlecase_fields=config["curate"]["titlecase"]["fields"],
@@ -56,7 +77,10 @@ rule curate:
         r"""
         exec &> >(tee {log:q})
 
-        cat {input.sequences_ndjson:q} \
+        # TODO
+        # - Curate doesn't handle PPX authors
+
+        zstdcat {input.sequences_ndjson:q} \
             | augur curate rename \
                 --field-map {params.field_map:q} \
             | augur curate normalize-strings \
@@ -66,8 +90,6 @@ rule curate:
             | augur curate format-dates \
                 --date-fields {params.date_fields:q} \
                 --expected-date-formats {params.expected_date_formats:q} \
-            | augur curate parse-genbank-location \
-                --location-field {params.genbank_location_field:q} \
             | augur curate titlecase \
                 --titlecase-fields {params.titlecase_fields:q} \
                 --articles {params.articles:q} \
@@ -78,6 +100,7 @@ rule curate:
                 --abbr-authors-field {params.abbr_authors_field:q} \
             | augur curate apply-geolocation-rules \
                 --geolocation-rules {input.geolocation_rules:q} \
+            | python {input.urls_script:q} \
             | augur curate apply-record-annotations \
                 --annotations {input.annotations:q} \
                 --id-field {params.annotations_id:q} \
@@ -88,36 +111,9 @@ rule curate:
         """
 
 
-rule add_metadata_columns:
-    """Add columns to metadata
-    Notable columns:
-    - url: URL linking to the NCBI GenBank record ('https://www.ncbi.nlm.nih.gov/nuccore/*').
-    """
-    input:
-        metadata="data/all_metadata.tsv",
-    output:
-        metadata=temp("data/all_metadata_added.tsv"),
-    params:
-        accession=config["curate"]["genbank_accession"],
-    benchmark:
-        "benchmarks/add_metadata_columns.txt"
-    log:
-        "logs/add_metadata_columns.txt",
-    shell:
-        r"""
-        exec &> >(tee {log:q})
-
-        csvtk mutate2 -t \
-          -n url \
-          -e '"https://www.ncbi.nlm.nih.gov/nuccore/" + ${params.accession:q}' \
-          {input.metadata:q} \
-        > {output.metadata:q}
-        """
-
-
 rule subset_metadata:
     input:
-        metadata="data/all_metadata_added.tsv",
+        metadata="data/all_metadata.tsv",
     output:
         subset_metadata="data/subset_metadata.tsv",
     params: