loculus-project · anna-parker · Aug 11, 2025 · Aug 11, 2025 · Aug 11, 2025 · Aug 11, 2025
diff --git a/integration-tests/tests/pages/review.page.ts b/integration-tests/tests/pages/review.page.ts
@@ -22,7 +22,7 @@ export class ReviewPage {
     async waitForZeroProcessing() {
         await expect(this.page.locator('[data-testid="review-page-control-panel"]')).toContainText(
             '0 awaiting processing',
-            { timeout: 60000 },
+            { timeout: 200000 },
         );
     }
 

diff --git a/integration-tests/tests/specs/features/search/lineage-field.spec.ts b/integration-tests/tests/specs/features/search/lineage-field.spec.ts
@@ -7,7 +7,7 @@ import { v4 as uuidv4 } from 'uuid';
 const SEQUENCE = 'ATTGATCTCATCATTT';
 
 test('Lineage field lineage counts', async ({ page, pageWithGroup }) => {
-    test.setTimeout(95_000);
+    test.setTimeout(900_000);
     const uuid = uuidv4();
 
     await page.goto('/');

diff --git a/kubernetes/loculus/templates/_common-metadata.tpl b/kubernetes/loculus/templates/_common-metadata.tpl
@@ -251,8 +251,8 @@ organisms:
       {{ end }}
       {{ .website | toYaml | nindent 6 }}
       {{- end }}
-    referenceGenomes:
-      {{ $instance.referenceGenomes | toYaml | nindent 6 }}
+    referenceGenome:
+      {{ include "loculus.mergeReferenceGenomes" $instance.referenceGenomes | nindent 6 }}
   {{- end }}
 {{- end }}
 

diff --git a/kubernetes/loculus/templates/loculus-preprocessing-config.yaml b/kubernetes/loculus/templates/loculus-preprocessing-config.yaml
@@ -2,7 +2,12 @@
 {{- $metadata := ($organismConfig.schema | include "loculus.patchMetadataSchema" | fromYaml).metadata }}
 {{- $rawNucleotideSequences := (($organismConfig.schema | include "loculus.patchMetadataSchema" | fromYaml).nucleotideSequences) }}
 {{- $nucleotideSequences := ($rawNucleotideSequences | default "" ) }}
-{{- $nucleotideSequencesList := (eq $rawNucleotideSequences nil | ternary (list "main") $rawNucleotideSequences) }}
+{{- $referenceGenomes:= include "loculus.mergeReferenceGenomes" $organismConfig.referenceGenomes | fromYaml }}
+{{- $genesList := (eq $referenceGenomes.genes nil | ternary (list) $referenceGenomes.genes) }}
+{{- $genesDict := dict "genes" (list) -}}
+{{- range $g := $genesList }}
+  {{- $_ := set $genesDict "genes" (append ($genesDict.genes) $g.name) -}}
+{{- end }}
 {{- range $processingIndex, $processingConfig := $organismConfig.preprocessing }}
 {{- if $processingConfig.configFile }}
 {{- /* Use the enaDepositionConfig as the base config */}}
@@ -17,7 +22,7 @@ data:
   preprocessing-config.yaml: |
     organism: {{ $organism }}
     {{- $preproAndEnaConfigFile | toYaml | nindent 4 }}
-    {{- (dict "nucleotideSequences" $nucleotideSequencesList) | toYaml | nindent 4 }}
+    {{- $genesDict | toYaml | nindent 4 }}
     processing_spec:
       {{- $args := dict "metadata" $metadata "nucleotideSequences" $nucleotideSequences }}
       {{- include "loculus.preprocessingSpecs" $args | nindent 6 }}

diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml
@@ -1312,7 +1312,6 @@ defaultOrganismConfig: &defaultOrganismConfig
       replicas: 2
       configFile: &preprocessingConfigFile
         log_level: DEBUG
-        genes: []
         batch_size: 100
         create_embl_file: true
   ingest: &ingest
@@ -1356,16 +1355,18 @@ defaultOrganisms:
       - <<: *preprocessing
         version: 1
         configFile:
-          <<: *preprocessingConfigFile      
-          genes: [NP, VP35, VP40, GP, sGP, ssGP, VP30, VP24, L]
-          nextclade_dataset_name: nextstrain/ebola/sudan
+          <<: *preprocessingConfigFile
+          nucleotideSequences:
+            - name: "main"
+              nextclade_dataset_name: nextstrain/ebola/sudan
           nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/ebola/data_output
       - <<: *preprocessing
         version: 2
         configFile:
           <<: *preprocessingConfigFile
-          genes: [NP, VP35, VP40, GP, sGP, ssGP, VP30, VP24, L]
-          nextclade_dataset_name: nextstrain/ebola/sudan
+          nucleotideSequences:
+            - name: "main"
+              nextclade_dataset_name: nextstrain/ebola/sudan
           nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/ebola/data_output
   west-nile:
     <<: *defaultOrganismConfig
@@ -1417,9 +1418,10 @@ defaultOrganisms:
         version: 1
         configFile:
           <<: *preprocessingConfigFile
-          nextclade_dataset_name: nextstrain/wnv/all-lineages
+          nucleotideSequences:
+            - name: "main"
+              nextclade_dataset_name: nextstrain/wnv/all-lineages
           nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/wnv/data_output
-          genes: [capsid, prM, env, NS1, NS2A, NS2B, NS3, NS4A, 2K, NS4B, NS5]
     ingest:
       <<: *ingest
       configFile:
@@ -1674,8 +1676,9 @@ defaultOrganisms:
           - "prepro"
         configFile:
           log_level: DEBUG
-          genes: []
           batch_size: 100
+          nucleotideSequences:
+            - name: "main"
     referenceGenomes:
       singleReference:
         nucleotideSequences:
@@ -1734,9 +1737,14 @@ defaultOrganisms:
         configFile:
           <<: *preprocessingConfigFile
           log_level: DEBUG
-          nextclade_dataset_name: nextstrain/cchfv/linked
+          nucleotideSequences:
+            - name: L
+              nextclade_dataset_name: nextstrain/cchfv/linked/L
+            - name: M
+              nextclade_dataset_name: nextstrain/cchfv/linked/M
+            - name: S
+              nextclade_dataset_name: nextstrain/cchfv/linked/S
           nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/cornelius-cchfv/data_output
-          genes: [RdRp, GPC, NP]
     ingest:
       <<: *ingest
       configFile:
@@ -1772,7 +1780,7 @@ defaultOrganisms:
             sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/cchf/NP.fasta]]"
   enteroviruses:
     <<: *defaultOrganismConfig
-    enabled: false
+    enabled: true
     schema:
       <<: *schema
       organismName: "Enterovirus"
@@ -1848,9 +1856,23 @@ defaultOrganisms:
         configFile:
           <<: *preprocessingConfigFile
           log_level: INFO
-          nextclade_dataset_name: community/hodcroftlab/enterovirus/enterovirus/linked
+          classify_with_nextclade_sort: True
+          minimizer_index: "https://raw.githubusercontent.com/alejandra-gonzalezsanchez/loculus-evs/master/evs_minimizer-index.json"
+          nucleotideSequences:
+            - name: CV-A16
+              nextclade_dataset_name: community/hodcroftlab/enterovirus/enterovirus/linked/CV-A16
+              accepted_sort_matches: ["community/hodcroftlab/enterovirus/cva16", "community/hodcroftlab/enterovirus/enterovirus/linked/CV-A16"]
+              gene_prefix: "CV-A16-"
+            - name: CV-A10
+              nextclade_dataset_name: community/hodcroftlab/enterovirus/enterovirus/linked/CV-A10
+              gene_prefix: "CV-A10-"
+            - name: EV-A71
+              nextclade_dataset_name: community/hodcroftlab/enterovirus/enterovirus/linked/EV-A71
+              gene_prefix: "EV-A71-"
+            - name: EV-D68
+              gene_prefix: "EV-D68-"
+              nextclade_dataset_name: community/hodcroftlab/enterovirus/enterovirus/linked/EV-D68
           nextclade_dataset_server: https://raw.githubusercontent.com/alejandra-gonzalezsanchez/nextclade_data/multi-pathogen-evs/data_output
-          genes: ["CV-A16-VP4", "CV-A16-VP2", "CV-A16-VP3", "CV-A16-VP1", "CV-A16-2A", "CV-A16-2B", "CV-A16-2C", "CV-A16-3A", "CV-A16-3B", "CV-A16-3C", "CV-A16-3D", "CV-A10-VP4", "CV-A10-VP2", "CV-A10-VP3", "CV-A10-VP1", "CV-A10-2A", "CV-A10-2B", "CV-A10-2C", "CV-A10-3A", "CV-A10-3B", "CV-A10-3C", "CV-A10-3D", "EV-A71-VP4", "EV-A71-VP2", "EV-A71-VP3", "EV-A71-VP1", "EV-A71-2A", "EV-A71-2B", "EV-A71-2C", "EV-A71-3A", "EV-A71-3B", "EV-A71-3C", "EV-A71-3D", "EV-D68-VP4", "EV-D68-VP2", "EV-D68-VP3", "EV-D68-VP1", "EV-D68-2A", "EV-D68-2B", "EV-D68-2C", "EV-D68-3A", "EV-D68-3B", "EV-D68-3C", "EV-D68-3D"]
     ingest:
       <<: *ingest
       configFile:
@@ -2070,7 +2092,7 @@ enforceHTTPS: true
 registrationTermsMessage: >
   You must agree to the <a href="http://main.loculus.org/terms">terms of use</a>.
 
-enaDeposition: 
+enaDeposition:
   submitToEnaProduction: false
   enaDbName: Loculus
   enaUniqueSuffix: Loculus

diff --git a/preprocessing/nextclade/src/loculus_preprocessing/config.py b/preprocessing/nextclade/src/loculus_preprocessing/config.py
@@ -32,8 +32,21 @@ class AlignmentRequirement(StrEnum):
     # Determines whether ALL or ANY segments that a user provides must align.
     # ANY: warn if some segments fail and some segments align
     # ALL: error if any segment fails even if some segments align
+    # NONE: do not align any segments, just process them as-is 
+    # - set if no nextclade dataset is provided
     ANY = "ANY"
     ALL = "ALL"
+    NONE = "NONE"
+
+
+@dataclass
+class NextcladeSequenceAndDataset:
+    name: str = "main"
+    nextclade_dataset_name: str | None = None
+    nextclade_dataset_tag: str | None = None
+    nextclade_dataset_server: str | None = None
+    accepted_sort_matches: list[str] | None = None
+    gene_prefix: str | None = None
 
 
 @dataclass
@@ -51,21 +64,20 @@ class Config:
     keycloak_token_path: str = "realms/loculus/protocol/openid-connect/token"  # noqa: S105
 
     organism: str = "mpox"
+    nucleotideSequences: list[NextcladeSequenceAndDataset] = dataclasses.field(  # noqa: N815
+        default_factory=list
+    )
     genes: list[str] = dataclasses.field(default_factory=list)
-    nucleotideSequences: list[str] = dataclasses.field(default_factory=lambda: ["main"])  # noqa: N815
     processing_spec: dict[str, dict[str, Any]] = dataclasses.field(default_factory=dict)
     multi_segment: bool = False
 
     alignment_requirement: AlignmentRequirement = AlignmentRequirement.ALL
-    nextclade_dataset_name: str | None = None
-    nextclade_dataset_name_map: dict[str, str] | None = None
-    nextclade_dataset_tag: str | None = None
     nextclade_dataset_server: str = "https://data.clades.nextstrain.org/v3"
-    nextclade_dataset_server_map: dict[str, str] | None = None
 
     require_nextclade_sort_match: bool = False
     minimizer_url: str | None = None
-    accepted_dataset_matches: list[str] = dataclasses.field(default_factory=list)
+    classify_with_nextclade_sort: bool = False
+
     create_embl_file: bool = False
     scientific_name: str = "Orthonairovirus haemorrhagiae"
     molecule_type: MoleculeType = MoleculeType.GENOMIC_RNA
@@ -77,13 +89,46 @@ class Config:
     )
 
 
+def assign_nextclade_sequence_and_dataset(
+    nuc_seq_values: list[dict[str, Any]],
+) -> list[NextcladeSequenceAndDataset]:
+    if not isinstance(nuc_seq_values, list):
+        error_msg = f"nucleotideSequences should be a list of dicts, got: {type(nuc_seq_values)}"
+        logger.error(error_msg)
+        raise ValueError(error_msg)
+    nextclade_sequence_and_dataset_list: list[NextcladeSequenceAndDataset] = []
+    for value in nuc_seq_values:
+        if value is None or not isinstance(value, dict):
+            continue
+        seq_and_dataset = NextcladeSequenceAndDataset()
+        for seq_key, seq_value in value.items():
+            if hasattr(seq_and_dataset, seq_key) and seq_value is not None:
+                setattr(seq_and_dataset, seq_key, seq_value)
+        nextclade_sequence_and_dataset_list.append(seq_and_dataset)
+    return nextclade_sequence_and_dataset_list
+
+
+def set_alignment_requirement(
+    config: Config) -> AlignmentRequirement:
+    need_nextclade_dataset: bool = False
+    for sequence in config.nucleotideSequences:
+        if sequence.nextclade_dataset_name:
+            need_nextclade_dataset = True
+    if not need_nextclade_dataset:
+        return AlignmentRequirement.NONE
+    return config.alignment_requirement
+
+
 def load_config_from_yaml(config_file: str, config: Config | None = None) -> Config:
     config = Config() if config is None else copy.deepcopy(config)
     with open(config_file, encoding="utf-8") as file:
         yaml_config = yaml.safe_load(file)
         logger.debug(f"Loaded config from {config_file}: {yaml_config}")
     for key, value in yaml_config.items():
         if value is not None and hasattr(config, key):
+            if key == "nucleotideSequences":
+                setattr(config, key, assign_nextclade_sequence_and_dataset(value))
+                continue
             attr = getattr(config, key)
             if isinstance(attr, StrEnum):
                 try:
@@ -171,4 +216,6 @@ def get_config(config_file: str | None = None, ignore_args: bool = False) -> Con
     if len(config.nucleotideSequences) > 1:
         config.multi_segment = True
 
+    config.alignment_requirement = set_alignment_requirement(config)
+
     return config