diff --git a/integration-tests/tests/pages/review.page.ts b/integration-tests/tests/pages/review.page.ts
index 8a50b6d63e..1b26860d58 100644
--- a/integration-tests/tests/pages/review.page.ts
+++ b/integration-tests/tests/pages/review.page.ts
@@ -22,7 +22,7 @@ export class ReviewPage {
async waitForZeroProcessing() {
await expect(this.page.locator('[data-testid="review-page-control-panel"]')).toContainText(
'0 awaiting processing',
- { timeout: 60000 },
+ { timeout: 200000 },
);
}
diff --git a/integration-tests/tests/specs/features/search/lineage-field.spec.ts b/integration-tests/tests/specs/features/search/lineage-field.spec.ts
index 6d31683df6..6cef8fab28 100644
--- a/integration-tests/tests/specs/features/search/lineage-field.spec.ts
+++ b/integration-tests/tests/specs/features/search/lineage-field.spec.ts
@@ -7,7 +7,7 @@ import { v4 as uuidv4 } from 'uuid';
const SEQUENCE = 'ATTGATCTCATCATTT';
test('Lineage field lineage counts', async ({ page, pageWithGroup }) => {
- test.setTimeout(95_000);
+ test.setTimeout(900_000);
const uuid = uuidv4();
await page.goto('/');
diff --git a/kubernetes/loculus/templates/_common-metadata.tpl b/kubernetes/loculus/templates/_common-metadata.tpl
index 72968acb54..db2ec373e3 100644
--- a/kubernetes/loculus/templates/_common-metadata.tpl
+++ b/kubernetes/loculus/templates/_common-metadata.tpl
@@ -251,8 +251,8 @@ organisms:
{{ end }}
{{ .website | toYaml | nindent 6 }}
{{- end }}
- referenceGenomes:
- {{ $instance.referenceGenomes | toYaml | nindent 6 }}
+ referenceGenome:
+ {{ include "loculus.mergeReferenceGenomes" $instance.referenceGenomes | nindent 6 }}
{{- end }}
{{- end }}
diff --git a/kubernetes/loculus/templates/loculus-preprocessing-config.yaml b/kubernetes/loculus/templates/loculus-preprocessing-config.yaml
index 63118bfc76..c51d881307 100644
--- a/kubernetes/loculus/templates/loculus-preprocessing-config.yaml
+++ b/kubernetes/loculus/templates/loculus-preprocessing-config.yaml
@@ -2,7 +2,12 @@
{{- $metadata := ($organismConfig.schema | include "loculus.patchMetadataSchema" | fromYaml).metadata }}
{{- $rawNucleotideSequences := (($organismConfig.schema | include "loculus.patchMetadataSchema" | fromYaml).nucleotideSequences) }}
{{- $nucleotideSequences := ($rawNucleotideSequences | default "" ) }}
-{{- $nucleotideSequencesList := (eq $rawNucleotideSequences nil | ternary (list "main") $rawNucleotideSequences) }}
+{{- $referenceGenomes:= include "loculus.mergeReferenceGenomes" $organismConfig.referenceGenomes | fromYaml }}
+{{- $genesList := (eq $referenceGenomes.genes nil | ternary (list) $referenceGenomes.genes) }}
+{{- $genesDict := dict "genes" (list) -}}
+{{- range $g := $genesList }}
+ {{- $_ := set $genesDict "genes" (append ($genesDict.genes) $g.name) -}}
+{{- end }}
{{- range $processingIndex, $processingConfig := $organismConfig.preprocessing }}
{{- if $processingConfig.configFile }}
{{- /* Use the enaDepositionConfig as the base config */}}
@@ -17,7 +22,7 @@ data:
preprocessing-config.yaml: |
organism: {{ $organism }}
{{- $preproAndEnaConfigFile | toYaml | nindent 4 }}
- {{- (dict "nucleotideSequences" $nucleotideSequencesList) | toYaml | nindent 4 }}
+ {{- $genesDict | toYaml | nindent 4 }}
processing_spec:
{{- $args := dict "metadata" $metadata "nucleotideSequences" $nucleotideSequences }}
{{- include "loculus.preprocessingSpecs" $args | nindent 6 }}
diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml
index ec8e04163c..00ef3d6358 100644
--- a/kubernetes/loculus/values.yaml
+++ b/kubernetes/loculus/values.yaml
@@ -1312,7 +1312,6 @@ defaultOrganismConfig: &defaultOrganismConfig
replicas: 2
configFile: &preprocessingConfigFile
log_level: DEBUG
- genes: []
batch_size: 100
create_embl_file: true
ingest: &ingest
@@ -1356,16 +1355,18 @@ defaultOrganisms:
- <<: *preprocessing
version: 1
configFile:
- <<: *preprocessingConfigFile
- genes: [NP, VP35, VP40, GP, sGP, ssGP, VP30, VP24, L]
- nextclade_dataset_name: nextstrain/ebola/sudan
+ <<: *preprocessingConfigFile
+ nucleotideSequences:
+ - name: "main"
+ nextclade_dataset_name: nextstrain/ebola/sudan
nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/ebola/data_output
- <<: *preprocessing
version: 2
configFile:
<<: *preprocessingConfigFile
- genes: [NP, VP35, VP40, GP, sGP, ssGP, VP30, VP24, L]
- nextclade_dataset_name: nextstrain/ebola/sudan
+ nucleotideSequences:
+ - name: "main"
+ nextclade_dataset_name: nextstrain/ebola/sudan
nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/ebola/data_output
west-nile:
<<: *defaultOrganismConfig
@@ -1417,9 +1418,10 @@ defaultOrganisms:
version: 1
configFile:
<<: *preprocessingConfigFile
- nextclade_dataset_name: nextstrain/wnv/all-lineages
+ nucleotideSequences:
+ - name: "main"
+ nextclade_dataset_name: nextstrain/wnv/all-lineages
nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/wnv/data_output
- genes: [capsid, prM, env, NS1, NS2A, NS2B, NS3, NS4A, 2K, NS4B, NS5]
ingest:
<<: *ingest
configFile:
@@ -1674,8 +1676,9 @@ defaultOrganisms:
- "prepro"
configFile:
log_level: DEBUG
- genes: []
batch_size: 100
+ nucleotideSequences:
+ - name: "main"
referenceGenomes:
singleReference:
nucleotideSequences:
@@ -1734,9 +1737,14 @@ defaultOrganisms:
configFile:
<<: *preprocessingConfigFile
log_level: DEBUG
- nextclade_dataset_name: nextstrain/cchfv/linked
+ nucleotideSequences:
+ - name: L
+ nextclade_dataset_name: nextstrain/cchfv/linked/L
+ - name: M
+ nextclade_dataset_name: nextstrain/cchfv/linked/M
+ - name: S
+ nextclade_dataset_name: nextstrain/cchfv/linked/S
nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/cornelius-cchfv/data_output
- genes: [RdRp, GPC, NP]
ingest:
<<: *ingest
configFile:
@@ -1772,7 +1780,7 @@ defaultOrganisms:
sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/cchf/NP.fasta]]"
enteroviruses:
<<: *defaultOrganismConfig
- enabled: false
+ enabled: true
schema:
<<: *schema
organismName: "Enterovirus"
@@ -1848,9 +1856,23 @@ defaultOrganisms:
configFile:
<<: *preprocessingConfigFile
log_level: INFO
- nextclade_dataset_name: community/hodcroftlab/enterovirus/enterovirus/linked
+ classify_with_nextclade_sort: True
+ minimizer_index: "https://raw.githubusercontent.com/alejandra-gonzalezsanchez/loculus-evs/master/evs_minimizer-index.json"
+ nucleotideSequences:
+ - name: CV-A16
+ nextclade_dataset_name: community/hodcroftlab/enterovirus/enterovirus/linked/CV-A16
+ accepted_sort_matches: ["community/hodcroftlab/enterovirus/cva16", "community/hodcroftlab/enterovirus/enterovirus/linked/CV-A16"]
+ gene_prefix: "CV-A16-"
+ - name: CV-A10
+ nextclade_dataset_name: community/hodcroftlab/enterovirus/enterovirus/linked/CV-A10
+ gene_prefix: "CV-A10-"
+ - name: EV-A71
+ nextclade_dataset_name: community/hodcroftlab/enterovirus/enterovirus/linked/EV-A71
+ gene_prefix: "EV-A71-"
+ - name: EV-D68
+ gene_prefix: "EV-D68-"
+ nextclade_dataset_name: community/hodcroftlab/enterovirus/enterovirus/linked/EV-D68
nextclade_dataset_server: https://raw.githubusercontent.com/alejandra-gonzalezsanchez/nextclade_data/multi-pathogen-evs/data_output
- genes: ["CV-A16-VP4", "CV-A16-VP2", "CV-A16-VP3", "CV-A16-VP1", "CV-A16-2A", "CV-A16-2B", "CV-A16-2C", "CV-A16-3A", "CV-A16-3B", "CV-A16-3C", "CV-A16-3D", "CV-A10-VP4", "CV-A10-VP2", "CV-A10-VP3", "CV-A10-VP1", "CV-A10-2A", "CV-A10-2B", "CV-A10-2C", "CV-A10-3A", "CV-A10-3B", "CV-A10-3C", "CV-A10-3D", "EV-A71-VP4", "EV-A71-VP2", "EV-A71-VP3", "EV-A71-VP1", "EV-A71-2A", "EV-A71-2B", "EV-A71-2C", "EV-A71-3A", "EV-A71-3B", "EV-A71-3C", "EV-A71-3D", "EV-D68-VP4", "EV-D68-VP2", "EV-D68-VP3", "EV-D68-VP1", "EV-D68-2A", "EV-D68-2B", "EV-D68-2C", "EV-D68-3A", "EV-D68-3B", "EV-D68-3C", "EV-D68-3D"]
ingest:
<<: *ingest
configFile:
@@ -2070,7 +2092,7 @@ enforceHTTPS: true
registrationTermsMessage: >
You must agree to the terms of use.
-enaDeposition:
+enaDeposition:
submitToEnaProduction: false
enaDbName: Loculus
enaUniqueSuffix: Loculus
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/config.py b/preprocessing/nextclade/src/loculus_preprocessing/config.py
index d84d5403ca..142ca67e05 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/config.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/config.py
@@ -32,8 +32,21 @@ class AlignmentRequirement(StrEnum):
# Determines whether ALL or ANY segments that a user provides must align.
# ANY: warn if some segments fail and some segments align
# ALL: error if any segment fails even if some segments align
+ # NONE: do not align any segments, just process them as-is
+ # - set if no nextclade dataset is provided
ANY = "ANY"
ALL = "ALL"
+ NONE = "NONE"
+
+
+@dataclass
+class NextcladeSequenceAndDataset:
+ name: str = "main"
+ nextclade_dataset_name: str | None = None
+ nextclade_dataset_tag: str | None = None
+ nextclade_dataset_server: str | None = None
+ accepted_sort_matches: list[str] | None = None
+ gene_prefix: str | None = None
@dataclass
@@ -51,21 +64,20 @@ class Config:
keycloak_token_path: str = "realms/loculus/protocol/openid-connect/token" # noqa: S105
organism: str = "mpox"
+ nucleotideSequences: list[NextcladeSequenceAndDataset] = dataclasses.field( # noqa: N815
+ default_factory=list
+ )
genes: list[str] = dataclasses.field(default_factory=list)
- nucleotideSequences: list[str] = dataclasses.field(default_factory=lambda: ["main"]) # noqa: N815
processing_spec: dict[str, dict[str, Any]] = dataclasses.field(default_factory=dict)
multi_segment: bool = False
alignment_requirement: AlignmentRequirement = AlignmentRequirement.ALL
- nextclade_dataset_name: str | None = None
- nextclade_dataset_name_map: dict[str, str] | None = None
- nextclade_dataset_tag: str | None = None
nextclade_dataset_server: str = "https://data.clades.nextstrain.org/v3"
- nextclade_dataset_server_map: dict[str, str] | None = None
require_nextclade_sort_match: bool = False
minimizer_url: str | None = None
- accepted_dataset_matches: list[str] = dataclasses.field(default_factory=list)
+ classify_with_nextclade_sort: bool = False
+
create_embl_file: bool = False
scientific_name: str = "Orthonairovirus haemorrhagiae"
molecule_type: MoleculeType = MoleculeType.GENOMIC_RNA
@@ -77,6 +89,36 @@ class Config:
)
+def assign_nextclade_sequence_and_dataset(
+ nuc_seq_values: list[dict[str, Any]],
+) -> list[NextcladeSequenceAndDataset]:
+ if not isinstance(nuc_seq_values, list):
+ error_msg = f"nucleotideSequences should be a list of dicts, got: {type(nuc_seq_values)}"
+ logger.error(error_msg)
+ raise ValueError(error_msg)
+ nextclade_sequence_and_dataset_list: list[NextcladeSequenceAndDataset] = []
+ for value in nuc_seq_values:
+ if value is None or not isinstance(value, dict):
+ continue
+ seq_and_dataset = NextcladeSequenceAndDataset()
+ for seq_key, seq_value in value.items():
+ if hasattr(seq_and_dataset, seq_key) and seq_value is not None:
+ setattr(seq_and_dataset, seq_key, seq_value)
+ nextclade_sequence_and_dataset_list.append(seq_and_dataset)
+ return nextclade_sequence_and_dataset_list
+
+
+def set_alignment_requirement(
+ config: Config) -> AlignmentRequirement:
+ need_nextclade_dataset: bool = False
+ for sequence in config.nucleotideSequences:
+ if sequence.nextclade_dataset_name:
+ need_nextclade_dataset = True
+ if not need_nextclade_dataset:
+ return AlignmentRequirement.NONE
+ return config.alignment_requirement
+
+
def load_config_from_yaml(config_file: str, config: Config | None = None) -> Config:
config = Config() if config is None else copy.deepcopy(config)
with open(config_file, encoding="utf-8") as file:
@@ -84,6 +126,9 @@ def load_config_from_yaml(config_file: str, config: Config | None = None) -> Con
logger.debug(f"Loaded config from {config_file}: {yaml_config}")
for key, value in yaml_config.items():
if value is not None and hasattr(config, key):
+ if key == "nucleotideSequences":
+ setattr(config, key, assign_nextclade_sequence_and_dataset(value))
+ continue
attr = getattr(config, key)
if isinstance(attr, StrEnum):
try:
@@ -171,4 +216,6 @@ def get_config(config_file: str | None = None, ignore_args: bool = False) -> Con
if len(config.nucleotideSequences) > 1:
config.multi_segment = True
+ config.alignment_requirement = set_alignment_requirement(config)
+
return config
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
index 1148472cbe..c4adba2b4d 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
@@ -24,7 +24,7 @@
submit_processed_sequences,
upload_embl_file_to_presigned_url,
)
-from .config import AlignmentRequirement, Config
+from .config import AlignmentRequirement, Config, NextcladeSequenceAndDataset
from .datatypes import (
AccessionVersion,
Alerts,
@@ -75,11 +75,12 @@ def parse_nextclade_tsv(
],
result_dir: str,
config: Config,
- segment: SegmentName,
+ sequence_and_dataset: NextcladeSequenceAndDataset,
) -> tuple[
defaultdict[AccessionVersion, defaultdict[GeneName, list[AminoAcidInsertion]]],
defaultdict[AccessionVersion, defaultdict[SegmentName, list[NucleotideInsertion]]],
]:
+ segment = sequence_and_dataset.name
with Path(result_dir + "/nextclade.tsv").open(encoding="utf-8") as nextclade_tsv:
reader = csv.DictReader(nextclade_tsv, delimiter="\t")
for row in reader:
@@ -94,7 +95,12 @@ def parse_nextclade_tsv(
continue
gene, val = ins.split(":", maxsplit=1)
if gene in config.genes:
- amino_acid_insertions[id][gene].append(val)
+ gene_name = (
+ sequence_and_dataset.gene_prefix + gene
+ if sequence_and_dataset.gene_prefix
+ else gene
+ )
+ amino_acid_insertions[id][gene_name].append(val)
else:
logger.debug(
"Note: Nextclade found AA insertion in gene missing from config in gene "
@@ -130,32 +136,20 @@ def parse_nextclade_json(
def run_sort(
- result_file_dir: str,
+ result_file: str,
input_file: str,
- alerts: Alerts,
config: Config,
- segment: SegmentName,
+ nextclade_dataset_server: str,
dataset_dir: str,
-) -> Alerts:
+) -> pd.DataFrame:
"""
Run nextclade
- use config.minimizer_url or default minimizer from nextclade server
- - assert highest score is in config.accepted_dataset_matches
- (default is nextclade_dataset_name)
"""
- nextclade_dataset_name = get_nextclade_dataset_name(config, segment)
- if not config.accepted_dataset_matches and not nextclade_dataset_name:
- logger.warning("No nextclade dataset name or accepted dataset match list found in config")
- return alerts
- nextclade_dataset_server = get_nextclade_dataset_server(config, segment)
-
if config.minimizer_url:
minimizer_file = dataset_dir + "/minimizer/minimizer.json"
- accepted_dataset_names = config.accepted_dataset_matches or [nextclade_dataset_name] # type: ignore
-
- result_file = result_file_dir + "/sort_output.tsv"
- command = [
+ subprocess_args_with_emtpy_strings = [
"nextclade3",
"sort",
input_file,
@@ -174,14 +168,15 @@ def run_sort(
f"{nextclade_dataset_server}",
]
- logger.debug(f"Running nextclade sort: {command}")
+ subprocess_args = [arg for arg in subprocess_args_with_emtpy_strings if arg]
+
+ logger.debug(f"Running nextclade sort: {subprocess_args}")
- exit_code = subprocess.run(command, check=False).returncode # noqa: S603
+ exit_code = subprocess.run(subprocess_args, check=False).returncode # noqa: S603
if exit_code != 0:
msg = f"nextclade sort failed with exit code {exit_code}"
raise Exception(msg)
-
- df = pd.read_csv(
+ return pd.read_csv(
result_file,
sep="\t",
dtype={
@@ -192,6 +187,39 @@ def run_sort(
},
)
+
+def check_nextclade_sort_matches( # noqa: PLR0913, PLR0917
+ result_file_dir: str,
+ input_file: str,
+ alerts: Alerts,
+ config: Config,
+ sequence_and_dataset: NextcladeSequenceAndDataset,
+ dataset_dir: str,
+) -> Alerts:
+ """
+ Run nextclade sort
+ - assert highest score is in sequence_and_dataset.accepted_sort_matches
+ (default is nextclade_dataset_name)
+ """
+ nextclade_dataset_name = sequence_and_dataset.nextclade_dataset_name
+ if not sequence_and_dataset.accepted_sort_matches and not nextclade_dataset_name:
+ logger.warning("No nextclade dataset name or accepted dataset match list found in config")
+ return alerts
+ nextclade_dataset_server = (
+ sequence_and_dataset.nextclade_dataset_server or config.nextclade_dataset_server
+ )
+
+ accepted_dataset_names = sequence_and_dataset.accepted_sort_matches or [nextclade_dataset_name] # type: ignore
+
+ result_file = result_file_dir + "/sort_output.tsv"
+ df = run_sort(
+ result_file,
+ input_file,
+ config,
+ nextclade_dataset_server,
+ dataset_dir,
+ )
+
hits = df.dropna(subset=["score"]).sort_values("score", ascending=False)
best_hits = hits.groupby("seqName", as_index=False).first()
@@ -208,7 +236,7 @@ def run_sort(
"Sequence does not appear to match reference, per `nextclade sort`. "
"Double check you are submitting to the correct organism."
),
- ),
+ )
)
for _, row in best_hits.iterrows():
@@ -219,7 +247,7 @@ def run_sort(
ProcessingAnnotationAlignment,
AnnotationSourceType.NUCLEOTIDE_SEQUENCE,
message=(
- f"This sequence best matches {row['dataset']}, "
+ f"Sequence best matches {row['dataset']}, "
"a different organism than the one you are submitting to: "
f"{config.organism}. It is therefore not possible to release. "
"Contact the administrator if you think this message is an error."
@@ -230,12 +258,109 @@ def run_sort(
return alerts
+# TODO: running this for each sequence is inefficient, should be run once per batch
+def classify_with_nextclade_sort(
+ input_unaligned_sequences: dict[str, NucleotideSequence | None],
+ unaligned_nucleotide_sequences: dict[SegmentName, NucleotideSequence | None],
+ errors: list[ProcessingAnnotation],
+ config: Config,
+ dataset_dir: str,
+):
+ """
+ Run nextclade sort
+ - assert highest score is in sequence_and_dataset.accepted_sort_matches
+ (default is nextclade_dataset_name)
+ """
+ nextclade_dataset_server = config.nextclade_dataset_server
+
+ with TemporaryDirectory(delete=not config.keep_tmp_dir) as result_dir:
+ input_file = result_dir + "/input.fasta"
+ os.makedirs(os.path.dirname(input_file), exist_ok=True)
+ with open(input_file, "w", encoding="utf-8") as f:
+ for id, seq in input_unaligned_sequences.items():
+ f.write(f">{id}\n")
+ f.write(f"{seq}\n")
+
+ result_file = result_dir + "/sort_output.tsv"
+ df = run_sort(
+ result_file,
+ input_file,
+ config,
+ nextclade_dataset_server,
+ dataset_dir,
+ )
+
+ no_hits = df[df["score"].isna()]
+ hits = df.dropna(subset=["score"]).sort_values("score", ascending=False)
+ for seq_name in no_hits["seqName"].unique():
+ if seq_name not in hits["seqName"].unique():
+ msg = (
+ f"Sequence {seq_name} does not appear to match any reference for organism: "
+ f"{config.organism} per `nextclade sort`. "
+ f"Double check you are submitting to the correct organism."
+ )
+ # TODO: only error when config.alignment_requirement == "ALL", otherwise warn
+ errors.append(
+ ProcessingAnnotation.from_single(
+ ProcessingAnnotationAlignment,
+ AnnotationSourceType.NUCLEOTIDE_SEQUENCE,
+ message=msg,
+ )
+ )
+
+ best_hits = hits.groupby("seqName", as_index=False).first()
+ logger.info(f"Found hits: {best_hits['seqName'].tolist()}")
+
+ for _, row in best_hits.iterrows():
+ not_found = True
+ for segment in config.nucleotideSequences:
+ default = [segment.nextclade_dataset_name] if segment.nextclade_dataset_name else []
+ accepted_dataset_names = segment.accepted_sort_matches or default
+ if row["dataset"] in accepted_dataset_names:
+ unaligned_nucleotide_sequences[segment.name] = input_unaligned_sequences[
+ row["seqName"]
+ ]
+ not_found = False
+ break
+ if not_found:
+ msg = (
+ f"Sequence {row['seqName']} best matches {row['dataset']}, "
+ "which is currently not an accepted option for organism: "
+ f"{config.organism}. It is therefore not possible to release. "
+ "Contact the administrator if you think this message is an error."
+ )
+ errors.append(
+ ProcessingAnnotation.from_single(
+ ProcessingAnnotationAlignment,
+ AnnotationSourceType.NUCLEOTIDE_SEQUENCE,
+ message=msg,
+ )
+ )
+
+ return (unaligned_nucleotide_sequences, errors)
+
+
def assign_segment(
input_unaligned_sequences: dict[str, NucleotideSequence | None],
unaligned_nucleotide_sequences: dict[SegmentName, NucleotideSequence | None],
errors: list[ProcessingAnnotation],
config: Config,
+ dataset_dir: str | None = None,
):
+ if config.classify_with_nextclade_sort:
+ if not dataset_dir:
+ error_msg = (
+ "Dataset directory must be provided when classify_with_nextclade_sort is True"
+ )
+ logger.error(error_msg)
+ raise ValueError(error_msg)
+ return classify_with_nextclade_sort(
+ input_unaligned_sequences,
+ unaligned_nucleotide_sequences,
+ dataset_dir=dataset_dir,
+ errors=errors,
+ config=config,
+ )
valid_segments = set()
duplicate_segments = set()
if not config.nucleotideSequences:
@@ -265,7 +390,8 @@ def assign_segment(
unaligned_nucleotide_sequences,
errors,
)
- for segment in config.nucleotideSequences:
+ for sequence_and_dataset in config.nucleotideSequences:
+ segment = sequence_and_dataset.name
unaligned_segment = [
data
for data in input_unaligned_sequences
@@ -303,7 +429,7 @@ def assign_segment(
f"{', '.join(remaining_segments)}. "
"Each metadata entry can have multiple corresponding fasta sequence "
"entries with format _ valid segments are: "
- f"{', '.join(config.nucleotideSequences)}."
+ f"{', '.join([sequence_and_dataset.name for sequence_and_dataset in config.nucleotideSequences])}."
),
)
)
@@ -341,8 +467,8 @@ def enrich_with_nextclade( # noqa: C901, PLR0914, PLR0915
id = entry.accessionVersion
input_metadata[id] = entry.data.metadata
input_metadata[id]["submitter"] = entry.data.submitter
- input_metadata[id]["group_id"] = entry.data.group_id
input_metadata[id]["submittedAt"] = entry.data.submittedAt
+ input_metadata[id]["group_id"] = entry.data.group_id
aligned_aminoacid_sequences[id] = {}
unaligned_nucleotide_sequences[id] = {}
aligned_nucleotide_sequences[id] = {}
@@ -356,6 +482,7 @@ def enrich_with_nextclade( # noqa: C901, PLR0914, PLR0915
unaligned_nucleotide_sequences=unaligned_nucleotide_sequences[id],
errors=alerts.errors[id],
config=config,
+ dataset_dir=dataset_dir,
)
nextclade_metadata: defaultdict[
@@ -368,9 +495,12 @@ def enrich_with_nextclade( # noqa: C901, PLR0914, PLR0915
AccessionVersion, defaultdict[GeneName, list[AminoAcidInsertion]]
] = defaultdict(lambda: defaultdict(list))
with TemporaryDirectory(delete=not config.keep_tmp_dir) as result_dir: # noqa: PLR1702
- for segment in config.nucleotideSequences:
- result_dir_seg = result_dir if segment == "main" else result_dir + "/" + segment
- dataset_dir_seg = dataset_dir if segment == "main" else dataset_dir + "/" + segment
+ for sequence_and_dataset in config.nucleotideSequences:
+ segment = sequence_and_dataset.name
+ result_dir_seg = result_dir if not config.multi_segment else result_dir + "/" + segment
+ dataset_dir_seg = (
+ dataset_dir if not config.multi_segment else dataset_dir + "/" + segment
+ )
input_file = result_dir_seg + "/input.fasta"
os.makedirs(os.path.dirname(input_file), exist_ok=True)
is_empty: bool = True
@@ -384,7 +514,9 @@ def enrich_with_nextclade( # noqa: C901, PLR0914, PLR0915
continue
if config.require_nextclade_sort_match:
- alerts = run_sort(result_dir_seg, input_file, alerts, config, segment, dataset_dir)
+ alerts = check_nextclade_sort_matches(
+ result_dir_seg, input_file, alerts, config, sequence_and_dataset, dataset_dir
+ )
command = [
"nextclade3",
@@ -422,7 +554,12 @@ def enrich_with_nextclade( # noqa: C901, PLR0914, PLR0915
masked_sequence = mask_terminal_gaps(
str(aligned_sequence.seq), mask_char="X"
)
- aligned_aminoacid_sequences[sequence_id][gene] = masked_sequence
+ gene_name = (
+ sequence_and_dataset.gene_prefix + gene
+ if sequence_and_dataset.gene_prefix
+ else gene
+ )
+ aligned_aminoacid_sequences[sequence_id][gene_name] = masked_sequence
except FileNotFoundError:
# TODO: Add warning to each sequence
logger.info(
@@ -435,7 +572,11 @@ def enrich_with_nextclade( # noqa: C901, PLR0914, PLR0915
result_dir_seg, nextclade_metadata, segment, unaligned_nucleotide_sequences
) # this includes the "annotation" field
amino_acid_insertions, nucleotide_insertions = parse_nextclade_tsv(
- amino_acid_insertions, nucleotide_insertions, result_dir_seg, config, segment
+ amino_acid_insertions,
+ nucleotide_insertions,
+ result_dir_seg,
+ config,
+ sequence_and_dataset,
)
return {
@@ -555,7 +696,7 @@ def add_input_metadata(
if not unprocessed.nextcladeMetadata[segment]:
message = (
"Nucleotide sequence failed to align"
- if segment == "main"
+ if not config.multi_segment
else f"Nucleotide sequence for {segment} failed to align"
)
annotation = ProcessingAnnotation.from_single(
@@ -722,15 +863,16 @@ def process_single( # noqa: C901
config=config,
)
- for segment in config.nucleotideSequences:
+ for sequence_and_dataset in config.nucleotideSequences:
+ segment = sequence_and_dataset.name
sequence = unprocessed.unalignedNucleotideSequences.get(segment, None)
- key = "length" if segment == "main" else "length_" + segment
+ key = "length" if not config.multi_segment else "length_" + segment
if key in config.processing_spec:
output_metadata[key] = len(sequence) if sequence else 0
for output_field, spec_dict in config.processing_spec.items():
length_fields = [
- "length" if segment == "main" else "length_" + segment
+ "length" if not config.multi_segment else "length_" + segment.name
for segment in config.nucleotideSequences
]
if output_field in length_fields:
@@ -771,7 +913,8 @@ def process_single( # noqa: C901
return processed_entry_no_alignment(id, unprocessed, output_metadata, errors, warnings)
aligned_segments = set()
- for segment in config.nucleotideSequences:
+ for sequence_and_dataset in config.nucleotideSequences:
+ segment = sequence_and_dataset.name
if unprocessed.alignedNucleotideSequences.get(segment, None):
aligned_segments.add(segment)
@@ -788,7 +931,8 @@ def process_single( # noqa: C901
if config.create_embl_file and unprocessed.nextcladeMetadata is not None:
annotations = {}
- for segment in config.nucleotideSequences:
+ for sequence_and_dataset in config.nucleotideSequences:
+ segment = sequence_and_dataset.name
if segment in unprocessed.nextcladeMetadata:
annotations[segment] = None
if unprocessed.nextcladeMetadata[segment]:
@@ -853,7 +997,7 @@ def process_all(
) -> Sequence[SubmissionData]:
processed_results = []
logger.debug(f"Processing {len(unprocessed)} unprocessed sequences")
- if config.nextclade_dataset_name:
+ if config.alignment_requirement != AlignmentRequirement.NONE:
nextclade_results = enrich_with_nextclade(unprocessed, dataset_dir, config)
for id, result in nextclade_results.items():
try:
@@ -874,30 +1018,15 @@ def process_all(
return processed_results
-def get_nextclade_dataset_name(config: Config, segment: SegmentName) -> str | None:
- if config.nextclade_dataset_name_map and segment in config.nextclade_dataset_name_map:
- return config.nextclade_dataset_name_map[segment]
- if not config.nextclade_dataset_name:
- return None
- return (
- config.nextclade_dataset_name
- if segment == "main"
- else config.nextclade_dataset_name + "/" + segment
- )
-
-
-def get_nextclade_dataset_server(config: Config, segment: SegmentName) -> str:
- if config.nextclade_dataset_server_map and segment in config.nextclade_dataset_server_map:
- return config.nextclade_dataset_server_map[segment]
- return config.nextclade_dataset_server
-
-
def download_nextclade_dataset(dataset_dir: str, config: Config) -> None:
- for segment in config.nucleotideSequences:
- nextclade_dataset_name = get_nextclade_dataset_name(config, segment)
- nextclade_dataset_server = get_nextclade_dataset_server(config, segment)
+ for sequence_and_dataset in config.nucleotideSequences:
+ name = sequence_and_dataset.name
+ nextclade_dataset_name = sequence_and_dataset.nextclade_dataset_name
+ nextclade_dataset_server = (
+ sequence_and_dataset.nextclade_dataset_server or config.nextclade_dataset_server
+ )
- dataset_dir_seg = dataset_dir if segment == "main" else dataset_dir + "/" + segment
+ dataset_dir_seg = dataset_dir if not config.multi_segment else dataset_dir + "/" + name
dataset_download_command = [
"nextclade3",
"dataset",
@@ -907,8 +1036,8 @@ def download_nextclade_dataset(dataset_dir: str, config: Config) -> None:
f"--output-dir={dataset_dir_seg}",
]
- if config.nextclade_dataset_tag is not None:
- dataset_download_command.append(f"--tag={config.nextclade_dataset_tag}")
+ if sequence_and_dataset.nextclade_dataset_tag is not None:
+ dataset_download_command.append(f"--tag={sequence_and_dataset.nextclade_dataset_tag}")
logger.info("Downloading Nextclade dataset: %s", dataset_download_command)
if subprocess.run(dataset_download_command, check=False).returncode != 0: # noqa: S603
@@ -952,7 +1081,7 @@ def upload_flatfiles(processed: Sequence[SubmissionData], config: Config) -> Non
def run(config: Config) -> None:
with TemporaryDirectory(delete=not config.keep_tmp_dir) as dataset_dir:
- if config.nextclade_dataset_name:
+ if config.alignment_requirement != AlignmentRequirement.NONE:
download_nextclade_dataset(dataset_dir, config)
if config.minimizer_url and config.require_nextclade_sort_match:
download_minimizer(config.minimizer_url, dataset_dir + "/minimizer/minimizer.json")
diff --git a/preprocessing/nextclade/tests/factory_methods.py b/preprocessing/nextclade/tests/factory_methods.py
index 6911da68fe..23411c99e9 100644
--- a/preprocessing/nextclade/tests/factory_methods.py
+++ b/preprocessing/nextclade/tests/factory_methods.py
@@ -46,7 +46,7 @@ class ProcessingAnnotationHelper:
@dataclass
class ProcessedAlignment:
unalignedNucleotideSequences: dict[str, str | None] = field( # noqa: N815
- default_factory=lambda: {"main": None}
+ default_factory=dict
)
alignedNucleotideSequences: dict[str, str | None] = field( # noqa: N815
default_factory=dict
diff --git a/preprocessing/nextclade/tests/multi_segment_config.yaml b/preprocessing/nextclade/tests/multi_segment_config.yaml
index bfac8b94e7..bf81cb8121 100644
--- a/preprocessing/nextclade/tests/multi_segment_config.yaml
+++ b/preprocessing/nextclade/tests/multi_segment_config.yaml
@@ -6,11 +6,12 @@ genes:
- VP24EbolaZaire
- LEbolaZaire
log_level: DEBUG
-nextclade_dataset_name: ebola-dataset
nextclade_dataset_server: fake_url
nucleotideSequences:
-- ebola-sudan
-- ebola-zaire
+- name: ebola-sudan
+ nextclade_dataset_name: ebola-dataset/ebola-sudan
+- name: ebola-zaire
+ nextclade_dataset_name: ebola-dataset/ebola-zaire
organism: multi-ebola-test
processing_spec:
totalInsertedNucs_ebola-zaire:
diff --git a/preprocessing/nextclade/tests/single_segment_config.yaml b/preprocessing/nextclade/tests/single_segment_config.yaml
index 670665961c..0ae57b7a3a 100644
--- a/preprocessing/nextclade/tests/single_segment_config.yaml
+++ b/preprocessing/nextclade/tests/single_segment_config.yaml
@@ -3,14 +3,14 @@ genes:
- NPEbolaSudan
- VP35EbolaSudan
log_level: DEBUG
-nextclade_dataset_name: ebola-sudan-test-dataset
nextclade_dataset_server: fake_url
scientific_name: "Test Ebola Sudan Virus"
molecule_type: "genomic RNA"
topology: "linear"
db_name: "Loculus"
nucleotideSequences:
-- main
+- name: main
+ nextclade_dataset_name: ebola-sudan-test-dataset
organism: ebola-sudan-test
processing_spec:
completeness:
diff --git a/website/src/config.ts b/website/src/config.ts
index e91a4050da..cf2043db22 100644
--- a/website/src/config.ts
+++ b/website/src/config.ts
@@ -227,7 +227,7 @@ export function getLapisUrl(serviceConfig: ServiceUrls, organism: string): strin
}
export function getReferenceGenome(organism: string): ReferenceGenome {
- return Object.values(getConfig(organism).referenceGenomes)[0];
+ return getConfig(organism).referenceGenome;
}
const getAccession = (n: NamedSequence): ReferenceAccession => {
diff --git a/website/src/types/config.ts b/website/src/types/config.ts
index 68fb966a39..bba32812de 100644
--- a/website/src/types/config.ts
+++ b/website/src/types/config.ts
@@ -1,7 +1,7 @@
import z from 'zod';
import { mutationProportionCount, orderByType } from './lapis.ts';
-import { referenceGenomes } from './referencesGenomes.ts';
+import { referenceGenome } from './referencesGenomes.ts';
// These metadata types need to be kept in sync with the backend config class `MetadataType` in Config.kt
export const metadataPossibleTypes = z.enum([
@@ -156,7 +156,7 @@ export type Schema = z.infer;
export const instanceConfig = z.object({
schema,
- referenceGenomes,
+ referenceGenome,
});
export type InstanceConfig = z.infer;