diff --git a/integration-tests/tests/pages/review.page.ts b/integration-tests/tests/pages/review.page.ts index 8a50b6d63e..1b26860d58 100644 --- a/integration-tests/tests/pages/review.page.ts +++ b/integration-tests/tests/pages/review.page.ts @@ -22,7 +22,7 @@ export class ReviewPage { async waitForZeroProcessing() { await expect(this.page.locator('[data-testid="review-page-control-panel"]')).toContainText( '0 awaiting processing', - { timeout: 60000 }, + { timeout: 200000 }, ); } diff --git a/integration-tests/tests/specs/features/search/lineage-field.spec.ts b/integration-tests/tests/specs/features/search/lineage-field.spec.ts index 6d31683df6..6cef8fab28 100644 --- a/integration-tests/tests/specs/features/search/lineage-field.spec.ts +++ b/integration-tests/tests/specs/features/search/lineage-field.spec.ts @@ -7,7 +7,7 @@ import { v4 as uuidv4 } from 'uuid'; const SEQUENCE = 'ATTGATCTCATCATTT'; test('Lineage field lineage counts', async ({ page, pageWithGroup }) => { - test.setTimeout(95_000); + test.setTimeout(900_000); const uuid = uuidv4(); await page.goto('/'); diff --git a/kubernetes/loculus/templates/_common-metadata.tpl b/kubernetes/loculus/templates/_common-metadata.tpl index 72968acb54..db2ec373e3 100644 --- a/kubernetes/loculus/templates/_common-metadata.tpl +++ b/kubernetes/loculus/templates/_common-metadata.tpl @@ -251,8 +251,8 @@ organisms: {{ end }} {{ .website | toYaml | nindent 6 }} {{- end }} - referenceGenomes: - {{ $instance.referenceGenomes | toYaml | nindent 6 }} + referenceGenome: + {{ include "loculus.mergeReferenceGenomes" $instance.referenceGenomes | nindent 6 }} {{- end }} {{- end }} diff --git a/kubernetes/loculus/templates/loculus-preprocessing-config.yaml b/kubernetes/loculus/templates/loculus-preprocessing-config.yaml index 63118bfc76..c51d881307 100644 --- a/kubernetes/loculus/templates/loculus-preprocessing-config.yaml +++ b/kubernetes/loculus/templates/loculus-preprocessing-config.yaml @@ -2,7 +2,12 @@ {{- $metadata := ($organismConfig.schema | include "loculus.patchMetadataSchema" | fromYaml).metadata }} {{- $rawNucleotideSequences := (($organismConfig.schema | include "loculus.patchMetadataSchema" | fromYaml).nucleotideSequences) }} {{- $nucleotideSequences := ($rawNucleotideSequences | default "" ) }} -{{- $nucleotideSequencesList := (eq $rawNucleotideSequences nil | ternary (list "main") $rawNucleotideSequences) }} +{{- $referenceGenomes:= include "loculus.mergeReferenceGenomes" $organismConfig.referenceGenomes | fromYaml }} +{{- $genesList := (eq $referenceGenomes.genes nil | ternary (list) $referenceGenomes.genes) }} +{{- $genesDict := dict "genes" (list) -}} +{{- range $g := $genesList }} + {{- $_ := set $genesDict "genes" (append ($genesDict.genes) $g.name) -}} +{{- end }} {{- range $processingIndex, $processingConfig := $organismConfig.preprocessing }} {{- if $processingConfig.configFile }} {{- /* Use the enaDepositionConfig as the base config */}} @@ -17,7 +22,7 @@ data: preprocessing-config.yaml: | organism: {{ $organism }} {{- $preproAndEnaConfigFile | toYaml | nindent 4 }} - {{- (dict "nucleotideSequences" $nucleotideSequencesList) | toYaml | nindent 4 }} + {{- $genesDict | toYaml | nindent 4 }} processing_spec: {{- $args := dict "metadata" $metadata "nucleotideSequences" $nucleotideSequences }} {{- include "loculus.preprocessingSpecs" $args | nindent 6 }} diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index ec8e04163c..00ef3d6358 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -1312,7 +1312,6 @@ defaultOrganismConfig: &defaultOrganismConfig replicas: 2 configFile: &preprocessingConfigFile log_level: DEBUG - genes: [] batch_size: 100 create_embl_file: true ingest: &ingest @@ -1356,16 +1355,18 @@ defaultOrganisms: - <<: *preprocessing version: 1 configFile: - <<: *preprocessingConfigFile - genes: [NP, VP35, VP40, GP, sGP, ssGP, VP30, VP24, L] - nextclade_dataset_name: nextstrain/ebola/sudan + <<: *preprocessingConfigFile + nucleotideSequences: + - name: "main" + nextclade_dataset_name: nextstrain/ebola/sudan nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/ebola/data_output - <<: *preprocessing version: 2 configFile: <<: *preprocessingConfigFile - genes: [NP, VP35, VP40, GP, sGP, ssGP, VP30, VP24, L] - nextclade_dataset_name: nextstrain/ebola/sudan + nucleotideSequences: + - name: "main" + nextclade_dataset_name: nextstrain/ebola/sudan nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/ebola/data_output west-nile: <<: *defaultOrganismConfig @@ -1417,9 +1418,10 @@ defaultOrganisms: version: 1 configFile: <<: *preprocessingConfigFile - nextclade_dataset_name: nextstrain/wnv/all-lineages + nucleotideSequences: + - name: "main" + nextclade_dataset_name: nextstrain/wnv/all-lineages nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/wnv/data_output - genes: [capsid, prM, env, NS1, NS2A, NS2B, NS3, NS4A, 2K, NS4B, NS5] ingest: <<: *ingest configFile: @@ -1674,8 +1676,9 @@ defaultOrganisms: - "prepro" configFile: log_level: DEBUG - genes: [] batch_size: 100 + nucleotideSequences: + - name: "main" referenceGenomes: singleReference: nucleotideSequences: @@ -1734,9 +1737,14 @@ defaultOrganisms: configFile: <<: *preprocessingConfigFile log_level: DEBUG - nextclade_dataset_name: nextstrain/cchfv/linked + nucleotideSequences: + - name: L + nextclade_dataset_name: nextstrain/cchfv/linked/L + - name: M + nextclade_dataset_name: nextstrain/cchfv/linked/M + - name: S + nextclade_dataset_name: nextstrain/cchfv/linked/S nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/cornelius-cchfv/data_output - genes: [RdRp, GPC, NP] ingest: <<: *ingest configFile: @@ -1772,7 +1780,7 @@ defaultOrganisms: sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/cchf/NP.fasta]]" enteroviruses: <<: *defaultOrganismConfig - enabled: false + enabled: true schema: <<: *schema organismName: "Enterovirus" @@ -1848,9 +1856,23 @@ defaultOrganisms: configFile: <<: *preprocessingConfigFile log_level: INFO - nextclade_dataset_name: community/hodcroftlab/enterovirus/enterovirus/linked + classify_with_nextclade_sort: True + minimizer_index: "https://raw.githubusercontent.com/alejandra-gonzalezsanchez/loculus-evs/master/evs_minimizer-index.json" + nucleotideSequences: + - name: CV-A16 + nextclade_dataset_name: community/hodcroftlab/enterovirus/enterovirus/linked/CV-A16 + accepted_sort_matches: ["community/hodcroftlab/enterovirus/cva16", "community/hodcroftlab/enterovirus/enterovirus/linked/CV-A16"] + gene_prefix: "CV-A16-" + - name: CV-A10 + nextclade_dataset_name: community/hodcroftlab/enterovirus/enterovirus/linked/CV-A10 + gene_prefix: "CV-A10-" + - name: EV-A71 + nextclade_dataset_name: community/hodcroftlab/enterovirus/enterovirus/linked/EV-A71 + gene_prefix: "EV-A71-" + - name: EV-D68 + gene_prefix: "EV-D68-" + nextclade_dataset_name: community/hodcroftlab/enterovirus/enterovirus/linked/EV-D68 nextclade_dataset_server: https://raw.githubusercontent.com/alejandra-gonzalezsanchez/nextclade_data/multi-pathogen-evs/data_output - genes: ["CV-A16-VP4", "CV-A16-VP2", "CV-A16-VP3", "CV-A16-VP1", "CV-A16-2A", "CV-A16-2B", "CV-A16-2C", "CV-A16-3A", "CV-A16-3B", "CV-A16-3C", "CV-A16-3D", "CV-A10-VP4", "CV-A10-VP2", "CV-A10-VP3", "CV-A10-VP1", "CV-A10-2A", "CV-A10-2B", "CV-A10-2C", "CV-A10-3A", "CV-A10-3B", "CV-A10-3C", "CV-A10-3D", "EV-A71-VP4", "EV-A71-VP2", "EV-A71-VP3", "EV-A71-VP1", "EV-A71-2A", "EV-A71-2B", "EV-A71-2C", "EV-A71-3A", "EV-A71-3B", "EV-A71-3C", "EV-A71-3D", "EV-D68-VP4", "EV-D68-VP2", "EV-D68-VP3", "EV-D68-VP1", "EV-D68-2A", "EV-D68-2B", "EV-D68-2C", "EV-D68-3A", "EV-D68-3B", "EV-D68-3C", "EV-D68-3D"] ingest: <<: *ingest configFile: @@ -2070,7 +2092,7 @@ enforceHTTPS: true registrationTermsMessage: > You must agree to the terms of use. -enaDeposition: +enaDeposition: submitToEnaProduction: false enaDbName: Loculus enaUniqueSuffix: Loculus diff --git a/preprocessing/nextclade/src/loculus_preprocessing/config.py b/preprocessing/nextclade/src/loculus_preprocessing/config.py index d84d5403ca..142ca67e05 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/config.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/config.py @@ -32,8 +32,21 @@ class AlignmentRequirement(StrEnum): # Determines whether ALL or ANY segments that a user provides must align. # ANY: warn if some segments fail and some segments align # ALL: error if any segment fails even if some segments align + # NONE: do not align any segments, just process them as-is + # - set if no nextclade dataset is provided ANY = "ANY" ALL = "ALL" + NONE = "NONE" + + +@dataclass +class NextcladeSequenceAndDataset: + name: str = "main" + nextclade_dataset_name: str | None = None + nextclade_dataset_tag: str | None = None + nextclade_dataset_server: str | None = None + accepted_sort_matches: list[str] | None = None + gene_prefix: str | None = None @dataclass @@ -51,21 +64,20 @@ class Config: keycloak_token_path: str = "realms/loculus/protocol/openid-connect/token" # noqa: S105 organism: str = "mpox" + nucleotideSequences: list[NextcladeSequenceAndDataset] = dataclasses.field( # noqa: N815 + default_factory=list + ) genes: list[str] = dataclasses.field(default_factory=list) - nucleotideSequences: list[str] = dataclasses.field(default_factory=lambda: ["main"]) # noqa: N815 processing_spec: dict[str, dict[str, Any]] = dataclasses.field(default_factory=dict) multi_segment: bool = False alignment_requirement: AlignmentRequirement = AlignmentRequirement.ALL - nextclade_dataset_name: str | None = None - nextclade_dataset_name_map: dict[str, str] | None = None - nextclade_dataset_tag: str | None = None nextclade_dataset_server: str = "https://data.clades.nextstrain.org/v3" - nextclade_dataset_server_map: dict[str, str] | None = None require_nextclade_sort_match: bool = False minimizer_url: str | None = None - accepted_dataset_matches: list[str] = dataclasses.field(default_factory=list) + classify_with_nextclade_sort: bool = False + create_embl_file: bool = False scientific_name: str = "Orthonairovirus haemorrhagiae" molecule_type: MoleculeType = MoleculeType.GENOMIC_RNA @@ -77,6 +89,36 @@ class Config: ) +def assign_nextclade_sequence_and_dataset( + nuc_seq_values: list[dict[str, Any]], +) -> list[NextcladeSequenceAndDataset]: + if not isinstance(nuc_seq_values, list): + error_msg = f"nucleotideSequences should be a list of dicts, got: {type(nuc_seq_values)}" + logger.error(error_msg) + raise ValueError(error_msg) + nextclade_sequence_and_dataset_list: list[NextcladeSequenceAndDataset] = [] + for value in nuc_seq_values: + if value is None or not isinstance(value, dict): + continue + seq_and_dataset = NextcladeSequenceAndDataset() + for seq_key, seq_value in value.items(): + if hasattr(seq_and_dataset, seq_key) and seq_value is not None: + setattr(seq_and_dataset, seq_key, seq_value) + nextclade_sequence_and_dataset_list.append(seq_and_dataset) + return nextclade_sequence_and_dataset_list + + +def set_alignment_requirement( + config: Config) -> AlignmentRequirement: + need_nextclade_dataset: bool = False + for sequence in config.nucleotideSequences: + if sequence.nextclade_dataset_name: + need_nextclade_dataset = True + if not need_nextclade_dataset: + return AlignmentRequirement.NONE + return config.alignment_requirement + + def load_config_from_yaml(config_file: str, config: Config | None = None) -> Config: config = Config() if config is None else copy.deepcopy(config) with open(config_file, encoding="utf-8") as file: @@ -84,6 +126,9 @@ def load_config_from_yaml(config_file: str, config: Config | None = None) -> Con logger.debug(f"Loaded config from {config_file}: {yaml_config}") for key, value in yaml_config.items(): if value is not None and hasattr(config, key): + if key == "nucleotideSequences": + setattr(config, key, assign_nextclade_sequence_and_dataset(value)) + continue attr = getattr(config, key) if isinstance(attr, StrEnum): try: @@ -171,4 +216,6 @@ def get_config(config_file: str | None = None, ignore_args: bool = False) -> Con if len(config.nucleotideSequences) > 1: config.multi_segment = True + config.alignment_requirement = set_alignment_requirement(config) + return config diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index 1148472cbe..c4adba2b4d 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -24,7 +24,7 @@ submit_processed_sequences, upload_embl_file_to_presigned_url, ) -from .config import AlignmentRequirement, Config +from .config import AlignmentRequirement, Config, NextcladeSequenceAndDataset from .datatypes import ( AccessionVersion, Alerts, @@ -75,11 +75,12 @@ def parse_nextclade_tsv( ], result_dir: str, config: Config, - segment: SegmentName, + sequence_and_dataset: NextcladeSequenceAndDataset, ) -> tuple[ defaultdict[AccessionVersion, defaultdict[GeneName, list[AminoAcidInsertion]]], defaultdict[AccessionVersion, defaultdict[SegmentName, list[NucleotideInsertion]]], ]: + segment = sequence_and_dataset.name with Path(result_dir + "/nextclade.tsv").open(encoding="utf-8") as nextclade_tsv: reader = csv.DictReader(nextclade_tsv, delimiter="\t") for row in reader: @@ -94,7 +95,12 @@ def parse_nextclade_tsv( continue gene, val = ins.split(":", maxsplit=1) if gene in config.genes: - amino_acid_insertions[id][gene].append(val) + gene_name = ( + sequence_and_dataset.gene_prefix + gene + if sequence_and_dataset.gene_prefix + else gene + ) + amino_acid_insertions[id][gene_name].append(val) else: logger.debug( "Note: Nextclade found AA insertion in gene missing from config in gene " @@ -130,32 +136,20 @@ def parse_nextclade_json( def run_sort( - result_file_dir: str, + result_file: str, input_file: str, - alerts: Alerts, config: Config, - segment: SegmentName, + nextclade_dataset_server: str, dataset_dir: str, -) -> Alerts: +) -> pd.DataFrame: """ Run nextclade - use config.minimizer_url or default minimizer from nextclade server - - assert highest score is in config.accepted_dataset_matches - (default is nextclade_dataset_name) """ - nextclade_dataset_name = get_nextclade_dataset_name(config, segment) - if not config.accepted_dataset_matches and not nextclade_dataset_name: - logger.warning("No nextclade dataset name or accepted dataset match list found in config") - return alerts - nextclade_dataset_server = get_nextclade_dataset_server(config, segment) - if config.minimizer_url: minimizer_file = dataset_dir + "/minimizer/minimizer.json" - accepted_dataset_names = config.accepted_dataset_matches or [nextclade_dataset_name] # type: ignore - - result_file = result_file_dir + "/sort_output.tsv" - command = [ + subprocess_args_with_emtpy_strings = [ "nextclade3", "sort", input_file, @@ -174,14 +168,15 @@ def run_sort( f"{nextclade_dataset_server}", ] - logger.debug(f"Running nextclade sort: {command}") + subprocess_args = [arg for arg in subprocess_args_with_emtpy_strings if arg] + + logger.debug(f"Running nextclade sort: {subprocess_args}") - exit_code = subprocess.run(command, check=False).returncode # noqa: S603 + exit_code = subprocess.run(subprocess_args, check=False).returncode # noqa: S603 if exit_code != 0: msg = f"nextclade sort failed with exit code {exit_code}" raise Exception(msg) - - df = pd.read_csv( + return pd.read_csv( result_file, sep="\t", dtype={ @@ -192,6 +187,39 @@ def run_sort( }, ) + +def check_nextclade_sort_matches( # noqa: PLR0913, PLR0917 + result_file_dir: str, + input_file: str, + alerts: Alerts, + config: Config, + sequence_and_dataset: NextcladeSequenceAndDataset, + dataset_dir: str, +) -> Alerts: + """ + Run nextclade sort + - assert highest score is in sequence_and_dataset.accepted_sort_matches + (default is nextclade_dataset_name) + """ + nextclade_dataset_name = sequence_and_dataset.nextclade_dataset_name + if not sequence_and_dataset.accepted_sort_matches and not nextclade_dataset_name: + logger.warning("No nextclade dataset name or accepted dataset match list found in config") + return alerts + nextclade_dataset_server = ( + sequence_and_dataset.nextclade_dataset_server or config.nextclade_dataset_server + ) + + accepted_dataset_names = sequence_and_dataset.accepted_sort_matches or [nextclade_dataset_name] # type: ignore + + result_file = result_file_dir + "/sort_output.tsv" + df = run_sort( + result_file, + input_file, + config, + nextclade_dataset_server, + dataset_dir, + ) + hits = df.dropna(subset=["score"]).sort_values("score", ascending=False) best_hits = hits.groupby("seqName", as_index=False).first() @@ -208,7 +236,7 @@ def run_sort( "Sequence does not appear to match reference, per `nextclade sort`. " "Double check you are submitting to the correct organism." ), - ), + ) ) for _, row in best_hits.iterrows(): @@ -219,7 +247,7 @@ def run_sort( ProcessingAnnotationAlignment, AnnotationSourceType.NUCLEOTIDE_SEQUENCE, message=( - f"This sequence best matches {row['dataset']}, " + f"Sequence best matches {row['dataset']}, " "a different organism than the one you are submitting to: " f"{config.organism}. It is therefore not possible to release. " "Contact the administrator if you think this message is an error." @@ -230,12 +258,109 @@ def run_sort( return alerts +# TODO: running this for each sequence is inefficient, should be run once per batch +def classify_with_nextclade_sort( + input_unaligned_sequences: dict[str, NucleotideSequence | None], + unaligned_nucleotide_sequences: dict[SegmentName, NucleotideSequence | None], + errors: list[ProcessingAnnotation], + config: Config, + dataset_dir: str, +): + """ + Run nextclade sort + - assert highest score is in sequence_and_dataset.accepted_sort_matches + (default is nextclade_dataset_name) + """ + nextclade_dataset_server = config.nextclade_dataset_server + + with TemporaryDirectory(delete=not config.keep_tmp_dir) as result_dir: + input_file = result_dir + "/input.fasta" + os.makedirs(os.path.dirname(input_file), exist_ok=True) + with open(input_file, "w", encoding="utf-8") as f: + for id, seq in input_unaligned_sequences.items(): + f.write(f">{id}\n") + f.write(f"{seq}\n") + + result_file = result_dir + "/sort_output.tsv" + df = run_sort( + result_file, + input_file, + config, + nextclade_dataset_server, + dataset_dir, + ) + + no_hits = df[df["score"].isna()] + hits = df.dropna(subset=["score"]).sort_values("score", ascending=False) + for seq_name in no_hits["seqName"].unique(): + if seq_name not in hits["seqName"].unique(): + msg = ( + f"Sequence {seq_name} does not appear to match any reference for organism: " + f"{config.organism} per `nextclade sort`. " + f"Double check you are submitting to the correct organism." + ) + # TODO: only error when config.alignment_requirement == "ALL", otherwise warn + errors.append( + ProcessingAnnotation.from_single( + ProcessingAnnotationAlignment, + AnnotationSourceType.NUCLEOTIDE_SEQUENCE, + message=msg, + ) + ) + + best_hits = hits.groupby("seqName", as_index=False).first() + logger.info(f"Found hits: {best_hits['seqName'].tolist()}") + + for _, row in best_hits.iterrows(): + not_found = True + for segment in config.nucleotideSequences: + default = [segment.nextclade_dataset_name] if segment.nextclade_dataset_name else [] + accepted_dataset_names = segment.accepted_sort_matches or default + if row["dataset"] in accepted_dataset_names: + unaligned_nucleotide_sequences[segment.name] = input_unaligned_sequences[ + row["seqName"] + ] + not_found = False + break + if not_found: + msg = ( + f"Sequence {row['seqName']} best matches {row['dataset']}, " + "which is currently not an accepted option for organism: " + f"{config.organism}. It is therefore not possible to release. " + "Contact the administrator if you think this message is an error." + ) + errors.append( + ProcessingAnnotation.from_single( + ProcessingAnnotationAlignment, + AnnotationSourceType.NUCLEOTIDE_SEQUENCE, + message=msg, + ) + ) + + return (unaligned_nucleotide_sequences, errors) + + def assign_segment( input_unaligned_sequences: dict[str, NucleotideSequence | None], unaligned_nucleotide_sequences: dict[SegmentName, NucleotideSequence | None], errors: list[ProcessingAnnotation], config: Config, + dataset_dir: str | None = None, ): + if config.classify_with_nextclade_sort: + if not dataset_dir: + error_msg = ( + "Dataset directory must be provided when classify_with_nextclade_sort is True" + ) + logger.error(error_msg) + raise ValueError(error_msg) + return classify_with_nextclade_sort( + input_unaligned_sequences, + unaligned_nucleotide_sequences, + dataset_dir=dataset_dir, + errors=errors, + config=config, + ) valid_segments = set() duplicate_segments = set() if not config.nucleotideSequences: @@ -265,7 +390,8 @@ def assign_segment( unaligned_nucleotide_sequences, errors, ) - for segment in config.nucleotideSequences: + for sequence_and_dataset in config.nucleotideSequences: + segment = sequence_and_dataset.name unaligned_segment = [ data for data in input_unaligned_sequences @@ -303,7 +429,7 @@ def assign_segment( f"{', '.join(remaining_segments)}. " "Each metadata entry can have multiple corresponding fasta sequence " "entries with format _ valid segments are: " - f"{', '.join(config.nucleotideSequences)}." + f"{', '.join([sequence_and_dataset.name for sequence_and_dataset in config.nucleotideSequences])}." ), ) ) @@ -341,8 +467,8 @@ def enrich_with_nextclade( # noqa: C901, PLR0914, PLR0915 id = entry.accessionVersion input_metadata[id] = entry.data.metadata input_metadata[id]["submitter"] = entry.data.submitter - input_metadata[id]["group_id"] = entry.data.group_id input_metadata[id]["submittedAt"] = entry.data.submittedAt + input_metadata[id]["group_id"] = entry.data.group_id aligned_aminoacid_sequences[id] = {} unaligned_nucleotide_sequences[id] = {} aligned_nucleotide_sequences[id] = {} @@ -356,6 +482,7 @@ def enrich_with_nextclade( # noqa: C901, PLR0914, PLR0915 unaligned_nucleotide_sequences=unaligned_nucleotide_sequences[id], errors=alerts.errors[id], config=config, + dataset_dir=dataset_dir, ) nextclade_metadata: defaultdict[ @@ -368,9 +495,12 @@ def enrich_with_nextclade( # noqa: C901, PLR0914, PLR0915 AccessionVersion, defaultdict[GeneName, list[AminoAcidInsertion]] ] = defaultdict(lambda: defaultdict(list)) with TemporaryDirectory(delete=not config.keep_tmp_dir) as result_dir: # noqa: PLR1702 - for segment in config.nucleotideSequences: - result_dir_seg = result_dir if segment == "main" else result_dir + "/" + segment - dataset_dir_seg = dataset_dir if segment == "main" else dataset_dir + "/" + segment + for sequence_and_dataset in config.nucleotideSequences: + segment = sequence_and_dataset.name + result_dir_seg = result_dir if not config.multi_segment else result_dir + "/" + segment + dataset_dir_seg = ( + dataset_dir if not config.multi_segment else dataset_dir + "/" + segment + ) input_file = result_dir_seg + "/input.fasta" os.makedirs(os.path.dirname(input_file), exist_ok=True) is_empty: bool = True @@ -384,7 +514,9 @@ def enrich_with_nextclade( # noqa: C901, PLR0914, PLR0915 continue if config.require_nextclade_sort_match: - alerts = run_sort(result_dir_seg, input_file, alerts, config, segment, dataset_dir) + alerts = check_nextclade_sort_matches( + result_dir_seg, input_file, alerts, config, sequence_and_dataset, dataset_dir + ) command = [ "nextclade3", @@ -422,7 +554,12 @@ def enrich_with_nextclade( # noqa: C901, PLR0914, PLR0915 masked_sequence = mask_terminal_gaps( str(aligned_sequence.seq), mask_char="X" ) - aligned_aminoacid_sequences[sequence_id][gene] = masked_sequence + gene_name = ( + sequence_and_dataset.gene_prefix + gene + if sequence_and_dataset.gene_prefix + else gene + ) + aligned_aminoacid_sequences[sequence_id][gene_name] = masked_sequence except FileNotFoundError: # TODO: Add warning to each sequence logger.info( @@ -435,7 +572,11 @@ def enrich_with_nextclade( # noqa: C901, PLR0914, PLR0915 result_dir_seg, nextclade_metadata, segment, unaligned_nucleotide_sequences ) # this includes the "annotation" field amino_acid_insertions, nucleotide_insertions = parse_nextclade_tsv( - amino_acid_insertions, nucleotide_insertions, result_dir_seg, config, segment + amino_acid_insertions, + nucleotide_insertions, + result_dir_seg, + config, + sequence_and_dataset, ) return { @@ -555,7 +696,7 @@ def add_input_metadata( if not unprocessed.nextcladeMetadata[segment]: message = ( "Nucleotide sequence failed to align" - if segment == "main" + if not config.multi_segment else f"Nucleotide sequence for {segment} failed to align" ) annotation = ProcessingAnnotation.from_single( @@ -722,15 +863,16 @@ def process_single( # noqa: C901 config=config, ) - for segment in config.nucleotideSequences: + for sequence_and_dataset in config.nucleotideSequences: + segment = sequence_and_dataset.name sequence = unprocessed.unalignedNucleotideSequences.get(segment, None) - key = "length" if segment == "main" else "length_" + segment + key = "length" if not config.multi_segment else "length_" + segment if key in config.processing_spec: output_metadata[key] = len(sequence) if sequence else 0 for output_field, spec_dict in config.processing_spec.items(): length_fields = [ - "length" if segment == "main" else "length_" + segment + "length" if not config.multi_segment else "length_" + segment.name for segment in config.nucleotideSequences ] if output_field in length_fields: @@ -771,7 +913,8 @@ def process_single( # noqa: C901 return processed_entry_no_alignment(id, unprocessed, output_metadata, errors, warnings) aligned_segments = set() - for segment in config.nucleotideSequences: + for sequence_and_dataset in config.nucleotideSequences: + segment = sequence_and_dataset.name if unprocessed.alignedNucleotideSequences.get(segment, None): aligned_segments.add(segment) @@ -788,7 +931,8 @@ def process_single( # noqa: C901 if config.create_embl_file and unprocessed.nextcladeMetadata is not None: annotations = {} - for segment in config.nucleotideSequences: + for sequence_and_dataset in config.nucleotideSequences: + segment = sequence_and_dataset.name if segment in unprocessed.nextcladeMetadata: annotations[segment] = None if unprocessed.nextcladeMetadata[segment]: @@ -853,7 +997,7 @@ def process_all( ) -> Sequence[SubmissionData]: processed_results = [] logger.debug(f"Processing {len(unprocessed)} unprocessed sequences") - if config.nextclade_dataset_name: + if config.alignment_requirement != AlignmentRequirement.NONE: nextclade_results = enrich_with_nextclade(unprocessed, dataset_dir, config) for id, result in nextclade_results.items(): try: @@ -874,30 +1018,15 @@ def process_all( return processed_results -def get_nextclade_dataset_name(config: Config, segment: SegmentName) -> str | None: - if config.nextclade_dataset_name_map and segment in config.nextclade_dataset_name_map: - return config.nextclade_dataset_name_map[segment] - if not config.nextclade_dataset_name: - return None - return ( - config.nextclade_dataset_name - if segment == "main" - else config.nextclade_dataset_name + "/" + segment - ) - - -def get_nextclade_dataset_server(config: Config, segment: SegmentName) -> str: - if config.nextclade_dataset_server_map and segment in config.nextclade_dataset_server_map: - return config.nextclade_dataset_server_map[segment] - return config.nextclade_dataset_server - - def download_nextclade_dataset(dataset_dir: str, config: Config) -> None: - for segment in config.nucleotideSequences: - nextclade_dataset_name = get_nextclade_dataset_name(config, segment) - nextclade_dataset_server = get_nextclade_dataset_server(config, segment) + for sequence_and_dataset in config.nucleotideSequences: + name = sequence_and_dataset.name + nextclade_dataset_name = sequence_and_dataset.nextclade_dataset_name + nextclade_dataset_server = ( + sequence_and_dataset.nextclade_dataset_server or config.nextclade_dataset_server + ) - dataset_dir_seg = dataset_dir if segment == "main" else dataset_dir + "/" + segment + dataset_dir_seg = dataset_dir if not config.multi_segment else dataset_dir + "/" + name dataset_download_command = [ "nextclade3", "dataset", @@ -907,8 +1036,8 @@ def download_nextclade_dataset(dataset_dir: str, config: Config) -> None: f"--output-dir={dataset_dir_seg}", ] - if config.nextclade_dataset_tag is not None: - dataset_download_command.append(f"--tag={config.nextclade_dataset_tag}") + if sequence_and_dataset.nextclade_dataset_tag is not None: + dataset_download_command.append(f"--tag={sequence_and_dataset.nextclade_dataset_tag}") logger.info("Downloading Nextclade dataset: %s", dataset_download_command) if subprocess.run(dataset_download_command, check=False).returncode != 0: # noqa: S603 @@ -952,7 +1081,7 @@ def upload_flatfiles(processed: Sequence[SubmissionData], config: Config) -> Non def run(config: Config) -> None: with TemporaryDirectory(delete=not config.keep_tmp_dir) as dataset_dir: - if config.nextclade_dataset_name: + if config.alignment_requirement != AlignmentRequirement.NONE: download_nextclade_dataset(dataset_dir, config) if config.minimizer_url and config.require_nextclade_sort_match: download_minimizer(config.minimizer_url, dataset_dir + "/minimizer/minimizer.json") diff --git a/preprocessing/nextclade/tests/factory_methods.py b/preprocessing/nextclade/tests/factory_methods.py index 6911da68fe..23411c99e9 100644 --- a/preprocessing/nextclade/tests/factory_methods.py +++ b/preprocessing/nextclade/tests/factory_methods.py @@ -46,7 +46,7 @@ class ProcessingAnnotationHelper: @dataclass class ProcessedAlignment: unalignedNucleotideSequences: dict[str, str | None] = field( # noqa: N815 - default_factory=lambda: {"main": None} + default_factory=dict ) alignedNucleotideSequences: dict[str, str | None] = field( # noqa: N815 default_factory=dict diff --git a/preprocessing/nextclade/tests/multi_segment_config.yaml b/preprocessing/nextclade/tests/multi_segment_config.yaml index bfac8b94e7..bf81cb8121 100644 --- a/preprocessing/nextclade/tests/multi_segment_config.yaml +++ b/preprocessing/nextclade/tests/multi_segment_config.yaml @@ -6,11 +6,12 @@ genes: - VP24EbolaZaire - LEbolaZaire log_level: DEBUG -nextclade_dataset_name: ebola-dataset nextclade_dataset_server: fake_url nucleotideSequences: -- ebola-sudan -- ebola-zaire +- name: ebola-sudan + nextclade_dataset_name: ebola-dataset/ebola-sudan +- name: ebola-zaire + nextclade_dataset_name: ebola-dataset/ebola-zaire organism: multi-ebola-test processing_spec: totalInsertedNucs_ebola-zaire: diff --git a/preprocessing/nextclade/tests/single_segment_config.yaml b/preprocessing/nextclade/tests/single_segment_config.yaml index 670665961c..0ae57b7a3a 100644 --- a/preprocessing/nextclade/tests/single_segment_config.yaml +++ b/preprocessing/nextclade/tests/single_segment_config.yaml @@ -3,14 +3,14 @@ genes: - NPEbolaSudan - VP35EbolaSudan log_level: DEBUG -nextclade_dataset_name: ebola-sudan-test-dataset nextclade_dataset_server: fake_url scientific_name: "Test Ebola Sudan Virus" molecule_type: "genomic RNA" topology: "linear" db_name: "Loculus" nucleotideSequences: -- main +- name: main + nextclade_dataset_name: ebola-sudan-test-dataset organism: ebola-sudan-test processing_spec: completeness: diff --git a/website/src/config.ts b/website/src/config.ts index e91a4050da..cf2043db22 100644 --- a/website/src/config.ts +++ b/website/src/config.ts @@ -227,7 +227,7 @@ export function getLapisUrl(serviceConfig: ServiceUrls, organism: string): strin } export function getReferenceGenome(organism: string): ReferenceGenome { - return Object.values(getConfig(organism).referenceGenomes)[0]; + return getConfig(organism).referenceGenome; } const getAccession = (n: NamedSequence): ReferenceAccession => { diff --git a/website/src/types/config.ts b/website/src/types/config.ts index 68fb966a39..bba32812de 100644 --- a/website/src/types/config.ts +++ b/website/src/types/config.ts @@ -1,7 +1,7 @@ import z from 'zod'; import { mutationProportionCount, orderByType } from './lapis.ts'; -import { referenceGenomes } from './referencesGenomes.ts'; +import { referenceGenome } from './referencesGenomes.ts'; // These metadata types need to be kept in sync with the backend config class `MetadataType` in Config.kt export const metadataPossibleTypes = z.enum([ @@ -156,7 +156,7 @@ export type Schema = z.infer; export const instanceConfig = z.object({ schema, - referenceGenomes, + referenceGenome, }); export type InstanceConfig = z.infer;