feat(prepro): add sort code

anna-parker · anna-parker · commit 3d4d2ddfdfc6 · 2025-08-12T09:29:06.000+02:00
diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml
@@ -1248,20 +1248,20 @@ defaultOrganisms:
       - <<: *preprocessing
         version: 1
         configFile:
-          <<: *preprocessingConfigFile      
+          <<: *preprocessingConfigFile
           genes: [NP, VP35, VP40, GP, sGP, ssGP, VP30, VP24, L]
           nucleotideSequences:
-          - name: "main"
-            nextclade_dataset_name: nextstrain/ebola/sudan
+            - name: "main"
+              nextclade_dataset_name: nextstrain/ebola/sudan
           nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/ebola/data_output
       - <<: *preprocessing
         version: 2
         configFile:
           <<: *preprocessingConfigFile
           genes: [NP, VP35, VP40, GP, sGP, ssGP, VP30, VP24, L]
           nucleotideSequences:
-          - name: "main"
-            nextclade_dataset_name: nextstrain/ebola/sudan
+            - name: "main"
+              nextclade_dataset_name: nextstrain/ebola/sudan
           nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/ebola/data_output
   west-nile:
     <<: *defaultOrganismConfig
@@ -1314,8 +1314,8 @@ defaultOrganisms:
         configFile:
           <<: *preprocessingConfigFile
           nucleotideSequences:
-          - name: "main"
-            nextclade_dataset_name: nextstrain/wnv/all-lineages
+            - name: "main"
+              nextclade_dataset_name: nextstrain/wnv/all-lineages
           nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/wnv/data_output
           genes: [capsid, prM, env, NS1, NS2A, NS2B, NS3, NS4A, 2K, NS4B, NS5]
     ingest:
@@ -1757,9 +1757,11 @@ defaultOrganisms:
           <<: *preprocessingConfigFile
           log_level: INFO
           classify_with_nextclade_sort: True
+          minimizer_index: "https://raw.githubusercontent.com/alejandra-gonzalezsanchez/loculus-evs/master/evs_minimizer-index.json"
           nucleotideSequences:
             - name: CV-A16
               nextclade_dataset_name: community/hodcroftlab/enterovirus/enterovirus/linked/CV-A16
+              accepted_sort_matches: ["community/hodcroftlab/enterovirus/cva16", "community/hodcroftlab/enterovirus/enterovirus/linked/CV-A16"]
             - name: CV-A10
               nextclade_dataset_name: community/hodcroftlab/enterovirus/enterovirus/linked/CV-A10
             - name: EV-A71
@@ -1985,7 +1987,7 @@ enforceHTTPS: true
 registrationTermsMessage: >
   You must agree to the <a href="http://main.loculus.org/terms">terms of use</a>.
 
-enaDeposition: 
+enaDeposition:
   submitToEnaProduction: false
   enaDbName: Loculus
   enaUniqueSuffix: Loculus
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/config.py b/preprocessing/nextclade/src/loculus_preprocessing/config.py
@@ -36,11 +36,6 @@ class AlignmentRequirement(StrEnum):
     ALL = "ALL"
 
 
-class AlignmentRequirement(Enum):
-    ANY = "ANY"
-    ALL = "ALL"
-
-
 @dataclass
 class NextcladeSequenceAndDataset:
     name: str = "main"
@@ -66,26 +61,19 @@ class Config:
     keycloak_token_path: str = "realms/loculus/protocol/openid-connect/token"  # noqa: S105
 
     organism: str = "mpox"
+    nucleotideSequences: list[NextcladeSequenceAndDataset] = dataclasses.field(  # noqa: N815
+        default_factory=list
+    )
     genes: list[str] = dataclasses.field(default_factory=list)
-    nucleotideSequences: list[str] = dataclasses.field(default_factory=lambda: ["main"])  # noqa: N815
     processing_spec: dict[str, dict[str, Any]] = dataclasses.field(default_factory=dict)
     multi_segment: bool = False
 
     alignment_requirement: AlignmentRequirement = AlignmentRequirement.ALL
-    nextclade_dataset_name: str | None = None
-    nextclade_dataset_name_map: dict[str, str] | None = None
-    nextclade_dataset_tag: str | None = None
     nextclade_dataset_server: str = "https://data.clades.nextstrain.org/v3"
+
     require_nextclade_sort_match: bool = False
     minimizer_url: str | None = None
-    nucleotideSequences: list[NextcladeSequenceAndDataset] = dataclasses.field(  # noqa: N815
-        default_factory=list
-    )
-    genes: list[str] = dataclasses.field(default_factory=list)
-    multi_segment: bool = False
     classify_with_nextclade_sort: bool = False
-    alignment_requirement: AlignmentRequirement = AlignmentRequirement.ALL
-    require_nextclade_sort_match: bool = False
 
     create_embl_file: bool = False
     scientific_name: str = "Orthonairovirus haemorrhagiae"
@@ -97,8 +85,6 @@ class Config:
         default_factory=EmblInfoMetadataPropertyNames
     )
 
-    processing_spec: dict[str, dict[str, Any]] = dataclasses.field(default_factory=dict)
-
 
 def assign_nextclade_sequence_and_dataset(
     nuc_seq_values: list[dict[str, Any]],
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
@@ -134,7 +134,7 @@ def run_sort(
     config: Config,
     nextclade_dataset_server: str,
     dataset_dir: str,
-) -> None:
+) -> pd.DataFrame:
     """
     Run nextclade
     - use config.minimizer_url or default minimizer from nextclade server
@@ -169,8 +169,19 @@ def run_sort(
     if exit_code != 0:
         msg = f"nextclade sort failed with exit code {exit_code}"
         raise Exception(msg)
+    return pd.read_csv(
+        result_file,
+        sep="\t",
+        dtype={
+            "index": "Int64",
+            "score": "float64",
+            "seqName": "string",
+            "dataset": "string",
+        },
+    )
 
 
+# TODO: running this for each sequence is inefficient, should be run once per batch
 def check_nextclade_sort_matches(  # noqa: PLR0913, PLR0917
     result_file_dir: str,
     input_file: str,
@@ -195,25 +206,14 @@ def check_nextclade_sort_matches(  # noqa: PLR0913, PLR0917
     accepted_dataset_names = sequence_and_dataset.accepted_sort_matches or [nextclade_dataset_name]  # type: ignore
 
     result_file = result_file_dir + "/sort_output.tsv"
-    run_sort(
+    df = run_sort(
         result_file,
         input_file,
         config,
         nextclade_dataset_server,
         dataset_dir,
     )
 
-    df = pd.read_csv(
-        result_file,
-        sep="\t",
-        dtype={
-            "index": "Int64",
-            "score": "float64",
-            "seqName": "string",
-            "dataset": "string",
-        },
-    )
-
     hits = df.dropna(subset=["score"]).sort_values("score", ascending=False)
     best_hits = hits.groupby("seqName", as_index=False).first()
 
@@ -250,21 +250,141 @@ def check_nextclade_sort_matches(  # noqa: PLR0913, PLR0917
     return alerts
 
 
-def assign_segment(
+def classify_with_nextclade_sort(
+    input_unaligned_sequences: dict[str, NucleotideSequence | None],
+    unaligned_nucleotide_sequences: dict[SegmentName, NucleotideSequence | None],
+    aligned_nucleotide_sequences: dict[SegmentName, NucleotideSequence | None],
+    errors: list[ProcessingAnnotation],
+    config: Config,
+    dataset_dir: str,
+):
+    """
+    Run nextclade sort
+    - assert highest score is in sequence_and_dataset.accepted_sort_matches
+    (default is nextclade_dataset_name)
+    """
+    nextclade_dataset_server = config.nextclade_dataset_server
+
+    with TemporaryDirectory(delete=not config.keep_tmp_dir) as result_dir:
+        input_file = result_dir + "/input.fasta"
+        os.makedirs(os.path.dirname(input_file), exist_ok=True)
+        with open(input_file, "w", encoding="utf-8") as f:
+            for id, seq in input_unaligned_sequences.items():
+                f.write(f">{id}\n")
+                f.write(f"{seq}\n")
+
+        result_file = result_dir + "/sort_output.tsv"
+        df = run_sort(
+            result_file,
+            input_file,
+            config,
+            nextclade_dataset_server,
+            dataset_dir,
+        )
+
+        no_hits = df[df["score"].isna()]
+        hits = df.dropna(subset=["score"]).sort_values("score", ascending=False)
+        for seq_name in no_hits["seqName"].unique():
+            if seq_name not in hits["seqName"].unique():
+                msg = (
+                    f"Sequence {seq_name} does not appear to match any reference for organism: "
+                    f"{config.organism} per `nextclade sort`. "
+                    f"Double check you are submitting to the correct organism."
+                )
+                # TODO: only error when config.alignment_requirement == "ALL", otherwise warn
+                errors.append(
+                    sequence_annotation(
+                        name="alignment",
+                        message=msg,
+                    )
+                )
+
+        best_hits = hits.groupby("seqName", as_index=False).first()
+        logger.info(f"Found hits: {best_hits['seqName'].tolist()}")
+
+        for _, row in best_hits.iterrows():
+            not_found = True
+            for segment in config.nucleotideSequences:
+                accepted_dataset_names = segment.accepted_sort_matches or [
+                    segment.nextclade_dataset_name
+                ]
+                if row["dataset"] in accepted_dataset_names:
+                    unaligned_nucleotide_sequences[segment.name] = input_unaligned_sequences[
+                        row["seqName"]
+                    ]
+                    aligned_nucleotide_sequences[segment.name] = None
+                    not_found = False
+                    break
+            if not_found:
+                msg = (
+                    f"Sequence {row['seqName']} best matches {row['dataset']}, "
+                    "which is currently not an accepted option for organism: "
+                    f"{config.organism}. It is therefore not possible to release. "
+                    "Contact the administrator if you think this message is an error."
+                )
+                errors.append(
+                    sequence_annotation(
+                        name="alignment",
+                        message=msg,
+                    )
+                )
+
+    return (unaligned_nucleotide_sequences, aligned_nucleotide_sequences, errors)
+
+
+def assign_segment(  # noqa: PLR0913, PLR0917
     input_unaligned_sequences: dict[str, NucleotideSequence | None],
     unaligned_nucleotide_sequences: dict[SegmentName, NucleotideSequence | None],
     errors: list[ProcessingAnnotation],
     aligned_nucleotide_sequences: dict[SegmentName, NucleotideSequence | None],
     config: Config,
+    dataset_dir: str,
 ):
     if config.classify_with_nextclade_sort:
-        # TODO: add this functionality
-        raise NotImplementedError(
-            "Classify with nextclade sort is not implemented yet. "
-            "Please set classify_with_nextclade_sort to False in the config."
+        return classify_with_nextclade_sort(
+            input_unaligned_sequences,
+            unaligned_nucleotide_sequences,
+            aligned_nucleotide_sequences,
+            dataset_dir=dataset_dir,
+            errors=errors,
+            config=config,
         )
     valid_segments = set()
     duplicate_segments = set()
+    if not config.multi_segment:
+        aligned_nucleotide_sequences["main"] = None
+        if len(input_unaligned_sequences) > 1:
+            errors.append(
+                ProcessingAnnotation(
+                    unprocessedFields=[
+                        AnnotationSource(
+                            name="alignment",
+                            type=AnnotationSourceType.NUCLEOTIDE_SEQUENCE,
+                        ),
+                    ],
+                    processedFields=[
+                        AnnotationSource(
+                            name="alignment",
+                            type=AnnotationSourceType.NUCLEOTIDE_SEQUENCE,
+                        ),
+                    ],
+                    message=(
+                        f"Multiple sequences: {list(input_unaligned_sequences.keys())} found in the"
+                        f" input data, but organism: {config.organism} is single-segmented. "
+                        "Please check that your metadata and sequences are annotated correctly."
+                        "Each metadata entry should have a single corresponding fasta sequence "
+                        "entry with the same submissionId."
+                    ),
+                )
+            )
+        else:
+            _, value = next(iter(input_unaligned_sequences.items()))
+            unaligned_nucleotide_sequences["main"] = value
+        return (
+            unaligned_nucleotide_sequences,
+            aligned_nucleotide_sequences,
+            errors,
+        )
     for sequence_and_dataset in config.nucleotideSequences:
         segment = sequence_and_dataset.name
         unaligned_segment = [
@@ -342,8 +462,6 @@ def enrich_with_nextclade(  # noqa: C901, PLR0914, PLR0915
         aligned_nucleotide_sequences[id] = {}
         alerts.warnings[id] = []
         alerts.errors[id] = []
-        for gene in config.genes:
-            aligned_aminoacid_sequences[id][gene] = None
         (
             unaligned_nucleotide_sequences[id],
             aligned_nucleotide_sequences[id],
@@ -354,6 +472,7 @@ def enrich_with_nextclade(  # noqa: C901, PLR0914, PLR0915
             errors=alerts.errors[id],
             aligned_nucleotide_sequences=aligned_nucleotide_sequences[id],
             config=config,
+            dataset_dir=dataset_dir,
         )
 
     nextclade_metadata: defaultdict[
@@ -776,7 +895,8 @@ def process_single(  # noqa: C901
         )
 
     aligned_segments = set()
-    for segment in config.nucleotideSequences:
+    for sequence_and_dataset in config.nucleotideSequences:
+        segment = sequence_and_dataset.name
         if unprocessed.alignedNucleotideSequences.get(segment, None):
             aligned_segments.add(segment)
 
@@ -792,7 +912,8 @@ def process_single(  # noqa: C901
 
     if config.create_embl_file and unprocessed.nextcladeMetadata is not None:
         annotations = {}
-        for segment in config.nucleotideSequences:
+        for sequence_and_dataset in config.nucleotideSequences:
+            segment = sequence_and_dataset.name
             if segment in unprocessed.nextcladeMetadata:
                 annotations[segment] = None
                 if unprocessed.nextcladeMetadata[segment]: