Skip to content

Commit 3d4d2dd

Browse files
committed
feat(prepro): add sort code
1 parent 9e528ec commit 3d4d2dd

File tree

3 files changed

+157
-48
lines changed

3 files changed

+157
-48
lines changed

kubernetes/loculus/values.yaml

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1248,20 +1248,20 @@ defaultOrganisms:
12481248
- <<: *preprocessing
12491249
version: 1
12501250
configFile:
1251-
<<: *preprocessingConfigFile
1251+
<<: *preprocessingConfigFile
12521252
genes: [NP, VP35, VP40, GP, sGP, ssGP, VP30, VP24, L]
12531253
nucleotideSequences:
1254-
- name: "main"
1255-
nextclade_dataset_name: nextstrain/ebola/sudan
1254+
- name: "main"
1255+
nextclade_dataset_name: nextstrain/ebola/sudan
12561256
nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/ebola/data_output
12571257
- <<: *preprocessing
12581258
version: 2
12591259
configFile:
12601260
<<: *preprocessingConfigFile
12611261
genes: [NP, VP35, VP40, GP, sGP, ssGP, VP30, VP24, L]
12621262
nucleotideSequences:
1263-
- name: "main"
1264-
nextclade_dataset_name: nextstrain/ebola/sudan
1263+
- name: "main"
1264+
nextclade_dataset_name: nextstrain/ebola/sudan
12651265
nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/ebola/data_output
12661266
west-nile:
12671267
<<: *defaultOrganismConfig
@@ -1314,8 +1314,8 @@ defaultOrganisms:
13141314
configFile:
13151315
<<: *preprocessingConfigFile
13161316
nucleotideSequences:
1317-
- name: "main"
1318-
nextclade_dataset_name: nextstrain/wnv/all-lineages
1317+
- name: "main"
1318+
nextclade_dataset_name: nextstrain/wnv/all-lineages
13191319
nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/wnv/data_output
13201320
genes: [capsid, prM, env, NS1, NS2A, NS2B, NS3, NS4A, 2K, NS4B, NS5]
13211321
ingest:
@@ -1757,9 +1757,11 @@ defaultOrganisms:
17571757
<<: *preprocessingConfigFile
17581758
log_level: INFO
17591759
classify_with_nextclade_sort: True
1760+
minimizer_index: "https://raw.githubusercontent.com/alejandra-gonzalezsanchez/loculus-evs/master/evs_minimizer-index.json"
17601761
nucleotideSequences:
17611762
- name: CV-A16
17621763
nextclade_dataset_name: community/hodcroftlab/enterovirus/enterovirus/linked/CV-A16
1764+
accepted_sort_matches: ["community/hodcroftlab/enterovirus/cva16", "community/hodcroftlab/enterovirus/enterovirus/linked/CV-A16"]
17631765
- name: CV-A10
17641766
nextclade_dataset_name: community/hodcroftlab/enterovirus/enterovirus/linked/CV-A10
17651767
- name: EV-A71
@@ -1985,7 +1987,7 @@ enforceHTTPS: true
19851987
registrationTermsMessage: >
19861988
You must agree to the <a href="http://main.loculus.org/terms">terms of use</a>.
19871989
1988-
enaDeposition:
1990+
enaDeposition:
19891991
submitToEnaProduction: false
19901992
enaDbName: Loculus
19911993
enaUniqueSuffix: Loculus

preprocessing/nextclade/src/loculus_preprocessing/config.py

Lines changed: 4 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,6 @@ class AlignmentRequirement(StrEnum):
3636
ALL = "ALL"
3737

3838

39-
class AlignmentRequirement(Enum):
40-
ANY = "ANY"
41-
ALL = "ALL"
42-
43-
4439
@dataclass
4540
class NextcladeSequenceAndDataset:
4641
name: str = "main"
@@ -66,26 +61,19 @@ class Config:
6661
keycloak_token_path: str = "realms/loculus/protocol/openid-connect/token" # noqa: S105
6762

6863
organism: str = "mpox"
64+
nucleotideSequences: list[NextcladeSequenceAndDataset] = dataclasses.field( # noqa: N815
65+
default_factory=list
66+
)
6967
genes: list[str] = dataclasses.field(default_factory=list)
70-
nucleotideSequences: list[str] = dataclasses.field(default_factory=lambda: ["main"]) # noqa: N815
7168
processing_spec: dict[str, dict[str, Any]] = dataclasses.field(default_factory=dict)
7269
multi_segment: bool = False
7370

7471
alignment_requirement: AlignmentRequirement = AlignmentRequirement.ALL
75-
nextclade_dataset_name: str | None = None
76-
nextclade_dataset_name_map: dict[str, str] | None = None
77-
nextclade_dataset_tag: str | None = None
7872
nextclade_dataset_server: str = "https://data.clades.nextstrain.org/v3"
73+
7974
require_nextclade_sort_match: bool = False
8075
minimizer_url: str | None = None
81-
nucleotideSequences: list[NextcladeSequenceAndDataset] = dataclasses.field( # noqa: N815
82-
default_factory=list
83-
)
84-
genes: list[str] = dataclasses.field(default_factory=list)
85-
multi_segment: bool = False
8676
classify_with_nextclade_sort: bool = False
87-
alignment_requirement: AlignmentRequirement = AlignmentRequirement.ALL
88-
require_nextclade_sort_match: bool = False
8977

9078
create_embl_file: bool = False
9179
scientific_name: str = "Orthonairovirus haemorrhagiae"
@@ -97,8 +85,6 @@ class Config:
9785
default_factory=EmblInfoMetadataPropertyNames
9886
)
9987

100-
processing_spec: dict[str, dict[str, Any]] = dataclasses.field(default_factory=dict)
101-
10288

10389
def assign_nextclade_sequence_and_dataset(
10490
nuc_seq_values: list[dict[str, Any]],

preprocessing/nextclade/src/loculus_preprocessing/prepro.py

Lines changed: 143 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ def run_sort(
134134
config: Config,
135135
nextclade_dataset_server: str,
136136
dataset_dir: str,
137-
) -> None:
137+
) -> pd.DataFrame:
138138
"""
139139
Run nextclade
140140
- use config.minimizer_url or default minimizer from nextclade server
@@ -169,8 +169,19 @@ def run_sort(
169169
if exit_code != 0:
170170
msg = f"nextclade sort failed with exit code {exit_code}"
171171
raise Exception(msg)
172+
return pd.read_csv(
173+
result_file,
174+
sep="\t",
175+
dtype={
176+
"index": "Int64",
177+
"score": "float64",
178+
"seqName": "string",
179+
"dataset": "string",
180+
},
181+
)
172182

173183

184+
# TODO: running this for each sequence is inefficient, should be run once per batch
174185
def check_nextclade_sort_matches( # noqa: PLR0913, PLR0917
175186
result_file_dir: str,
176187
input_file: str,
@@ -195,25 +206,14 @@ def check_nextclade_sort_matches( # noqa: PLR0913, PLR0917
195206
accepted_dataset_names = sequence_and_dataset.accepted_sort_matches or [nextclade_dataset_name] # type: ignore
196207

197208
result_file = result_file_dir + "/sort_output.tsv"
198-
run_sort(
209+
df = run_sort(
199210
result_file,
200211
input_file,
201212
config,
202213
nextclade_dataset_server,
203214
dataset_dir,
204215
)
205216

206-
df = pd.read_csv(
207-
result_file,
208-
sep="\t",
209-
dtype={
210-
"index": "Int64",
211-
"score": "float64",
212-
"seqName": "string",
213-
"dataset": "string",
214-
},
215-
)
216-
217217
hits = df.dropna(subset=["score"]).sort_values("score", ascending=False)
218218
best_hits = hits.groupby("seqName", as_index=False).first()
219219

@@ -250,21 +250,141 @@ def check_nextclade_sort_matches( # noqa: PLR0913, PLR0917
250250
return alerts
251251

252252

253-
def assign_segment(
253+
def classify_with_nextclade_sort(
254+
input_unaligned_sequences: dict[str, NucleotideSequence | None],
255+
unaligned_nucleotide_sequences: dict[SegmentName, NucleotideSequence | None],
256+
aligned_nucleotide_sequences: dict[SegmentName, NucleotideSequence | None],
257+
errors: list[ProcessingAnnotation],
258+
config: Config,
259+
dataset_dir: str,
260+
):
261+
"""
262+
Run nextclade sort
263+
- assert highest score is in sequence_and_dataset.accepted_sort_matches
264+
(default is nextclade_dataset_name)
265+
"""
266+
nextclade_dataset_server = config.nextclade_dataset_server
267+
268+
with TemporaryDirectory(delete=not config.keep_tmp_dir) as result_dir:
269+
input_file = result_dir + "/input.fasta"
270+
os.makedirs(os.path.dirname(input_file), exist_ok=True)
271+
with open(input_file, "w", encoding="utf-8") as f:
272+
for id, seq in input_unaligned_sequences.items():
273+
f.write(f">{id}\n")
274+
f.write(f"{seq}\n")
275+
276+
result_file = result_dir + "/sort_output.tsv"
277+
df = run_sort(
278+
result_file,
279+
input_file,
280+
config,
281+
nextclade_dataset_server,
282+
dataset_dir,
283+
)
284+
285+
no_hits = df[df["score"].isna()]
286+
hits = df.dropna(subset=["score"]).sort_values("score", ascending=False)
287+
for seq_name in no_hits["seqName"].unique():
288+
if seq_name not in hits["seqName"].unique():
289+
msg = (
290+
f"Sequence {seq_name} does not appear to match any reference for organism: "
291+
f"{config.organism} per `nextclade sort`. "
292+
f"Double check you are submitting to the correct organism."
293+
)
294+
# TODO: only error when config.alignment_requirement == "ALL", otherwise warn
295+
errors.append(
296+
sequence_annotation(
297+
name="alignment",
298+
message=msg,
299+
)
300+
)
301+
302+
best_hits = hits.groupby("seqName", as_index=False).first()
303+
logger.info(f"Found hits: {best_hits['seqName'].tolist()}")
304+
305+
for _, row in best_hits.iterrows():
306+
not_found = True
307+
for segment in config.nucleotideSequences:
308+
accepted_dataset_names = segment.accepted_sort_matches or [
309+
segment.nextclade_dataset_name
310+
]
311+
if row["dataset"] in accepted_dataset_names:
312+
unaligned_nucleotide_sequences[segment.name] = input_unaligned_sequences[
313+
row["seqName"]
314+
]
315+
aligned_nucleotide_sequences[segment.name] = None
316+
not_found = False
317+
break
318+
if not_found:
319+
msg = (
320+
f"Sequence {row['seqName']} best matches {row['dataset']}, "
321+
"which is currently not an accepted option for organism: "
322+
f"{config.organism}. It is therefore not possible to release. "
323+
"Contact the administrator if you think this message is an error."
324+
)
325+
errors.append(
326+
sequence_annotation(
327+
name="alignment",
328+
message=msg,
329+
)
330+
)
331+
332+
return (unaligned_nucleotide_sequences, aligned_nucleotide_sequences, errors)
333+
334+
335+
def assign_segment( # noqa: PLR0913, PLR0917
254336
input_unaligned_sequences: dict[str, NucleotideSequence | None],
255337
unaligned_nucleotide_sequences: dict[SegmentName, NucleotideSequence | None],
256338
errors: list[ProcessingAnnotation],
257339
aligned_nucleotide_sequences: dict[SegmentName, NucleotideSequence | None],
258340
config: Config,
341+
dataset_dir: str,
259342
):
260343
if config.classify_with_nextclade_sort:
261-
# TODO: add this functionality
262-
raise NotImplementedError(
263-
"Classify with nextclade sort is not implemented yet. "
264-
"Please set classify_with_nextclade_sort to False in the config."
344+
return classify_with_nextclade_sort(
345+
input_unaligned_sequences,
346+
unaligned_nucleotide_sequences,
347+
aligned_nucleotide_sequences,
348+
dataset_dir=dataset_dir,
349+
errors=errors,
350+
config=config,
265351
)
266352
valid_segments = set()
267353
duplicate_segments = set()
354+
if not config.multi_segment:
355+
aligned_nucleotide_sequences["main"] = None
356+
if len(input_unaligned_sequences) > 1:
357+
errors.append(
358+
ProcessingAnnotation(
359+
unprocessedFields=[
360+
AnnotationSource(
361+
name="alignment",
362+
type=AnnotationSourceType.NUCLEOTIDE_SEQUENCE,
363+
),
364+
],
365+
processedFields=[
366+
AnnotationSource(
367+
name="alignment",
368+
type=AnnotationSourceType.NUCLEOTIDE_SEQUENCE,
369+
),
370+
],
371+
message=(
372+
f"Multiple sequences: {list(input_unaligned_sequences.keys())} found in the"
373+
f" input data, but organism: {config.organism} is single-segmented. "
374+
"Please check that your metadata and sequences are annotated correctly."
375+
"Each metadata entry should have a single corresponding fasta sequence "
376+
"entry with the same submissionId."
377+
),
378+
)
379+
)
380+
else:
381+
_, value = next(iter(input_unaligned_sequences.items()))
382+
unaligned_nucleotide_sequences["main"] = value
383+
return (
384+
unaligned_nucleotide_sequences,
385+
aligned_nucleotide_sequences,
386+
errors,
387+
)
268388
for sequence_and_dataset in config.nucleotideSequences:
269389
segment = sequence_and_dataset.name
270390
unaligned_segment = [
@@ -342,8 +462,6 @@ def enrich_with_nextclade( # noqa: C901, PLR0914, PLR0915
342462
aligned_nucleotide_sequences[id] = {}
343463
alerts.warnings[id] = []
344464
alerts.errors[id] = []
345-
for gene in config.genes:
346-
aligned_aminoacid_sequences[id][gene] = None
347465
(
348466
unaligned_nucleotide_sequences[id],
349467
aligned_nucleotide_sequences[id],
@@ -354,6 +472,7 @@ def enrich_with_nextclade( # noqa: C901, PLR0914, PLR0915
354472
errors=alerts.errors[id],
355473
aligned_nucleotide_sequences=aligned_nucleotide_sequences[id],
356474
config=config,
475+
dataset_dir=dataset_dir,
357476
)
358477

359478
nextclade_metadata: defaultdict[
@@ -776,7 +895,8 @@ def process_single( # noqa: C901
776895
)
777896

778897
aligned_segments = set()
779-
for segment in config.nucleotideSequences:
898+
for sequence_and_dataset in config.nucleotideSequences:
899+
segment = sequence_and_dataset.name
780900
if unprocessed.alignedNucleotideSequences.get(segment, None):
781901
aligned_segments.add(segment)
782902

@@ -792,7 +912,8 @@ def process_single( # noqa: C901
792912

793913
if config.create_embl_file and unprocessed.nextcladeMetadata is not None:
794914
annotations = {}
795-
for segment in config.nucleotideSequences:
915+
for sequence_and_dataset in config.nucleotideSequences:
916+
segment = sequence_and_dataset.name
796917
if segment in unprocessed.nextcladeMetadata:
797918
annotations[segment] = None
798919
if unprocessed.nextcladeMetadata[segment]:

0 commit comments

Comments
 (0)