@@ -134,7 +134,7 @@ def run_sort(
134134 config : Config ,
135135 nextclade_dataset_server : str ,
136136 dataset_dir : str ,
137- ) -> None :
137+ ) -> pd . DataFrame :
138138 """
139139 Run nextclade
140140 - use config.minimizer_url or default minimizer from nextclade server
@@ -169,8 +169,19 @@ def run_sort(
169169 if exit_code != 0 :
170170 msg = f"nextclade sort failed with exit code { exit_code } "
171171 raise Exception (msg )
172+ return pd .read_csv (
173+ result_file ,
174+ sep = "\t " ,
175+ dtype = {
176+ "index" : "Int64" ,
177+ "score" : "float64" ,
178+ "seqName" : "string" ,
179+ "dataset" : "string" ,
180+ },
181+ )
172182
173183
184+ # TODO: running this for each sequence is inefficient, should be run once per batch
174185def check_nextclade_sort_matches ( # noqa: PLR0913, PLR0917
175186 result_file_dir : str ,
176187 input_file : str ,
@@ -195,25 +206,14 @@ def check_nextclade_sort_matches( # noqa: PLR0913, PLR0917
195206 accepted_dataset_names = sequence_and_dataset .accepted_sort_matches or [nextclade_dataset_name ] # type: ignore
196207
197208 result_file = result_file_dir + "/sort_output.tsv"
198- run_sort (
209+ df = run_sort (
199210 result_file ,
200211 input_file ,
201212 config ,
202213 nextclade_dataset_server ,
203214 dataset_dir ,
204215 )
205216
206- df = pd .read_csv (
207- result_file ,
208- sep = "\t " ,
209- dtype = {
210- "index" : "Int64" ,
211- "score" : "float64" ,
212- "seqName" : "string" ,
213- "dataset" : "string" ,
214- },
215- )
216-
217217 hits = df .dropna (subset = ["score" ]).sort_values ("score" , ascending = False )
218218 best_hits = hits .groupby ("seqName" , as_index = False ).first ()
219219
@@ -250,21 +250,141 @@ def check_nextclade_sort_matches( # noqa: PLR0913, PLR0917
250250 return alerts
251251
252252
253- def assign_segment (
253+ def classify_with_nextclade_sort (
254+ input_unaligned_sequences : dict [str , NucleotideSequence | None ],
255+ unaligned_nucleotide_sequences : dict [SegmentName , NucleotideSequence | None ],
256+ aligned_nucleotide_sequences : dict [SegmentName , NucleotideSequence | None ],
257+ errors : list [ProcessingAnnotation ],
258+ config : Config ,
259+ dataset_dir : str ,
260+ ):
261+ """
262+ Run nextclade sort
263+ - assert highest score is in sequence_and_dataset.accepted_sort_matches
264+ (default is nextclade_dataset_name)
265+ """
266+ nextclade_dataset_server = config .nextclade_dataset_server
267+
268+ with TemporaryDirectory (delete = not config .keep_tmp_dir ) as result_dir :
269+ input_file = result_dir + "/input.fasta"
270+ os .makedirs (os .path .dirname (input_file ), exist_ok = True )
271+ with open (input_file , "w" , encoding = "utf-8" ) as f :
272+ for id , seq in input_unaligned_sequences .items ():
273+ f .write (f">{ id } \n " )
274+ f .write (f"{ seq } \n " )
275+
276+ result_file = result_dir + "/sort_output.tsv"
277+ df = run_sort (
278+ result_file ,
279+ input_file ,
280+ config ,
281+ nextclade_dataset_server ,
282+ dataset_dir ,
283+ )
284+
285+ no_hits = df [df ["score" ].isna ()]
286+ hits = df .dropna (subset = ["score" ]).sort_values ("score" , ascending = False )
287+ for seq_name in no_hits ["seqName" ].unique ():
288+ if seq_name not in hits ["seqName" ].unique ():
289+ msg = (
290+ f"Sequence { seq_name } does not appear to match any reference for organism: "
291+ f"{ config .organism } per `nextclade sort`. "
292+ f"Double check you are submitting to the correct organism."
293+ )
294+ # TODO: only error when config.alignment_requirement == "ALL", otherwise warn
295+ errors .append (
296+ sequence_annotation (
297+ name = "alignment" ,
298+ message = msg ,
299+ )
300+ )
301+
302+ best_hits = hits .groupby ("seqName" , as_index = False ).first ()
303+ logger .info (f"Found hits: { best_hits ['seqName' ].tolist ()} " )
304+
305+ for _ , row in best_hits .iterrows ():
306+ not_found = True
307+ for segment in config .nucleotideSequences :
308+ accepted_dataset_names = segment .accepted_sort_matches or [
309+ segment .nextclade_dataset_name
310+ ]
311+ if row ["dataset" ] in accepted_dataset_names :
312+ unaligned_nucleotide_sequences [segment .name ] = input_unaligned_sequences [
313+ row ["seqName" ]
314+ ]
315+ aligned_nucleotide_sequences [segment .name ] = None
316+ not_found = False
317+ break
318+ if not_found :
319+ msg = (
320+ f"Sequence { row ['seqName' ]} best matches { row ['dataset' ]} , "
321+ "which is currently not an accepted option for organism: "
322+ f"{ config .organism } . It is therefore not possible to release. "
323+ "Contact the administrator if you think this message is an error."
324+ )
325+ errors .append (
326+ sequence_annotation (
327+ name = "alignment" ,
328+ message = msg ,
329+ )
330+ )
331+
332+ return (unaligned_nucleotide_sequences , aligned_nucleotide_sequences , errors )
333+
334+
335+ def assign_segment ( # noqa: PLR0913, PLR0917
254336 input_unaligned_sequences : dict [str , NucleotideSequence | None ],
255337 unaligned_nucleotide_sequences : dict [SegmentName , NucleotideSequence | None ],
256338 errors : list [ProcessingAnnotation ],
257339 aligned_nucleotide_sequences : dict [SegmentName , NucleotideSequence | None ],
258340 config : Config ,
341+ dataset_dir : str ,
259342):
260343 if config .classify_with_nextclade_sort :
261- # TODO: add this functionality
262- raise NotImplementedError (
263- "Classify with nextclade sort is not implemented yet. "
264- "Please set classify_with_nextclade_sort to False in the config."
344+ return classify_with_nextclade_sort (
345+ input_unaligned_sequences ,
346+ unaligned_nucleotide_sequences ,
347+ aligned_nucleotide_sequences ,
348+ dataset_dir = dataset_dir ,
349+ errors = errors ,
350+ config = config ,
265351 )
266352 valid_segments = set ()
267353 duplicate_segments = set ()
354+ if not config .multi_segment :
355+ aligned_nucleotide_sequences ["main" ] = None
356+ if len (input_unaligned_sequences ) > 1 :
357+ errors .append (
358+ ProcessingAnnotation (
359+ unprocessedFields = [
360+ AnnotationSource (
361+ name = "alignment" ,
362+ type = AnnotationSourceType .NUCLEOTIDE_SEQUENCE ,
363+ ),
364+ ],
365+ processedFields = [
366+ AnnotationSource (
367+ name = "alignment" ,
368+ type = AnnotationSourceType .NUCLEOTIDE_SEQUENCE ,
369+ ),
370+ ],
371+ message = (
372+ f"Multiple sequences: { list (input_unaligned_sequences .keys ())} found in the"
373+ f" input data, but organism: { config .organism } is single-segmented. "
374+ "Please check that your metadata and sequences are annotated correctly."
375+ "Each metadata entry should have a single corresponding fasta sequence "
376+ "entry with the same submissionId."
377+ ),
378+ )
379+ )
380+ else :
381+ _ , value = next (iter (input_unaligned_sequences .items ()))
382+ unaligned_nucleotide_sequences ["main" ] = value
383+ return (
384+ unaligned_nucleotide_sequences ,
385+ aligned_nucleotide_sequences ,
386+ errors ,
387+ )
268388 for sequence_and_dataset in config .nucleotideSequences :
269389 segment = sequence_and_dataset .name
270390 unaligned_segment = [
@@ -342,8 +462,6 @@ def enrich_with_nextclade( # noqa: C901, PLR0914, PLR0915
342462 aligned_nucleotide_sequences [id ] = {}
343463 alerts .warnings [id ] = []
344464 alerts .errors [id ] = []
345- for gene in config .genes :
346- aligned_aminoacid_sequences [id ][gene ] = None
347465 (
348466 unaligned_nucleotide_sequences [id ],
349467 aligned_nucleotide_sequences [id ],
@@ -354,6 +472,7 @@ def enrich_with_nextclade( # noqa: C901, PLR0914, PLR0915
354472 errors = alerts .errors [id ],
355473 aligned_nucleotide_sequences = aligned_nucleotide_sequences [id ],
356474 config = config ,
475+ dataset_dir = dataset_dir ,
357476 )
358477
359478 nextclade_metadata : defaultdict [
@@ -776,7 +895,8 @@ def process_single( # noqa: C901
776895 )
777896
778897 aligned_segments = set ()
779- for segment in config .nucleotideSequences :
898+ for sequence_and_dataset in config .nucleotideSequences :
899+ segment = sequence_and_dataset .name
780900 if unprocessed .alignedNucleotideSequences .get (segment , None ):
781901 aligned_segments .add (segment )
782902
@@ -792,7 +912,8 @@ def process_single( # noqa: C901
792912
793913 if config .create_embl_file and unprocessed .nextcladeMetadata is not None :
794914 annotations = {}
795- for segment in config .nucleotideSequences :
915+ for sequence_and_dataset in config .nucleotideSequences :
916+ segment = sequence_and_dataset .name
796917 if segment in unprocessed .nextcladeMetadata :
797918 annotations [segment ] = None
798919 if unprocessed .nextcladeMetadata [segment ]:
0 commit comments