diff --git a/backend/src/main/kotlin/org/loculus/backend/api/SubmissionTypes.kt b/backend/src/main/kotlin/org/loculus/backend/api/SubmissionTypes.kt index b3cc15b2ae..8af325417a 100644 --- a/backend/src/main/kotlin/org/loculus/backend/api/SubmissionTypes.kt +++ b/backend/src/main/kotlin/org/loculus/backend/api/SubmissionTypes.kt @@ -171,6 +171,11 @@ data class ProcessedData( example = """{"raw_reads": [{"fileId": "s0m3-uUiDd", "name": "data.fastaq"}], "sequencing_logs": []}""", description = "The key is the file category name, the value is a list of files, with ID and name.", ) + val sequenceNameToFastaHeaderMap: Map = emptyMap(), + @Schema( + example = """{"segment1": "fastaHeader1", "segment2": "fastaHeader2"}""", + description = "The key is the segment name, the value is the fastaHeader of the original Data", + ) val files: FileCategoryFilesMap?, ) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/CompressionService.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/CompressionService.kt index 3a82303e67..9e612e4927 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/CompressionService.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/CompressionService.kt @@ -118,6 +118,7 @@ class CompressionService( } }, processedData.aminoAcidInsertions, + processedData.sequenceNameToFastaHeaderMap, processedData.files, ) @@ -144,6 +145,7 @@ class CompressionService( } }, processedData.aminoAcidInsertions, + processedData.sequenceNameToFastaHeaderMap, processedData.files, ) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/EmptyProcessedDataProvider.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/EmptyProcessedDataProvider.kt index 959093fe80..f715554f08 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/EmptyProcessedDataProvider.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/EmptyProcessedDataProvider.kt @@ -20,6 +20,7 @@ class EmptyProcessedDataProvider(private val backendConfig: BackendConfig) { alignedAminoAcidSequences = referenceGenome.genes.map { it.name }.associateWith { null }, nucleotideInsertions = referenceGenome.nucleotideSequences.map { it.name }.associateWith { emptyList() }, aminoAcidInsertions = referenceGenome.genes.map { it.name }.associateWith { emptyList() }, + sequenceNameToFastaHeaderMap = referenceGenome.nucleotideSequences.map { it.name }.associateWith { "" }, files = null, ) } diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/ProcessedSequenceEntryValidator.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/ProcessedSequenceEntryValidator.kt index 61a69e10e5..4024d5f16d 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/ProcessedSequenceEntryValidator.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/ProcessedSequenceEntryValidator.kt @@ -232,6 +232,11 @@ class ProcessedSequenceEntryValidator(private val schema: Schema, private val re "alignedNucleotideSequences", ) + validateNoUnknownSegment( + processedData.sequenceNameToFastaHeaderMap, + "sequenceNameToFastaHeaderMap", + ) + validateNoUnknownSegment( processedData.unalignedNucleotideSequences, "unalignedNucleotideSequences", diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/SubmissionDatabaseService.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/SubmissionDatabaseService.kt index d4c6a464c4..17998aa1fc 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/SubmissionDatabaseService.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/SubmissionDatabaseService.kt @@ -460,6 +460,7 @@ class SubmissionDatabaseService( aminoAcidInsertions = processedData.aminoAcidInsertions.mapValues { (_, it) -> it.map { insertion -> insertion.copy(sequence = insertion.sequence.uppercase(Locale.US)) } }, + sequenceNameToFastaHeaderMap = processedData.sequenceNameToFastaHeaderMap, ) private fun validateExternalMetadata( diff --git a/backend/src/test/kotlin/org/loculus/backend/controller/submission/PreparedProcessedData.kt b/backend/src/test/kotlin/org/loculus/backend/controller/submission/PreparedProcessedData.kt index 9a874a5ab5..d5523ac3af 100644 --- a/backend/src/test/kotlin/org/loculus/backend/controller/submission/PreparedProcessedData.kt +++ b/backend/src/test/kotlin/org/loculus/backend/controller/submission/PreparedProcessedData.kt @@ -62,6 +62,7 @@ val defaultProcessedData = ProcessedData( Insertion(123, "RN"), ), ), + sequenceNameToFastaHeaderMap = mapOf(MAIN_SEGMENT to "header"), files = null, ) @@ -101,6 +102,7 @@ val defaultProcessedDataMultiSegmented = ProcessedData( Insertion(123, "RN"), ), ), + sequenceNameToFastaHeaderMap = mapOf("notOnlySegment" to "header1", "secondSegment" to "header2"), files = null, ) @@ -117,6 +119,7 @@ val defaultProcessedDataWithoutSequences = ProcessedData( nucleotideInsertions = emptyMap(), alignedAminoAcidSequences = emptyMap(), aminoAcidInsertions = emptyMap(), + sequenceNameToFastaHeaderMap = emptyMap(), files = null, ) diff --git a/backend/src/test/kotlin/org/loculus/backend/service/ProcessedMetadataPostprocessorTest.kt b/backend/src/test/kotlin/org/loculus/backend/service/ProcessedMetadataPostprocessorTest.kt index 2a5d304275..6ecbb08e79 100644 --- a/backend/src/test/kotlin/org/loculus/backend/service/ProcessedMetadataPostprocessorTest.kt +++ b/backend/src/test/kotlin/org/loculus/backend/service/ProcessedMetadataPostprocessorTest.kt @@ -43,6 +43,7 @@ class ProcessedMetadataPostprocessorTest( nucleotideInsertions = emptyMap(), alignedAminoAcidSequences = emptyMap(), aminoAcidInsertions = emptyMap(), + sequenceNameToFastaHeaderMap = emptyMap(), files = null, ) diff --git a/backend/src/test/kotlin/org/loculus/backend/service/ProcessedSequencesPostprocessorTest.kt b/backend/src/test/kotlin/org/loculus/backend/service/ProcessedSequencesPostprocessorTest.kt index fcf7873912..51e6db7353 100644 --- a/backend/src/test/kotlin/org/loculus/backend/service/ProcessedSequencesPostprocessorTest.kt +++ b/backend/src/test/kotlin/org/loculus/backend/service/ProcessedSequencesPostprocessorTest.kt @@ -91,6 +91,10 @@ class ProcessedSequencesPostprocessorTest( unconfiguredPresentGene to listOf(Insertion(13, "TT")), unconfiguredNullGene to emptyList(), ), + sequenceNameToFastaHeaderMap = mapOf( + "configuredPresentSeg" to "header1", + "unconfiguredPresentSeg" to "header2", + ), files = null, ) diff --git a/backend/src/test/kotlin/org/loculus/backend/utils/EarliestReleaseDateFinderTest.kt b/backend/src/test/kotlin/org/loculus/backend/utils/EarliestReleaseDateFinderTest.kt index 03d4b781da..43e89af3a4 100644 --- a/backend/src/test/kotlin/org/loculus/backend/utils/EarliestReleaseDateFinderTest.kt +++ b/backend/src/test/kotlin/org/loculus/backend/utils/EarliestReleaseDateFinderTest.kt @@ -62,6 +62,7 @@ fun row( nucleotideInsertions = emptyMap(), alignedAminoAcidSequences = emptyMap(), aminoAcidInsertions = emptyMap(), + sequenceNameToFastaHeaderMap = emptyMap(), files = null, ), isRevocation = false, diff --git a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py index 5159a20613..4ede4970a4 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py @@ -83,7 +83,7 @@ class UnprocessedEntry: @dataclass class SegmentAssignment: unalignedNucleotideSequences: dict[SegmentName, NucleotideSequence | None] # noqa: N815 - segmentNameToFastaHeaders: dict[SegmentName, str] # noqa: N815 + sequenceNameToFastaHeaderMap: dict[SegmentName, str] # noqa: N815 errors: list[ProcessingAnnotation] warnings: list[ProcessingAnnotation] @@ -107,6 +107,7 @@ class UnprocessedAfterNextclade: nucleotideInsertions: dict[SegmentName, list[NucleotideInsertion]] # noqa: N815 alignedAminoAcidSequences: dict[GeneName, AminoAcidSequence | None] # noqa: N815 aminoAcidInsertions: dict[GeneName, list[AminoAcidInsertion]] # noqa: N815 + sequenceNameToFastaHeaderMap: dict[SegmentName, str] # noqa: N815 errors: list[ProcessingAnnotation] warnings: list[ProcessingAnnotation] @@ -125,6 +126,7 @@ class ProcessedData: nucleotideInsertions: dict[SegmentName, Any] # noqa: N815 alignedAminoAcidSequences: dict[GeneName, Any] # noqa: N815 aminoAcidInsertions: dict[GeneName, Any] # noqa: N815 + sequenceNameToFastaHeaderMap: dict[SegmentName, str] # noqa: N815 files: dict[str, list[FileIdAndName]] | None = None diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index a8f7d872ee..31afb147fa 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -365,7 +365,7 @@ def assign_segment_with_nextclade_sort( message=msg, ) ) - segmentNameToFastaHeaders: dict[SegmentName, str] = {} + sequenceNameToFastaHeaderMap: dict[SegmentName, str] = {} for segment_name, headers in sort_results_map.items(): if len(headers) > 1: msg = ( @@ -381,7 +381,7 @@ def assign_segment_with_nextclade_sort( ) ) continue - segmentNameToFastaHeaders[segment_name] = headers[0] + sequenceNameToFastaHeaderMap[segment_name] = headers[0] unaligned_nucleotide_sequences[segment_name] = input_unaligned_sequences[headers[0]] if ( @@ -399,7 +399,7 @@ def assign_segment_with_nextclade_sort( return SegmentAssignment( unalignedNucleotideSequences=unaligned_nucleotide_sequences, - segmentNameToFastaHeaders=segmentNameToFastaHeaders, + sequenceNameToFastaHeaderMap=sequenceNameToFastaHeaderMap, errors=errors, warnings=warnings, ) @@ -412,7 +412,7 @@ def assign_single_segment( errors: list[ProcessingAnnotation] = [] warnings: list[ProcessingAnnotation] = [] unaligned_nucleotide_sequences: dict[SegmentName, NucleotideSequence | None] = {} - fastaHeader = "" + sequenceNameToFastaHeaderMap: dict[SegmentName, str] = {} if len(input_unaligned_sequences) > 1: errors.append( ProcessingAnnotation.from_single( @@ -429,28 +429,29 @@ def assign_single_segment( ) else: fastaHeader, value = next(iter(input_unaligned_sequences.items())) + sequenceNameToFastaHeaderMap["main"] = fastaHeader unaligned_nucleotide_sequences["main"] = value return SegmentAssignment( unalignedNucleotideSequences=unaligned_nucleotide_sequences, - segmentNameToFastaHeaders={"main": fastaHeader}, + sequenceNameToFastaHeaderMap=sequenceNameToFastaHeaderMap, errors=errors, warnings=warnings, ) -def assign_segment_with_header( +def assign_segment_using_header( input_unaligned_sequences: dict[str, NucleotideSequence | None], config: Config, ) -> SegmentAssignment: errors: list[ProcessingAnnotation] = [] warnings: list[ProcessingAnnotation] = [] unaligned_nucleotide_sequences: dict[SegmentName, NucleotideSequence | None] = {} - segmentNameToFastaHeaders: dict[SegmentName, str] = {} + sequenceNameToFastaHeaderMap: dict[SegmentName, str] = {} duplicate_segments = set() if not config.nucleotideSequences: return SegmentAssignment( unalignedNucleotideSequences={}, - segmentNameToFastaHeaders={}, + sequenceNameToFastaHeaderMap={}, errors=errors, warnings=warnings, ) @@ -480,13 +481,13 @@ def assign_segment_with_header( ) ) elif len(unaligned_segment) == 1: - segmentNameToFastaHeaders[segment] = unaligned_segment[0] + sequenceNameToFastaHeaderMap[segment] = unaligned_segment[0] unaligned_nucleotide_sequences[segment] = input_unaligned_sequences[ unaligned_segment[0] ] remaining_segments = ( set(input_unaligned_sequences.keys()) - - set(segmentNameToFastaHeaders.values()) + - set(sequenceNameToFastaHeaderMap.values()) - duplicate_segments ) if len(remaining_segments) > 0: @@ -513,7 +514,7 @@ def assign_segment_with_header( ) return SegmentAssignment( unalignedNucleotideSequences=unaligned_nucleotide_sequences, - segmentNameToFastaHeaders=segmentNameToFastaHeaders, + sequenceNameToFastaHeaderMap=sequenceNameToFastaHeaderMap, errors=errors, warnings=warnings, ) @@ -533,11 +534,13 @@ def enrich_with_nextclade( # noqa: C901, PLR0914, PLR0915 nucleotideInsertions: dict[SegmentName, list[NucleotideInsertion]] alignedAminoAcidSequences: dict[GeneName, AminoAcidSequence | None] aminoAcidInsertions: dict[GeneName, list[AminoAcidInsertion]] + sequenceNameToFastaHeaderMap: dict[SegmentName, str] )` object. """ unaligned_nucleotide_sequences: dict[ AccessionVersion, dict[SegmentName, NucleotideSequence | None] ] = {} + segment_assignment_map: dict[AccessionVersion, dict[SegmentName, str]] = {} alerts: Alerts = Alerts() input_metadata: dict[AccessionVersion, dict[str, Any]] = {} aligned_aminoacid_sequences: dict[ @@ -554,6 +557,7 @@ def enrich_with_nextclade( # noqa: C901, PLR0914, PLR0915 input_metadata[id]["group_id"] = entry.data.group_id aligned_aminoacid_sequences[id] = {} aligned_nucleotide_sequences[id] = {} + segment_assignment_map[id] = {} if not config.multi_segment: segment_assignment = assign_single_segment( input_unaligned_sequences=entry.data.unalignedNucleotideSequences, @@ -568,6 +572,7 @@ def enrich_with_nextclade( # noqa: C901, PLR0914, PLR0915 unaligned_nucleotide_sequences[id] = segment_assignment.unalignedNucleotideSequences alerts.errors[id] = segment_assignment.errors alerts.warnings[id] = segment_assignment.warnings + segment_assignment_map[id] = segment_assignment.sequenceNameToFastaHeaderMap nextclade_metadata: defaultdict[ AccessionVersion, defaultdict[SegmentName, dict[str, Any] | None] @@ -672,6 +677,7 @@ def enrich_with_nextclade( # noqa: C901, PLR0914, PLR0915 nucleotideInsertions=nucleotide_insertions[id], alignedAminoAcidSequences=aligned_aminoacid_sequences[id], aminoAcidInsertions=amino_acid_insertions[id], + sequenceNameToFastaHeaderMap=segment_assignment_map[id], errors=alerts.errors[id], warnings=alerts.warnings[id], ) @@ -884,6 +890,7 @@ def processed_entry_no_alignment( output_metadata: ProcessedMetadata, errors: list[ProcessingAnnotation], warnings: list[ProcessingAnnotation], + sequenceNameToFastaHeaderMap: dict[SegmentName, str], ) -> SubmissionData: """Process a single sequence without alignment""" @@ -903,6 +910,7 @@ def processed_entry_no_alignment( nucleotideInsertions=nucleotide_insertions, alignedAminoAcidSequences=aligned_aminoacid_sequences, aminoAcidInsertions=amino_acid_insertions, + sequenceNameToFastaHeaderMap=sequenceNameToFastaHeaderMap, ), errors=errors, warnings=warnings, @@ -932,7 +940,7 @@ def process_single( # noqa: C901 else: submitter = unprocessed.submitter group_id = unprocessed.group_id - segment_assignment = assign_segment_with_header( + segment_assignment = assign_segment_using_header( input_unaligned_sequences=unprocessed.unalignedNucleotideSequences, config=config, ) @@ -987,7 +995,14 @@ def process_single( # noqa: C901 logger.debug(f"Processed {id}: {output_metadata}") if isinstance(unprocessed, UnprocessedData): - return processed_entry_no_alignment(id, unprocessed, output_metadata, errors, warnings) + return processed_entry_no_alignment( + id, + unprocessed, + output_metadata, + errors, + warnings, + segment_assignment.sequenceNameToFastaHeaderMap, + ) aligned_segments = set() for sequence_and_dataset in config.nucleotideSequences: @@ -1031,6 +1046,7 @@ def process_single( # noqa: C901 nucleotideInsertions=unprocessed.nucleotideInsertions, alignedAminoAcidSequences=unprocessed.alignedAminoAcidSequences, aminoAcidInsertions=unprocessed.aminoAcidInsertions, + sequenceNameToFastaHeaderMap=unprocessed.sequenceNameToFastaHeaderMap, ), errors=list(set(errors)), warnings=list(set(warnings)), @@ -1056,6 +1072,7 @@ def processed_entry_with_errors(id) -> SubmissionData: nucleotideInsertions=defaultdict(dict[str, Any]), alignedAminoAcidSequences=defaultdict(dict[str, Any]), aminoAcidInsertions=defaultdict(dict[str, Any]), + sequenceNameToFastaHeaderMap=defaultdict(str), ), errors=[ ProcessingAnnotation.from_single( diff --git a/preprocessing/nextclade/tests/factory_methods.py b/preprocessing/nextclade/tests/factory_methods.py index 23411c99e9..c15629910d 100644 --- a/preprocessing/nextclade/tests/factory_methods.py +++ b/preprocessing/nextclade/tests/factory_methods.py @@ -54,6 +54,9 @@ class ProcessedAlignment: nucleotideInsertions: dict[str, list[str]] = field(default_factory=dict) # noqa: N815 alignedAminoAcidSequences: dict[str, str | None] = field(default_factory=dict) # noqa: N815 aminoAcidInsertions: dict[str, list[str]] = field(default_factory=dict) # noqa: N815 + sequenceNameToFastaHeaderMap: dict[str, str] = field( # noqa: N815 + default_factory=dict + ) @dataclass @@ -140,6 +143,7 @@ def create_processed_entry( nucleotideInsertions=processed_alignment.nucleotideInsertions, alignedAminoAcidSequences=processed_alignment.alignedAminoAcidSequences, aminoAcidInsertions=processed_alignment.aminoAcidInsertions, + sequenceNameToFastaHeaderMap=processed_alignment.sequenceNameToFastaHeaderMap, ), errors=errors, warnings=warnings, @@ -242,3 +246,7 @@ def verify_processed_entry( f"{test_name}: amino acid insertions '{actual.aminoAcidInsertions}' do not " f"match expectation '{expected.aminoAcidInsertions}'." ) + assert actual.sequenceNameToFastaHeaderMap == expected.sequenceNameToFastaHeaderMap, ( + f"{test_name}: sequence name to fasta header map '{actual.sequenceNameToFastaHeaderMap}' do not " + f"match expectation '{expected.sequenceNameToFastaHeaderMap}'." + ) diff --git a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py index f973cfd5ab..bbb4bfc5f8 100644 --- a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py +++ b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py @@ -115,7 +115,7 @@ def invalid_sequence() -> str: Case( name="with mutation", input_metadata={}, - input_sequence={"main": sequence_with_mutation("single")}, + input_sequence={"fastaHeader": sequence_with_mutation("single")}, accession_id="1", expected_metadata={ "completeness": 1.0, @@ -135,12 +135,13 @@ def invalid_sequence() -> str: "VP35EbolaSudan": ebola_sudan_aa(sequence_with_mutation("single"), "VP35"), }, aminoAcidInsertions={}, + sequenceNameToFastaHeaderMap={"main": "fastaHeader"}, ), ), Case( name="with insertion", input_metadata={}, - input_sequence={"main": sequence_with_insertion("single")}, + input_sequence={"fastaHeader": sequence_with_insertion("single")}, accession_id="1", expected_metadata={ "completeness": 1.0, @@ -160,12 +161,13 @@ def invalid_sequence() -> str: "VP35EbolaSudan": ebola_sudan_aa(consensus_sequence("single"), "VP35"), }, aminoAcidInsertions={"NPEbolaSudan": ["738:D"]}, + sequenceNameToFastaHeaderMap={"main": "fastaHeader"}, ), ), Case( name="with deletion", input_metadata={}, - input_sequence={"main": sequence_with_deletion("single")}, + input_sequence={"fastaHeader": sequence_with_deletion("single")}, accession_id="1", expected_metadata={ "completeness": 1.0, @@ -189,12 +191,13 @@ def invalid_sequence() -> str: ), }, aminoAcidInsertions={}, + sequenceNameToFastaHeaderMap={"main": "fastaHeader"}, ), ), Case( name="with failed alignment", input_metadata={}, - input_sequence={"main": invalid_sequence()}, + input_sequence={"fastaHeader": invalid_sequence()}, accession_id="1", expected_metadata={ "completeness": None, @@ -218,6 +221,7 @@ def invalid_sequence() -> str: nucleotideInsertions={}, alignedAminoAcidSequences={}, aminoAcidInsertions={}, + sequenceNameToFastaHeaderMap={"main": "fastaHeader"}, ), ), ] @@ -227,8 +231,8 @@ def invalid_sequence() -> str: name="with mutation", input_metadata={}, input_sequence={ - "ebola-sudan": sequence_with_mutation("ebola-sudan"), - "ebola-zaire": sequence_with_mutation("ebola-zaire"), + "fastaHeader1": sequence_with_mutation("ebola-sudan"), + "fastaHeader2": sequence_with_mutation("ebola-zaire"), }, accession_id="1", expected_metadata={ @@ -260,14 +264,18 @@ def invalid_sequence() -> str: "LEbolaZaire": ebola_zaire_aa(sequence_with_mutation("ebola-zaire"), "L"), }, aminoAcidInsertions={}, + sequenceNameToFastaHeaderMap={ + "ebola-sudan": "fastaHeader1", + "ebola-zaire": "fastaHeader2", + }, ), ), Case( name="with insertion", input_metadata={}, input_sequence={ - "ebola-sudan": sequence_with_insertion("ebola-sudan"), - "ebola-zaire": sequence_with_insertion("ebola-zaire"), + "fastaHeader1": sequence_with_insertion("ebola-sudan"), + "fastaHeader2": sequence_with_insertion("ebola-zaire"), }, accession_id="1", expected_metadata={ @@ -299,14 +307,18 @@ def invalid_sequence() -> str: "LEbolaZaire": ebola_zaire_aa(consensus_sequence("ebola-zaire"), "L"), }, aminoAcidInsertions={"NPEbolaSudan": ["738:D"], "VP24EbolaZaire": ["251:D"]}, + sequenceNameToFastaHeaderMap={ + "ebola-sudan": "fastaHeader1", + "ebola-zaire": "fastaHeader2", + }, ), ), Case( name="with deletion", input_metadata={}, input_sequence={ - "ebola-sudan": sequence_with_deletion("ebola-sudan"), - "ebola-zaire": sequence_with_deletion("ebola-zaire"), + "fastaHeader1": sequence_with_deletion("ebola-sudan"), + "fastaHeader2": sequence_with_deletion("ebola-zaire"), }, accession_id="1", expected_metadata={ @@ -348,13 +360,17 @@ def invalid_sequence() -> str: ), }, aminoAcidInsertions={}, + sequenceNameToFastaHeaderMap={ + "ebola-sudan": "fastaHeader1", + "ebola-zaire": "fastaHeader2", + }, ), ), Case( name="with one succeeded and one not uploaded", input_metadata={}, input_sequence={ - "ebola-zaire": sequence_with_mutation("ebola-zaire"), + "fastaHeader2": sequence_with_mutation("ebola-zaire"), }, accession_id="1", expected_metadata={ @@ -382,6 +398,7 @@ def invalid_sequence() -> str: "LEbolaZaire": ebola_zaire_aa(sequence_with_mutation("ebola-zaire"), "L"), }, aminoAcidInsertions={}, + sequenceNameToFastaHeaderMap={"ebola-zaire": "fastaHeader2"}, ), ), ] @@ -390,7 +407,7 @@ def invalid_sequence() -> str: Case( name="with one failed alignment, one not uploaded", input_metadata={}, - input_sequence={"ebola-sudan": invalid_sequence()}, + input_sequence={"fastaHeader1": invalid_sequence()}, accession_id="1", expected_metadata={ "totalInsertedNucs_ebola-sudan": None, @@ -406,7 +423,7 @@ def invalid_sequence() -> str: ProcessingAnnotationHelper( ["alignment"], ["alignment"], - "Sequence with fasta header ebola-sudan does not appear to match any reference for organism: multi-ebola-test per `nextclade sort`. Double check you are submitting to the correct organism.", + "Sequence with fasta header fastaHeader1 does not appear to match any reference for organism: multi-ebola-test per `nextclade sort`. Double check you are submitting to the correct organism.", AnnotationSourceType.NUCLEOTIDE_SEQUENCE, ), ProcessingAnnotationHelper( @@ -423,14 +440,15 @@ def invalid_sequence() -> str: nucleotideInsertions={}, alignedAminoAcidSequences={}, aminoAcidInsertions={}, + sequenceNameToFastaHeaderMap={}, ), ), Case( name="with one failed alignment, one succeeded", input_metadata={}, input_sequence={ - "ebola-sudan": invalid_sequence(), - "ebola-zaire": sequence_with_mutation("ebola-zaire"), + "fastaHeader1": invalid_sequence(), + "fastaHeader2": sequence_with_mutation("ebola-zaire"), }, accession_id="1", expected_metadata={ @@ -447,7 +465,7 @@ def invalid_sequence() -> str: ProcessingAnnotationHelper( ["alignment"], ["alignment"], - "Sequence with fasta header ebola-sudan does not appear to match any reference for organism: multi-ebola-test per `nextclade sort`. Double check you are submitting to the correct organism.", + "Sequence with fasta header fastaHeader1 does not appear to match any reference for organism: multi-ebola-test per `nextclade sort`. Double check you are submitting to the correct organism.", AnnotationSourceType.NUCLEOTIDE_SEQUENCE, ), ], @@ -465,6 +483,7 @@ def invalid_sequence() -> str: "LEbolaZaire": ebola_zaire_aa(sequence_with_mutation("ebola-zaire"), "L"), }, aminoAcidInsertions={}, + sequenceNameToFastaHeaderMap={"ebola-zaire": "fastaHeader2"}, ), ), ] @@ -473,7 +492,7 @@ def invalid_sequence() -> str: Case( name="with one failed alignment, one not uploaded", input_metadata={}, - input_sequence={"ebola-sudan": invalid_sequence()}, + input_sequence={"fastaHeader1": invalid_sequence()}, accession_id="1", expected_metadata={ "totalInsertedNucs_ebola-sudan": None, @@ -497,7 +516,7 @@ def invalid_sequence() -> str: ProcessingAnnotationHelper( ["alignment"], ["alignment"], - "Sequence with fasta header ebola-sudan does not appear to match any reference for organism: multi-ebola-test per `nextclade sort`. Double check you are submitting to the correct organism.", + "Sequence with fasta header fastaHeader1 does not appear to match any reference for organism: multi-ebola-test per `nextclade sort`. Double check you are submitting to the correct organism.", AnnotationSourceType.NUCLEOTIDE_SEQUENCE, ) ], @@ -507,14 +526,15 @@ def invalid_sequence() -> str: nucleotideInsertions={}, alignedAminoAcidSequences={}, aminoAcidInsertions={}, + sequenceNameToFastaHeaderMap={}, ), ), Case( name="with one failed alignment, one succeeded", input_metadata={}, input_sequence={ - "ebola-sudan": invalid_sequence(), - "ebola-zaire": sequence_with_mutation("ebola-zaire"), + "fastaHeader1": invalid_sequence(), + "fastaHeader2": sequence_with_mutation("ebola-zaire"), }, accession_id="1", expected_metadata={ @@ -532,7 +552,7 @@ def invalid_sequence() -> str: ProcessingAnnotationHelper( ["alignment"], ["alignment"], - "Sequence with fasta header ebola-sudan does not appear to match any reference for organism: multi-ebola-test per `nextclade sort`. Double check you are submitting to the correct organism.", + "Sequence with fasta header fastaHeader1 does not appear to match any reference for organism: multi-ebola-test per `nextclade sort`. Double check you are submitting to the correct organism.", AnnotationSourceType.NUCLEOTIDE_SEQUENCE, ) ], @@ -549,42 +569,18 @@ def invalid_sequence() -> str: "LEbolaZaire": ebola_zaire_aa(sequence_with_mutation("ebola-zaire"), "L"), }, aminoAcidInsertions={}, + sequenceNameToFastaHeaderMap={"ebola-zaire": "fastaHeader2"}, ), ), ] segment_validation_tests_single_segment = [ - Case( - name="accept any fastaHeader for single segment", - input_metadata={}, - input_sequence={"randomKey": sequence_with_mutation("single")}, - accession_id="1", - expected_metadata={ - "completeness": 1.0, - "totalInsertedNucs": 0, - "totalSnps": 1, - "totalDeletedNucs": 0, - "length": len(consensus_sequence("single")), - }, - expected_errors=[], - expected_warnings=[], - expected_processed_alignment=ProcessedAlignment( - unalignedNucleotideSequences={"main": sequence_with_mutation("single")}, - alignedNucleotideSequences={"main": sequence_with_mutation("single")}, - nucleotideInsertions={}, - alignedAminoAcidSequences={ - "NPEbolaSudan": ebola_sudan_aa(sequence_with_mutation("single"), "NP"), - "VP35EbolaSudan": ebola_sudan_aa(sequence_with_mutation("single"), "VP35"), - }, - aminoAcidInsertions={}, - ), - ), Case( name="do not accept multiple segments for single segment", input_metadata={}, input_sequence={ - "main": sequence_with_mutation("single"), - "randomKey": sequence_with_mutation("single"), + "fastaHeader1": sequence_with_mutation("single"), + "fastaHeader2": sequence_with_mutation("single"), }, accession_id="2", expected_metadata={"length": 0}, @@ -592,7 +588,7 @@ def invalid_sequence() -> str: ProcessingAnnotationHelper( [ProcessingAnnotationAlignment], [ProcessingAnnotationAlignment], - "Multiple sequences: ['main', 'randomKey'] found in the" + "Multiple sequences: ['fastaHeader1', 'fastaHeader2'] found in the" " input data, but organism: ebola-sudan-test is single-segmented. " "Please check that your metadata and sequences are annotated correctly." "Each metadata entry should have a single corresponding fasta sequence " @@ -607,50 +603,12 @@ def invalid_sequence() -> str: nucleotideInsertions={}, alignedAminoAcidSequences={}, aminoAcidInsertions={}, + sequenceNameToFastaHeaderMap={}, ), ), ] segment_validation_tests_multi_segments = [ - Case( - name="accept any fastaHeader for multi-segment", - input_metadata={}, - input_sequence={ - "randomFastaHeader1": sequence_with_mutation("ebola-sudan"), - "otherFastaHeader2": sequence_with_mutation("ebola-zaire"), - }, - accession_id="1", - expected_metadata={ - "totalInsertedNucs_ebola-sudan": 0, - "totalSnps_ebola-sudan": 1, - "totalDeletedNucs_ebola-sudan": 0, - "length_ebola-sudan": len(consensus_sequence("ebola-sudan")), - "totalInsertedNucs_ebola-zaire": 0, - "totalSnps_ebola-zaire": 1, - "totalDeletedNucs_ebola-zaire": 0, - "length_ebola-zaire": len(consensus_sequence("ebola-zaire")), - }, - expected_errors=[], - expected_warnings=[], - expected_processed_alignment=ProcessedAlignment( - unalignedNucleotideSequences={ - "ebola-sudan": sequence_with_mutation("ebola-sudan"), - "ebola-zaire": sequence_with_mutation("ebola-zaire"), - }, - alignedNucleotideSequences={ - "ebola-sudan": sequence_with_mutation("ebola-sudan"), - "ebola-zaire": sequence_with_mutation("ebola-zaire"), - }, - nucleotideInsertions={}, - alignedAminoAcidSequences={ - "NPEbolaSudan": ebola_sudan_aa(sequence_with_mutation("single"), "NP"), - "VP35EbolaSudan": ebola_sudan_aa(sequence_with_mutation("single"), "VP35"), - "VP24EbolaZaire": ebola_zaire_aa(sequence_with_mutation("ebola-zaire"), "VP24"), - "LEbolaZaire": ebola_zaire_aa(sequence_with_mutation("ebola-zaire"), "L"), - }, - aminoAcidInsertions={}, - ), - ), Case( name="don't allow duplicated of the same segment", input_metadata={}, @@ -712,6 +670,10 @@ def invalid_sequence() -> str: nucleotideInsertions={}, alignedAminoAcidSequences={}, aminoAcidInsertions={}, + sequenceNameToFastaHeaderMap={ + "ebola-sudan": "prefix_ebola-sudan", + "ebola-zaire": "other_prefix_ebola-zaire", + }, ), ), Case( @@ -743,6 +705,7 @@ def invalid_sequence() -> str: nucleotideInsertions={}, alignedAminoAcidSequences={}, aminoAcidInsertions={}, + sequenceNameToFastaHeaderMap={}, ), ), Case( @@ -775,6 +738,7 @@ def invalid_sequence() -> str: nucleotideInsertions={}, alignedAminoAcidSequences={}, aminoAcidInsertions={}, + sequenceNameToFastaHeaderMap={"ebola-sudan": "ebola-sudan"}, ), ), ] diff --git a/preprocessing/specification.md b/preprocessing/specification.md index 00d4ffb645..07532a2f9a 100644 --- a/preprocessing/specification.md +++ b/preprocessing/specification.md @@ -92,6 +92,7 @@ In the NDJSON, each row contains a sequence entry version and a list of errors a nucleotideInsertions, alignedAminoAcidSequences, aminoAcidInsertions, + sequenceNameToFastaHeaderMap, //TODO: use in backend files } } diff --git a/website/src/components/Edit/SequencesForm.spec.tsx b/website/src/components/Edit/SequencesForm.spec.tsx index 9c12d48c09..39db17542a 100644 --- a/website/src/components/Edit/SequencesForm.spec.tsx +++ b/website/src/components/Edit/SequencesForm.spec.tsx @@ -205,7 +205,7 @@ describe('SequencesForm', () => { expect(editableSequences.rows).toEqual([ { - label: 'originalFastaHeader', + label: 'originalFastaHeader (mapped to unalignedProcessedSequenceName)', fastaHeader: 'originalFastaHeader', value: 'originalUnalignedNucleotideSequencesValue', initialValue: 'originalUnalignedNucleotideSequencesValue', diff --git a/website/src/components/Edit/SequencesForm.tsx b/website/src/components/Edit/SequencesForm.tsx index de9fa2c27b..919a9741c4 100644 --- a/website/src/components/Edit/SequencesForm.tsx +++ b/website/src/components/Edit/SequencesForm.tsx @@ -73,6 +73,18 @@ export class EditableSequences { return this.maxNumberOfRows > 1; } + static invertRecordMulti(obj: Record): Record { + const inverted: Record = {}; + + for (const key in obj) { + const value = obj[key]; + if (value === null) continue; + (inverted[value] ??= []).push(key); + } + + return inverted; + } + /** * @param initialData The sequence entry to edit, from which the initial sequence data is taken. * @param referenceGenomeLightweightSchema @@ -82,16 +94,27 @@ export class EditableSequences { referenceGenomeLightweightSchema: ReferenceGenomesLightweightSchema, ): EditableSequences { const maxNumberRows = this.getMaxNumberOfRows(referenceGenomeLightweightSchema); + const fastaHeaderMap = EditableSequences.invertRecordMulti( + initialData.processedData.sequenceNameToFastaHeaderMap, + ); const existingDataRows = Object.entries(initialData.originalData.unalignedNucleotideSequences).map( - ([key, value]) => ({ - // TODO: older entries will still have the segmentName and not the fastaHeader as a key - label: key, // TODO: In future prepro will map the fastaHeader to the segment (will be added to the label) - fastaHeader: maxNumberRows > 1 ? key : initialData.submissionId, - value: value, - initialValue: value, - key: EditableSequences.getNextKey(), - }), + ([key, value]) => { + const mapped = fastaHeaderMap[key].join(', ') || ''; + const label = !mapped + ? `${key} (could not be classified)` + : mapped === key + ? key + : `${key} (mapped to ${mapped})`; + return { + label, + fastaHeader: maxNumberRows > 1 ? key : initialData.submissionId, + value: value, + initialValue: value, + key: EditableSequences.getNextKey(), + }; + }, ); + return new EditableSequences(existingDataRows, maxNumberRows); } @@ -149,7 +172,7 @@ export class EditableSequences { getFastaIds(submissionId: string): string { const filledRows = this.rows.filter((row) => row.value !== null); return this.isMultiSegmented() - ? filledRows.map((sequence) => sequence.label.replace(/\s+/g, '')).join(', ') + ? filledRows.map((sequence) => sequence.fastaHeader).join(',') : submissionId; } diff --git a/website/src/components/ReviewPage/SequencesDialog.spec.tsx b/website/src/components/ReviewPage/SequencesDialog.spec.tsx index 5f106af900..98abed902a 100644 --- a/website/src/components/ReviewPage/SequencesDialog.spec.tsx +++ b/website/src/components/ReviewPage/SequencesDialog.spec.tsx @@ -93,6 +93,10 @@ const dataToView: SequenceEntryToEdit = { }, nucleotideInsertions: {}, aminoAcidInsertions: {}, + sequenceNameToFastaHeaderMap: { + [sequence1]: 'header1', + [sequence2]: 'header2', + }, files: null, }, status: processedStatus, diff --git a/website/src/types/backend.ts b/website/src/types/backend.ts index 20b69f7c89..21310175e6 100644 --- a/website/src/types/backend.ts +++ b/website/src/types/backend.ts @@ -234,6 +234,7 @@ export const sequenceEntryToEdit = accessionVersion.merge( nucleotideInsertions: z.record(z.array(z.string())), alignedAminoAcidSequences: z.record(z.string().nullable()), aminoAcidInsertions: z.record(z.array(z.string())), + sequenceNameToFastaHeaderMap: z.record(z.string().nullable()), files: filesByCategory.nullable(), }), }), diff --git a/website/vitest.setup.ts b/website/vitest.setup.ts index 9818f7ce59..36d669d707 100755 --- a/website/vitest.setup.ts +++ b/website/vitest.setup.ts @@ -119,6 +119,9 @@ export const defaultReviewData: SequenceEntryToEdit = { aminoAcidInsertions: { processedInsertionGeneName: ['aminoAcidInsertion1', 'aminoAcidInsertion2'], }, + sequenceNameToFastaHeaderMap: { + unalignedProcessedSequenceName: 'originalFastaHeader' + }, files: null, }, submissionId: 'defaultSubmitter',