Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,11 @@ data class ProcessedData<SequenceType>(
example = """{"raw_reads": [{"fileId": "s0m3-uUiDd", "name": "data.fastaq"}], "sequencing_logs": []}""",
description = "The key is the file category name, the value is a list of files, with ID and name.",
)
val sequenceNameToFastaHeaderMap: Map<SegmentName, String> = emptyMap(),
@Schema(
example = """{"segment1": "fastaHeader1", "segment2": "fastaHeader2"}""",
description = "The key is the segment name, the value is the fastaHeader of the original Data",
)
val files: FileCategoryFilesMap?,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ class CompressionService(
}
},
processedData.aminoAcidInsertions,
processedData.sequenceNameToFastaHeaderMap,
processedData.files,
)

Expand All @@ -144,6 +145,7 @@ class CompressionService(
}
},
processedData.aminoAcidInsertions,
processedData.sequenceNameToFastaHeaderMap,
processedData.files,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class EmptyProcessedDataProvider(private val backendConfig: BackendConfig) {
alignedAminoAcidSequences = referenceGenome.genes.map { it.name }.associateWith { null },
nucleotideInsertions = referenceGenome.nucleotideSequences.map { it.name }.associateWith { emptyList() },
aminoAcidInsertions = referenceGenome.genes.map { it.name }.associateWith { emptyList() },
sequenceNameToFastaHeaderMap = referenceGenome.nucleotideSequences.map { it.name }.associateWith { "" },
files = null,
)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,11 @@ class ProcessedSequenceEntryValidator(private val schema: Schema, private val re
"alignedNucleotideSequences",
)

validateNoUnknownSegment(
processedData.sequenceNameToFastaHeaderMap,
"sequenceNameToFastaHeaderMap",
)

validateNoUnknownSegment(
processedData.unalignedNucleotideSequences,
"unalignedNucleotideSequences",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,7 @@ class SubmissionDatabaseService(
aminoAcidInsertions = processedData.aminoAcidInsertions.mapValues { (_, it) ->
it.map { insertion -> insertion.copy(sequence = insertion.sequence.uppercase(Locale.US)) }
},
sequenceNameToFastaHeaderMap = processedData.sequenceNameToFastaHeaderMap,
)

private fun validateExternalMetadata(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ val defaultProcessedData = ProcessedData(
Insertion(123, "RN"),
),
),
sequenceNameToFastaHeaderMap = mapOf(MAIN_SEGMENT to "header"),
files = null,
)

Expand Down Expand Up @@ -101,6 +102,7 @@ val defaultProcessedDataMultiSegmented = ProcessedData(
Insertion(123, "RN"),
),
),
sequenceNameToFastaHeaderMap = mapOf("notOnlySegment" to "header1", "secondSegment" to "header2"),
files = null,
)

Expand All @@ -117,6 +119,7 @@ val defaultProcessedDataWithoutSequences = ProcessedData<GeneticSequence>(
nucleotideInsertions = emptyMap(),
alignedAminoAcidSequences = emptyMap(),
aminoAcidInsertions = emptyMap(),
sequenceNameToFastaHeaderMap = emptyMap(),
files = null,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class ProcessedMetadataPostprocessorTest(
nucleotideInsertions = emptyMap(),
alignedAminoAcidSequences = emptyMap(),
aminoAcidInsertions = emptyMap(),
sequenceNameToFastaHeaderMap = emptyMap(),
files = null,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,10 @@ class ProcessedSequencesPostprocessorTest(
unconfiguredPresentGene to listOf(Insertion(13, "TT")),
unconfiguredNullGene to emptyList(),
),
sequenceNameToFastaHeaderMap = mapOf(
"configuredPresentSeg" to "header1",
"unconfiguredPresentSeg" to "header2",
),
files = null,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ fun row(
nucleotideInsertions = emptyMap(),
alignedAminoAcidSequences = emptyMap(),
aminoAcidInsertions = emptyMap(),
sequenceNameToFastaHeaderMap = emptyMap(),
files = null,
),
isRevocation = false,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ class UnprocessedEntry:
@dataclass
class SegmentAssignment:
unalignedNucleotideSequences: dict[SegmentName, NucleotideSequence | None] # noqa: N815
segmentNameToFastaHeaders: dict[SegmentName, str] # noqa: N815
sequenceNameToFastaHeaderMap: dict[SegmentName, str] # noqa: N815
errors: list[ProcessingAnnotation]
warnings: list[ProcessingAnnotation]

Expand All @@ -107,6 +107,7 @@ class UnprocessedAfterNextclade:
nucleotideInsertions: dict[SegmentName, list[NucleotideInsertion]] # noqa: N815
alignedAminoAcidSequences: dict[GeneName, AminoAcidSequence | None] # noqa: N815
aminoAcidInsertions: dict[GeneName, list[AminoAcidInsertion]] # noqa: N815
sequenceNameToFastaHeaderMap: dict[SegmentName, str] # noqa: N815
errors: list[ProcessingAnnotation]
warnings: list[ProcessingAnnotation]

Expand All @@ -125,6 +126,7 @@ class ProcessedData:
nucleotideInsertions: dict[SegmentName, Any] # noqa: N815
alignedAminoAcidSequences: dict[GeneName, Any] # noqa: N815
aminoAcidInsertions: dict[GeneName, Any] # noqa: N815
sequenceNameToFastaHeaderMap: dict[SegmentName, str] # noqa: N815
files: dict[str, list[FileIdAndName]] | None = None


Expand Down
43 changes: 30 additions & 13 deletions preprocessing/nextclade/src/loculus_preprocessing/prepro.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,7 @@ def assign_segment_with_nextclade_sort(
message=msg,
)
)
segmentNameToFastaHeaders: dict[SegmentName, str] = {}
sequenceNameToFastaHeaderMap: dict[SegmentName, str] = {}
for segment_name, headers in sort_results_map.items():
if len(headers) > 1:
msg = (
Expand All @@ -381,7 +381,7 @@ def assign_segment_with_nextclade_sort(
)
)
continue
segmentNameToFastaHeaders[segment_name] = headers[0]
sequenceNameToFastaHeaderMap[segment_name] = headers[0]
unaligned_nucleotide_sequences[segment_name] = input_unaligned_sequences[headers[0]]

if (
Expand All @@ -399,7 +399,7 @@ def assign_segment_with_nextclade_sort(

return SegmentAssignment(
unalignedNucleotideSequences=unaligned_nucleotide_sequences,
segmentNameToFastaHeaders=segmentNameToFastaHeaders,
sequenceNameToFastaHeaderMap=sequenceNameToFastaHeaderMap,
errors=errors,
warnings=warnings,
)
Expand All @@ -412,7 +412,7 @@ def assign_single_segment(
errors: list[ProcessingAnnotation] = []
warnings: list[ProcessingAnnotation] = []
unaligned_nucleotide_sequences: dict[SegmentName, NucleotideSequence | None] = {}
fastaHeader = ""
sequenceNameToFastaHeaderMap: dict[SegmentName, str] = {}
if len(input_unaligned_sequences) > 1:
errors.append(
ProcessingAnnotation.from_single(
Expand All @@ -429,28 +429,29 @@ def assign_single_segment(
)
else:
fastaHeader, value = next(iter(input_unaligned_sequences.items()))
sequenceNameToFastaHeaderMap["main"] = fastaHeader
unaligned_nucleotide_sequences["main"] = value
return SegmentAssignment(
unalignedNucleotideSequences=unaligned_nucleotide_sequences,
segmentNameToFastaHeaders={"main": fastaHeader},
sequenceNameToFastaHeaderMap=sequenceNameToFastaHeaderMap,
errors=errors,
warnings=warnings,
)


def assign_segment_with_header(
def assign_segment_using_header(
input_unaligned_sequences: dict[str, NucleotideSequence | None],
config: Config,
) -> SegmentAssignment:
errors: list[ProcessingAnnotation] = []
warnings: list[ProcessingAnnotation] = []
unaligned_nucleotide_sequences: dict[SegmentName, NucleotideSequence | None] = {}
segmentNameToFastaHeaders: dict[SegmentName, str] = {}
sequenceNameToFastaHeaderMap: dict[SegmentName, str] = {}
duplicate_segments = set()
if not config.nucleotideSequences:
return SegmentAssignment(
unalignedNucleotideSequences={},
segmentNameToFastaHeaders={},
sequenceNameToFastaHeaderMap={},
errors=errors,
warnings=warnings,
)
Expand Down Expand Up @@ -480,13 +481,13 @@ def assign_segment_with_header(
)
)
elif len(unaligned_segment) == 1:
segmentNameToFastaHeaders[segment] = unaligned_segment[0]
sequenceNameToFastaHeaderMap[segment] = unaligned_segment[0]
unaligned_nucleotide_sequences[segment] = input_unaligned_sequences[
unaligned_segment[0]
]
remaining_segments = (
set(input_unaligned_sequences.keys())
- set(segmentNameToFastaHeaders.values())
- set(sequenceNameToFastaHeaderMap.values())
- duplicate_segments
)
if len(remaining_segments) > 0:
Expand All @@ -513,7 +514,7 @@ def assign_segment_with_header(
)
return SegmentAssignment(
unalignedNucleotideSequences=unaligned_nucleotide_sequences,
segmentNameToFastaHeaders=segmentNameToFastaHeaders,
sequenceNameToFastaHeaderMap=sequenceNameToFastaHeaderMap,
errors=errors,
warnings=warnings,
)
Expand All @@ -533,11 +534,13 @@ def enrich_with_nextclade( # noqa: C901, PLR0914, PLR0915
nucleotideInsertions: dict[SegmentName, list[NucleotideInsertion]]
alignedAminoAcidSequences: dict[GeneName, AminoAcidSequence | None]
aminoAcidInsertions: dict[GeneName, list[AminoAcidInsertion]]
sequenceNameToFastaHeaderMap: dict[SegmentName, str]
)` object.
"""
unaligned_nucleotide_sequences: dict[
AccessionVersion, dict[SegmentName, NucleotideSequence | None]
] = {}
segment_assignment_map: dict[AccessionVersion, dict[SegmentName, str]] = {}
alerts: Alerts = Alerts()
input_metadata: dict[AccessionVersion, dict[str, Any]] = {}
aligned_aminoacid_sequences: dict[
Expand All @@ -554,6 +557,7 @@ def enrich_with_nextclade( # noqa: C901, PLR0914, PLR0915
input_metadata[id]["group_id"] = entry.data.group_id
aligned_aminoacid_sequences[id] = {}
aligned_nucleotide_sequences[id] = {}
segment_assignment_map[id] = {}
if not config.multi_segment:
segment_assignment = assign_single_segment(
input_unaligned_sequences=entry.data.unalignedNucleotideSequences,
Expand All @@ -568,6 +572,7 @@ def enrich_with_nextclade( # noqa: C901, PLR0914, PLR0915
unaligned_nucleotide_sequences[id] = segment_assignment.unalignedNucleotideSequences
alerts.errors[id] = segment_assignment.errors
alerts.warnings[id] = segment_assignment.warnings
segment_assignment_map[id] = segment_assignment.sequenceNameToFastaHeaderMap

nextclade_metadata: defaultdict[
AccessionVersion, defaultdict[SegmentName, dict[str, Any] | None]
Expand Down Expand Up @@ -672,6 +677,7 @@ def enrich_with_nextclade( # noqa: C901, PLR0914, PLR0915
nucleotideInsertions=nucleotide_insertions[id],
alignedAminoAcidSequences=aligned_aminoacid_sequences[id],
aminoAcidInsertions=amino_acid_insertions[id],
sequenceNameToFastaHeaderMap=segment_assignment_map[id],
errors=alerts.errors[id],
warnings=alerts.warnings[id],
)
Expand Down Expand Up @@ -884,6 +890,7 @@ def processed_entry_no_alignment(
output_metadata: ProcessedMetadata,
errors: list[ProcessingAnnotation],
warnings: list[ProcessingAnnotation],
sequenceNameToFastaHeaderMap: dict[SegmentName, str],
) -> SubmissionData:
"""Process a single sequence without alignment"""

Expand All @@ -903,6 +910,7 @@ def processed_entry_no_alignment(
nucleotideInsertions=nucleotide_insertions,
alignedAminoAcidSequences=aligned_aminoacid_sequences,
aminoAcidInsertions=amino_acid_insertions,
sequenceNameToFastaHeaderMap=sequenceNameToFastaHeaderMap,
),
errors=errors,
warnings=warnings,
Expand Down Expand Up @@ -932,7 +940,7 @@ def process_single( # noqa: C901
else:
submitter = unprocessed.submitter
group_id = unprocessed.group_id
segment_assignment = assign_segment_with_header(
segment_assignment = assign_segment_using_header(
input_unaligned_sequences=unprocessed.unalignedNucleotideSequences,
config=config,
)
Expand Down Expand Up @@ -987,7 +995,14 @@ def process_single( # noqa: C901
logger.debug(f"Processed {id}: {output_metadata}")

if isinstance(unprocessed, UnprocessedData):
return processed_entry_no_alignment(id, unprocessed, output_metadata, errors, warnings)
return processed_entry_no_alignment(
id,
unprocessed,
output_metadata,
errors,
warnings,
segment_assignment.sequenceNameToFastaHeaderMap,
)

aligned_segments = set()
for sequence_and_dataset in config.nucleotideSequences:
Expand Down Expand Up @@ -1031,6 +1046,7 @@ def process_single( # noqa: C901
nucleotideInsertions=unprocessed.nucleotideInsertions,
alignedAminoAcidSequences=unprocessed.alignedAminoAcidSequences,
aminoAcidInsertions=unprocessed.aminoAcidInsertions,
sequenceNameToFastaHeaderMap=unprocessed.sequenceNameToFastaHeaderMap,
),
errors=list(set(errors)),
warnings=list(set(warnings)),
Expand All @@ -1056,6 +1072,7 @@ def processed_entry_with_errors(id) -> SubmissionData:
nucleotideInsertions=defaultdict(dict[str, Any]),
alignedAminoAcidSequences=defaultdict(dict[str, Any]),
aminoAcidInsertions=defaultdict(dict[str, Any]),
sequenceNameToFastaHeaderMap=defaultdict(str),
),
errors=[
ProcessingAnnotation.from_single(
Expand Down
8 changes: 8 additions & 0 deletions preprocessing/nextclade/tests/factory_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ class ProcessedAlignment:
nucleotideInsertions: dict[str, list[str]] = field(default_factory=dict) # noqa: N815
alignedAminoAcidSequences: dict[str, str | None] = field(default_factory=dict) # noqa: N815
aminoAcidInsertions: dict[str, list[str]] = field(default_factory=dict) # noqa: N815
sequenceNameToFastaHeaderMap: dict[str, str] = field( # noqa: N815
default_factory=dict
)


@dataclass
Expand Down Expand Up @@ -140,6 +143,7 @@ def create_processed_entry(
nucleotideInsertions=processed_alignment.nucleotideInsertions,
alignedAminoAcidSequences=processed_alignment.alignedAminoAcidSequences,
aminoAcidInsertions=processed_alignment.aminoAcidInsertions,
sequenceNameToFastaHeaderMap=processed_alignment.sequenceNameToFastaHeaderMap,
),
errors=errors,
warnings=warnings,
Expand Down Expand Up @@ -242,3 +246,7 @@ def verify_processed_entry(
f"{test_name}: amino acid insertions '{actual.aminoAcidInsertions}' do not "
f"match expectation '{expected.aminoAcidInsertions}'."
)
assert actual.sequenceNameToFastaHeaderMap == expected.sequenceNameToFastaHeaderMap, (
f"{test_name}: sequence name to fasta header map '{actual.sequenceNameToFastaHeaderMap}' do not "
f"match expectation '{expected.sequenceNameToFastaHeaderMap}'."
)
Loading
Loading