Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
0aa0149
feat(backend): remove fasta header validation from backend and refact…
anna-parker Aug 6, 2025
32ab96b
feat(prepro): move segment validation to prepro
anna-parker Aug 6, 2025
8ab5d14
feat(prepro): fix tests
anna-parker Aug 6, 2025
6313f6b
Update schema documentation based on migration changes
actions-user Aug 6, 2025
05c8a6c
format
anna-parker Aug 6, 2025
9af1826
fix prepro
anna-parker Aug 6, 2025
5cb0014
retry
anna-parker Aug 6, 2025
01418ef
again
anna-parker Aug 6, 2025
c9c4b67
feat(prepro,backend): fix merge conflicts
anna-parker Aug 11, 2025
48cbc47
feat(backend): fix merge conflict
anna-parker Aug 11, 2025
458ce5f
feat(website): try to fix revisions
anna-parker Aug 11, 2025
2a4db34
feat(prepro): clean up more
anna-parker Aug 11, 2025
2883291
feat(prepro): add tests
anna-parker Aug 11, 2025
fbee233
feat(prepro): increase timeout
anna-parker Aug 11, 2025
28c4b46
feat(prepro): add more tests
anna-parker Aug 11, 2025
5fd6242
feat(backend): improve migration
anna-parker Aug 11, 2025
af93e19
Update schema documentation based on migration changes
actions-user Aug 11, 2025
d182c08
feat(kotlin): correctly define fields
anna-parker Aug 11, 2025
4a950cf
feat(backend): add tests for edge case
anna-parker Aug 11, 2025
b76d1aa
double timeout while I investigate
anna-parker Aug 11, 2025
477ffa3
feat(backend): only change for multi-segmented organisms
anna-parker Aug 11, 2025
0c3dff2
for kicks
anna-parker Aug 11, 2025
4914069
feat(prepro): update specification docs
anna-parker Aug 12, 2025
e160c4d
feat(prepro): try to fix integration tests
anna-parker Aug 12, 2025
278d085
feat: revert changes that are not required
anna-parker Aug 12, 2025
c46e22b
try again
anna-parker Aug 12, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions backend/docs/db/schema.sql
Original file line number Diff line number Diff line change
Expand Up @@ -527,9 +527,9 @@ ALTER VIEW public.sequence_entries_view OWNER TO postgres;

CREATE TABLE public.sequence_upload_aux_table (
upload_id text NOT NULL,
submission_id text NOT NULL,
segment_name text NOT NULL,
compressed_sequence_data text NOT NULL
compressed_sequence_data text NOT NULL,
sequence_submission_id text NOT NULL,
metadata_submission_id text
);


Expand Down Expand Up @@ -718,7 +718,7 @@ ALTER TABLE ONLY public.sequence_entries_preprocessed_data
--

ALTER TABLE ONLY public.sequence_upload_aux_table
ADD CONSTRAINT sequence_upload_aux_table_pkey PRIMARY KEY (upload_id, submission_id, segment_name);
ADD CONSTRAINT sequence_upload_aux_table_pkey PRIMARY KEY (upload_id, sequence_submission_id);


--
Expand Down
100 changes: 88 additions & 12 deletions backend/src/main/kotlin/org/loculus/backend/model/SubmitModel.kt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ import mu.KotlinLogging
import org.apache.commons.compress.archivers.zip.ZipFile
import org.apache.commons.compress.compressors.CompressorStreamFactory
import org.jetbrains.exposed.exceptions.ExposedSQLException
import org.jetbrains.exposed.sql.transactions.transaction
import org.jetbrains.exposed.sql.update
import org.loculus.backend.api.DataUseTerms
import org.loculus.backend.api.Organism
import org.loculus.backend.api.SubmissionIdFilesMap
Expand All @@ -21,6 +23,7 @@ import org.loculus.backend.service.groupmanagement.GroupManagementPreconditionVa
import org.loculus.backend.service.submission.CompressionAlgorithm
import org.loculus.backend.service.submission.MetadataUploadAuxTable
import org.loculus.backend.service.submission.SequenceUploadAuxTable
import org.loculus.backend.service.submission.SequenceUploadAuxTable.metadataSubmissionIdColumn
import org.loculus.backend.service.submission.SubmissionIdFilesMappingPreconditionValidator
import org.loculus.backend.service.submission.UploadDatabaseService
import org.loculus.backend.utils.DateProvider
Expand Down Expand Up @@ -132,7 +135,7 @@ class SubmitModel(
if (requiresConsensusSequenceFile(submissionParams.organism)) {
log.debug { "Validating submission with uploadId $uploadId" }
val sequenceSubmissionIds = uploadDatabaseService.getSequenceUploadSubmissionIds(uploadId).toSet()
validateSubmissionIdSetsForConsensusSequences(metadataSubmissionIds, sequenceSubmissionIds)
mapMetadataKeysToSequenceKeys(metadataSubmissionIds, sequenceSubmissionIds, submissionParams.organism)
}

if (submissionParams is SubmissionParams.RevisionSubmissionParams) {
Expand Down Expand Up @@ -348,27 +351,100 @@ class SubmitModel(
)
}

private fun validateSubmissionIdSetsForConsensusSequences(
private fun SubmissionId.removeSuffixPattern(): SubmissionId {
val lastDelimiter = this.lastIndexOf("_")
if (lastDelimiter == -1) {
return this
}
val cleaned = this.substring(0, lastDelimiter)
return cleaned
}

@Transactional
private fun mapMetadataKeysToSequenceKeys(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This class is already quite large. What do you think about moving parts of this method to a separate class/function? I think everything except the transaction might be a good candidate for refactoring (i.e. the for loop and the if statement with the validation).

metadataKeysSet: Set<SubmissionId>,
sequenceKeysSet: Set<SubmissionId>,
organism: Organism,
) {
val metadataKeysNotInSequences = metadataKeysSet.subtract(sequenceKeysSet)
val sequenceKeysNotInMetadata = sequenceKeysSet.subtract(metadataKeysSet)
val metadataKeyToSequences = mutableMapOf<SubmissionId, MutableList<SubmissionId>>()
val unmatchedSequenceKeys = mutableSetOf<SubmissionId>()
val ambiguousSequenceKeys = mutableMapOf<SubmissionId, List<SubmissionId>>()

val referenceGenome = backendConfig.getInstanceConfig(organism).referenceGenome

for (seqKey in sequenceKeysSet) {
val matchedMetadataKey = if (referenceGenome.nucleotideSequences.size == 1) {
val seqKeyInMeta = metadataKeysSet.contains(seqKey)
when {
seqKeyInMeta -> seqKey
else -> null
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another refactoring suggestion:

Suggested change
else -> null
else -> {
unmatchedSequenceKeys.add(seqKey)
continue
}

and similar for the other else case below. Then we would not need the if (matchedMetadataKey != null) { below.
IMO that straightens the control flow of this loop and makes it easier to understand.

}
} else {
val baseKey = seqKey.removeSuffixPattern()
val seqKeyInMeta = metadataKeysSet.contains(seqKey)
val baseKeyInMeta = metadataKeysSet.contains(baseKey)
if ((seqKey != baseKey) && seqKeyInMeta && baseKeyInMeta) {
ambiguousSequenceKeys[seqKey] = listOf(seqKey, baseKey)
}
when {
seqKeyInMeta -> seqKey
baseKeyInMeta -> baseKey
else -> null
}
}

if (matchedMetadataKey != null) {
metadataKeyToSequences.computeIfAbsent(matchedMetadataKey) { mutableListOf() }.add(seqKey)
} else {
unmatchedSequenceKeys.add(seqKey)
}
}

val metadataKeysWithoutSequences = metadataKeysSet.filterNot { metadataKeyToSequences.containsKey(it) }
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this does the same but it's more concise:

Suggested change
val metadataKeysWithoutSequences = metadataKeysSet.filterNot { metadataKeyToSequences.containsKey(it) }
val metadataKeysWithoutSequences = metadataKeysSet.subtract(metadataKeyToSequences.keys)


if (metadataKeysNotInSequences.isNotEmpty() || sequenceKeysNotInMetadata.isNotEmpty()) {
val metadataNotPresentErrorText = if (metadataKeysNotInSequences.isNotEmpty()) {
"Metadata file contains ${metadataKeysNotInSequences.size} ids that are not present " +
"in the sequence file: " + metadataKeysNotInSequences.toList().joinToString(limit = 10) + "; "
if (unmatchedSequenceKeys.isNotEmpty() || metadataKeysWithoutSequences.isNotEmpty() ||
ambiguousSequenceKeys.isNotEmpty()
) {
val unmatchedSeqText = if (unmatchedSequenceKeys.isNotEmpty()) {
"Sequence file contains ${unmatchedSequenceKeys.size} ids that are not present in the metadata file: ${
unmatchedSequenceKeys.joinToString(limit = 10)
}; "
} else {
""
}
val unmatchedMetadataText = if (metadataKeysWithoutSequences.isNotEmpty()) {
"Metadata file contains ${metadataKeysWithoutSequences.size} ids that are not present in " +
"the sequence file: ${metadataKeysWithoutSequences.joinToString(limit = 10)};"
} else {
""
}
val sequenceNotPresentErrorText = if (sequenceKeysNotInMetadata.isNotEmpty()) {
"Sequence file contains ${sequenceKeysNotInMetadata.size} ids that are not present " +
"in the metadata file: " + sequenceKeysNotInMetadata.toList().joinToString(limit = 10)
val ambiguousSequenceText = if (ambiguousSequenceKeys.isNotEmpty()) {
"Sequence file contains ${ambiguousSequenceKeys.size} ids that could be matched to multiple metadata " +
"keys, e.g. ${
ambiguousSequenceKeys.entries.joinToString(limit = 3) { (key, value) ->
"Sequence key: $key matches $value"
}
} " +
"- to avoid future issues we recommend not using the separator `_` in your metadata submissionIds;"
} else {
""
}
throw UnprocessableEntityException(metadataNotPresentErrorText + sequenceNotPresentErrorText)
throw UnprocessableEntityException(unmatchedSeqText + unmatchedMetadataText + ambiguousSequenceText)
}

transaction {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we really need transaction in a @Transactional method? I thought that the annotation already wraps the whole method in a transaction?

for ((metadataSubmissionId, sequenceSubmissionIds) in metadataKeyToSequences) {
for (sequenceSubmissionId in sequenceSubmissionIds) {
SequenceUploadAuxTable.update(
{
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nitpick to improve readability:

Suggested change
{
where = {

SequenceUploadAuxTable.sequenceSubmissionIdColumn eq
sequenceSubmissionId
},
) {
it[metadataSubmissionIdColumn] = metadataSubmissionId
}
}
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ const val SEQUENCE_UPLOAD_AUX_TABLE_NAME = "sequence_upload_aux_table"

object SequenceUploadAuxTable : Table(SEQUENCE_UPLOAD_AUX_TABLE_NAME) {
val sequenceUploadIdColumn = varchar("upload_id", 255)
val sequenceSubmissionIdColumn = varchar("submission_id", 255)
val segmentNameColumn = varchar("segment_name", 255)
val sequenceSubmissionIdColumn = varchar("sequence_submission_id", 255)
val metadataSubmissionIdColumn = varchar("metadata_submission_id", 255).nullable()
val compressedSequenceDataColumn = jacksonSerializableJsonb<CompressedSequence>("compressed_sequence_data")

override val primaryKey = PrimaryKey(sequenceUploadIdColumn, sequenceSubmissionIdColumn, segmentNameColumn)
override val primaryKey = PrimaryKey(sequenceUploadIdColumn, sequenceSubmissionIdColumn)
}
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,12 @@ import org.loculus.backend.service.submission.MetadataUploadAuxTable.submitterCo
import org.loculus.backend.service.submission.MetadataUploadAuxTable.uploadIdColumn
import org.loculus.backend.service.submission.MetadataUploadAuxTable.uploadedAtColumn
import org.loculus.backend.service.submission.SequenceUploadAuxTable.compressedSequenceDataColumn
import org.loculus.backend.service.submission.SequenceUploadAuxTable.segmentNameColumn
import org.loculus.backend.service.submission.SequenceUploadAuxTable.metadataSubmissionIdColumn
import org.loculus.backend.service.submission.SequenceUploadAuxTable.sequenceSubmissionIdColumn
import org.loculus.backend.service.submission.SequenceUploadAuxTable.sequenceUploadIdColumn
import org.loculus.backend.utils.DatabaseConstants
import org.loculus.backend.utils.FastaEntry
import org.loculus.backend.utils.MetadataEntry
import org.loculus.backend.utils.ParseFastaHeader
import org.loculus.backend.utils.RevisionEntry
import org.loculus.backend.utils.chunkedForDatabase
import org.loculus.backend.utils.getNextSequenceNumbers
Expand All @@ -54,7 +53,6 @@ private const val METADATA_BATCH_SIZE = DatabaseConstants.POSTGRESQL_PARAMETER_L
@Service
@Transactional
class UploadDatabaseService(
private val parseFastaHeader: ParseFastaHeader,
private val compressor: CompressionService,
private val accessionPreconditionValidator: AccessionPreconditionValidator,
private val dataUseTermsDatabaseService: DataUseTermsDatabaseService,
Expand Down Expand Up @@ -114,13 +112,12 @@ class UploadDatabaseService(
) {
uploadedSequencesBatch.chunkedForDatabase({ batch ->
SequenceUploadAuxTable.batchInsert(batch) {
val (submissionId, segmentName) = parseFastaHeader.parse(it.sampleName, submittedOrganism)
this[sequenceSubmissionIdColumn] = submissionId
this[segmentNameColumn] = segmentName
this[sequenceSubmissionIdColumn] = it.sampleName
this[metadataSubmissionIdColumn] = null
this[sequenceUploadIdColumn] = uploadId
this[compressedSequenceDataColumn] = compressor.compressNucleotideSequence(
it.sequence,
segmentName,
it.sampleName,
submittedOrganism,
)
}
Expand Down Expand Up @@ -176,9 +173,9 @@ class UploadDatabaseService(
'unalignedNucleotideSequences',
COALESCE(
jsonb_object_agg(
sequence_upload_aux_table.segment_name,
sequence_upload_aux_table.sequence_submission_id,
sequence_upload_aux_table.compressed_sequence_data::jsonb
) FILTER (WHERE sequence_upload_aux_table.segment_name IS NOT NULL),
) FILTER (WHERE sequence_upload_aux_table.sequence_submission_id IS NOT NULL),
'{}'::jsonb
)
)
Expand All @@ -187,7 +184,7 @@ class UploadDatabaseService(
LEFT JOIN
sequence_upload_aux_table
ON metadata_upload_aux_table.upload_id = sequence_upload_aux_table.upload_id
AND metadata_upload_aux_table.submission_id = sequence_upload_aux_table.submission_id
AND metadata_upload_aux_table.submission_id = sequence_upload_aux_table.metadata_submission_id
WHERE metadata_upload_aux_table.upload_id = ?
GROUP BY
metadata_upload_aux_table.upload_id,
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
alter table sequence_upload_aux_table drop CONSTRAINT sequence_upload_aux_table_pkey;

alter table sequence_upload_aux_table add column sequence_submission_id text;
alter table sequence_upload_aux_table add column metadata_submission_id text;

update sequence_upload_aux_table
set metadata_submission_id = submission_id
where metadata_submission_id is null;

update sequence_upload_aux_table
set sequence_submission_id = case
when segment_name is NULL or segment_name = '' then submission_id
else submission_id || '_' || segment_name
end
where sequence_submission_id is NULL;

alter table sequence_upload_aux_table drop column segment_name;
alter table sequence_upload_aux_table drop column submission_id;

alter table sequence_upload_aux_table add PRIMARY KEY (upload_id, sequence_submission_id);
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ val defaultOriginalData = OriginalData(
"country" to "Switzerland",
"division" to "Bern",
),
mapOf("main" to "ACTG"),
mapOf("custom0" to "ACTG"),
)

val emptyOriginalData = OriginalData<GeneticSequence>(metadata = emptyMap(), unalignedNucleotideSequences = emptyMap())
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ class SubmitEndpointSingleSegmentedTest(
.data
.unalignedNucleotideSequences

assertThat(unalignedNucleotideSequences, hasEntry(DEFAULT_SEQUENCE_NAME, "AC"))
assertThat(unalignedNucleotideSequences, hasEntry("header1", "AC"))
}

@Test
Expand Down
Loading
Loading