-
Notifications
You must be signed in to change notification settings - Fork 9
feat!(backend): remove fasta header validation from backend and refactor how fasta and metadata are merged #4821
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
0aa0149
32ab96b
8ab5d14
6313f6b
05c8a6c
9af1826
5cb0014
01418ef
c9c4b67
48cbc47
458ce5f
2a4db34
2883291
fbee233
28c4b46
5fd6242
af93e19
d182c08
4a950cf
b76d1aa
477ffa3
0c3dff2
4914069
e160c4d
278d085
c46e22b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -4,6 +4,8 @@ import mu.KotlinLogging | |||||||||||
| import org.apache.commons.compress.archivers.zip.ZipFile | ||||||||||||
| import org.apache.commons.compress.compressors.CompressorStreamFactory | ||||||||||||
| import org.jetbrains.exposed.exceptions.ExposedSQLException | ||||||||||||
| import org.jetbrains.exposed.sql.transactions.transaction | ||||||||||||
| import org.jetbrains.exposed.sql.update | ||||||||||||
| import org.loculus.backend.api.DataUseTerms | ||||||||||||
| import org.loculus.backend.api.Organism | ||||||||||||
| import org.loculus.backend.api.SubmissionIdFilesMap | ||||||||||||
|
|
@@ -21,6 +23,7 @@ import org.loculus.backend.service.groupmanagement.GroupManagementPreconditionVa | |||||||||||
| import org.loculus.backend.service.submission.CompressionAlgorithm | ||||||||||||
| import org.loculus.backend.service.submission.MetadataUploadAuxTable | ||||||||||||
| import org.loculus.backend.service.submission.SequenceUploadAuxTable | ||||||||||||
| import org.loculus.backend.service.submission.SequenceUploadAuxTable.metadataSubmissionIdColumn | ||||||||||||
| import org.loculus.backend.service.submission.SubmissionIdFilesMappingPreconditionValidator | ||||||||||||
| import org.loculus.backend.service.submission.UploadDatabaseService | ||||||||||||
| import org.loculus.backend.utils.DateProvider | ||||||||||||
|
|
@@ -132,7 +135,7 @@ class SubmitModel( | |||||||||||
| if (requiresConsensusSequenceFile(submissionParams.organism)) { | ||||||||||||
| log.debug { "Validating submission with uploadId $uploadId" } | ||||||||||||
| val sequenceSubmissionIds = uploadDatabaseService.getSequenceUploadSubmissionIds(uploadId).toSet() | ||||||||||||
| validateSubmissionIdSetsForConsensusSequences(metadataSubmissionIds, sequenceSubmissionIds) | ||||||||||||
| mapMetadataKeysToSequenceKeys(metadataSubmissionIds, sequenceSubmissionIds, submissionParams.organism) | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| if (submissionParams is SubmissionParams.RevisionSubmissionParams) { | ||||||||||||
|
|
@@ -348,27 +351,100 @@ class SubmitModel( | |||||||||||
| ) | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| private fun validateSubmissionIdSetsForConsensusSequences( | ||||||||||||
| private fun SubmissionId.removeSuffixPattern(): SubmissionId { | ||||||||||||
| val lastDelimiter = this.lastIndexOf("_") | ||||||||||||
| if (lastDelimiter == -1) { | ||||||||||||
| return this | ||||||||||||
| } | ||||||||||||
| val cleaned = this.substring(0, lastDelimiter) | ||||||||||||
| return cleaned | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| @Transactional | ||||||||||||
| private fun mapMetadataKeysToSequenceKeys( | ||||||||||||
| metadataKeysSet: Set<SubmissionId>, | ||||||||||||
| sequenceKeysSet: Set<SubmissionId>, | ||||||||||||
| organism: Organism, | ||||||||||||
| ) { | ||||||||||||
| val metadataKeysNotInSequences = metadataKeysSet.subtract(sequenceKeysSet) | ||||||||||||
| val sequenceKeysNotInMetadata = sequenceKeysSet.subtract(metadataKeysSet) | ||||||||||||
| val metadataKeyToSequences = mutableMapOf<SubmissionId, MutableList<SubmissionId>>() | ||||||||||||
| val unmatchedSequenceKeys = mutableSetOf<SubmissionId>() | ||||||||||||
| val ambiguousSequenceKeys = mutableMapOf<SubmissionId, List<SubmissionId>>() | ||||||||||||
|
|
||||||||||||
| val referenceGenome = backendConfig.getInstanceConfig(organism).referenceGenome | ||||||||||||
|
|
||||||||||||
| for (seqKey in sequenceKeysSet) { | ||||||||||||
anna-parker marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||
| val matchedMetadataKey = if (referenceGenome.nucleotideSequences.size == 1) { | ||||||||||||
| val seqKeyInMeta = metadataKeysSet.contains(seqKey) | ||||||||||||
| when { | ||||||||||||
| seqKeyInMeta -> seqKey | ||||||||||||
| else -> null | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another refactoring suggestion:
Suggested change
and similar for the other |
||||||||||||
| } | ||||||||||||
| } else { | ||||||||||||
| val baseKey = seqKey.removeSuffixPattern() | ||||||||||||
| val seqKeyInMeta = metadataKeysSet.contains(seqKey) | ||||||||||||
| val baseKeyInMeta = metadataKeysSet.contains(baseKey) | ||||||||||||
| if ((seqKey != baseKey) && seqKeyInMeta && baseKeyInMeta) { | ||||||||||||
| ambiguousSequenceKeys[seqKey] = listOf(seqKey, baseKey) | ||||||||||||
| } | ||||||||||||
| when { | ||||||||||||
| seqKeyInMeta -> seqKey | ||||||||||||
| baseKeyInMeta -> baseKey | ||||||||||||
| else -> null | ||||||||||||
| } | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| if (matchedMetadataKey != null) { | ||||||||||||
| metadataKeyToSequences.computeIfAbsent(matchedMetadataKey) { mutableListOf() }.add(seqKey) | ||||||||||||
| } else { | ||||||||||||
| unmatchedSequenceKeys.add(seqKey) | ||||||||||||
| } | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| val metadataKeysWithoutSequences = metadataKeysSet.filterNot { metadataKeyToSequences.containsKey(it) } | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this does the same but it's more concise:
Suggested change
|
||||||||||||
|
|
||||||||||||
| if (metadataKeysNotInSequences.isNotEmpty() || sequenceKeysNotInMetadata.isNotEmpty()) { | ||||||||||||
| val metadataNotPresentErrorText = if (metadataKeysNotInSequences.isNotEmpty()) { | ||||||||||||
| "Metadata file contains ${metadataKeysNotInSequences.size} ids that are not present " + | ||||||||||||
| "in the sequence file: " + metadataKeysNotInSequences.toList().joinToString(limit = 10) + "; " | ||||||||||||
| if (unmatchedSequenceKeys.isNotEmpty() || metadataKeysWithoutSequences.isNotEmpty() || | ||||||||||||
| ambiguousSequenceKeys.isNotEmpty() | ||||||||||||
| ) { | ||||||||||||
| val unmatchedSeqText = if (unmatchedSequenceKeys.isNotEmpty()) { | ||||||||||||
| "Sequence file contains ${unmatchedSequenceKeys.size} ids that are not present in the metadata file: ${ | ||||||||||||
| unmatchedSequenceKeys.joinToString(limit = 10) | ||||||||||||
| }; " | ||||||||||||
| } else { | ||||||||||||
| "" | ||||||||||||
| } | ||||||||||||
| val unmatchedMetadataText = if (metadataKeysWithoutSequences.isNotEmpty()) { | ||||||||||||
| "Metadata file contains ${metadataKeysWithoutSequences.size} ids that are not present in " + | ||||||||||||
| "the sequence file: ${metadataKeysWithoutSequences.joinToString(limit = 10)};" | ||||||||||||
| } else { | ||||||||||||
| "" | ||||||||||||
| } | ||||||||||||
| val sequenceNotPresentErrorText = if (sequenceKeysNotInMetadata.isNotEmpty()) { | ||||||||||||
| "Sequence file contains ${sequenceKeysNotInMetadata.size} ids that are not present " + | ||||||||||||
| "in the metadata file: " + sequenceKeysNotInMetadata.toList().joinToString(limit = 10) | ||||||||||||
| val ambiguousSequenceText = if (ambiguousSequenceKeys.isNotEmpty()) { | ||||||||||||
| "Sequence file contains ${ambiguousSequenceKeys.size} ids that could be matched to multiple metadata " + | ||||||||||||
| "keys, e.g. ${ | ||||||||||||
| ambiguousSequenceKeys.entries.joinToString(limit = 3) { (key, value) -> | ||||||||||||
| "Sequence key: $key matches $value" | ||||||||||||
| } | ||||||||||||
| } " + | ||||||||||||
| "- to avoid future issues we recommend not using the separator `_` in your metadata submissionIds;" | ||||||||||||
| } else { | ||||||||||||
| "" | ||||||||||||
| } | ||||||||||||
| throw UnprocessableEntityException(metadataNotPresentErrorText + sequenceNotPresentErrorText) | ||||||||||||
| throw UnprocessableEntityException(unmatchedSeqText + unmatchedMetadataText + ambiguousSequenceText) | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| transaction { | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we really need |
||||||||||||
| for ((metadataSubmissionId, sequenceSubmissionIds) in metadataKeyToSequences) { | ||||||||||||
| for (sequenceSubmissionId in sequenceSubmissionIds) { | ||||||||||||
| SequenceUploadAuxTable.update( | ||||||||||||
| { | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nitpick to improve readability:
Suggested change
|
||||||||||||
| SequenceUploadAuxTable.sequenceSubmissionIdColumn eq | ||||||||||||
| sequenceSubmissionId | ||||||||||||
| }, | ||||||||||||
| ) { | ||||||||||||
| it[metadataSubmissionIdColumn] = metadataSubmissionId | ||||||||||||
| } | ||||||||||||
| } | ||||||||||||
| } | ||||||||||||
| } | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
|
|
||||||||||||
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,20 @@ | ||
| alter table sequence_upload_aux_table drop CONSTRAINT sequence_upload_aux_table_pkey; | ||
|
|
||
| alter table sequence_upload_aux_table add column sequence_submission_id text; | ||
| alter table sequence_upload_aux_table add column metadata_submission_id text; | ||
|
|
||
| update sequence_upload_aux_table | ||
| set metadata_submission_id = submission_id | ||
| where metadata_submission_id is null; | ||
|
|
||
| update sequence_upload_aux_table | ||
| set sequence_submission_id = case | ||
| when segment_name is NULL or segment_name = '' then submission_id | ||
| else submission_id || '_' || segment_name | ||
| end | ||
| where sequence_submission_id is NULL; | ||
|
|
||
| alter table sequence_upload_aux_table drop column segment_name; | ||
| alter table sequence_upload_aux_table drop column submission_id; | ||
|
|
||
| alter table sequence_upload_aux_table add PRIMARY KEY (upload_id, sequence_submission_id); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This class is already quite large. What do you think about moving parts of this method to a separate class/function? I think everything except the transaction might be a good candidate for refactoring (i.e. the for loop and the if statement with the validation).