Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
488270f
feat(prepro): increase timeout
anna-parker Aug 11, 2025
7bddce1
double timeout while I investigate
anna-parker Aug 11, 2025
c71127d
higher timeout
anna-parker Aug 11, 2025
3ef8786
for kicks
anna-parker Aug 11, 2025
38ddc83
retry
anna-parker Aug 11, 2025
2a09e87
feat(prepro): move segment validation to prepro
anna-parker Aug 6, 2025
60e87b1
feat(prepro): fix tests
anna-parker Aug 6, 2025
b8e6771
feat(backend): add missing sequences in the unaligned and aligned ret…
anna-parker Aug 4, 2025
d4fd169
feat(prepro): start config
anna-parker Jul 31, 2025
9ab7332
feat(prepro): add sort code
anna-parker Aug 4, 2025
fe700cb
fix merge issue
anna-parker Aug 12, 2025
b7645ed
rebase fix
anna-parker Aug 12, 2025
9af680e
fix merge conflict
anna-parker Aug 12, 2025
15dd80d
feat(website): use mergedConfig to improve website experience for now
anna-parker Aug 12, 2025
1aec5a6
fix
anna-parker Aug 12, 2025
aa1ef09
feat(prepro): make config options even clearer
anna-parker Aug 12, 2025
48318d1
fix type
anna-parker Aug 12, 2025
8022a7f
try to fix weird error
anna-parker Aug 12, 2025
4c3b317
bad patch
anna-parker Aug 12, 2025
ccfde44
see where this happens
anna-parker Aug 12, 2025
4d51b9b
fix
anna-parker Aug 12, 2025
ce8824a
fix up
anna-parker Aug 12, 2025
145bfd5
fix dummy
anna-parker Aug 13, 2025
90a76c4
wupps
anna-parker Aug 13, 2025
9d7ee6b
fix
anna-parker Aug 13, 2025
6ac8e46
try again
anna-parker Aug 13, 2025
0a43242
Update kubernetes/loculus/values.yaml
anna-parker Aug 13, 2025
fa2cd75
feat: add gene_prefix option
anna-parker Sep 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion integration-tests/tests/pages/review.page.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ export class ReviewPage {
async waitForZeroProcessing() {
await expect(this.page.locator('[data-testid="review-page-control-panel"]')).toContainText(
'0 awaiting processing',
{ timeout: 60000 },
{ timeout: 200000 },
);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import { v4 as uuidv4 } from 'uuid';
const SEQUENCE = 'ATTGATCTCATCATTT';

test('Lineage field lineage counts', async ({ page, pageWithGroup }) => {
test.setTimeout(95_000);
test.setTimeout(900_000);
const uuid = uuidv4();

await page.goto('/');
Expand Down
4 changes: 2 additions & 2 deletions kubernetes/loculus/templates/_common-metadata.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -251,8 +251,8 @@ organisms:
{{ end }}
{{ .website | toYaml | nindent 6 }}
{{- end }}
referenceGenomes:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The website changes are just for visualization purposes and should be reverted before merging. Currently the website cannot handle the unmerged reference genomes config.

Copy link
Contributor Author

@anna-parker anna-parker Aug 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the integration test failures are due to this change actually (the mergedReferenceGenomes object does not contain the insdc ReferenceAccessions) - I am just going to leave this as is for now as fixing it is a bit tedious and these changes will have to be reverted anyways

{{ $instance.referenceGenomes | toYaml | nindent 6 }}
referenceGenome:
{{ include "loculus.mergeReferenceGenomes" $instance.referenceGenomes | nindent 6 }}
{{- end }}
{{- end }}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,12 @@
{{- $metadata := ($organismConfig.schema | include "loculus.patchMetadataSchema" | fromYaml).metadata }}
{{- $rawNucleotideSequences := (($organismConfig.schema | include "loculus.patchMetadataSchema" | fromYaml).nucleotideSequences) }}
{{- $nucleotideSequences := ($rawNucleotideSequences | default "" ) }}
{{- $nucleotideSequencesList := (eq $rawNucleotideSequences nil | ternary (list "main") $rawNucleotideSequences) }}
{{- $referenceGenomes:= include "loculus.mergeReferenceGenomes" $organismConfig.referenceGenomes | fromYaml }}
{{- $genesList := (eq $referenceGenomes.genes nil | ternary (list) $referenceGenomes.genes) }}
{{- $genesDict := dict "genes" (list) -}}
{{- range $g := $genesList }}
{{- $_ := set $genesDict "genes" (append ($genesDict.genes) $g.name) -}}
{{- end }}
{{- range $processingIndex, $processingConfig := $organismConfig.preprocessing }}
{{- if $processingConfig.configFile }}
{{- /* Use the enaDepositionConfig as the base config */}}
Expand All @@ -17,7 +22,7 @@ data:
preprocessing-config.yaml: |
organism: {{ $organism }}
{{- $preproAndEnaConfigFile | toYaml | nindent 4 }}
{{- (dict "nucleotideSequences" $nucleotideSequencesList) | toYaml | nindent 4 }}
{{- $genesDict | toYaml | nindent 4 }}
processing_spec:
{{- $args := dict "metadata" $metadata "nucleotideSequences" $nucleotideSequences }}
{{- include "loculus.preprocessingSpecs" $args | nindent 6 }}
Expand Down
52 changes: 37 additions & 15 deletions kubernetes/loculus/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1312,7 +1312,6 @@ defaultOrganismConfig: &defaultOrganismConfig
replicas: 2
configFile: &preprocessingConfigFile
log_level: DEBUG
genes: []
batch_size: 100
create_embl_file: true
ingest: &ingest
Expand Down Expand Up @@ -1356,16 +1355,18 @@ defaultOrganisms:
- <<: *preprocessing
version: 1
configFile:
<<: *preprocessingConfigFile
genes: [NP, VP35, VP40, GP, sGP, ssGP, VP30, VP24, L]
nextclade_dataset_name: nextstrain/ebola/sudan
<<: *preprocessingConfigFile
nucleotideSequences:
- name: "main"
nextclade_dataset_name: nextstrain/ebola/sudan
nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/ebola/data_output
- <<: *preprocessing
version: 2
configFile:
<<: *preprocessingConfigFile
genes: [NP, VP35, VP40, GP, sGP, ssGP, VP30, VP24, L]
nextclade_dataset_name: nextstrain/ebola/sudan
nucleotideSequences:
- name: "main"
nextclade_dataset_name: nextstrain/ebola/sudan
nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/ebola/data_output
west-nile:
<<: *defaultOrganismConfig
Expand Down Expand Up @@ -1417,9 +1418,10 @@ defaultOrganisms:
version: 1
configFile:
<<: *preprocessingConfigFile
nextclade_dataset_name: nextstrain/wnv/all-lineages
nucleotideSequences:
- name: "main"
nextclade_dataset_name: nextstrain/wnv/all-lineages
nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/wnv/data_output
genes: [capsid, prM, env, NS1, NS2A, NS2B, NS3, NS4A, 2K, NS4B, NS5]
ingest:
<<: *ingest
configFile:
Expand Down Expand Up @@ -1674,8 +1676,9 @@ defaultOrganisms:
- "prepro"
configFile:
log_level: DEBUG
genes: []
batch_size: 100
nucleotideSequences:
- name: "main"
referenceGenomes:
singleReference:
nucleotideSequences:
Expand Down Expand Up @@ -1734,9 +1737,14 @@ defaultOrganisms:
configFile:
<<: *preprocessingConfigFile
log_level: DEBUG
nextclade_dataset_name: nextstrain/cchfv/linked
nucleotideSequences:
- name: L
nextclade_dataset_name: nextstrain/cchfv/linked/L
- name: M
nextclade_dataset_name: nextstrain/cchfv/linked/M
- name: S
nextclade_dataset_name: nextstrain/cchfv/linked/S
nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/cornelius-cchfv/data_output
genes: [RdRp, GPC, NP]
ingest:
<<: *ingest
configFile:
Expand Down Expand Up @@ -1772,7 +1780,7 @@ defaultOrganisms:
sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/cchf/NP.fasta]]"
enteroviruses:
<<: *defaultOrganismConfig
enabled: false
enabled: true
schema:
<<: *schema
organismName: "Enterovirus"
Expand Down Expand Up @@ -1848,9 +1856,23 @@ defaultOrganisms:
configFile:
<<: *preprocessingConfigFile
log_level: INFO
nextclade_dataset_name: community/hodcroftlab/enterovirus/enterovirus/linked
classify_with_nextclade_sort: True
minimizer_index: "https://raw.githubusercontent.com/alejandra-gonzalezsanchez/loculus-evs/master/evs_minimizer-index.json"
nucleotideSequences:
- name: CV-A16
nextclade_dataset_name: community/hodcroftlab/enterovirus/enterovirus/linked/CV-A16
accepted_sort_matches: ["community/hodcroftlab/enterovirus/cva16", "community/hodcroftlab/enterovirus/enterovirus/linked/CV-A16"]
gene_prefix: "CV-A16-"
- name: CV-A10
nextclade_dataset_name: community/hodcroftlab/enterovirus/enterovirus/linked/CV-A10
gene_prefix: "CV-A10-"
- name: EV-A71
nextclade_dataset_name: community/hodcroftlab/enterovirus/enterovirus/linked/EV-A71
gene_prefix: "EV-A71-"
- name: EV-D68
gene_prefix: "EV-D68-"
nextclade_dataset_name: community/hodcroftlab/enterovirus/enterovirus/linked/EV-D68
nextclade_dataset_server: https://raw.githubusercontent.com/alejandra-gonzalezsanchez/nextclade_data/multi-pathogen-evs/data_output
genes: ["CV-A16-VP4", "CV-A16-VP2", "CV-A16-VP3", "CV-A16-VP1", "CV-A16-2A", "CV-A16-2B", "CV-A16-2C", "CV-A16-3A", "CV-A16-3B", "CV-A16-3C", "CV-A16-3D", "CV-A10-VP4", "CV-A10-VP2", "CV-A10-VP3", "CV-A10-VP1", "CV-A10-2A", "CV-A10-2B", "CV-A10-2C", "CV-A10-3A", "CV-A10-3B", "CV-A10-3C", "CV-A10-3D", "EV-A71-VP4", "EV-A71-VP2", "EV-A71-VP3", "EV-A71-VP1", "EV-A71-2A", "EV-A71-2B", "EV-A71-2C", "EV-A71-3A", "EV-A71-3B", "EV-A71-3C", "EV-A71-3D", "EV-D68-VP4", "EV-D68-VP2", "EV-D68-VP3", "EV-D68-VP1", "EV-D68-2A", "EV-D68-2B", "EV-D68-2C", "EV-D68-3A", "EV-D68-3B", "EV-D68-3C", "EV-D68-3D"]
ingest:
<<: *ingest
configFile:
Expand Down Expand Up @@ -2070,7 +2092,7 @@ enforceHTTPS: true
registrationTermsMessage: >
You must agree to the <a href="http://main.loculus.org/terms">terms of use</a>.

enaDeposition:
enaDeposition:
submitToEnaProduction: false
enaDbName: Loculus
enaUniqueSuffix: Loculus
Expand Down
59 changes: 53 additions & 6 deletions preprocessing/nextclade/src/loculus_preprocessing/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,21 @@ class AlignmentRequirement(StrEnum):
# Determines whether ALL or ANY segments that a user provides must align.
# ANY: warn if some segments fail and some segments align
# ALL: error if any segment fails even if some segments align
# NONE: do not align any segments, just process them as-is
# - set if no nextclade dataset is provided
ANY = "ANY"
ALL = "ALL"
NONE = "NONE"


@dataclass
class NextcladeSequenceAndDataset:
name: str = "main"
nextclade_dataset_name: str | None = None
nextclade_dataset_tag: str | None = None
nextclade_dataset_server: str | None = None
accepted_sort_matches: list[str] | None = None
gene_prefix: str | None = None


@dataclass
Expand All @@ -51,21 +64,20 @@ class Config:
keycloak_token_path: str = "realms/loculus/protocol/openid-connect/token" # noqa: S105

organism: str = "mpox"
nucleotideSequences: list[NextcladeSequenceAndDataset] = dataclasses.field( # noqa: N815
default_factory=list
)
genes: list[str] = dataclasses.field(default_factory=list)
nucleotideSequences: list[str] = dataclasses.field(default_factory=lambda: ["main"]) # noqa: N815
processing_spec: dict[str, dict[str, Any]] = dataclasses.field(default_factory=dict)
multi_segment: bool = False

alignment_requirement: AlignmentRequirement = AlignmentRequirement.ALL
nextclade_dataset_name: str | None = None
nextclade_dataset_name_map: dict[str, str] | None = None
nextclade_dataset_tag: str | None = None
nextclade_dataset_server: str = "https://data.clades.nextstrain.org/v3"
nextclade_dataset_server_map: dict[str, str] | None = None

require_nextclade_sort_match: bool = False
minimizer_url: str | None = None
accepted_dataset_matches: list[str] = dataclasses.field(default_factory=list)
classify_with_nextclade_sort: bool = False

create_embl_file: bool = False
scientific_name: str = "Orthonairovirus haemorrhagiae"
molecule_type: MoleculeType = MoleculeType.GENOMIC_RNA
Expand All @@ -77,13 +89,46 @@ class Config:
)


def assign_nextclade_sequence_and_dataset(
nuc_seq_values: list[dict[str, Any]],
) -> list[NextcladeSequenceAndDataset]:
if not isinstance(nuc_seq_values, list):
error_msg = f"nucleotideSequences should be a list of dicts, got: {type(nuc_seq_values)}"
logger.error(error_msg)
raise ValueError(error_msg)
nextclade_sequence_and_dataset_list: list[NextcladeSequenceAndDataset] = []
for value in nuc_seq_values:
if value is None or not isinstance(value, dict):
continue
seq_and_dataset = NextcladeSequenceAndDataset()
for seq_key, seq_value in value.items():
if hasattr(seq_and_dataset, seq_key) and seq_value is not None:
setattr(seq_and_dataset, seq_key, seq_value)
nextclade_sequence_and_dataset_list.append(seq_and_dataset)
return nextclade_sequence_and_dataset_list


def set_alignment_requirement(
config: Config) -> AlignmentRequirement:
need_nextclade_dataset: bool = False
for sequence in config.nucleotideSequences:
if sequence.nextclade_dataset_name:
need_nextclade_dataset = True
if not need_nextclade_dataset:
return AlignmentRequirement.NONE
return config.alignment_requirement


def load_config_from_yaml(config_file: str, config: Config | None = None) -> Config:
config = Config() if config is None else copy.deepcopy(config)
with open(config_file, encoding="utf-8") as file:
yaml_config = yaml.safe_load(file)
logger.debug(f"Loaded config from {config_file}: {yaml_config}")
for key, value in yaml_config.items():
if value is not None and hasattr(config, key):
if key == "nucleotideSequences":
setattr(config, key, assign_nextclade_sequence_and_dataset(value))
continue
attr = getattr(config, key)
if isinstance(attr, StrEnum):
try:
Expand Down Expand Up @@ -171,4 +216,6 @@ def get_config(config_file: str | None = None, ignore_args: bool = False) -> Con
if len(config.nucleotideSequences) > 1:
config.multi_segment = True

config.alignment_requirement = set_alignment_requirement(config)

return config
Loading
Loading