Skip to content

Commit 70a17d0

Browse files
authored
Merge pull request #5052 from broadinstitute/sample-validator-cleanup
Loading sample validator cleanup
2 parents 5ba6a49 + 785d570 commit 70a17d0

File tree

5 files changed

+112
-104
lines changed

5 files changed

+112
-104
lines changed

clickhouse_search/search.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -667,6 +667,11 @@ def delete_clickhouse_project(project, dataset_type, sample_type=None):
667667
return f'Deleted all {dataset_type} search data for project {project.name}'
668668

669669

670+
def delete_clickhouse_family(project, family_guid, dataset_type, sample_type=None):
671+
dataset_type = _clickhouse_dataset_type(dataset_type, sample_type)
672+
return f'Clickhouse does not support deleting individual families from project. Manually delete {dataset_type} data for {family_guid} in project {project.guid}'
673+
674+
670675
SV_DATASET_TYPES = {
671676
Sample.SAMPLE_TYPE_WGS: Sample.DATASET_TYPE_SV_CALLS,
672677
Sample.SAMPLE_TYPE_WES: 'GCNV',

seqr/utils/search/add_data_utils.py

Lines changed: 17 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
from django.db.models import F
44
import json
55
import requests
6-
from typing import Callable
76

87
from reference_data.models import GeneInfo, GENOME_VERSION_LOOKUP
98
from seqr.models import Sample, Individual, Project
@@ -197,67 +196,20 @@ def _get_pedigree_path(genome_version: str, sample_type: str, dataset_type: str)
197196
return f'{LOADING_DATASETS_DIR}/{GENOME_VERSION_LOOKUP[genome_version]}/{loading_dataset_type}/pedigrees/{sample_type}'
198197

199198

200-
def get_loading_samples_validator(vcf_samples: list[str], loaded_individual_ids: list[int], sample_source: str,
201-
missing_family_samples_error: str, loaded_sample_types: list[str] = None,
202-
fetch_missing_loaded_samples: Callable = None, fetch_missing_vcf_samples: Callable = None) -> Callable:
203-
204-
def validate_expected_samples(record_family_ids, previous_loaded_individuals, sample_type):
205-
errors = []
206-
207-
if loaded_sample_types is not None:
208-
if sample_type:
209-
loaded_sample_types.append(sample_type)
210-
else:
211-
errors.append('New data cannot be added to this project until the previously requested data is loaded')
212-
213-
families = set(record_family_ids.values())
214-
missing_samples_by_family = defaultdict(set)
215-
expected_sample_set = record_family_ids if fetch_missing_loaded_samples else vcf_samples
216-
for loaded_individual in previous_loaded_individuals:
217-
individual_id = loaded_individual[JsonConstants.INDIVIDUAL_ID_COLUMN]
218-
family_id = loaded_individual[JsonConstants.FAMILY_ID_COLUMN]
219-
if family_id in families and individual_id not in expected_sample_set:
220-
missing_samples_by_family[family_id].add(individual_id)
221-
222-
loading_samples = set(record_family_ids.keys())
223-
if missing_samples_by_family and fetch_missing_loaded_samples:
224-
try:
225-
additional_loaded_samples = fetch_missing_loaded_samples()
226-
for missing_samples in missing_samples_by_family.values():
227-
loading_samples.update(missing_samples.intersection(additional_loaded_samples))
228-
missing_samples -= additional_loaded_samples
229-
missing_samples_by_family = {
230-
family_id: samples for family_id, samples in missing_samples_by_family.items() if samples
231-
}
232-
except ValueError as e:
233-
errors.append(str(e))
234-
235-
if missing_samples_by_family:
236-
missing_family_sample_messages = [
237-
f'Family {family_id}: {", ".join(sorted(individual_ids))}'
238-
for family_id, individual_ids in missing_samples_by_family.items()
239-
]
240-
errors.append(
241-
missing_family_samples_error + '\n'.join(sorted(missing_family_sample_messages))
242-
)
243-
244-
missing_vcf_samples = [] if vcf_samples is None else set(loading_samples - set(vcf_samples))
245-
if missing_vcf_samples and fetch_missing_vcf_samples:
246-
try:
247-
additional_vcf_samples = fetch_missing_vcf_samples(missing_vcf_samples)
248-
missing_vcf_samples -= set(additional_vcf_samples)
249-
except ValueError as e:
250-
errors.append(str(e))
251-
if missing_vcf_samples:
252-
errors.insert(
253-
0, f'The following samples are included in {sample_source} but are missing from the VCF: {", ".join(sorted(missing_vcf_samples))}',
254-
)
255-
256-
nonlocal loaded_individual_ids
257-
loaded_individual_ids += [
258-
i['individual_id'] for i in previous_loaded_individuals if i[JsonConstants.FAMILY_ID_COLUMN] in families
259-
]
260-
261-
return errors
262-
263-
return validate_expected_samples
199+
def get_missing_family_samples(expected_sample_set, record_family_ids, previous_loaded_individuals):
200+
families = set(record_family_ids.values())
201+
missing_samples_by_family = defaultdict(set)
202+
for loaded_individual in previous_loaded_individuals:
203+
individual_id = loaded_individual[JsonConstants.INDIVIDUAL_ID_COLUMN]
204+
family_id = loaded_individual[JsonConstants.FAMILY_ID_COLUMN]
205+
if family_id in families and individual_id not in expected_sample_set:
206+
missing_samples_by_family[family_id].add(individual_id)
207+
208+
return missing_samples_by_family
209+
210+
211+
def get_loaded_individual_ids(record_family_ids, previous_loaded_individuals):
212+
families = set(record_family_ids.values())
213+
return [
214+
i['individual_id'] for i in previous_loaded_individuals if i[JsonConstants.FAMILY_ID_COLUMN] in families
215+
]

seqr/views/apis/anvil_workspace_api.py

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from seqr.views.utils.individual_utils import add_or_update_individuals_and_families
2323
from seqr.utils.communication_utils import send_html_email
2424
from seqr.utils.file_utils import list_files
25-
from seqr.utils.search.add_data_utils import get_loading_samples_validator, trigger_data_loading
25+
from seqr.utils.search.add_data_utils import get_missing_family_samples, get_loaded_individual_ids, trigger_data_loading
2626
from seqr.utils.vcf_utils import validate_vcf_and_get_samples, get_vcf_list
2727
from seqr.utils.logging_utils import SeqrLogger
2828
from seqr.utils.middleware import ErrorsWarningsException
@@ -244,10 +244,8 @@ def add_workspace_data(request, project_guid):
244244
def _parse_uploaded_pedigree(request_json, project=None, search_dataset_type=None):
245245
loaded_sample_types = [] if search_dataset_type else None
246246
loaded_individual_ids = []
247-
validate_expected_samples = get_loading_samples_validator(
248-
request_json['vcfSamples'], loaded_individual_ids, loaded_sample_types=loaded_sample_types, sample_source='the pedigree file',
249-
missing_family_samples_error='In order to load data for families with previously loaded data, new family samples must be joint called in a single VCF with all previously loaded samples. The following samples were previously loaded in this project but are missing from the VCF:\n',
250-
)
247+
def validate_expected_samples(*args):
248+
return _validate_expected_samples(request_json['vcfSamples'], loaded_sample_types, loaded_individual_ids, *args)
251249

252250
json_records = load_uploaded_file(request_json['uploadedFileId'])
253251
pedigree_records = parse_basic_pedigree_table(
@@ -258,6 +256,36 @@ def _parse_uploaded_pedigree(request_json, project=None, search_dataset_type=Non
258256
return pedigree_records, loaded_individual_ids, loaded_sample_types[0] if loaded_sample_types else None
259257

260258

259+
def _validate_expected_samples(vcf_samples, loaded_sample_types, loaded_individual_ids, record_family_ids, previous_loaded_individuals, sample_type):
260+
errors = []
261+
262+
if loaded_sample_types is not None:
263+
if sample_type:
264+
loaded_sample_types.append(sample_type)
265+
else:
266+
errors.append('New data cannot be added to this project until the previously requested data is loaded')
267+
268+
missing_vcf_samples = set(record_family_ids.keys()) - set(vcf_samples)
269+
if missing_vcf_samples:
270+
errors.append(
271+
f'The following samples are included in the pedigree file but are missing from the VCF: {", ".join(sorted(missing_vcf_samples))}',
272+
)
273+
274+
missing_samples_by_family = get_missing_family_samples(vcf_samples, record_family_ids, previous_loaded_individuals)
275+
if missing_samples_by_family:
276+
missing_family_sample_messages = [
277+
f'Family {family_id}: {", ".join(sorted(individual_ids))}'
278+
for family_id, individual_ids in missing_samples_by_family.items()
279+
]
280+
errors.append('\n'.join([
281+
'In order to load data for families with previously loaded data, new family samples must be joint called in a single VCF with all previously loaded samples. The following samples were previously loaded in this project but are missing from the VCF:',
282+
] + sorted(missing_family_sample_messages)))
283+
284+
loaded_individual_ids += get_loaded_individual_ids(record_family_ids, previous_loaded_individuals)
285+
286+
return errors
287+
288+
261289
def _trigger_add_workspace_data(project, pedigree_records, user, data_path, sample_type, previous_loaded_ids=None, get_pedigree_json=False):
262290
# add families and individuals according to the uploaded individual records
263291
pedigree_json, individual_ids = add_or_update_individuals_and_families(

seqr/views/apis/data_manager_api.py

Lines changed: 56 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
from clickhouse_search.search import delete_clickhouse_project
1717
from seqr.utils.communication_utils import send_project_notification
18-
from seqr.utils.search.add_data_utils import trigger_data_loading, get_loading_samples_validator, trigger_delete_families_search
18+
from seqr.utils.search.add_data_utils import trigger_data_loading, get_missing_family_samples, get_loaded_individual_ids, trigger_delete_families_search
1919
from seqr.utils.search.elasticsearch.es_utils import get_elasticsearch_status, delete_es_index
2020
from seqr.utils.search.utils import clickhouse_only, es_only, InvalidSearchException
2121
from seqr.utils.file_utils import file_iter
@@ -372,21 +372,54 @@ def _get_valid_search_individuals(project, airtable_samples, vcf_samples, datase
372372
)
373373
}
374374

375+
if airtable_samples:
376+
missing_airtable_samples = {sample_id for sample_id in airtable_samples if sample_id not in search_individuals_by_id}
377+
if missing_airtable_samples:
378+
errors.append(
379+
f'The following samples are included in airtable for {project.name} but are missing from seqr: {", ".join(missing_airtable_samples)}')
380+
381+
previous_loaded_individuals, record_family_ids, _ = get_validated_related_individuals(
382+
project, search_individuals_by_id, errors, search_dataset_type=dataset_type, search_sample_type=sample_type,
383+
add_missing_parents=False,
384+
)
385+
386+
expected_sample_set = record_family_ids if airtable_samples else vcf_samples
387+
missing_samples_by_family = get_missing_family_samples(expected_sample_set, record_family_ids, previous_loaded_individuals.values())
388+
loading_samples = set(record_family_ids.keys())
389+
get_sample_kwargs = {
390+
'user': user, 'dataset_type': dataset_type, 'sample_type': sample_type, 'project_guid': project.guid,
391+
}
392+
if missing_samples_by_family and airtable_samples:
393+
try:
394+
additional_loaded_samples = {
395+
sample['sample_id'] for sample in _get_dataset_type_samples_for_matched_pdos(
396+
AVAILABLE_PDO_STATUSES, **get_sample_kwargs,
397+
)
398+
}
399+
for missing_samples in missing_samples_by_family.values():
400+
loading_samples.update(missing_samples.intersection(additional_loaded_samples))
401+
missing_samples -= additional_loaded_samples
402+
missing_samples_by_family = {
403+
family_id: samples for family_id, samples in missing_samples_by_family.items() if samples
404+
}
405+
except ValueError as e:
406+
errors.append(str(e))
407+
408+
sample_source = 'airtable' if airtable_samples else 'the vcf'
409+
if missing_samples_by_family:
410+
missing_family_sample_messages = [
411+
f'Family {family_id}: {", ".join(sorted(individual_ids))}'
412+
for family_id, individual_ids in missing_samples_by_family.items()
413+
]
414+
errors.append('\n'.join(
415+
[f'The following families have previously loaded samples absent from {sample_source}'] +
416+
sorted(missing_family_sample_messages)
417+
))
418+
375419
vcf_sample_id_map = {}
376-
if not airtable_samples:
377-
fetch_missing_loaded_samples = None
378-
fetch_missing_vcf_samples = None
379-
sample_source = 'the vcf'
380-
else:
381-
get_sample_kwargs = {
382-
'user': user, 'dataset_type': dataset_type, 'sample_type': sample_type, 'project_guid': project.guid,
383-
}
384-
fetch_missing_loaded_samples = lambda: {
385-
sample['sample_id'] for sample in _get_dataset_type_samples_for_matched_pdos(
386-
AVAILABLE_PDO_STATUSES, **get_sample_kwargs,
387-
)
388-
}
389-
def fetch_missing_vcf_samples(missing_vcf_samples):
420+
missing_vcf_samples = [] if vcf_samples is None else set(loading_samples - set(vcf_samples))
421+
if missing_vcf_samples and airtable_samples:
422+
try:
390423
samples = _get_dataset_type_samples_for_matched_pdos(
391424
LOADABLE_PDO_STATUSES + AVAILABLE_PDO_STATUSES, **get_sample_kwargs, sample_fields=['VCFIDWithMismatch'],
392425
additional_sample_filters={'SeqrIDWithMismatch': sorted(missing_vcf_samples)},
@@ -395,25 +428,15 @@ def fetch_missing_vcf_samples(missing_vcf_samples):
395428
s['sample_id']: s['VCFIDWithMismatch'] for s in samples
396429
if s['sample_id'] in airtable_samples and s['VCFIDWithMismatch'] in vcf_samples
397430
})
398-
return vcf_sample_id_map.keys()
399-
sample_source = 'airtable'
400-
401-
missing_airtable_samples = {sample_id for sample_id in airtable_samples if sample_id not in search_individuals_by_id}
402-
if missing_airtable_samples:
403-
errors.append(
404-
f'The following samples are included in airtable for {project.name} but are missing from seqr: {", ".join(missing_airtable_samples)}')
405-
406-
loaded_individual_ids = []
407-
validate_expected_samples = get_loading_samples_validator(
408-
vcf_samples, loaded_individual_ids, sample_source=sample_source,
409-
fetch_missing_loaded_samples=fetch_missing_loaded_samples, fetch_missing_vcf_samples=fetch_missing_vcf_samples,
410-
missing_family_samples_error= f'The following families have previously loaded samples absent from {sample_source}\n',
411-
)
431+
missing_vcf_samples -= set(vcf_sample_id_map.keys())
432+
except ValueError as e:
433+
errors.append(str(e))
434+
if missing_vcf_samples:
435+
errors.append(
436+
f'The following samples are included in {sample_source} but are missing from the VCF: {", ".join(sorted(missing_vcf_samples))}',
437+
)
412438

413-
get_validated_related_individuals(
414-
project, search_individuals_by_id, errors, search_dataset_type=dataset_type, search_sample_type=sample_type,
415-
validate_expected_samples=validate_expected_samples, add_missing_parents=False,
416-
)
439+
loaded_individual_ids = get_loaded_individual_ids(record_family_ids, previous_loaded_individuals.values())
417440

418441
return [i['id'] for i in search_individuals_by_id.values()] + loaded_individual_ids, vcf_sample_id_map
419442

seqr/views/apis/data_manager_api_tests.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1958,8 +1958,8 @@ def _trigger_error(self, url, body, variables, mock_open, mock_gzip_open, mock_m
19581958
self.assertDictEqual(response.json(), {
19591959
'warnings': None,
19601960
'errors': [
1961-
'The following samples are included in airtable but are missing from the VCF: NA21987',
19621961
'The following families have previously loaded samples absent from airtable\nFamily fam14: NA21234, NA21654',
1962+
'The following samples are included in airtable but are missing from the VCF: NA21987',
19631963
],
19641964
})
19651965
self.assertEqual(len(responses.calls), 2)

0 commit comments

Comments
 (0)