diff --git a/clickhouse_search/search.py b/clickhouse_search/search.py index bdc69ac7e6..bef7024541 100644 --- a/clickhouse_search/search.py +++ b/clickhouse_search/search.py @@ -667,6 +667,11 @@ def delete_clickhouse_project(project, dataset_type, sample_type=None): return f'Deleted all {dataset_type} search data for project {project.name}' +def delete_clickhouse_family(project, family_guid, dataset_type, sample_type=None): + dataset_type = _clickhouse_dataset_type(dataset_type, sample_type) + return f'Clickhouse does not support deleting individual families from project. Manually delete {dataset_type} data for {family_guid} in project {project.guid}' + + SV_DATASET_TYPES = { Sample.SAMPLE_TYPE_WGS: Sample.DATASET_TYPE_SV_CALLS, Sample.SAMPLE_TYPE_WES: 'GCNV', diff --git a/seqr/utils/search/add_data_utils.py b/seqr/utils/search/add_data_utils.py index 8fd12f4189..4159f20fc8 100644 --- a/seqr/utils/search/add_data_utils.py +++ b/seqr/utils/search/add_data_utils.py @@ -3,7 +3,6 @@ from django.db.models import F import json import requests -from typing import Callable from reference_data.models import GeneInfo, GENOME_VERSION_LOOKUP from seqr.models import Sample, Individual, Project @@ -197,67 +196,20 @@ def _get_pedigree_path(genome_version: str, sample_type: str, dataset_type: str) return f'{LOADING_DATASETS_DIR}/{GENOME_VERSION_LOOKUP[genome_version]}/{loading_dataset_type}/pedigrees/{sample_type}' -def get_loading_samples_validator(vcf_samples: list[str], loaded_individual_ids: list[int], sample_source: str, - missing_family_samples_error: str, loaded_sample_types: list[str] = None, - fetch_missing_loaded_samples: Callable = None, fetch_missing_vcf_samples: Callable = None) -> Callable: - - def validate_expected_samples(record_family_ids, previous_loaded_individuals, sample_type): - errors = [] - - if loaded_sample_types is not None: - if sample_type: - loaded_sample_types.append(sample_type) - else: - errors.append('New data cannot be added to this project until the previously requested data is loaded') - - families = set(record_family_ids.values()) - missing_samples_by_family = defaultdict(set) - expected_sample_set = record_family_ids if fetch_missing_loaded_samples else vcf_samples - for loaded_individual in previous_loaded_individuals: - individual_id = loaded_individual[JsonConstants.INDIVIDUAL_ID_COLUMN] - family_id = loaded_individual[JsonConstants.FAMILY_ID_COLUMN] - if family_id in families and individual_id not in expected_sample_set: - missing_samples_by_family[family_id].add(individual_id) - - loading_samples = set(record_family_ids.keys()) - if missing_samples_by_family and fetch_missing_loaded_samples: - try: - additional_loaded_samples = fetch_missing_loaded_samples() - for missing_samples in missing_samples_by_family.values(): - loading_samples.update(missing_samples.intersection(additional_loaded_samples)) - missing_samples -= additional_loaded_samples - missing_samples_by_family = { - family_id: samples for family_id, samples in missing_samples_by_family.items() if samples - } - except ValueError as e: - errors.append(str(e)) - - if missing_samples_by_family: - missing_family_sample_messages = [ - f'Family {family_id}: {", ".join(sorted(individual_ids))}' - for family_id, individual_ids in missing_samples_by_family.items() - ] - errors.append( - missing_family_samples_error + '\n'.join(sorted(missing_family_sample_messages)) - ) - - missing_vcf_samples = [] if vcf_samples is None else set(loading_samples - set(vcf_samples)) - if missing_vcf_samples and fetch_missing_vcf_samples: - try: - additional_vcf_samples = fetch_missing_vcf_samples(missing_vcf_samples) - missing_vcf_samples -= set(additional_vcf_samples) - except ValueError as e: - errors.append(str(e)) - if missing_vcf_samples: - errors.insert( - 0, f'The following samples are included in {sample_source} but are missing from the VCF: {", ".join(sorted(missing_vcf_samples))}', - ) - - nonlocal loaded_individual_ids - loaded_individual_ids += [ - i['individual_id'] for i in previous_loaded_individuals if i[JsonConstants.FAMILY_ID_COLUMN] in families - ] - - return errors - - return validate_expected_samples +def get_missing_family_samples(expected_sample_set, record_family_ids, previous_loaded_individuals): + families = set(record_family_ids.values()) + missing_samples_by_family = defaultdict(set) + for loaded_individual in previous_loaded_individuals: + individual_id = loaded_individual[JsonConstants.INDIVIDUAL_ID_COLUMN] + family_id = loaded_individual[JsonConstants.FAMILY_ID_COLUMN] + if family_id in families and individual_id not in expected_sample_set: + missing_samples_by_family[family_id].add(individual_id) + + return missing_samples_by_family + + +def get_loaded_individual_ids(record_family_ids, previous_loaded_individuals): + families = set(record_family_ids.values()) + return [ + i['individual_id'] for i in previous_loaded_individuals if i[JsonConstants.FAMILY_ID_COLUMN] in families + ] diff --git a/seqr/views/apis/anvil_workspace_api.py b/seqr/views/apis/anvil_workspace_api.py index 72bab3a39e..8905c91f2c 100644 --- a/seqr/views/apis/anvil_workspace_api.py +++ b/seqr/views/apis/anvil_workspace_api.py @@ -22,7 +22,7 @@ from seqr.views.utils.individual_utils import add_or_update_individuals_and_families from seqr.utils.communication_utils import send_html_email from seqr.utils.file_utils import list_files -from seqr.utils.search.add_data_utils import get_loading_samples_validator, trigger_data_loading +from seqr.utils.search.add_data_utils import get_missing_family_samples, get_loaded_individual_ids, trigger_data_loading from seqr.utils.vcf_utils import validate_vcf_and_get_samples, get_vcf_list from seqr.utils.logging_utils import SeqrLogger from seqr.utils.middleware import ErrorsWarningsException @@ -244,10 +244,8 @@ def add_workspace_data(request, project_guid): def _parse_uploaded_pedigree(request_json, project=None, search_dataset_type=None): loaded_sample_types = [] if search_dataset_type else None loaded_individual_ids = [] - validate_expected_samples = get_loading_samples_validator( - request_json['vcfSamples'], loaded_individual_ids, loaded_sample_types=loaded_sample_types, sample_source='the pedigree file', - missing_family_samples_error='In order to load data for families with previously loaded data, new family samples must be joint called in a single VCF with all previously loaded samples. The following samples were previously loaded in this project but are missing from the VCF:\n', - ) + def validate_expected_samples(*args): + return _validate_expected_samples(request_json['vcfSamples'], loaded_sample_types, loaded_individual_ids, *args) json_records = load_uploaded_file(request_json['uploadedFileId']) pedigree_records = parse_basic_pedigree_table( @@ -258,6 +256,36 @@ def _parse_uploaded_pedigree(request_json, project=None, search_dataset_type=Non return pedigree_records, loaded_individual_ids, loaded_sample_types[0] if loaded_sample_types else None +def _validate_expected_samples(vcf_samples, loaded_sample_types, loaded_individual_ids, record_family_ids, previous_loaded_individuals, sample_type): + errors = [] + + if loaded_sample_types is not None: + if sample_type: + loaded_sample_types.append(sample_type) + else: + errors.append('New data cannot be added to this project until the previously requested data is loaded') + + missing_vcf_samples = set(record_family_ids.keys()) - set(vcf_samples) + if missing_vcf_samples: + errors.append( + f'The following samples are included in the pedigree file but are missing from the VCF: {", ".join(sorted(missing_vcf_samples))}', + ) + + missing_samples_by_family = get_missing_family_samples(vcf_samples, record_family_ids, previous_loaded_individuals) + if missing_samples_by_family: + missing_family_sample_messages = [ + f'Family {family_id}: {", ".join(sorted(individual_ids))}' + for family_id, individual_ids in missing_samples_by_family.items() + ] + errors.append('\n'.join([ + 'In order to load data for families with previously loaded data, new family samples must be joint called in a single VCF with all previously loaded samples. The following samples were previously loaded in this project but are missing from the VCF:', + ] + sorted(missing_family_sample_messages))) + + loaded_individual_ids += get_loaded_individual_ids(record_family_ids, previous_loaded_individuals) + + return errors + + def _trigger_add_workspace_data(project, pedigree_records, user, data_path, sample_type, previous_loaded_ids=None, get_pedigree_json=False): # add families and individuals according to the uploaded individual records pedigree_json, individual_ids = add_or_update_individuals_and_families( diff --git a/seqr/views/apis/data_manager_api.py b/seqr/views/apis/data_manager_api.py index 4cd7b3ab81..f7ad59f445 100644 --- a/seqr/views/apis/data_manager_api.py +++ b/seqr/views/apis/data_manager_api.py @@ -15,7 +15,7 @@ from clickhouse_search.search import delete_clickhouse_project from seqr.utils.communication_utils import send_project_notification -from seqr.utils.search.add_data_utils import trigger_data_loading, get_loading_samples_validator, trigger_delete_families_search +from seqr.utils.search.add_data_utils import trigger_data_loading, get_missing_family_samples, get_loaded_individual_ids, trigger_delete_families_search from seqr.utils.search.elasticsearch.es_utils import get_elasticsearch_status, delete_es_index from seqr.utils.search.utils import clickhouse_only, es_only, InvalidSearchException from seqr.utils.file_utils import file_iter @@ -372,21 +372,54 @@ def _get_valid_search_individuals(project, airtable_samples, vcf_samples, datase ) } + if airtable_samples: + missing_airtable_samples = {sample_id for sample_id in airtable_samples if sample_id not in search_individuals_by_id} + if missing_airtable_samples: + errors.append( + f'The following samples are included in airtable for {project.name} but are missing from seqr: {", ".join(missing_airtable_samples)}') + + previous_loaded_individuals, record_family_ids, _ = get_validated_related_individuals( + project, search_individuals_by_id, errors, search_dataset_type=dataset_type, search_sample_type=sample_type, + add_missing_parents=False, + ) + + expected_sample_set = record_family_ids if airtable_samples else vcf_samples + missing_samples_by_family = get_missing_family_samples(expected_sample_set, record_family_ids, previous_loaded_individuals.values()) + loading_samples = set(record_family_ids.keys()) + get_sample_kwargs = { + 'user': user, 'dataset_type': dataset_type, 'sample_type': sample_type, 'project_guid': project.guid, + } + if missing_samples_by_family and airtable_samples: + try: + additional_loaded_samples = { + sample['sample_id'] for sample in _get_dataset_type_samples_for_matched_pdos( + AVAILABLE_PDO_STATUSES, **get_sample_kwargs, + ) + } + for missing_samples in missing_samples_by_family.values(): + loading_samples.update(missing_samples.intersection(additional_loaded_samples)) + missing_samples -= additional_loaded_samples + missing_samples_by_family = { + family_id: samples for family_id, samples in missing_samples_by_family.items() if samples + } + except ValueError as e: + errors.append(str(e)) + + sample_source = 'airtable' if airtable_samples else 'the vcf' + if missing_samples_by_family: + missing_family_sample_messages = [ + f'Family {family_id}: {", ".join(sorted(individual_ids))}' + for family_id, individual_ids in missing_samples_by_family.items() + ] + errors.append('\n'.join( + [f'The following families have previously loaded samples absent from {sample_source}'] + + sorted(missing_family_sample_messages) + )) + vcf_sample_id_map = {} - if not airtable_samples: - fetch_missing_loaded_samples = None - fetch_missing_vcf_samples = None - sample_source = 'the vcf' - else: - get_sample_kwargs = { - 'user': user, 'dataset_type': dataset_type, 'sample_type': sample_type, 'project_guid': project.guid, - } - fetch_missing_loaded_samples = lambda: { - sample['sample_id'] for sample in _get_dataset_type_samples_for_matched_pdos( - AVAILABLE_PDO_STATUSES, **get_sample_kwargs, - ) - } - def fetch_missing_vcf_samples(missing_vcf_samples): + missing_vcf_samples = [] if vcf_samples is None else set(loading_samples - set(vcf_samples)) + if missing_vcf_samples and airtable_samples: + try: samples = _get_dataset_type_samples_for_matched_pdos( LOADABLE_PDO_STATUSES + AVAILABLE_PDO_STATUSES, **get_sample_kwargs, sample_fields=['VCFIDWithMismatch'], additional_sample_filters={'SeqrIDWithMismatch': sorted(missing_vcf_samples)}, @@ -395,25 +428,15 @@ def fetch_missing_vcf_samples(missing_vcf_samples): s['sample_id']: s['VCFIDWithMismatch'] for s in samples if s['sample_id'] in airtable_samples and s['VCFIDWithMismatch'] in vcf_samples }) - return vcf_sample_id_map.keys() - sample_source = 'airtable' - - missing_airtable_samples = {sample_id for sample_id in airtable_samples if sample_id not in search_individuals_by_id} - if missing_airtable_samples: - errors.append( - f'The following samples are included in airtable for {project.name} but are missing from seqr: {", ".join(missing_airtable_samples)}') - - loaded_individual_ids = [] - validate_expected_samples = get_loading_samples_validator( - vcf_samples, loaded_individual_ids, sample_source=sample_source, - fetch_missing_loaded_samples=fetch_missing_loaded_samples, fetch_missing_vcf_samples=fetch_missing_vcf_samples, - missing_family_samples_error= f'The following families have previously loaded samples absent from {sample_source}\n', - ) + missing_vcf_samples -= set(vcf_sample_id_map.keys()) + except ValueError as e: + errors.append(str(e)) + if missing_vcf_samples: + errors.append( + f'The following samples are included in {sample_source} but are missing from the VCF: {", ".join(sorted(missing_vcf_samples))}', + ) - get_validated_related_individuals( - project, search_individuals_by_id, errors, search_dataset_type=dataset_type, search_sample_type=sample_type, - validate_expected_samples=validate_expected_samples, add_missing_parents=False, - ) + loaded_individual_ids = get_loaded_individual_ids(record_family_ids, previous_loaded_individuals.values()) return [i['id'] for i in search_individuals_by_id.values()] + loaded_individual_ids, vcf_sample_id_map diff --git a/seqr/views/apis/data_manager_api_tests.py b/seqr/views/apis/data_manager_api_tests.py index 508dac06ca..311855726b 100644 --- a/seqr/views/apis/data_manager_api_tests.py +++ b/seqr/views/apis/data_manager_api_tests.py @@ -1958,8 +1958,8 @@ def _trigger_error(self, url, body, variables, mock_open, mock_gzip_open, mock_m self.assertDictEqual(response.json(), { 'warnings': None, 'errors': [ - 'The following samples are included in airtable but are missing from the VCF: NA21987', 'The following families have previously loaded samples absent from airtable\nFamily fam14: NA21234, NA21654', + 'The following samples are included in airtable but are missing from the VCF: NA21987', ], }) self.assertEqual(len(responses.calls), 2)