Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions clickhouse_search/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,11 @@ def delete_clickhouse_project(project, dataset_type, sample_type=None):
return f'Deleted all {dataset_type} search data for project {project.name}'


def delete_clickhouse_family(project, family_guid, dataset_type, sample_type=None):
dataset_type = _clickhouse_dataset_type(dataset_type, sample_type)
return f'Clickhouse does not support deleting individual families from project. Manually delete {dataset_type} data for {family_guid} in project {project.guid}'


SV_DATASET_TYPES = {
Sample.SAMPLE_TYPE_WGS: Sample.DATASET_TYPE_SV_CALLS,
Sample.SAMPLE_TYPE_WES: 'GCNV',
Expand Down
82 changes: 17 additions & 65 deletions seqr/utils/search/add_data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from django.db.models import F
import json
import requests
from typing import Callable

from reference_data.models import GeneInfo, GENOME_VERSION_LOOKUP
from seqr.models import Sample, Individual, Project
Expand Down Expand Up @@ -197,67 +196,20 @@ def _get_pedigree_path(genome_version: str, sample_type: str, dataset_type: str)
return f'{LOADING_DATASETS_DIR}/{GENOME_VERSION_LOOKUP[genome_version]}/{loading_dataset_type}/pedigrees/{sample_type}'


def get_loading_samples_validator(vcf_samples: list[str], loaded_individual_ids: list[int], sample_source: str,
missing_family_samples_error: str, loaded_sample_types: list[str] = None,
fetch_missing_loaded_samples: Callable = None, fetch_missing_vcf_samples: Callable = None) -> Callable:

def validate_expected_samples(record_family_ids, previous_loaded_individuals, sample_type):
errors = []

if loaded_sample_types is not None:
if sample_type:
loaded_sample_types.append(sample_type)
else:
errors.append('New data cannot be added to this project until the previously requested data is loaded')

families = set(record_family_ids.values())
missing_samples_by_family = defaultdict(set)
expected_sample_set = record_family_ids if fetch_missing_loaded_samples else vcf_samples
for loaded_individual in previous_loaded_individuals:
individual_id = loaded_individual[JsonConstants.INDIVIDUAL_ID_COLUMN]
family_id = loaded_individual[JsonConstants.FAMILY_ID_COLUMN]
if family_id in families and individual_id not in expected_sample_set:
missing_samples_by_family[family_id].add(individual_id)

loading_samples = set(record_family_ids.keys())
if missing_samples_by_family and fetch_missing_loaded_samples:
try:
additional_loaded_samples = fetch_missing_loaded_samples()
for missing_samples in missing_samples_by_family.values():
loading_samples.update(missing_samples.intersection(additional_loaded_samples))
missing_samples -= additional_loaded_samples
missing_samples_by_family = {
family_id: samples for family_id, samples in missing_samples_by_family.items() if samples
}
except ValueError as e:
errors.append(str(e))

if missing_samples_by_family:
missing_family_sample_messages = [
f'Family {family_id}: {", ".join(sorted(individual_ids))}'
for family_id, individual_ids in missing_samples_by_family.items()
]
errors.append(
missing_family_samples_error + '\n'.join(sorted(missing_family_sample_messages))
)

missing_vcf_samples = [] if vcf_samples is None else set(loading_samples - set(vcf_samples))
if missing_vcf_samples and fetch_missing_vcf_samples:
try:
additional_vcf_samples = fetch_missing_vcf_samples(missing_vcf_samples)
missing_vcf_samples -= set(additional_vcf_samples)
except ValueError as e:
errors.append(str(e))
if missing_vcf_samples:
errors.insert(
0, f'The following samples are included in {sample_source} but are missing from the VCF: {", ".join(sorted(missing_vcf_samples))}',
)

nonlocal loaded_individual_ids
loaded_individual_ids += [
i['individual_id'] for i in previous_loaded_individuals if i[JsonConstants.FAMILY_ID_COLUMN] in families
]

return errors

return validate_expected_samples
def get_missing_family_samples(expected_sample_set, record_family_ids, previous_loaded_individuals):
families = set(record_family_ids.values())
missing_samples_by_family = defaultdict(set)
for loaded_individual in previous_loaded_individuals:
individual_id = loaded_individual[JsonConstants.INDIVIDUAL_ID_COLUMN]
family_id = loaded_individual[JsonConstants.FAMILY_ID_COLUMN]
if family_id in families and individual_id not in expected_sample_set:
missing_samples_by_family[family_id].add(individual_id)

return missing_samples_by_family


def get_loaded_individual_ids(record_family_ids, previous_loaded_individuals):
families = set(record_family_ids.values())
return [
i['individual_id'] for i in previous_loaded_individuals if i[JsonConstants.FAMILY_ID_COLUMN] in families
]
38 changes: 33 additions & 5 deletions seqr/views/apis/anvil_workspace_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from seqr.views.utils.individual_utils import add_or_update_individuals_and_families
from seqr.utils.communication_utils import send_html_email
from seqr.utils.file_utils import list_files
from seqr.utils.search.add_data_utils import get_loading_samples_validator, trigger_data_loading
from seqr.utils.search.add_data_utils import get_missing_family_samples, get_loaded_individual_ids, trigger_data_loading
from seqr.utils.vcf_utils import validate_vcf_and_get_samples, get_vcf_list
from seqr.utils.logging_utils import SeqrLogger
from seqr.utils.middleware import ErrorsWarningsException
Expand Down Expand Up @@ -244,10 +244,8 @@ def add_workspace_data(request, project_guid):
def _parse_uploaded_pedigree(request_json, project=None, search_dataset_type=None):
loaded_sample_types = [] if search_dataset_type else None
loaded_individual_ids = []
validate_expected_samples = get_loading_samples_validator(
request_json['vcfSamples'], loaded_individual_ids, loaded_sample_types=loaded_sample_types, sample_source='the pedigree file',
missing_family_samples_error='In order to load data for families with previously loaded data, new family samples must be joint called in a single VCF with all previously loaded samples. The following samples were previously loaded in this project but are missing from the VCF:\n',
)
def validate_expected_samples(*args):
return _validate_expected_samples(request_json['vcfSamples'], loaded_sample_types, loaded_individual_ids, *args)

json_records = load_uploaded_file(request_json['uploadedFileId'])
pedigree_records = parse_basic_pedigree_table(
Expand All @@ -258,6 +256,36 @@ def _parse_uploaded_pedigree(request_json, project=None, search_dataset_type=Non
return pedigree_records, loaded_individual_ids, loaded_sample_types[0] if loaded_sample_types else None


def _validate_expected_samples(vcf_samples, loaded_sample_types, loaded_individual_ids, record_family_ids, previous_loaded_individuals, sample_type):
errors = []

if loaded_sample_types is not None:
if sample_type:
loaded_sample_types.append(sample_type)
else:
errors.append('New data cannot be added to this project until the previously requested data is loaded')

missing_vcf_samples = set(record_family_ids.keys()) - set(vcf_samples)
if missing_vcf_samples:
errors.append(
f'The following samples are included in the pedigree file but are missing from the VCF: {", ".join(sorted(missing_vcf_samples))}',
)

missing_samples_by_family = get_missing_family_samples(vcf_samples, record_family_ids, previous_loaded_individuals)
if missing_samples_by_family:
missing_family_sample_messages = [
f'Family {family_id}: {", ".join(sorted(individual_ids))}'
for family_id, individual_ids in missing_samples_by_family.items()
]
errors.append('\n'.join([
'In order to load data for families with previously loaded data, new family samples must be joint called in a single VCF with all previously loaded samples. The following samples were previously loaded in this project but are missing from the VCF:',
] + sorted(missing_family_sample_messages)))

loaded_individual_ids += get_loaded_individual_ids(record_family_ids, previous_loaded_individuals)

return errors


def _trigger_add_workspace_data(project, pedigree_records, user, data_path, sample_type, previous_loaded_ids=None, get_pedigree_json=False):
# add families and individuals according to the uploaded individual records
pedigree_json, individual_ids = add_or_update_individuals_and_families(
Expand Down
89 changes: 56 additions & 33 deletions seqr/views/apis/data_manager_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

from clickhouse_search.search import delete_clickhouse_project
from seqr.utils.communication_utils import send_project_notification
from seqr.utils.search.add_data_utils import trigger_data_loading, get_loading_samples_validator, trigger_delete_families_search
from seqr.utils.search.add_data_utils import trigger_data_loading, get_missing_family_samples, get_loaded_individual_ids, trigger_delete_families_search
from seqr.utils.search.elasticsearch.es_utils import get_elasticsearch_status, delete_es_index
from seqr.utils.search.utils import clickhouse_only, es_only, InvalidSearchException
from seqr.utils.file_utils import file_iter
Expand Down Expand Up @@ -372,21 +372,54 @@ def _get_valid_search_individuals(project, airtable_samples, vcf_samples, datase
)
}

if airtable_samples:
missing_airtable_samples = {sample_id for sample_id in airtable_samples if sample_id not in search_individuals_by_id}
if missing_airtable_samples:
errors.append(
f'The following samples are included in airtable for {project.name} but are missing from seqr: {", ".join(missing_airtable_samples)}')

previous_loaded_individuals, record_family_ids, _ = get_validated_related_individuals(
project, search_individuals_by_id, errors, search_dataset_type=dataset_type, search_sample_type=sample_type,
add_missing_parents=False,
)

expected_sample_set = record_family_ids if airtable_samples else vcf_samples
missing_samples_by_family = get_missing_family_samples(expected_sample_set, record_family_ids, previous_loaded_individuals.values())
loading_samples = set(record_family_ids.keys())
get_sample_kwargs = {
'user': user, 'dataset_type': dataset_type, 'sample_type': sample_type, 'project_guid': project.guid,
}
if missing_samples_by_family and airtable_samples:
try:
additional_loaded_samples = {
sample['sample_id'] for sample in _get_dataset_type_samples_for_matched_pdos(
AVAILABLE_PDO_STATUSES, **get_sample_kwargs,
)
}
for missing_samples in missing_samples_by_family.values():
loading_samples.update(missing_samples.intersection(additional_loaded_samples))
missing_samples -= additional_loaded_samples
missing_samples_by_family = {
family_id: samples for family_id, samples in missing_samples_by_family.items() if samples
}
except ValueError as e:
errors.append(str(e))

sample_source = 'airtable' if airtable_samples else 'the vcf'
if missing_samples_by_family:
missing_family_sample_messages = [
f'Family {family_id}: {", ".join(sorted(individual_ids))}'
for family_id, individual_ids in missing_samples_by_family.items()
]
errors.append('\n'.join(
[f'The following families have previously loaded samples absent from {sample_source}'] +
sorted(missing_family_sample_messages)
))

vcf_sample_id_map = {}
if not airtable_samples:
fetch_missing_loaded_samples = None
fetch_missing_vcf_samples = None
sample_source = 'the vcf'
else:
get_sample_kwargs = {
'user': user, 'dataset_type': dataset_type, 'sample_type': sample_type, 'project_guid': project.guid,
}
fetch_missing_loaded_samples = lambda: {
sample['sample_id'] for sample in _get_dataset_type_samples_for_matched_pdos(
AVAILABLE_PDO_STATUSES, **get_sample_kwargs,
)
}
def fetch_missing_vcf_samples(missing_vcf_samples):
missing_vcf_samples = [] if vcf_samples is None else set(loading_samples - set(vcf_samples))
if missing_vcf_samples and airtable_samples:
try:
samples = _get_dataset_type_samples_for_matched_pdos(
LOADABLE_PDO_STATUSES + AVAILABLE_PDO_STATUSES, **get_sample_kwargs, sample_fields=['VCFIDWithMismatch'],
additional_sample_filters={'SeqrIDWithMismatch': sorted(missing_vcf_samples)},
Expand All @@ -395,25 +428,15 @@ def fetch_missing_vcf_samples(missing_vcf_samples):
s['sample_id']: s['VCFIDWithMismatch'] for s in samples
if s['sample_id'] in airtable_samples and s['VCFIDWithMismatch'] in vcf_samples
})
return vcf_sample_id_map.keys()
sample_source = 'airtable'

missing_airtable_samples = {sample_id for sample_id in airtable_samples if sample_id not in search_individuals_by_id}
if missing_airtable_samples:
errors.append(
f'The following samples are included in airtable for {project.name} but are missing from seqr: {", ".join(missing_airtable_samples)}')

loaded_individual_ids = []
validate_expected_samples = get_loading_samples_validator(
vcf_samples, loaded_individual_ids, sample_source=sample_source,
fetch_missing_loaded_samples=fetch_missing_loaded_samples, fetch_missing_vcf_samples=fetch_missing_vcf_samples,
missing_family_samples_error= f'The following families have previously loaded samples absent from {sample_source}\n',
)
missing_vcf_samples -= set(vcf_sample_id_map.keys())
except ValueError as e:
errors.append(str(e))
if missing_vcf_samples:
errors.append(
f'The following samples are included in {sample_source} but are missing from the VCF: {", ".join(sorted(missing_vcf_samples))}',
)

get_validated_related_individuals(
project, search_individuals_by_id, errors, search_dataset_type=dataset_type, search_sample_type=sample_type,
validate_expected_samples=validate_expected_samples, add_missing_parents=False,
)
loaded_individual_ids = get_loaded_individual_ids(record_family_ids, previous_loaded_individuals.values())

return [i['id'] for i in search_individuals_by_id.values()] + loaded_individual_ids, vcf_sample_id_map

Expand Down
2 changes: 1 addition & 1 deletion seqr/views/apis/data_manager_api_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1958,8 +1958,8 @@ def _trigger_error(self, url, body, variables, mock_open, mock_gzip_open, mock_m
self.assertDictEqual(response.json(), {
'warnings': None,
'errors': [
'The following samples are included in airtable but are missing from the VCF: NA21987',
'The following families have previously loaded samples absent from airtable\nFamily fam14: NA21234, NA21654',
'The following samples are included in airtable but are missing from the VCF: NA21987',
],
})
self.assertEqual(len(responses.calls), 2)
Expand Down