15
15
16
16
from clickhouse_search .search import delete_clickhouse_project
17
17
from seqr .utils .communication_utils import send_project_notification
18
- from seqr .utils .search .add_data_utils import trigger_data_loading , get_loading_samples_validator , trigger_delete_families_search
18
+ from seqr .utils .search .add_data_utils import trigger_data_loading , get_missing_family_samples , get_loaded_individual_ids , trigger_delete_families_search
19
19
from seqr .utils .search .elasticsearch .es_utils import get_elasticsearch_status , delete_es_index
20
20
from seqr .utils .search .utils import clickhouse_only , es_only , InvalidSearchException
21
21
from seqr .utils .file_utils import file_iter
@@ -372,21 +372,54 @@ def _get_valid_search_individuals(project, airtable_samples, vcf_samples, datase
372
372
)
373
373
}
374
374
375
+ if airtable_samples :
376
+ missing_airtable_samples = {sample_id for sample_id in airtable_samples if sample_id not in search_individuals_by_id }
377
+ if missing_airtable_samples :
378
+ errors .append (
379
+ f'The following samples are included in airtable for { project .name } but are missing from seqr: { ", " .join (missing_airtable_samples )} ' )
380
+
381
+ previous_loaded_individuals , record_family_ids , _ = get_validated_related_individuals (
382
+ project , search_individuals_by_id , errors , search_dataset_type = dataset_type , search_sample_type = sample_type ,
383
+ add_missing_parents = False ,
384
+ )
385
+
386
+ expected_sample_set = record_family_ids if airtable_samples else vcf_samples
387
+ missing_samples_by_family = get_missing_family_samples (expected_sample_set , record_family_ids , previous_loaded_individuals .values ())
388
+ loading_samples = set (record_family_ids .keys ())
389
+ get_sample_kwargs = {
390
+ 'user' : user , 'dataset_type' : dataset_type , 'sample_type' : sample_type , 'project_guid' : project .guid ,
391
+ }
392
+ if missing_samples_by_family and airtable_samples :
393
+ try :
394
+ additional_loaded_samples = {
395
+ sample ['sample_id' ] for sample in _get_dataset_type_samples_for_matched_pdos (
396
+ AVAILABLE_PDO_STATUSES , ** get_sample_kwargs ,
397
+ )
398
+ }
399
+ for missing_samples in missing_samples_by_family .values ():
400
+ loading_samples .update (missing_samples .intersection (additional_loaded_samples ))
401
+ missing_samples -= additional_loaded_samples
402
+ missing_samples_by_family = {
403
+ family_id : samples for family_id , samples in missing_samples_by_family .items () if samples
404
+ }
405
+ except ValueError as e :
406
+ errors .append (str (e ))
407
+
408
+ sample_source = 'airtable' if airtable_samples else 'the vcf'
409
+ if missing_samples_by_family :
410
+ missing_family_sample_messages = [
411
+ f'Family { family_id } : { ", " .join (sorted (individual_ids ))} '
412
+ for family_id , individual_ids in missing_samples_by_family .items ()
413
+ ]
414
+ errors .append ('\n ' .join (
415
+ [f'The following families have previously loaded samples absent from { sample_source } ' ] +
416
+ sorted (missing_family_sample_messages )
417
+ ))
418
+
375
419
vcf_sample_id_map = {}
376
- if not airtable_samples :
377
- fetch_missing_loaded_samples = None
378
- fetch_missing_vcf_samples = None
379
- sample_source = 'the vcf'
380
- else :
381
- get_sample_kwargs = {
382
- 'user' : user , 'dataset_type' : dataset_type , 'sample_type' : sample_type , 'project_guid' : project .guid ,
383
- }
384
- fetch_missing_loaded_samples = lambda : {
385
- sample ['sample_id' ] for sample in _get_dataset_type_samples_for_matched_pdos (
386
- AVAILABLE_PDO_STATUSES , ** get_sample_kwargs ,
387
- )
388
- }
389
- def fetch_missing_vcf_samples (missing_vcf_samples ):
420
+ missing_vcf_samples = [] if vcf_samples is None else set (loading_samples - set (vcf_samples ))
421
+ if missing_vcf_samples and airtable_samples :
422
+ try :
390
423
samples = _get_dataset_type_samples_for_matched_pdos (
391
424
LOADABLE_PDO_STATUSES + AVAILABLE_PDO_STATUSES , ** get_sample_kwargs , sample_fields = ['VCFIDWithMismatch' ],
392
425
additional_sample_filters = {'SeqrIDWithMismatch' : sorted (missing_vcf_samples )},
@@ -395,25 +428,15 @@ def fetch_missing_vcf_samples(missing_vcf_samples):
395
428
s ['sample_id' ]: s ['VCFIDWithMismatch' ] for s in samples
396
429
if s ['sample_id' ] in airtable_samples and s ['VCFIDWithMismatch' ] in vcf_samples
397
430
})
398
- return vcf_sample_id_map .keys ()
399
- sample_source = 'airtable'
400
-
401
- missing_airtable_samples = {sample_id for sample_id in airtable_samples if sample_id not in search_individuals_by_id }
402
- if missing_airtable_samples :
403
- errors .append (
404
- f'The following samples are included in airtable for { project .name } but are missing from seqr: { ", " .join (missing_airtable_samples )} ' )
405
-
406
- loaded_individual_ids = []
407
- validate_expected_samples = get_loading_samples_validator (
408
- vcf_samples , loaded_individual_ids , sample_source = sample_source ,
409
- fetch_missing_loaded_samples = fetch_missing_loaded_samples , fetch_missing_vcf_samples = fetch_missing_vcf_samples ,
410
- missing_family_samples_error = f'The following families have previously loaded samples absent from { sample_source } \n ' ,
411
- )
431
+ missing_vcf_samples -= set (vcf_sample_id_map .keys ())
432
+ except ValueError as e :
433
+ errors .append (str (e ))
434
+ if missing_vcf_samples :
435
+ errors .append (
436
+ f'The following samples are included in { sample_source } but are missing from the VCF: { ", " .join (sorted (missing_vcf_samples ))} ' ,
437
+ )
412
438
413
- get_validated_related_individuals (
414
- project , search_individuals_by_id , errors , search_dataset_type = dataset_type , search_sample_type = sample_type ,
415
- validate_expected_samples = validate_expected_samples , add_missing_parents = False ,
416
- )
439
+ loaded_individual_ids = get_loaded_individual_ids (record_family_ids , previous_loaded_individuals .values ())
417
440
418
441
return [i ['id' ] for i in search_individuals_by_id .values ()] + loaded_individual_ids , vcf_sample_id_map
419
442
0 commit comments