Skip to content

Commit e43e99c

Browse files
committed
better checks for duplicate uids in input
closes #335
1 parent c837136 commit e43e99c

File tree

5 files changed

+38
-2
lines changed

5 files changed

+38
-2
lines changed

bin/partis

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -777,6 +777,7 @@ def run_all_loci(args, ig_or_tr='ig'):
777777
utils.replace_in_arglist(clist, '--n-trees', str(get_n_events(lpair[1], utils.non_none([args.n_trees, args.n_sim_events]))), insert_after='--generate-trees')
778778
else:
779779
utils.insert_in_arglist(clist, ['--locus', ltmp], args.action)
780+
clist.append('--crash-on-duplicate-uids') # single-chain subprocesses in paired mode should crash on duplicates
780781
if args.action == 'simulate':
781782
clist += ['--choose-trees-in-order', '--outfname', getofn(ltmp, lpair=lpair)]
782783
if args.input_simulation_treefname is None:
@@ -1677,6 +1678,7 @@ parent_args.append({'name' : '--pair-unpaired-seqs-with-paired-family', 'kwargs'
16771678
parent_args.append({'name' : '--add-unpaired-seqs-to-fake-paired-annotations', 'kwargs' : {'action' : 'store_true', 'help' : 'when making the fake h+l "paired" annotation (i.e. smashing h+l seqs together) for use in selection metrics and/or partition plotting, by default we ignore unpaired seqs. This option includes them, with Ns for the missing opposite-chain sequence.'}})
16781679
parent_args.append({'name' : '--keep-all-unpaired-seqs', 'kwargs' : {'action' : 'store_true', 'help' : 'By default when paired clustering, seqs with no pair info are kept only if they\'re in a (single-chain) family with at least one paired seq (i.e. families consisting entirely of unpaired seqs are discarded). If this is set, instead we keep all unpaired seqs.'}})
16791680
parent_args.append({'name' : '--ignore-sw-pair-info', 'kwargs' : {'action' : 'store_true', 'help' : 'If we already have paired clustering results and we\'re reading them for a subsequent step (for instance to get annotations) that is also reading sw cache info, we usually want to ignore the pair info in the sw cache file, since it is pre-cleaning/uncorrected. This argument instructs it to do that. Note that even if this argument isn\'t set, it should correctly figure out to overwrite the sw pair info with the correct pair info, but it\'s better/safer to also not read the sw pair info to begin with.'}})
1681+
parent_args.append({'name' : '--crash-on-duplicate-uids', 'kwargs' : {'action' : 'store_true', 'help' : 'Crash if duplicate UIDs are found in input files (instead of renaming them with suffixes like -2, -3, etc). This is automatically set by the paired loci parent process when calling single-chain subprocesses, since renaming breaks pairing info. Not intended for direct use.'}})
16801682

16811683
parent_args.append({'name' : '--max-ccf-fail-frac', 'kwargs' : {'type' : float, 'default' : 0.05, 'help' : 'when calculating clustering performance metrics (correct cluster fractions, purity/completeness), crash if more than this fraction of sequences are missing from the inferred partition'}})
16821684

bin/split-loci.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,9 @@ def lpstr(spair):
230230
print('totals: %s%s' % (' '.join(('%s %d'%(l, len(sfos))) for l, sfos in outfos.items()), '' if len(failed_seqs) == 0 else ' (%s: %d)'%(utils.color('yellow', 'failed'), len(failed_seqs))))
231231
assert sum(len(ofo) for ofo in outfos.values()) + len(failed_seqs) == len(seqfos)
232232

233+
# Check for UIDs appearing in multiple loci (will break later pair info processing)
234+
paircluster.check_for_cross_locus_duplicates({locus : [sfo['name'] for sfo in sfos] for locus, sfos in outfos.items()})
235+
233236
if args.guess_pairing_info:
234237
if len(paired_uids) > 0:
235238
raise Exception('can\'t/shouldn\'t guess pairing info if we already have it from elsewhere')

docs/install.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ Alternatively, you can install from source if you think you'll want to modify th
3333
git clone https://github.com/psathyrella/partis.git
3434
cd partis
3535

36-
# Initialize essential submodule
37-
git submodule update --init packages/ham
36+
# Initialize essential submodules
37+
git submodule update --init packages/ham packages/ig-sw
3838

3939
# Create virtual environment and install
4040
python -m venv .venv

partis/paircluster.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,28 @@ def clean_paired_dir(bdir, suffix='.fa', extra_files=None, expect_missing=False,
8181
fnames = extra_files + fnames # put 'em at the start, since presumably they're actual files, not dirs
8282
utils.clean_files(fnames, expect_missing=expect_missing)
8383

84+
# ----------------------------------------------------------------------------------------
85+
def check_for_cross_locus_duplicates(uid_lists_by_locus, extra_str=''):
86+
"""Check that no UID appears in multiple loci. Raises exception if any duplicates found.
87+
88+
Args:
89+
uid_lists_by_locus: dict mapping locus name to list/set of UIDs for that locus
90+
extra_str: optional string to prepend to error message
91+
"""
92+
uid_to_loci = {}
93+
for locus, uids in uid_lists_by_locus.items():
94+
for uid in uids:
95+
if uid not in uid_to_loci:
96+
uid_to_loci[uid] = []
97+
uid_to_loci[uid].append(locus)
98+
99+
cross_locus_dups = {u : loci for u, loci in uid_to_loci.items() if len(loci) > 1}
100+
if len(cross_locus_dups) > 0:
101+
err_str = '%sfound %d uid%s in multiple loci (which will cause hard to track crashes):\n' % (extra_str, len(cross_locus_dups), utils.plural(len(cross_locus_dups)))
102+
for uid, loci in sorted(cross_locus_dups.items()):
103+
err_str += ' %s: in %s\n' % (uid, ', '.join(loci))
104+
raise Exception(err_str)
105+
84106
# ----------------------------------------------------------------------------------------
85107
# add_selection_metrics: list of selection metrics to add (plotdir_fcn is atm only if adding selection metrics)
86108
def read_locus_output_files(tmploci, ofn_fcn, lpair=None, read_selection_metrics=False, add_selection_metrics=None, lb_tau=None, plotdir_fcn=None, seed_unique_id=None, dont_add_implicit_info=False, dbgstr='', debug=False):
@@ -134,6 +156,12 @@ def read_smetrics(ofn, atnlist):
134156
for smetric in add_selection_metrics:
135157
treeutils.calculate_individual_tree_metrics(smetric, lpfos['antn_lists'][ltmp], base_plotdir=None if plotdir_fcn is None else plotdir_fcn(ltmp, lpair=lpair), lb_tau=lb_tau)
136158
parse_pairing_info(ltmp, lpfos['antn_lists'][ltmp])
159+
160+
# Check for UIDs that appear in multiple loci (NOTE this is also checked earlier in split-loci.py, but we keep it here as a safety check)
161+
uid_lists = {ltmp : [uid for atn in lpfos['antn_lists'][ltmp] for uid in atn['unique_ids']]
162+
for ltmp in tmploci if lpfos['antn_lists'][ltmp] is not None}
163+
check_for_cross_locus_duplicates(uid_lists)
164+
137165
if all(lpfos['glfos'][l] is None for l in tmploci): # if there was no info for *any* of the loci, set Nones one level up (it's just easier to have the Nones there)
138166
lpfos = {k : None for k in lpfos}
139167
return lpfos

partis/seqfileopener.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,9 @@ def read_sequence_file(infname, is_data, n_max_queries=-1, args=None, simglfo=No
274274
already_printed_forbidden_character_warning = True
275275
uid = uid.translate(utils.forbidden_character_translations)
276276
if uid in input_info:
277+
# Crash on duplicates if requested (typically set by paired loci parent process for single-chain subprocesses)
278+
if args is not None and args.crash_on_duplicate_uids:
279+
raise Exception('Found duplicate UID \'%s\' in %s. Cannot handle duplicate UIDs since pairing info references the original names. Please remove duplicates from input files.' % (uid, infname))
277280
uid, n_duplicate_uids = utils.choose_non_dup_id(uid, n_duplicate_uids, input_info)
278281
inseq = line['input_seqs'][0]
279282

0 commit comments

Comments
 (0)