better checks for duplicate uids in input

psathyrella · psathyrella · commit e43e99c755b4 · 2026-01-22T09:24:41.000-08:00
closes #335
diff --git a/bin/partis b/bin/partis
@@ -777,6 +777,7 @@ def run_all_loci(args, ig_or_tr='ig'):
                 utils.replace_in_arglist(clist, '--n-trees', str(get_n_events(lpair[1], utils.non_none([args.n_trees, args.n_sim_events]))), insert_after='--generate-trees')
             else:
                 utils.insert_in_arglist(clist, ['--locus', ltmp], args.action)
+                clist.append('--crash-on-duplicate-uids')  # single-chain subprocesses in paired mode should crash on duplicates
                 if args.action == 'simulate':
                     clist += ['--choose-trees-in-order', '--outfname', getofn(ltmp, lpair=lpair)]
                     if args.input_simulation_treefname is None:
@@ -1677,6 +1678,7 @@ parent_args.append({'name' : '--pair-unpaired-seqs-with-paired-family', 'kwargs'
 parent_args.append({'name' : '--add-unpaired-seqs-to-fake-paired-annotations', 'kwargs' : {'action' : 'store_true', 'help' : 'when making the fake h+l "paired" annotation (i.e. smashing h+l seqs together) for use in selection metrics and/or partition plotting, by default we ignore unpaired seqs. This option includes them, with Ns for the missing opposite-chain sequence.'}})
 parent_args.append({'name' : '--keep-all-unpaired-seqs', 'kwargs' : {'action' : 'store_true', 'help' : 'By default when paired clustering, seqs with no pair info are kept only if they\'re in a (single-chain) family with at least one paired seq (i.e. families consisting entirely of unpaired seqs are discarded). If this is set, instead we keep all unpaired seqs.'}})
 parent_args.append({'name' : '--ignore-sw-pair-info', 'kwargs' : {'action' : 'store_true', 'help' : 'If we already have paired clustering results and we\'re reading them for a subsequent step (for instance to get annotations) that is also reading sw cache info, we usually want to ignore the pair info in the sw cache file, since it is pre-cleaning/uncorrected. This argument instructs it to do that. Note that even if this argument isn\'t set, it should correctly figure out to overwrite the sw pair info with the correct pair info, but it\'s better/safer to also not read the sw pair info to begin with.'}})
+parent_args.append({'name' : '--crash-on-duplicate-uids', 'kwargs' : {'action' : 'store_true', 'help' : 'Crash if duplicate UIDs are found in input files (instead of renaming them with suffixes like -2, -3, etc). This is automatically set by the paired loci parent process when calling single-chain subprocesses, since renaming breaks pairing info. Not intended for direct use.'}})
 
 parent_args.append({'name' : '--max-ccf-fail-frac', 'kwargs' : {'type' : float, 'default' : 0.05, 'help' : 'when calculating clustering performance metrics (correct cluster fractions, purity/completeness), crash if more than this fraction of sequences are missing from the inferred partition'}})
 
diff --git a/bin/split-loci.py b/bin/split-loci.py
@@ -230,6 +230,9 @@ def lpstr(spair):
 print('totals: %s%s' % (' '.join(('%s %d'%(l, len(sfos))) for l, sfos in outfos.items()), '' if len(failed_seqs) == 0 else ' (%s: %d)'%(utils.color('yellow', 'failed'), len(failed_seqs))))
 assert sum(len(ofo) for ofo in outfos.values()) + len(failed_seqs) == len(seqfos)
 
+# Check for UIDs appearing in multiple loci (will break later pair info processing)
+paircluster.check_for_cross_locus_duplicates({locus : [sfo['name'] for sfo in sfos] for locus, sfos in outfos.items()})
+
 if args.guess_pairing_info:
     if len(paired_uids) > 0:
         raise Exception('can\'t/shouldn\'t guess pairing info if we already have it from elsewhere')
diff --git a/docs/install.md b/docs/install.md
@@ -33,8 +33,8 @@ Alternatively, you can install from source if you think you'll want to modify th
 git clone https://github.com/psathyrella/partis.git
 cd partis
 
-# Initialize essential submodule
-git submodule update --init packages/ham
+# Initialize essential submodules
+git submodule update --init packages/ham packages/ig-sw
 
 # Create virtual environment and install
 python -m venv .venv
diff --git a/partis/paircluster.py b/partis/paircluster.py
@@ -81,6 +81,28 @@ def clean_paired_dir(bdir, suffix='.fa', extra_files=None, expect_missing=False,
         fnames = extra_files + fnames  # put 'em at the start, since presumably they're actual files, not dirs
     utils.clean_files(fnames, expect_missing=expect_missing)
 
+# ----------------------------------------------------------------------------------------
+def check_for_cross_locus_duplicates(uid_lists_by_locus, extra_str=''):
+    """Check that no UID appears in multiple loci. Raises exception if any duplicates found.
+
+    Args:
+        uid_lists_by_locus: dict mapping locus name to list/set of UIDs for that locus
+        extra_str: optional string to prepend to error message
+    """
+    uid_to_loci = {}
+    for locus, uids in uid_lists_by_locus.items():
+        for uid in uids:
+            if uid not in uid_to_loci:
+                uid_to_loci[uid] = []
+            uid_to_loci[uid].append(locus)
+
+    cross_locus_dups = {u : loci for u, loci in uid_to_loci.items() if len(loci) > 1}
+    if len(cross_locus_dups) > 0:
+        err_str = '%sfound %d uid%s in multiple loci (which will cause hard to track crashes):\n' % (extra_str, len(cross_locus_dups), utils.plural(len(cross_locus_dups)))
+        for uid, loci in sorted(cross_locus_dups.items()):
+            err_str += '  %s: in %s\n' % (uid, ', '.join(loci))
+        raise Exception(err_str)
+
 # ----------------------------------------------------------------------------------------
 # add_selection_metrics: list of selection metrics to add (plotdir_fcn is atm only if adding selection metrics)
 def read_locus_output_files(tmploci, ofn_fcn, lpair=None, read_selection_metrics=False, add_selection_metrics=None, lb_tau=None, plotdir_fcn=None, seed_unique_id=None, dont_add_implicit_info=False, dbgstr='', debug=False):
@@ -134,6 +156,12 @@ def read_smetrics(ofn, atnlist):
             for smetric in add_selection_metrics:
                 treeutils.calculate_individual_tree_metrics(smetric, lpfos['antn_lists'][ltmp], base_plotdir=None if plotdir_fcn is None else plotdir_fcn(ltmp, lpair=lpair), lb_tau=lb_tau)
         parse_pairing_info(ltmp, lpfos['antn_lists'][ltmp])
+
+    # Check for UIDs that appear in multiple loci (NOTE this is also checked earlier in split-loci.py, but we keep it here as a safety check)
+    uid_lists = {ltmp : [uid for atn in lpfos['antn_lists'][ltmp] for uid in atn['unique_ids']]
+                 for ltmp in tmploci if lpfos['antn_lists'][ltmp] is not None}
+    check_for_cross_locus_duplicates(uid_lists)
+
     if all(lpfos['glfos'][l] is None for l in tmploci):  # if there was no info for *any* of the loci, set Nones one level up (it's just easier to have the Nones there)
         lpfos = {k : None for k in lpfos}
     return lpfos
diff --git a/partis/seqfileopener.py b/partis/seqfileopener.py
@@ -274,6 +274,9 @@ def read_sequence_file(infname, is_data, n_max_queries=-1, args=None, simglfo=No
                 already_printed_forbidden_character_warning = True
             uid = uid.translate(utils.forbidden_character_translations)
         if uid in input_info:
+            # Crash on duplicates if requested (typically set by paired loci parent process for single-chain subprocesses)
+            if args is not None and args.crash_on_duplicate_uids:
+                raise Exception('Found duplicate UID \'%s\' in %s. Cannot handle duplicate UIDs since pairing info references the original names. Please remove duplicates from input files.' % (uid, infname))
             uid, n_duplicate_uids = utils.choose_non_dup_id(uid, n_duplicate_uids, input_info)
         inseq = line['input_seqs'][0]