Skip to content

Commit c91169a

Browse files
committed
Fix: genetic_relatedness to allow single sample set with self-comparisons
Previously, genetic_relatedness would fail with TSK_ERR_INSUFFICIENT_SAMPLE_SETS when given a single sample set and indexes referring to that set, e.g.: ts.genetic_relatedness([[0]], indexes=[(0,0)]) This was because the C API requires at least k=2 sample sets for k-way statistics, even when indexes only reference a single set for self-comparison. The fix adds an allow_self_comparisons parameter to __k_way_sample_set_stat, which is set to True only for genetic_relatedness. When enabled and indexes only reference existing sample sets, dummy sample sets are added to satisfy the C API requirement while ensuring they are never accessed during computation. Tests added for all three modes (site, branch, node) to verify self-comparisons work correctly with single sample sets. Fixes #3055
1 parent 3c0a99a commit c91169a

File tree

2 files changed

+55
-0
lines changed

2 files changed

+55
-0
lines changed

python/tests/test_tree_stats.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2311,6 +2311,16 @@ def test_shapes(self, proportion):
23112311
class TestBranchGeneticRelatedness(TestGeneticRelatedness, TopologyExamplesMixin):
23122312
mode = "branch"
23132313

2314+
def test_single_sample_set_self_comparison(self):
2315+
# Test for issue #3055 - self-comparisons with single sample set
2316+
ts = msprime.simulate(sample_size=10, Ne=10000, length=1000, random_seed=42)
2317+
# Single sample set with self-comparison
2318+
result = ts.genetic_relatedness([[0]], indexes=[(0, 0)], mode="branch")
2319+
assert result.shape == (1,)
2320+
# Should work for multiple samples in single set too
2321+
result = ts.genetic_relatedness([[0, 1, 2]], indexes=[(0, 0)], mode="branch")
2322+
assert result.shape == (1,)
2323+
23142324
@pytest.mark.parametrize("polarised", [True, False])
23152325
def test_simple_tree_noncentred(self, polarised):
23162326
# 2.00┊ 4 ┊
@@ -2365,10 +2375,35 @@ def test_simple_tree_noncentred(self, polarised):
23652375
class TestNodeGeneticRelatedness(TestGeneticRelatedness, TopologyExamplesMixin):
23662376
mode = "node"
23672377

2378+
def test_single_sample_set_self_comparison(self, ts_12_highrecomb_fixture):
2379+
# Test for issue #3055 - self-comparisons with single sample set
2380+
ts = ts_12_highrecomb_fixture
2381+
# Single sample set with self-comparison
2382+
result = ts.genetic_relatedness([[0]], indexes=[(0, 0)], mode="node")
2383+
assert result.shape == (ts.num_nodes, 1)
2384+
# Should work for multiple samples in single set too
2385+
result = ts.genetic_relatedness([[0, 1, 2]], indexes=[(0, 0)], mode="node")
2386+
assert result.shape == (ts.num_nodes, 1)
2387+
23682388

23692389
class TestSiteGeneticRelatedness(TestGeneticRelatedness, MutatedTopologyExamplesMixin):
23702390
mode = "site"
23712391

2392+
def test_single_sample_set_self_comparison(self, ts_12_highrecomb_fixture):
2393+
# Test for issue #3055 - self-comparisons with single sample set
2394+
ts = ts_12_highrecomb_fixture
2395+
# Single sample set with self-comparison
2396+
result = ts.genetic_relatedness([[0]], indexes=[(0, 0)], mode="site")
2397+
assert result.shape == (1,)
2398+
# Should work for multiple samples in single set too
2399+
result = ts.genetic_relatedness([[0, 1, 2]], indexes=[(0, 0)], mode="site")
2400+
assert result.shape == (1,)
2401+
# Test with multiple self-comparisons
2402+
result = ts.genetic_relatedness(
2403+
[[0], [1]], indexes=[(0, 0), (1, 1)], mode="site"
2404+
)
2405+
assert result.shape == (2,)
2406+
23722407
def test_match_K_c0(self):
23732408
# This test checks that ts.genetic_relatedness() matches K_c0
23742409
# from Speed & Balding (2014) https://www.nature.com/articles/nrg3821

python/tskit/trees.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8106,6 +8106,7 @@ def __k_way_sample_set_stat(
81068106
span_normalise=True,
81078107
polarised=False,
81088108
centre=True,
8109+
allow_self_comparisons=False,
81098110
):
81108111
sample_set_sizes = np.array(
81118112
[len(sample_set) for sample_set in sample_sets], dtype=np.uint32
@@ -8132,6 +8133,24 @@ def __k_way_sample_set_stat(
81328133
"Indexes must be convertable to a 2D numpy array with {} "
81338134
"columns".format(k)
81348135
)
8136+
# For genetic_relatedness, we allow self-comparisons with a single sample set
8137+
if allow_self_comparisons and len(sample_sets) < k:
8138+
# Check that all indexes are valid
8139+
if np.any(indexes >= len(sample_sets)) or np.any(indexes < 0):
8140+
raise ValueError("Index out of bounds")
8141+
# Find which sample sets we actually need
8142+
unique_indexes = np.unique(indexes)
8143+
if np.max(unique_indexes) < len(sample_sets):
8144+
# we need to pad with dummy sets to satisfy the C side
8145+
# requirement of having at least k sets
8146+
sample_sets = list(sample_sets)
8147+
sample_set_sizes = list(sample_set_sizes)
8148+
while len(sample_sets) < k:
8149+
# Add a dummy sample set that won't be used
8150+
sample_sets.append(np.array([sample_sets[0][0]], dtype=np.int32))
8151+
sample_set_sizes.append(1)
8152+
sample_set_sizes = np.array(sample_set_sizes, dtype=np.uint32)
8153+
flattened = util.safe_np_int_cast(np.hstack(sample_sets), np.int32)
81358154
stat = self.__run_windowed_stat(
81368155
windows,
81378156
ll_method,
@@ -8702,6 +8721,7 @@ def genetic_relatedness(
87028721
span_normalise=span_normalise,
87038722
polarised=polarised,
87048723
centre=centre,
8724+
allow_self_comparisons=True,
87058725
)
87068726
if proportion:
87078727
# TODO this should be done in C also

0 commit comments

Comments
 (0)