rework multipop r2 stat to avoid nans

lkirk · lkirk · commit f5b0cc080c0c · 2025-08-04T04:28:41.000-05:00
diff --git a/c/tests/test_stats.c b/c/tests/test_stats.c
@@ -2637,10 +2637,13 @@ test_paper_ex_two_site(void)
 
     tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites,
         paper_ex_mutations, paper_ex_individuals, NULL, 0);
-    double truth_three_index_tuples[27] = { 1, 1, NAN, 0.1111111111111111,
-        0.1111111111111111, NAN, 0.1111111111111111, 0.1111111111111111, NAN,
-        0.1111111111111111, 0.1111111111111111, NAN, 1, 1, 1, 1, 1, 1,
-        0.1111111111111111, 0.1111111111111111, NAN, 1, 1, 1, 1, 1, 1 };
+    double truth_three_index_tuples[27] = { 1, 1, 0.71111111111111114,
+        0.1111111111111111, 0.1111111111111111, -0.0074074074074074042,
+        0.1111111111111111, 0.1111111111111111, -0.0074074074074074042,
+        0.1111111111111111, 0.1111111111111111, -0.0074074074074074042, 1, 1,
+        0.70833333333333326, 1, 1, 0.70833333333333326, 0.1111111111111111,
+        0.1111111111111111, -0.0074074074074074042, 1, 1, 0.70833333333333326, 1, 1,
+        0.70833333333333326 };
 
     tsk_size_t sample_set_sizes[3], num_index_tuples;
     tsk_id_t sample_sets[ts.num_samples * 3], index_tuples[2 * 3] = { 0, 1, 0, 0, 0, 2 };
diff --git a/c/tskit/trees.c b/c/tskit/trees.c
@@ -4941,37 +4941,38 @@ r2_ij_summary_func(tsk_size_t TSK_UNUSED(state_dim), const double *state,
     tsk_size_t result_dim, double *result, void *params)
 {
     sample_count_stat_params_t args = *(sample_count_stat_params_t *) params;
-    double n;
     const double *state_row;
     tsk_size_t k;
     tsk_id_t i, j;
-    double p_AB, p_Ab, p_aB, p_A, p_B, D_i, D_j, denom_i, denom_j;
+    double ni, w_AB_i, w_Ab_i, w_aB_i, w_A_i, w_B_i, D_i;
+    double nj, w_AB_j, w_Ab_j, w_aB_j, w_A_j, w_B_j, D_j;
+    double p_A, p_B;
 
     for (k = 0; k < result_dim; k++) {
         i = args.set_indexes[2 * k];
         j = args.set_indexes[2 * k + 1];
 
-        n = (double) args.sample_set_sizes[i];
+        ni = (double) args.sample_set_sizes[i];
         state_row = GET_2D_ROW(state, 3, i);
-        p_AB = state_row[0] / n;
-        p_Ab = state_row[1] / n;
-        p_aB = state_row[2] / n;
-        p_A = p_AB + p_Ab;
-        p_B = p_AB + p_aB;
-        D_i = p_AB - (p_A * p_B);
-        denom_i = sqrt(p_A * p_B * (1 - p_A) * (1 - p_B));
+        w_AB_i = state_row[0];
+        w_Ab_i = state_row[1];
+        w_aB_i = state_row[2];
+        w_A_i = w_AB_i + w_Ab_i;
+        w_B_i = w_AB_i + w_aB_i;
+        D_i = (ni * w_AB_i - (w_A_i * w_B_i)) / (ni * ni);
 
-        n = (double) args.sample_set_sizes[j];
+        nj = (double) args.sample_set_sizes[j];
         state_row = GET_2D_ROW(state, 3, j);
-        p_AB = state_row[0] / n;
-        p_Ab = state_row[1] / n;
-        p_aB = state_row[2] / n;
-        p_A = p_AB + p_Ab;
-        p_B = p_AB + p_aB;
-        D_j = p_AB - (p_A * p_B);
-        denom_j = sqrt(p_A * p_B * (1 - p_A) * (1 - p_B));
-
-        result[k] = (D_i * D_j) / (denom_i * denom_j);
+        w_AB_j = state_row[0];
+        w_Ab_j = state_row[1];
+        w_aB_j = state_row[2];
+        w_A_j = w_AB_j + w_Ab_j;
+        w_B_j = w_AB_j + w_aB_j;
+        D_j = (nj * w_AB_j - (w_A_j * w_B_j)) / (nj * nj);
+
+        p_A = (w_A_i + w_A_j) / (ni + nj);
+        p_B = (w_B_i + w_B_j) / (ni + nj);
+        result[k] = (D_i * D_j) / (p_A * (1 - p_A) * p_B * (1 - p_B));
     }
     return 0;
 }
diff --git a/python/tests/test_ld_matrix.py b/python/tests/test_ld_matrix.py
@@ -276,7 +276,7 @@ def norm_hap_weighted_ij(
         nj = sample_set_sizes[j]
         wAB_i = hap_weights[0, i]
         wAB_j = hap_weights[0, j]
-        result[k] = (wAB_i / ni / 2) + (wAB_j / nj / 2)
+        result[k] = (wAB_i + wAB_j) / (ni + nj)
 
 
 def norm_total_weighted(
@@ -1034,26 +1034,26 @@ def r2_ij_summary_func(
     for k in range(result_dim):
         i = set_indexes[k][0]
         j = set_indexes[k][1]
-        n = sample_set_sizes[i]
-        p_AB = state[0, i] / n
-        p_Ab = state[1, i] / n
-        p_aB = state[2, i] / n
-        p_A = p_AB + p_Ab
-        p_B = p_AB + p_aB
-        D_i = p_AB - (p_A * p_B)
-        denom_i = np.sqrt(p_A * p_B * (1 - p_A) * (1 - p_B))
-
-        n = sample_set_sizes[j]
-        p_AB = state[0, j] / n
-        p_Ab = state[1, j] / n
-        p_aB = state[2, j] / n
-        p_A = p_AB + p_Ab
-        p_B = p_AB + p_aB
-        D_j = p_AB - (p_A * p_B)
-        denom_j = np.sqrt(p_A * p_B * (1 - p_A) * (1 - p_B))
+        ni = sample_set_sizes[i]
+        w_AB_i = state[0, i]
+        w_Ab_i = state[1, i]
+        w_aB_i = state[2, i]
+        w_A_i = w_AB_i + w_Ab_i
+        w_B_i = w_AB_i + w_aB_i
+        D_i = (ni * w_AB_i - (w_A_i * w_B_i)) / (ni * ni)
 
+        nj = sample_set_sizes[j]
+        w_AB_j = state[0, j]
+        w_Ab_j = state[1, j]
+        w_aB_j = state[2, j]
+        w_A_j = w_AB_j + w_Ab_j
+        w_B_j = w_AB_j + w_aB_j
+        D_j = (nj * w_AB_j - (w_A_j * w_B_j)) / (nj * nj)
+
+        p_A = (w_A_i + w_A_j) / (ni + nj)
+        p_B = (w_B_i + w_B_j) / (ni + nj)
         with suppress_overflow_div0_warning():
-            result[k] = (D_i * D_j) / (denom_i * denom_j)
+            result[k] = (D_i * D_j) / (p_A * (1 - p_A) * p_B * (1 - p_B))
 
 
 def D_summary_func(
@@ -2298,3 +2298,110 @@ def test_two_way_site_ld_matrix(ts, stat):
         ld_matrix(ts, stat=stat, sample_sets=ss, indexes=[(0, 0), (0, 1), (1, 1)]),
         ts.ld_matrix(stat=stat, sample_sets=ss, indexes=[(0, 0), (0, 1), (1, 1)]),
     )
+
+
+@pytest.mark.parametrize(
+    "genotypes,sample_sets,expected",
+    [
+        (
+            # these genotypes are rows from a genotype matrix (sites x samples)
+            correlated := np.array(
+                [
+                    [0, 1, 1, 0, 2, 2, 1, 0, 2, 0, 1, 2],
+                    [1, 2, 2, 1, 0, 0, 2, 1, 0, 1, 2, 0],
+                ],
+            ),
+            (np.array([0, 1, 2, 3, 4, 5]), np.array([6, 7, 8, 9, 10, 11])),
+            np.float64(1.0),
+        ),
+        (
+            correlated,
+            (np.array([0, 1, 2, 3, 4, 5]), np.array([6, 7, 8, 9, 10])),
+            np.float64(0.9708352229780801),
+        ),
+        (
+            correlated,
+            (np.array([0, 1, 2, 3, 4, 5]), np.array([6, 7, 8, 9])),
+            np.float64(0.9526958931720837),
+        ),
+        (
+            correlated,
+            (np.array([0, 1, 2, 3, 4, 5]), np.array([6, 7, 8])),
+            np.float64(1.0),
+        ),
+        (
+            correlated,
+            (np.array([0, 1, 2, 3, 4, 5]), np.array([6, 7])),
+            np.float64(0.7585185185185186),
+        ),
+        (
+            correlated,
+            (np.array([0, 1, 2, 3, 4, 5]), np.array([6])),
+            np.float64(0.0),
+        ),
+        (
+            anticorrelated := np.array(
+                [
+                    [0, 0, 0, 0, 2, 2, 2, 2, 1, 1, 1, 1, 3, 3, 3, 3],
+                    [1, 1, 1, 1, 3, 3, 3, 3, 0, 0, 0, 0, 2, 2, 2, 2],
+                ]
+            ),
+            (
+                np.array([0, 2, 4, 6, 8, 10, 12, 14]),
+                np.array([1, 3, 5, 7, 9, 11, 13, 15]),
+            ),
+            np.float64(1.0),
+        ),
+        (
+            anticorrelated,
+            (np.array([0, 2, 4, 6, 8, 10, 12, 14]), np.array([1, 3, 5, 7, 9, 11, 13])),
+            np.float64(0.9798566895766568),
+        ),
+        (
+            anticorrelated,
+            (np.array([0, 2, 4, 6, 8, 10, 12, 14]), np.array([1, 3, 5, 7, 9, 11])),
+            np.float64(0.8574999999999999),
+        ),
+        (
+            anticorrelated,
+            (np.array([0, 2, 4, 6, 8, 10, 12, 14]), np.array([1, 3, 5, 7, 9])),
+            np.float64(0.8299777777777777),
+        ),
+        (
+            anticorrelated,
+            (np.array([0, 2, 4, 6, 8, 10, 12, 14]), np.array([1, 3, 5, 7])),
+            np.float64(0.6328124999999999),
+        ),
+        (
+            anticorrelated,
+            (np.array([0, 2, 4, 6, 8, 10, 12, 14]), np.array([1, 3, 5])),
+            np.float64(0.57179616638322),
+        ),
+        (
+            anticorrelated,
+            (np.array([0, 2, 4, 6, 8, 10, 12, 14]), np.array([1, 3])),
+            np.float64(0.0),
+        ),
+        (
+            anticorrelated,
+            (np.array([0, 2, 4, 6, 8, 10, 12, 14]), np.array([1])),
+            np.float64(0.0),
+        ),
+    ],
+)
+def test_multipopulation_r2_varying_unequal_set_sizes(genotypes, sample_sets, expected):
+    a, b = genotypes
+    state_dim = len(sample_sets)
+    state = np.zeros((3, state_dim), dtype=int)
+    result = np.zeros((max(a) + 1, max(b) + 1, 1))
+    norm = np.zeros_like(result)
+    params = dict(sample_set_sizes=list(map(len, sample_sets)), set_indexes=[(0, 1)])
+    for i, j in np.ndindex(result.shape[:2]):
+        for k, ss in enumerate(sample_sets):
+            A = a[ss] == i
+            B = b[ss] == j
+            state[:, k] = (A & B).sum(), (A & ~B).sum(), (~A & B).sum()
+        r2_ij_summary_func(state_dim, state, 1, result[i, j], params)
+        norm_hap_weighted_ij(1, state, max(a) + 1, max(b) + 1, norm[i, j], params)
+
+    np.testing.assert_allclose(expected, (result * norm).sum())