revert changes to r2_ij, perhaps revisit these at another point

lkirk · lkirk · commit 3c596ea19478 · 2025-08-07T20:10:29.000-05:00
diff --git a/c/tests/test_stats.c b/c/tests/test_stats.c
@@ -2637,11 +2637,10 @@ test_paper_ex_two_site(void)
 
     tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites,
         paper_ex_mutations, paper_ex_individuals, NULL, 0);
-    double truth_three_index_tuples[27] = { 1, 1, 0, 0.1111111111111111,
-        0.1111111111111111, 0, 0.1111111111111111, 0.1111111111111111, 0,
-        0.1111111111111111, 0.1111111111111111, 0, 1, 1, 0.94921874999999978, 1, 1,
-        0.94921874999999978, 0.1111111111111111, 0.1111111111111111, 0, 1, 1,
-        0.94921874999999978, 1, 1, 0.94921874999999978 };
+    double truth_three_index_tuples[27] = { 1, 1, NAN, 0.1111111111111111,
+        0.1111111111111111, NAN, 0.1111111111111111, 0.1111111111111111, NAN,
+        0.1111111111111111, 0.1111111111111111, NAN, 1, 1, 1, 1, 1, 1,
+        0.1111111111111111, 0.1111111111111111, NAN, 1, 1, 1, 1, 1, 1 };
 
     tsk_size_t sample_set_sizes[3], num_index_tuples;
     tsk_id_t sample_sets[ts.num_samples * 3], index_tuples[2 * 3] = { 0, 1, 0, 0, 0, 2 };
diff --git a/c/tskit/trees.c b/c/tskit/trees.c
@@ -4944,35 +4944,33 @@ r2_ij_summary_func(tsk_size_t TSK_UNUSED(state_dim), const double *state,
     const double *state_row;
     tsk_size_t k;
     tsk_id_t i, j;
-    double ni, w_AB_i, w_Ab_i, w_aB_i, w_A_i, w_B_i, D_i;
-    double nj, w_AB_j, w_Ab_j, w_aB_j, w_A_j, w_B_j, D_j;
-    double p_A, p_B;
+    double n, pAB, pAb, paB, pA, pB, D_i, D_j, denom_i, denom_j;
 
     for (k = 0; k < result_dim; k++) {
         i = args.set_indexes[2 * k];
         j = args.set_indexes[2 * k + 1];
 
-        ni = (double) args.sample_set_sizes[i];
+        n = (double) args.sample_set_sizes[i];
         state_row = GET_2D_ROW(state, 3, i);
-        w_AB_i = state_row[0];
-        w_Ab_i = state_row[1];
-        w_aB_i = state_row[2];
-        w_A_i = w_AB_i + w_Ab_i;
-        w_B_i = w_AB_i + w_aB_i;
-        D_i = (ni * w_AB_i - (w_A_i * w_B_i)) / (ni * ni);
+        pAB = state_row[0] / n;
+        pAb = state_row[1] / n;
+        paB = state_row[2] / n;
+        pA = pAB + pAb;
+        pB = pAB + paB;
+        D_i = pAB - (pA * pB);
+        denom_i = sqrt(pA * (1 - pA) * pB * (1 - pB));
 
-        nj = (double) args.sample_set_sizes[j];
+        n = (double) args.sample_set_sizes[j];
         state_row = GET_2D_ROW(state, 3, j);
-        w_AB_j = state_row[0];
-        w_Ab_j = state_row[1];
-        w_aB_j = state_row[2];
-        w_A_j = w_AB_j + w_Ab_j;
-        w_B_j = w_AB_j + w_aB_j;
-        D_j = (nj * w_AB_j - (w_A_j * w_B_j)) / (nj * nj);
-
-        p_A = (w_A_i + w_A_j) / (ni + nj);
-        p_B = (w_B_i + w_B_j) / (ni + nj);
-        result[k] = (D_i * D_j) / (p_A * (1 - p_A) * p_B * (1 - p_B));
+        pAB = state_row[0] / n;
+        pAb = state_row[1] / n;
+        paB = state_row[2] / n;
+        pA = pAB + pAb;
+        pB = pAB + paB;
+        D_j = pAB - (pA * pB);
+        denom_j = sqrt(pA * (1 - pA) * pB * (1 - pB));
+
+        result[k] = (D_i * D_j) / (denom_i * denom_j);
     }
     return 0;
 }
diff --git a/python/tests/test_ld_matrix.py b/python/tests/test_ld_matrix.py
@@ -277,6 +277,7 @@ def norm_hap_weighted_ij(
         wAB_i = hap_weights[0, i]
         wAB_j = hap_weights[0, j]
         result[k] = (wAB_i + wAB_j) / (ni + nj)
+        # result[k] = (wAB_i / ni / 2) + (wAB_j / nj / 2)
 
 
 def norm_total_weighted(
@@ -1034,26 +1035,26 @@ def r2_ij_summary_func(
     for k in range(result_dim):
         i = set_indexes[k][0]
         j = set_indexes[k][1]
-        ni = sample_set_sizes[i]
-        w_AB_i = state[0, i]
-        w_Ab_i = state[1, i]
-        w_aB_i = state[2, i]
-        w_A_i = w_AB_i + w_Ab_i
-        w_B_i = w_AB_i + w_aB_i
-        D_i = (ni * w_AB_i - (w_A_i * w_B_i)) / (ni * ni)
+        n = sample_set_sizes[i]
+        pAB = state[0, i] / n
+        pAb = state[1, i] / n
+        paB = state[2, i] / n
+        pA = pAB + pAb
+        pB = pAB + paB
+        D_i = pAB - pA * pB
+        denom_i = np.sqrt(pA * (1 - pA) * pB * (1 - pB))
+
+        n = sample_set_sizes[j]
+        pAB = state[0, j] / n
+        pAb = state[1, j] / n
+        paB = state[2, j] / n
+        pA = pAB + pAb
+        pB = pAB + paB
+        D_j = pAB - pA * pB
+        denom_j = np.sqrt(pA * (1 - pA) * pB * (1 - pB))
 
-        nj = sample_set_sizes[j]
-        w_AB_j = state[0, j]
-        w_Ab_j = state[1, j]
-        w_aB_j = state[2, j]
-        w_A_j = w_AB_j + w_Ab_j
-        w_B_j = w_AB_j + w_aB_j
-        D_j = (nj * w_AB_j - (w_A_j * w_B_j)) / (nj * nj)
-
-        p_A = (w_A_i + w_A_j) / (ni + nj)
-        p_B = (w_B_i + w_B_j) / (ni + nj)
         with suppress_overflow_div0_warning():
-            result[k] = (D_i * D_j) / (p_A * (1 - p_A) * p_B * (1 - p_B))
+            result[k] = (D_i * D_j) / (denom_i * denom_j)
 
 
 def D_summary_func(
@@ -2317,12 +2318,12 @@ def test_two_way_site_ld_matrix(ts, stat):
         (
             correlated,
             (np.array([0, 1, 2, 3, 4, 5]), np.array([6, 7, 8, 9, 10])),
-            np.float64(0.9708352229780801),
+            np.float64(1.0),
         ),
         (
             correlated,
             (np.array([0, 1, 2, 3, 4, 5]), np.array([6, 7, 8, 9])),
-            np.float64(0.9526958931720837),
+            np.float64(1.0),
         ),
         (
             correlated,
@@ -2332,12 +2333,12 @@ def test_two_way_site_ld_matrix(ts, stat):
         (
             correlated,
             (np.array([0, 1, 2, 3, 4, 5]), np.array([6, 7])),
-            np.float64(0.7585185185185186),
+            np.float64(np.nan),
         ),
         (
             correlated,
             (np.array([0, 1, 2, 3, 4, 5]), np.array([6])),
-            np.float64(0.0),
+            np.float64(np.nan),
         ),
         (
             anticorrelated := np.array(
@@ -2355,37 +2356,37 @@ def test_two_way_site_ld_matrix(ts, stat):
         (
             anticorrelated,
             (np.array([0, 2, 4, 6, 8, 10, 12, 14]), np.array([1, 3, 5, 7, 9, 11, 13])),
-            np.float64(0.9798566895766568),
+            np.float64(1.0),
         ),
         (
             anticorrelated,
             (np.array([0, 2, 4, 6, 8, 10, 12, 14]), np.array([1, 3, 5, 7, 9, 11])),
-            np.float64(0.8574999999999999),
+            np.float64(np.nan),
         ),
         (
             anticorrelated,
             (np.array([0, 2, 4, 6, 8, 10, 12, 14]), np.array([1, 3, 5, 7, 9])),
-            np.float64(0.8299777777777777),
+            np.float64(np.nan),
         ),
         (
             anticorrelated,
             (np.array([0, 2, 4, 6, 8, 10, 12, 14]), np.array([1, 3, 5, 7])),
-            np.float64(0.6328124999999999),
+            np.float64(np.nan),
         ),
         (
             anticorrelated,
             (np.array([0, 2, 4, 6, 8, 10, 12, 14]), np.array([1, 3, 5])),
-            np.float64(0.57179616638322),
+            np.float64(np.nan),
         ),
         (
             anticorrelated,
             (np.array([0, 2, 4, 6, 8, 10, 12, 14]), np.array([1, 3])),
-            np.float64(0.0),
+            np.float64(np.nan),
         ),
         (
             anticorrelated,
             (np.array([0, 2, 4, 6, 8, 10, 12, 14]), np.array([1])),
-            np.float64(0.0),
+            np.float64(np.nan),
         ),
     ],
 )
@@ -2404,4 +2405,4 @@ def test_multipopulation_r2_varying_unequal_set_sizes(genotypes, sample_sets, ex
         r2_ij_summary_func(state_dim, state, 1, result[i, j], params)
         norm_hap_weighted_ij(1, state, max(a) + 1, max(b) + 1, norm[i, j], params)
 
-    np.testing.assert_allclose(expected, (result * norm).sum())
+    np.testing.assert_allclose((result * norm).sum(), expected)