int slider removed, edge case fixed, bin count logged, actual node count on hover and plot titles added

savitakartik · savitakartik · commit 5541b73b101b · 2023-09-23T19:44:01.000+01:00
Added child_left, child_right columns to nodes_df and tests for these.
diff --git a/model.py b/model.py
@@ -451,6 +451,8 @@ def nodes_df(self):
                 "ancestors_span": child_right - child_left,
                 "child_left": child_left,  # FIXME add test for this
                 "child_right": child_right,  # FIXME add test for this
+                "child_left": child_left,  # FIXME add test for this
+                "child_right": child_right,  # FIXME add test for this
                 "is_sample": is_sample,
             }
         )
@@ -589,7 +591,7 @@ def calc_mutations_per_tree(self):
         mutations_per_tree[unique_values] = counts
         return mutations_per_tree
 
-    def compute_ancestor_spans_heatmap_data(self, win_x_size=1_000_000, win_y_size=500):
+    def compute_ancestor_spans_heatmap_data(self, num_x_bins, num_y_bins):
         """
         Calculates the average ancestor span in a genomic-time window
         """
@@ -598,38 +600,38 @@ def compute_ancestor_spans_heatmap_data(self, win_x_size=1_000_000, win_y_size=5
         nodes_left = nodes_df.child_left
         nodes_right = nodes_df.child_right
         nodes_time = nodes_df.time
-        ancestors_span = nodes_df.ancestors_span
 
-        num_x_wins = int(np.ceil(nodes_right.max() - nodes_left.min()) / win_x_size)
-        num_y_wins = int(np.ceil(nodes_time.max() / win_y_size))
-        heatmap_sums = np.zeros((num_x_wins, num_y_wins))
-        heatmap_counts = np.zeros((num_x_wins, num_y_wins))
+        x_bins = np.linspace(nodes_left.min(), nodes_right.max(), num_x_bins + 1)
+        y_bins = np.linspace(0, nodes_time.max(), num_y_bins + 1)
+        heatmap_counts = np.zeros((num_x_bins, num_y_bins))
 
-        for u in range(len(nodes_left)):
-            x_start = int(
-                np.floor(nodes_left[u] / win_x_size)
-            )  # map the node span to the x-axis bins it overlaps
-            x_end = int(np.floor(nodes_right[u] / win_x_size))
-            y = max(0, int(np.floor(nodes_time[u] / win_y_size)) - 1)
-            heatmap_sums[x_start:x_end, y] += min(ancestors_span[u], win_x_size)
-            heatmap_counts[x_start:x_end, y] += 1
-
-        avg_spans = heatmap_sums / heatmap_counts
-        indices = np.indices((num_x_wins, num_y_wins))
-        x_coords = indices[0] * win_x_size
-        y_coords = indices[1] * win_y_size
+        x_starts = np.digitize(nodes_left, x_bins, right=True)
+        x_ends = np.digitize(nodes_right, x_bins, right=True)
+        y_starts = np.digitize(nodes_time, y_bins, right=True)
 
+        for u in range(len(nodes_left)):
+            x_start = max(0, x_starts[u] - 1)
+            x_end = max(0, x_ends[u] - 1)
+            y_bin = max(0, y_starts[u] - 1)
+            heatmap_counts[x_start : x_end + 1, y_bin] += 1
+
+        x_coords = np.repeat(x_bins[:-1], num_y_bins)
+        y_coords = np.tile(y_bins[:-1], num_x_bins)
+        overlapping_node_count = heatmap_counts.flatten()
+        overlapping_node_count[overlapping_node_count == 0] = 1
+        # FIXME - better way to avoid log 0 above?
         df = pd.DataFrame(
             {
-                "genomic_position": x_coords.flatten(),
+                "position": x_coords.flatten(),
                 "time": y_coords.flatten(),
-                "average_ancestor_span": avg_spans.flatten(),
+                "overlapping_node_count_log10": np.log10(overlapping_node_count),
+                "overlapping_node_count": overlapping_node_count,
             }
         )
         return df.astype(
             {
-                "genomic_position": "int",
+                "position": "int",
                 "time": "int",
-                "average_ancestor_span": "float64",
+                "overlapping_node_count": "int",
             }
         )
diff --git a/pages/nodes.py b/pages/nodes.py
@@ -3,6 +3,7 @@
 import hvplot.pandas  # noqa
 import numpy as np
 import panel as pn
+from bokeh.models import HoverTool
 
 import config
 from plot_helpers import filter_points
@@ -40,8 +41,15 @@ def make_node_hist_panel(tsm, log_y):
     points = df_nodes.hvplot.scatter(
         x="ancestors_span",
         y="time",
-        hover_cols=["ancestors_span", "time"],
-    ).opts(width=config.PLOT_WIDTH, height=config.PLOT_HEIGHT)
+        hover_cols=["ancestors_span", "time"],  # add node ID
+    ).opts(
+        width=config.PLOT_WIDTH,
+        height=config.PLOT_HEIGHT,
+        title="Node span by time",
+        xlabel="width of genome spanned by node ancestors",
+        ylabel="node time",
+        axiswise=True,
+    )
 
     range_stream = hv.streams.RangeXY(source=points)
     streams = [range_stream]
@@ -54,16 +62,47 @@ def make_node_hist_panel(tsm, log_y):
     )
 
     plot_options = pn.Column(
-        pn.pane.Markdown("# Plot Options"),
         log_y_checkbox,
     )
 
-    anc_span_data = tsm.compute_ancestor_spans_heatmap_data()
-    heatmap = hv.HeatMap(anc_span_data).opts(
-        width=config.PLOT_WIDTH,
-        height=config.PLOT_HEIGHT,
-        tools=["hover"],
-        colorbar=True,
+    def make_heatmap(num_x_bins, num_y_bins):
+        anc_span_data = tsm.compute_ancestor_spans_heatmap_data(num_x_bins, num_y_bins)
+        tooltips = [
+            ("position", "@position"),
+            ("time", "@time"),
+            ("overlapping_nodes", "@overlapping_node_count"),
+        ]
+        hover = HoverTool(tooltips=tooltips)
+        heatmap = hv.HeatMap(anc_span_data).opts(
+            width=config.PLOT_WIDTH,
+            height=config.PLOT_HEIGHT,
+            tools=[hover],
+            colorbar=True,
+            title="Average ancestor length in time and genome bins",
+            axiswise=True,
+        )
+        return heatmap
+
+    max_x_bins = int(np.sqrt(df_nodes.child_right.max()))
+    x_bin_input = pn.widgets.IntInput(
+        name="genome bins",
+        value=min(50, max_x_bins),
+        start=1,
+        end=max_x_bins,
+    )
+    max_y_bins = int(np.sqrt(df_nodes.time.max()))
+    y_bin_input = pn.widgets.IntInput(
+        name="time bins", value=min(50, int(max_y_bins)), start=1, end=max_y_bins
     )
+    hm_options = pn.Column(x_bin_input, y_bin_input)
 
-    return pn.Column(main, hist_panel, heatmap, plot_options)
+    hm_panel = pn.bind(
+        make_heatmap,
+        num_x_bins=x_bin_input,
+        num_y_bins=y_bin_input,
+    )
+
+    return pn.Column(
+        pn.Row(main, pn.Column(hist_panel, plot_options)),
+        pn.Column(hm_panel, hm_options),
+    )