openproblems-bio · ghar1821 · Oct 9, 2025 · Oct 8, 2025 · Oct 9, 2025 · Oct 9, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -128,6 +128,8 @@
 
 * Add arguments for including/excluding methods and metrics in the benchmarking workflow (PR #100).
 
+* Removed EMD max from calculation (PR #113).
+
 
 ## BUG FIXES
 
@@ -161,4 +163,6 @@
 
 * Fix bug in EMD where nan cannot be written out and added sklearn dependency for cytovi (PR #110). 
 
+* Fix bug in EMD vertical where sample combination was malformed (PR #113)
+
 
diff --git a/src/metrics/emd/config.vsh.yaml b/src/metrics/emd/config.vsh.yaml
@@ -55,50 +55,6 @@ info:
       max: .inf
       # Whether a higher value represents a 'better' solution (required)
       maximize: false
-
-      # A unique identifier for your metric (required).
-      # Can contain only lowercase letters or underscores.
-    - name: emd_max_ct_horiz
-      # A relatively short label, used when rendering visualisarions (required)
-      label: EMD Max CT Horizontal
-      # A one sentence summary of how this metric works (required). Used when 
-      # rendering summary tables.
-      summary: "Max Earth Mover Distance calculated horizontally across donors for each cell type and marker."
-      # A multi-line description of how this component works (required). Used
-      # when rendering reference documentation.
-      description: |
-        Earth Mover Distance (EMD), also known as the Wasserstein metric, measures the difference 
-        between two probability distributions. 
-
-        Here, EMD is used to compare marker expression distributions between paired samples from the same donor 
-        quantified across two different batches. 
-        For each paired sample, cell type, and marker, the marker expression values are first converted into 
-        probability distributions. 
-        This is done by binning the expression values into a range from -100 to 100 with a bin width of 0.1.
-        The `wasserstein_distance` function from SciPy is then used to calculate the EMD between the two 
-        probability distributions belonging to the same cell type, marker, and a given paired samples.
-        This is then repeated for every cell type, marker, and paired sample.
-        Finally, the maximum of all these EMD values is computed and reported as the metric score.
-
-        EMD Max CT score reflects the largest difference in marker expression distributions across all cell types, 
-        markers, and paired samples.
-        A high score indicates that at least one marker, cell type, or sample pair has a large difference in 
-        distribution after batch integration.
-        A low score means that even the most poorly corrected marker expression is well integrated across batches.    
-      references:
-        doi: 
-          - 10.1023/A:1026543900054
-      links:
-        # URL to the documentation for this metric (required).
-        documentation: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.wasserstein_distance.html
-        # URL to the code repository for this metric (required).
-        repository: https://github.com/scipy/scipy
-      # The minimum possible value for this metric (required)
-      min: 0
-      # The maximum possible value for this metric (required)
-      max: .inf
-      # Whether a higher value represents a 'better' solution (required)
-      maximize: false
 
       # A unique identifier for your metric (required).
       # Can contain only lowercase letters or underscores.
@@ -140,47 +96,6 @@ info:
       max: .inf
       # Whether a higher value represents a 'better' solution (required)
       maximize: false
-
-      # A unique identifier for your metric (required).
-      # Can contain only lowercase letters or underscores.
-    - name: emd_max_ct_vert
-      # A relatively short label, used when rendering visualisarions (required)
-      label: EMD Max CT Vertical
-      # A one sentence summary of how this metric works (required). Used when 
-      # rendering summary tables.
-      summary: "Max Earth Mover Distance across batch corrected samples, cell types, and markers."
-      # A multi-line description of how this component works (required). Used
-      # when rendering reference documentation.
-      description: |
-        Earth Mover Distance (EMD), also known as the Wasserstein metric, measures the difference 
-        between two probability distributions. 
-
-        Here, EMD is used to compare marker expression distributions between all integrated 
-        samples from the same group.
-        For each pair of samples, cell type, and marker, the marker expression values are first converted into 
-        probability distributions. 
-        This is done by binning the expression values into a range from -100 to 100 with a bin width of 0.1.
-        The `wasserstein_distance` function from SciPy is then used to calculate the EMD between the two 
-        probability distributions belonging to the same cell type, marker, and a given paired samples.
-        This is then repeated for every cell type, marker, and paired sample.
-        Finally, the maximum of all these EMD values is computed and reported as the metric score.
-
-        A high score indicates there is a pair of samples and marker which show large difference in distribution after batch integration.
-        A low score means that, the worst integrated pair of samples and marker are well integrated.
-      references:
-        doi: 
-          - 10.1023/A:1026543900054
-      links:
-        # URL to the documentation for this metric (required).
-        documentation: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.wasserstein_distance.html
-        # URL to the code repository for this metric (required).
-        repository: https://github.com/scipy/scipy
-      # The minimum possible value for this metric (required)
-      min: 0
-      # The maximum possible value for this metric (required)
-      max: .inf
-      # Whether a higher value represents a 'better' solution (required)
-      maximize: false
 
 # Resources required to run the component
 resources:
@@ -203,4 +118,4 @@ runners:
   # Allows turning the component into a Nextflow module / pipeline.
   - type: nextflow
     directives:
-      label: [midtime,midmem,midcpu]
+      label: [lowtime,lowmem,lowcpu]
diff --git a/src/metrics/emd/helper.py b/src/metrics/emd/helper.py
@@ -6,7 +6,6 @@
 from scipy.stats import wasserstein_distance
 
 KEY_MEAN_EMD_CT = "mean_emd_ct"
-KEY_MAX_EMD_CT = "max_emd_ct"
 KEY_EMD_VERT_MAT_split1 = "emd_vert_mat_split1"
 KEY_EMD_VERT_MAT_split2 = "emd_vert_mat_split2"
 KEY_EMD_HORZ_PER_DONOR = "emd_horz_per_donor"
@@ -27,26 +26,30 @@ def calculate_vertical_emd(
         dict: a dictionary containing the following elements.
             "mean_emd_ct": np.float32: mean emd value computed from a flattened data frame containing
                 mean emd computed for every marker and cell type across all pairing two samples from the same group.
-            "max_emd_ct": np.float32: max emd value computed from a flattened data frame containing
-                max emd computed for every marker and cell type across all pairing two samples from the same group.
-            "emd_wide_dfs": dict: each key is a marker. A value is yet another dictionary which value is
-                a 2d matrix where each row/column is a pair of sample and a cell contains
-                    emd value computed for the sample pair for either a given cell type
-                    or global. The key for this dictionary then is either a given cell type
-                    or global, depending on what the 2d matrix represents.
+            "emd_split1_long": pd.DataFrame or np.nan: a long format dataframe where each row is a combination of
+                first_sample, second_sample, cell_type, and markers.
+                Value is the EMD computed for that combination.
+                If the input data does not have at least 2 samples per group, then return np.nan.
+            "emd_split2_long": pd.DataFrame or np.nan: a long format dataframe where each row is a combination of
+                first_sample, second_sample, cell_type, and markers.
+                Value is the EMD computed for that combination.
+                If the input data does not have at least 2 samples per group, then return np.nan.
     """
 
+    print("Calculating vertical EMD for split 1", flush=True)
     emd_split1_long = get_vert_emd_for_integrated_adata(
         i_adata=i_split1_adata, markers_to_assess=markers_to_assess
     )
 
+    print("Calculating vertical EMD for split 2", flush=True)
     emd_split2_long = get_vert_emd_for_integrated_adata(
         i_adata=i_split2_adata, markers_to_assess=markers_to_assess
     )
 
     # safeguard
     mean_emd_ct = np.nan
-    max_emd_ct = np.nan
+
+    print("Computing mean vertical EMD", flush=True)
 
     # compute these only if we can.
     emd_long = []
@@ -57,21 +60,19 @@ def calculate_vertical_emd(
     if len(emd_long) > 0:
         emd_long = pd.concat(emd_long)
 
+        # note we did some marker name cleaning before. So the markers_to_assess
+        # parameter may not match exactly the column names in emd_long.
+        # so don't just subset to markers_to_assess!
+
         # mean cell type emd across all sample combinations, markers, and splits
         mean_emd_ct = np.nanmean(
             emd_long.drop(columns=["cell_type", "first_sample", "second_sample"])
             .to_numpy()
             .flatten()
         )
-        max_emd_ct = np.nanmax(
-            emd_long.drop(columns=["cell_type", "first_sample", "second_sample"])
-            .to_numpy()
-            .flatten()
-        )
 
     return {
         KEY_MEAN_EMD_CT: mean_emd_ct,
-        KEY_MAX_EMD_CT: max_emd_ct,
         KEY_EMD_VERT_MAT_split1: emd_split1_long,
         KEY_EMD_VERT_MAT_split2: emd_split2_long,
     }
@@ -86,22 +87,30 @@ def get_vert_emd_for_integrated_adata(i_adata: ad.AnnData, markers_to_assess: li
         markers_to_assess (list): list of markers to compute EMD for.
 
     Returns:
-        emd_dfs_ct: EMD per cell type,
-        emd_dfs_global: EMD per donor,
-        emd_wide_dfs: the break down of EMD per donor and cell type.
+        emd_vals (pd.DataFrame or np.nan): a long format dataframe where each row is a combination of
+            first_sample, second_sample, cell_type, and markers.
+            Value is the EMD computed for that combination.
+            If the input data does not have at least 2 samples per group, then return np.nan.
     """
 
-    # calculate the global first, agnostic of cell type
-
+    print("Determining samples per group", flush=True)
     # comparing one sample against every other sample (one at a time).
     # get all samples for each group first
     sample_group_map = i_adata.obs.groupby("group", observed=True)["sample"].apply(
         lambda x: list(set(x))
     )
-    # then get combinations of 2 for each group
-    sample_combos = np.array(
-        [list(itertools.combinations(x, 2)) for x in sample_group_map]
-    ).reshape(-1, 2)
+
+    print(f"Sample groups found: {sample_group_map}", flush=True)
+    print("Getting sample combinations to compute EMD for", flush=True)
+
+    sample_combos = []
+    for sample_type in sample_group_map:
+        # sample_type = sample_group_map[1]
+        if len(sample_type) >= 2:
+            sample_combo = list(itertools.combinations(sample_type, 2))
+            sample_combos.extend(sample_combo)
+        else:
+            print(f"There are less than 2 samples for group {sample_type}. Skipping.")
 
     if len(sample_combos) == 0:
         # this means the data processed do not have at least 2 samples per group.
@@ -114,6 +123,10 @@ def get_vert_emd_for_integrated_adata(i_adata: ad.AnnData, markers_to_assess: li
 
         return np.nan
 
+    print("Sample combinations found:", flush=True)
+    print(sample_combos, flush=True)
+
+    print("Calculating EMD for each sample combination and cell type", flush=True)
     cell_types = i_adata.obs["cell_type"].unique()
 
     emd_vals = []
@@ -126,6 +139,11 @@ def get_vert_emd_for_integrated_adata(i_adata: ad.AnnData, markers_to_assess: li
 
         # emd per cell type
         for cell_type in cell_types:
+            print(
+                f"Calculating EMD for sample combo: {sample_combo}, cell type: {cell_type}",
+                flush=True,
+            )
+
             # cell_type = cell_types[0]
             first_sample_adata_ct = first_sample_adata[
                 first_sample_adata.obs["cell_type"] == cell_type
@@ -136,6 +154,14 @@ def get_vert_emd_for_integrated_adata(i_adata: ad.AnnData, markers_to_assess: li
 
             # Do not calculate if we have less than 50 cells as it does not make sense.
             if first_sample_adata_ct.n_obs < 50 or second_sample_adata_ct.n_obs < 50:
+                print(
+                    f"There are less than 50 cells in either sample {sample_combo[0]} or"
+                    f" sample {sample_combo[1]} for cell type {cell_type}.\n"
+                    f"Sample {sample_combo[0]} {cell_type}: {first_sample_adata_ct.n_obs} cells.\n"
+                    f"Sample {sample_combo[1]} {cell_type}: {second_sample_adata_ct.n_obs} cells.\n"
+                    f"Skipping calculating vertical EMD for this sample combination and cell type.",
+                    flush=True,
+                )
                 continue
 
             emd_df = compute_emd(
@@ -152,38 +178,10 @@ def get_vert_emd_for_integrated_adata(i_adata: ad.AnnData, markers_to_assess: li
     # concatenate EMD values
     emd_vals = pd.concat(emd_vals)
     # remove unparsable characters like "/"
+    print("Cleaning up column names", flush=True)
     emd_vals.columns = emd_vals.columns.str.replace("/", "_")
 
-    # TODO remove me once we are happy with the results
-    # prepare the data to draw the heatmap in cytonorm 2 supp paper.
-    # 1 row/column = 1 sample, a cell is emd for a given marker
-    # repeat for every marker assessed
-    # note, only run this after calculating mean, otherwise you end up having to
-    # remove the sample id columns.
-
-    # emd_wide = {}
-
-    # emd_types = emd_vals["cell_type"].unique()
-
-    # for marker in markers_to_assess:
-    #     # marker = markers_to_assess[0]
-
-    #     # remove unparsable characters like "/"
-    #     marker_name = marker.replace("/", "_")
-    #     # have to initialise the dictionary..
-    #     emd_wide[marker_name] = {}
-
-    #     for emd_type in emd_types:
-    #         # ct = cell_types[0]
-    #         emd_df = emd_vals[emd_vals["cell_type"] == emd_type]
-
-    #         if emd_df.shape[0] > 0:
-    #             # safeguard. Only pivot if we computed the emd.
-    #             # This is a safeguard in case there is a rare cell type which we don't have
-    #             # any samples with at least 50 cells for.
-    #             emd_wide[marker_name][emd_type] = emd_df.pivot(
-    #                 index="second_sample", columns="first_sample", values=marker
-    #             )
+    print("Finished calculating vertical EMD", flush=True)
 
     return emd_vals
 
@@ -216,6 +214,7 @@ def calculate_horizontal_emd(
     emd_per_donor_per_ct = []
 
     for donor in donor_list:
+        print(f"Calculating horizontal EMD for donor: {donor}", flush=True)
         # donor = donor_list[0]
         i_split1_donor = i_split1_adata[i_split1_adata.obs["donor"] == donor]
         i_split2_donor = i_split2_adata[i_split2_adata.obs["donor"] == donor]
@@ -240,6 +239,7 @@ def calculate_horizontal_emd(
         )
 
         for cell_type in cell_types:
+            print(f"Calculating EMD for cell type: {cell_type}", flush=True)
             # cell_type = cell_types[0]
             i_split1_ct = i_split1_donor[i_split1_donor.obs["cell_type"] == cell_type]
             i_split2_ct = i_split2_donor[i_split2_donor.obs["cell_type"] == cell_type]
@@ -268,12 +268,10 @@ def calculate_horizontal_emd(
     emd_per_donor_per_ct = pd.concat(emd_per_donor_per_ct)
 
     # compute the mean and max per ct and for global.
+    print("Computing mean and max horizontal EMD", flush=True)
     mean_emd_ct = np.nanmean(
         emd_per_donor_per_ct.drop(columns=["cell_type", "donor"]).values
     )
-    max_emd_ct = np.nanmax(
-        emd_per_donor_per_ct.drop(columns=["cell_type", "donor"]).values
-    )
 
     # concatenate the global and cell type emd
     emd_per_donor_per_ct.columns = emd_per_donor_per_ct.columns.str.replace(
@@ -282,7 +280,6 @@ def calculate_horizontal_emd(
 
     return {
         KEY_MEAN_EMD_CT: mean_emd_ct,
-        KEY_MAX_EMD_CT: max_emd_ct,
         KEY_EMD_HORZ_PER_DONOR: emd_per_donor_per_ct,
     }