diff --git a/CHANGELOG.md b/CHANGELOG.md index 2a0efbba..d2348383 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -128,6 +128,8 @@ * Add arguments for including/excluding methods and metrics in the benchmarking workflow (PR #100). +* Removed EMD max from calculation (PR #113). + ## BUG FIXES @@ -161,4 +163,6 @@ * Fix bug in EMD where nan cannot be written out and added sklearn dependency for cytovi (PR #110). +* Fix bug in EMD vertical where sample combination was malformed (PR #113) + diff --git a/src/metrics/emd/config.vsh.yaml b/src/metrics/emd/config.vsh.yaml index 72cef341..319f7a19 100644 --- a/src/metrics/emd/config.vsh.yaml +++ b/src/metrics/emd/config.vsh.yaml @@ -55,50 +55,6 @@ info: max: .inf # Whether a higher value represents a 'better' solution (required) maximize: false - - # A unique identifier for your metric (required). - # Can contain only lowercase letters or underscores. - - name: emd_max_ct_horiz - # A relatively short label, used when rendering visualisarions (required) - label: EMD Max CT Horizontal - # A one sentence summary of how this metric works (required). Used when - # rendering summary tables. - summary: "Max Earth Mover Distance calculated horizontally across donors for each cell type and marker." - # A multi-line description of how this component works (required). Used - # when rendering reference documentation. - description: | - Earth Mover Distance (EMD), also known as the Wasserstein metric, measures the difference - between two probability distributions. - - Here, EMD is used to compare marker expression distributions between paired samples from the same donor - quantified across two different batches. - For each paired sample, cell type, and marker, the marker expression values are first converted into - probability distributions. - This is done by binning the expression values into a range from -100 to 100 with a bin width of 0.1. - The `wasserstein_distance` function from SciPy is then used to calculate the EMD between the two - probability distributions belonging to the same cell type, marker, and a given paired samples. - This is then repeated for every cell type, marker, and paired sample. - Finally, the maximum of all these EMD values is computed and reported as the metric score. - - EMD Max CT score reflects the largest difference in marker expression distributions across all cell types, - markers, and paired samples. - A high score indicates that at least one marker, cell type, or sample pair has a large difference in - distribution after batch integration. - A low score means that even the most poorly corrected marker expression is well integrated across batches. - references: - doi: - - 10.1023/A:1026543900054 - links: - # URL to the documentation for this metric (required). - documentation: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.wasserstein_distance.html - # URL to the code repository for this metric (required). - repository: https://github.com/scipy/scipy - # The minimum possible value for this metric (required) - min: 0 - # The maximum possible value for this metric (required) - max: .inf - # Whether a higher value represents a 'better' solution (required) - maximize: false # A unique identifier for your metric (required). # Can contain only lowercase letters or underscores. @@ -140,47 +96,6 @@ info: max: .inf # Whether a higher value represents a 'better' solution (required) maximize: false - - # A unique identifier for your metric (required). - # Can contain only lowercase letters or underscores. - - name: emd_max_ct_vert - # A relatively short label, used when rendering visualisarions (required) - label: EMD Max CT Vertical - # A one sentence summary of how this metric works (required). Used when - # rendering summary tables. - summary: "Max Earth Mover Distance across batch corrected samples, cell types, and markers." - # A multi-line description of how this component works (required). Used - # when rendering reference documentation. - description: | - Earth Mover Distance (EMD), also known as the Wasserstein metric, measures the difference - between two probability distributions. - - Here, EMD is used to compare marker expression distributions between all integrated - samples from the same group. - For each pair of samples, cell type, and marker, the marker expression values are first converted into - probability distributions. - This is done by binning the expression values into a range from -100 to 100 with a bin width of 0.1. - The `wasserstein_distance` function from SciPy is then used to calculate the EMD between the two - probability distributions belonging to the same cell type, marker, and a given paired samples. - This is then repeated for every cell type, marker, and paired sample. - Finally, the maximum of all these EMD values is computed and reported as the metric score. - - A high score indicates there is a pair of samples and marker which show large difference in distribution after batch integration. - A low score means that, the worst integrated pair of samples and marker are well integrated. - references: - doi: - - 10.1023/A:1026543900054 - links: - # URL to the documentation for this metric (required). - documentation: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.wasserstein_distance.html - # URL to the code repository for this metric (required). - repository: https://github.com/scipy/scipy - # The minimum possible value for this metric (required) - min: 0 - # The maximum possible value for this metric (required) - max: .inf - # Whether a higher value represents a 'better' solution (required) - maximize: false # Resources required to run the component resources: @@ -203,4 +118,4 @@ runners: # Allows turning the component into a Nextflow module / pipeline. - type: nextflow directives: - label: [midtime,midmem,midcpu] + label: [lowtime,lowmem,lowcpu] diff --git a/src/metrics/emd/helper.py b/src/metrics/emd/helper.py index 14fb0b71..ca880e0f 100644 --- a/src/metrics/emd/helper.py +++ b/src/metrics/emd/helper.py @@ -6,7 +6,6 @@ from scipy.stats import wasserstein_distance KEY_MEAN_EMD_CT = "mean_emd_ct" -KEY_MAX_EMD_CT = "max_emd_ct" KEY_EMD_VERT_MAT_split1 = "emd_vert_mat_split1" KEY_EMD_VERT_MAT_split2 = "emd_vert_mat_split2" KEY_EMD_HORZ_PER_DONOR = "emd_horz_per_donor" @@ -27,26 +26,30 @@ def calculate_vertical_emd( dict: a dictionary containing the following elements. "mean_emd_ct": np.float32: mean emd value computed from a flattened data frame containing mean emd computed for every marker and cell type across all pairing two samples from the same group. - "max_emd_ct": np.float32: max emd value computed from a flattened data frame containing - max emd computed for every marker and cell type across all pairing two samples from the same group. - "emd_wide_dfs": dict: each key is a marker. A value is yet another dictionary which value is - a 2d matrix where each row/column is a pair of sample and a cell contains - emd value computed for the sample pair for either a given cell type - or global. The key for this dictionary then is either a given cell type - or global, depending on what the 2d matrix represents. + "emd_split1_long": pd.DataFrame or np.nan: a long format dataframe where each row is a combination of + first_sample, second_sample, cell_type, and markers. + Value is the EMD computed for that combination. + If the input data does not have at least 2 samples per group, then return np.nan. + "emd_split2_long": pd.DataFrame or np.nan: a long format dataframe where each row is a combination of + first_sample, second_sample, cell_type, and markers. + Value is the EMD computed for that combination. + If the input data does not have at least 2 samples per group, then return np.nan. """ + print("Calculating vertical EMD for split 1", flush=True) emd_split1_long = get_vert_emd_for_integrated_adata( i_adata=i_split1_adata, markers_to_assess=markers_to_assess ) + print("Calculating vertical EMD for split 2", flush=True) emd_split2_long = get_vert_emd_for_integrated_adata( i_adata=i_split2_adata, markers_to_assess=markers_to_assess ) # safeguard mean_emd_ct = np.nan - max_emd_ct = np.nan + + print("Computing mean vertical EMD", flush=True) # compute these only if we can. emd_long = [] @@ -57,21 +60,19 @@ def calculate_vertical_emd( if len(emd_long) > 0: emd_long = pd.concat(emd_long) + # note we did some marker name cleaning before. So the markers_to_assess + # parameter may not match exactly the column names in emd_long. + # so don't just subset to markers_to_assess! + # mean cell type emd across all sample combinations, markers, and splits mean_emd_ct = np.nanmean( emd_long.drop(columns=["cell_type", "first_sample", "second_sample"]) .to_numpy() .flatten() ) - max_emd_ct = np.nanmax( - emd_long.drop(columns=["cell_type", "first_sample", "second_sample"]) - .to_numpy() - .flatten() - ) return { KEY_MEAN_EMD_CT: mean_emd_ct, - KEY_MAX_EMD_CT: max_emd_ct, KEY_EMD_VERT_MAT_split1: emd_split1_long, KEY_EMD_VERT_MAT_split2: emd_split2_long, } @@ -86,22 +87,30 @@ def get_vert_emd_for_integrated_adata(i_adata: ad.AnnData, markers_to_assess: li markers_to_assess (list): list of markers to compute EMD for. Returns: - emd_dfs_ct: EMD per cell type, - emd_dfs_global: EMD per donor, - emd_wide_dfs: the break down of EMD per donor and cell type. + emd_vals (pd.DataFrame or np.nan): a long format dataframe where each row is a combination of + first_sample, second_sample, cell_type, and markers. + Value is the EMD computed for that combination. + If the input data does not have at least 2 samples per group, then return np.nan. """ - # calculate the global first, agnostic of cell type - + print("Determining samples per group", flush=True) # comparing one sample against every other sample (one at a time). # get all samples for each group first sample_group_map = i_adata.obs.groupby("group", observed=True)["sample"].apply( lambda x: list(set(x)) ) - # then get combinations of 2 for each group - sample_combos = np.array( - [list(itertools.combinations(x, 2)) for x in sample_group_map] - ).reshape(-1, 2) + + print(f"Sample groups found: {sample_group_map}", flush=True) + print("Getting sample combinations to compute EMD for", flush=True) + + sample_combos = [] + for sample_type in sample_group_map: + # sample_type = sample_group_map[1] + if len(sample_type) >= 2: + sample_combo = list(itertools.combinations(sample_type, 2)) + sample_combos.extend(sample_combo) + else: + print(f"There are less than 2 samples for group {sample_type}. Skipping.") if len(sample_combos) == 0: # this means the data processed do not have at least 2 samples per group. @@ -114,6 +123,10 @@ def get_vert_emd_for_integrated_adata(i_adata: ad.AnnData, markers_to_assess: li return np.nan + print("Sample combinations found:", flush=True) + print(sample_combos, flush=True) + + print("Calculating EMD for each sample combination and cell type", flush=True) cell_types = i_adata.obs["cell_type"].unique() emd_vals = [] @@ -126,6 +139,11 @@ def get_vert_emd_for_integrated_adata(i_adata: ad.AnnData, markers_to_assess: li # emd per cell type for cell_type in cell_types: + print( + f"Calculating EMD for sample combo: {sample_combo}, cell type: {cell_type}", + flush=True, + ) + # cell_type = cell_types[0] first_sample_adata_ct = first_sample_adata[ first_sample_adata.obs["cell_type"] == cell_type @@ -136,6 +154,14 @@ def get_vert_emd_for_integrated_adata(i_adata: ad.AnnData, markers_to_assess: li # Do not calculate if we have less than 50 cells as it does not make sense. if first_sample_adata_ct.n_obs < 50 or second_sample_adata_ct.n_obs < 50: + print( + f"There are less than 50 cells in either sample {sample_combo[0]} or" + f" sample {sample_combo[1]} for cell type {cell_type}.\n" + f"Sample {sample_combo[0]} {cell_type}: {first_sample_adata_ct.n_obs} cells.\n" + f"Sample {sample_combo[1]} {cell_type}: {second_sample_adata_ct.n_obs} cells.\n" + f"Skipping calculating vertical EMD for this sample combination and cell type.", + flush=True, + ) continue emd_df = compute_emd( @@ -152,38 +178,10 @@ def get_vert_emd_for_integrated_adata(i_adata: ad.AnnData, markers_to_assess: li # concatenate EMD values emd_vals = pd.concat(emd_vals) # remove unparsable characters like "/" + print("Cleaning up column names", flush=True) emd_vals.columns = emd_vals.columns.str.replace("/", "_") - # TODO remove me once we are happy with the results - # prepare the data to draw the heatmap in cytonorm 2 supp paper. - # 1 row/column = 1 sample, a cell is emd for a given marker - # repeat for every marker assessed - # note, only run this after calculating mean, otherwise you end up having to - # remove the sample id columns. - - # emd_wide = {} - - # emd_types = emd_vals["cell_type"].unique() - - # for marker in markers_to_assess: - # # marker = markers_to_assess[0] - - # # remove unparsable characters like "/" - # marker_name = marker.replace("/", "_") - # # have to initialise the dictionary.. - # emd_wide[marker_name] = {} - - # for emd_type in emd_types: - # # ct = cell_types[0] - # emd_df = emd_vals[emd_vals["cell_type"] == emd_type] - - # if emd_df.shape[0] > 0: - # # safeguard. Only pivot if we computed the emd. - # # This is a safeguard in case there is a rare cell type which we don't have - # # any samples with at least 50 cells for. - # emd_wide[marker_name][emd_type] = emd_df.pivot( - # index="second_sample", columns="first_sample", values=marker - # ) + print("Finished calculating vertical EMD", flush=True) return emd_vals @@ -216,6 +214,7 @@ def calculate_horizontal_emd( emd_per_donor_per_ct = [] for donor in donor_list: + print(f"Calculating horizontal EMD for donor: {donor}", flush=True) # donor = donor_list[0] i_split1_donor = i_split1_adata[i_split1_adata.obs["donor"] == donor] i_split2_donor = i_split2_adata[i_split2_adata.obs["donor"] == donor] @@ -240,6 +239,7 @@ def calculate_horizontal_emd( ) for cell_type in cell_types: + print(f"Calculating EMD for cell type: {cell_type}", flush=True) # cell_type = cell_types[0] i_split1_ct = i_split1_donor[i_split1_donor.obs["cell_type"] == cell_type] i_split2_ct = i_split2_donor[i_split2_donor.obs["cell_type"] == cell_type] @@ -268,12 +268,10 @@ def calculate_horizontal_emd( emd_per_donor_per_ct = pd.concat(emd_per_donor_per_ct) # compute the mean and max per ct and for global. + print("Computing mean and max horizontal EMD", flush=True) mean_emd_ct = np.nanmean( emd_per_donor_per_ct.drop(columns=["cell_type", "donor"]).values ) - max_emd_ct = np.nanmax( - emd_per_donor_per_ct.drop(columns=["cell_type", "donor"]).values - ) # concatenate the global and cell type emd emd_per_donor_per_ct.columns = emd_per_donor_per_ct.columns.str.replace( @@ -282,7 +280,6 @@ def calculate_horizontal_emd( return { KEY_MEAN_EMD_CT: mean_emd_ct, - KEY_MAX_EMD_CT: max_emd_ct, KEY_EMD_HORZ_PER_DONOR: emd_per_donor_per_ct, } diff --git a/src/metrics/emd/script.py b/src/metrics/emd/script.py index 6586070d..3e728a80 100644 --- a/src/metrics/emd/script.py +++ b/src/metrics/emd/script.py @@ -6,6 +6,10 @@ # Note: this section is auto-generated by viash at runtime. To edit it, make changes # in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. par = { + # "input_integrated_split1": "/Users/putri.g/Documents/cytobenchmark/debug_emd/_viash_par/input_integrated_split1_1/human_cll_mass_cytometry.cycombine_all_controls_to_goal.cycombine_all_controls_to_goal.output.h5ad", + # "input_integrated_split2": "/Users/putri.g/Documents/cytobenchmark/debug_emd/_viash_par/input_integrated_split2_1/human_cll_mass_cytometry.cycombine_all_controls_to_goal.cycombine_all_controls_to_goal.output.h5ad", + # "input_unintegrated": "/Users/putri.g/Documents/cytobenchmark/debug_emd/_viash_par/input_unintegrated_1/unintegrated.h5ad", + # "output": "/Users/putri.g/Documents/cytobenchmark/debug_emd/_viash_par/emd_out.h5ad", "input_integrated_split1": "resources_test/task_cyto_batch_integration/mouse_spleen_flow_cytometry_subset/integrated_split1.h5ad", "input_integrated_split2": "resources_test/task_cyto_batch_integration/mouse_spleen_flow_cytometry_subset/integrated_split2.h5ad", "input_unintegrated": "resources_test/task_cyto_batch_integration/mouse_spleen_flow_cytometry_subset/unintegrated.h5ad", @@ -30,7 +34,7 @@ input_integrated_split2 = ad.read_h5ad(par["input_integrated_split2"]) input_unintegrated = ad.read_h5ad(par["input_unintegrated"]) -print("Formatting input files", flush=True) +print("Preprocessing input files", flush=True) # add obs and var to integrated data input_integrated_split1, input_integrated_split2 = ( @@ -63,15 +67,16 @@ dataset_id = input_unintegrated.uns["dataset_id"] method_id = input_integrated_split1.uns["method_id"] - -# check that the data for each donor in integrated left and right are actually +print("Sanity checking input files", flush=True) +# check that the data for each donor in split 1 and 2 are actually # from two different batches! emd_helper.check_donor_batches( input_integrated_split1=input_integrated_split1, input_integrated_split2=input_integrated_split2, ) -# calculate horizontal EMD for each donor across integrated left and right +print("Calculating horizontal EMD", flush=True) +# calculate horizontal EMD for each donor across split 1 and 2 donor_list = input_integrated_split1.obs["donor"].unique() emd_horz = emd_helper.calculate_horizontal_emd( i_split1_adata=input_integrated_split1, @@ -88,22 +93,17 @@ ) print("Assembling output AnnData", flush=True) -# note: emd_values is handy for plotting later on. output = ad.AnnData( uns={ "dataset_id": dataset_id, "method_id": method_id, "metric_ids": [ "emd_mean_ct_horiz", - "emd_max_ct_horiz", "emd_mean_ct_vert", - "emd_max_ct_vert", ], "metric_values": [ emd_horz[emd_helper.KEY_MEAN_EMD_CT], - emd_horz[emd_helper.KEY_MAX_EMD_CT], emd_vert[emd_helper.KEY_MEAN_EMD_CT], - emd_vert[emd_helper.KEY_MAX_EMD_CT], ], "emd_values": { "emd_values_horiz": emd_horz[emd_helper.KEY_EMD_HORZ_PER_DONOR],