openproblems-bio · patturtejaskumar · Mar 18, 2025 · Mar 18, 2025 · Mar 22, 2025
diff --git a/Bidcell_outputanalysis b/Bidcell_outputanalysis
@@ -0,0 +1,84 @@
+#Analysis and visualisation of BIDcell output
+#resizing BIDcell segmentation .tiff w.r.t DAPI image
+dapi_image = tiff.imread("morphology_mip_pyramidal.tiff")
+
+# Load the BIDCell segmentation mask (ensure the BIDCell segmentation is in the correct path;e.g., 'epoch_10_step_60.tif' or similar)
+segmentation_mask = tiff.imread("epoch_10_step_60_connected.tif")
+# Get the dimensions of the DAPI image (width and height)
+h_dapi, w_dapi = dapi_image.shape
+
+# Resize the BIDCell segmentation mask to match the DAPI image dimensions
+segmentation_mask_resized = cv2.resize(segmentation_mask.astype('float32'), (w_dapi, h_dapi), interpolation=cv2.INTER_NEAREST)
+
+# Convert the resized segmentation mask back to uint32 for storing the cell IDs
+segmentation_mask_resized = segmentation_mask_resized.astype(np.uint32)
+segmentation_mask_resized = segmentation_mask_resized.transpose(1, 0)
+# Save the resized segmentation mask to a .tif file
+tiff.imwrite("bidcellresult_resized.tif", segmentation_mask_resized)
+
+
+#creating bidcelloutput.zarr
+
+# Load the image
+image = tifffile.imread("morphology_mip_pyramidal.tiff")
+# Add a fake channel dimension (1 channel)
+image_with_channel = np.expand_dims(image, axis=0)
+# Parse the image with 'c', 'x', and 'y' dimensions
+# Load the image
+label_image = tifffile.imread("bidcellresult_resized.tif")
+labels = sd.models.Labels2DModel.parse(label_image, dims=('y', 'x'))
+# Convert 'x', 'y', and 'z' columns to float
+# Check the columns in transcript
+transcript_processed=pd.read_csv("/Volumes/Tejas_Kumar/testrunbidcell5/model_outputs/2025_03_17_09_36_19/test_output/transcript.csv.gz")
+transcript_processed['x'] = transcript_processed['x'].astype(float)
+transcript_processed['y'] = transcript_processed['y'].astype(float)
+transcript_processed['z'] = transcript_processed['z'].astype(float)
+transcript_processed['feature_name'] = transcript_processed['feature_name'].astype(str)
+transcript_processed=pd.DataFrame(transcript_processed)
+
+
+#creating images,labels and points for the bidcelloutput.zarr
+images = sd.models.Image2DModel.parse(image_with_channel, dims=('c', 'x', 'y'))
+labels = sd.models.Labels2DModel.parse(label_image, dims=('y', 'x'))
+points = sd.models.PointsModel.parse(transcript_processed)
+
+
+outputsdata = sd.SpatialData(
+    images={'DAPI': images},
+    labels={'segmentation_mask_labels': labels},  
+    points={'transcripts': points}  
+)
+outputsdata.write("Bidcelloutput.zarr",overwrite=True)
+
+
+#read the output zarr file and visualise segmentations 
+outdata=sd.read_zarr("Bidcelloutput.zarr")
+extents = sd.get_extent(outdata)
+# Crop a specific region for visualization
+crop0 = lambda x: sd.bounding_box_query(
+    x,
+    min_coordinate=[400,500],  # Min coordinates
+    max_coordinate=[900,1000],  # Max coordinates
+    #min_coordinate=[extents["x"][0], extents["y"][0]],  # Min coordinates
+    #max_coordinate=[extents["x"][1], extents["y"][1]],  # Max coordinates
+    #min_coordinate=[2125, 2125],  # Min coordinates
+    #max_coordinate=[2550, 2550],  # Max coordinates
+    axes=("x", "y"),  # Include the z-axis
+    target_coordinate_system="global",
+)
+# NOTE: this solves the error, somehow the cropping currently doesn't support sdata tables
+if "table" in outdata.tables:
+    del outdata.tables["table"]
+# Apply the cropping function to the spatial data
+sdata_crop = crop0(outdata)
+fig, axs = plt.subplots(ncols=3, figsize=(12, 3))
+sdata_crop.pl.render_images().pl.show(ax=axs[0])
+axs[0].set_title('Raw_DAPI_image')
+sdata_crop.pl.render_images().pl.render_labels(color="cell_id").pl.show(ax=axs[1])
+axs[1].set_title('BIDcell segmentations overlaid on DAPI')
+sdata_crop.pl.render_labels(color="cell_id").pl.show(ax=axs[2])
+axs[2].set_title('BIDcell segmentations')
+plt.tight_layout()
+plt.show()
+
+
diff --git a/bidcell.py b/bidcell.py
@@ -0,0 +1,125 @@
+import spatialdata as sd
+import spatialdata_plot as pl
+import matplotlib.pyplot as plt
+import numpy as np
+import tifffile
+import cv2
+import dask.dataframe as dd  #dask.dataframe should only be imported after spatial data to avoid prevent dependency conflicts
+import scanpy as sc
+import pandas as pd
+import natsort
+import os 
+from bidcell import BIDCellModel
+
+
+# preprocessing test data to get required input files for BIDcell
+sdata = sd.read_zarr("dataset.zarr")
+sdata_genes = sdata['transcripts']["feature_name"].unique().compute().sort_values().tolist()
+# Extracting DAPI image from dataset.zarr
+image_pyramid = []
+img = sdata["morphology_mip"]["/scale0"]["image"].values  # Convert dask array to numpy
+img = np.squeeze(img)  # Remove singleton channel dimension (c:1)
+image_pyramid.append(img)
+
+with tifffile.TiffWriter( "morphology_mip_pyramidal.tiff", bigtiff=True) as tiff:
+    for img in image_pyramid:
+        tiff.write(img, photometric="minisblack", resolution=(1, 1))
+
+
+#Converting h5ad single cell reference to .csv
+adata = sc.read_h5ad("dataset.h5ad")
+shared_genes = [g for g in sdata_genes if g in adata.var["feature_name"].values] #crucial to avoid key error due to discrepencies in the genes present
+adata = adata[:, adata.var["feature_name"].isin(shared_genes)]
+
+# Set var_names to feature_name
+adata.var_names = adata.var["feature_name"].astype(str)
+sc_ref = pd.DataFrame(
+    data=adata[:, shared_genes].layers["normalized"].toarray(), 
+    columns=shared_genes, 
+    index=range(adata.n_obs)
+)
+celltypes = adata.obs['cell_type'].unique().tolist()
+cell_type_col = adata.obs['cell_type'].astype('category')
+sc_ref["ct_idx"] = cell_type_col.cat.codes.values
+sc_ref["cell_type"] = cell_type_col.values
+sc_ref["atlas"] = "custom"
+sc_ref.to_csv( "scref.csv")
+
+# generating transcript map .csv from test data
+transcript = sdata["transcripts"].compute()
+transcript=pd.DataFrame(transcript)
+#transcript.to_csv(filename="transcript.csv.gz", single_file=True, compression='gzip') #to generate transcript file without the filter
+transcript[transcript["feature_name"].isin(shared_genes)].to_csv("transcript.csv.gz",compression='gzip'
+)
+
+
+#generating positive and negative marker files from the single cell reference
+# defining the function generate_markers
+
+def generate_markers(ref_df, max_overlaps_pos=4, max_overlaps_neg=15):
+    """
+    Generate positive and negative marker dataframes from reference data.
+
+    Args:
+        ref_df (pd.DataFrame): Reference dataframe with gene expression data and cell type info
+        max_overlaps_pos (int): Maximum number of cell types that can share a positive marker
+        max_overlaps_neg (int): Maximum number of cell types that can share a negative marker
+
+    Returns:
+        tuple: (df_pos, df_neg) - DataFrames containing positive and negative markers
+    """
+
+    n_genes = ref_df.shape[1] - 3
+    cell_types = natsort.natsorted(list(set(ref_df["cell_type"].tolist())))
+    n_cell_types = len(cell_types)
+
+    ref_expr = ref_df.iloc[:, :n_genes].to_numpy()
+    gene_names = ref_df.columns[:n_genes]
+    ct_idx = ref_df["ct_idx"].to_numpy()
+
+    # Generate negative markers
+    pct_10 = np.percentile(ref_expr, 10, axis=1, keepdims=True)
+    pct_10 = np.tile(pct_10, (1, n_genes))
+    low_expr_true = np.zeros(pct_10.shape)
+    low_expr_true[ref_expr <= pct_10] = 1
+
+    low_expr_true_agg = np.zeros((n_cell_types, n_genes))
+    for ct in range(n_cell_types):
+        rows = np.where(ct_idx == ct)[0]
+        low_expr_true_ct = low_expr_true[rows]
+        low_expr_true_agg[ct, :] = np.prod(low_expr_true_ct, axis=0)
+
+    overlaps = np.sum(low_expr_true_agg, 0)
+    too_many = np.where(overlaps > max_overlaps_neg)[0]
+    low_expr_true_agg[:, too_many] = 0
+    df_neg = pd.DataFrame(low_expr_true_agg, index=cell_types, columns=gene_names)
+
+    # Generate positive markers
+    pct_90 = np.percentile(ref_expr, 90, axis=1, keepdims=True)
+    pct_90 = np.tile(pct_90, (1, n_genes))
+    high_expr_true = np.zeros(pct_90.shape)
+    high_expr_true[ref_expr >= pct_90] = 1
+
+    high_expr_true_agg = np.zeros((n_cell_types, n_genes))
+    for ct in range(n_cell_types):
+        rows = np.where(ct_idx == ct)[0]
+        high_expr_true_ct = high_expr_true[rows]
+        high_expr_true_agg[ct, :] = np.prod(high_expr_true_ct, axis=0)
+
+    overlaps = np.sum(high_expr_true_agg, 0)
+    too_many = np.where(overlaps > max_overlaps_pos)[0]
+    high_expr_true_agg[:, too_many] = 0
+    df_pos = pd.DataFrame(high_expr_true_agg, index=cell_types, columns=gene_names)
+
+    return df_pos, df_neg
+
+# generate positive and negative marker files 
+df_pos, df_neg = generate_markers(sc_ref, max_overlaps_pos=4, max_overlaps_neg=15)
+df_pos.to_csv( "pos_marker.csv")
+df_neg.to_csv("neg_marker.csv")
+
+
+# Setting up and running BIDCell
+model = BIDCellModel("testdata.yaml")
+model.run_pipeline() 
+