Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 84 additions & 0 deletions Bidcell_outputanalysis
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#Analysis and visualisation of BIDcell output
#resizing BIDcell segmentation .tiff w.r.t DAPI image
dapi_image = tiff.imread("morphology_mip_pyramidal.tiff")

# Load the BIDCell segmentation mask (ensure the BIDCell segmentation is in the correct path;e.g., 'epoch_10_step_60.tif' or similar)
segmentation_mask = tiff.imread("epoch_10_step_60_connected.tif")
# Get the dimensions of the DAPI image (width and height)
h_dapi, w_dapi = dapi_image.shape

# Resize the BIDCell segmentation mask to match the DAPI image dimensions
segmentation_mask_resized = cv2.resize(segmentation_mask.astype('float32'), (w_dapi, h_dapi), interpolation=cv2.INTER_NEAREST)

# Convert the resized segmentation mask back to uint32 for storing the cell IDs
segmentation_mask_resized = segmentation_mask_resized.astype(np.uint32)
segmentation_mask_resized = segmentation_mask_resized.transpose(1, 0)
# Save the resized segmentation mask to a .tif file
tiff.imwrite("bidcellresult_resized.tif", segmentation_mask_resized)


#creating bidcelloutput.zarr

# Load the image
image = tifffile.imread("morphology_mip_pyramidal.tiff")
# Add a fake channel dimension (1 channel)
image_with_channel = np.expand_dims(image, axis=0)
# Parse the image with 'c', 'x', and 'y' dimensions
# Load the image
label_image = tifffile.imread("bidcellresult_resized.tif")
labels = sd.models.Labels2DModel.parse(label_image, dims=('y', 'x'))
# Convert 'x', 'y', and 'z' columns to float
# Check the columns in transcript
transcript_processed=pd.read_csv("/Volumes/Tejas_Kumar/testrunbidcell5/model_outputs/2025_03_17_09_36_19/test_output/transcript.csv.gz")
transcript_processed['x'] = transcript_processed['x'].astype(float)
transcript_processed['y'] = transcript_processed['y'].astype(float)
transcript_processed['z'] = transcript_processed['z'].astype(float)
transcript_processed['feature_name'] = transcript_processed['feature_name'].astype(str)
transcript_processed=pd.DataFrame(transcript_processed)


#creating images,labels and points for the bidcelloutput.zarr
images = sd.models.Image2DModel.parse(image_with_channel, dims=('c', 'x', 'y'))
labels = sd.models.Labels2DModel.parse(label_image, dims=('y', 'x'))
points = sd.models.PointsModel.parse(transcript_processed)


outputsdata = sd.SpatialData(
images={'DAPI': images},
labels={'segmentation_mask_labels': labels},
points={'transcripts': points}
)
outputsdata.write("Bidcelloutput.zarr",overwrite=True)


#read the output zarr file and visualise segmentations
outdata=sd.read_zarr("Bidcelloutput.zarr")
extents = sd.get_extent(outdata)
# Crop a specific region for visualization
crop0 = lambda x: sd.bounding_box_query(
x,
min_coordinate=[400,500], # Min coordinates
max_coordinate=[900,1000], # Max coordinates
#min_coordinate=[extents["x"][0], extents["y"][0]], # Min coordinates
#max_coordinate=[extents["x"][1], extents["y"][1]], # Max coordinates
#min_coordinate=[2125, 2125], # Min coordinates
#max_coordinate=[2550, 2550], # Max coordinates
axes=("x", "y"), # Include the z-axis
target_coordinate_system="global",
)
# NOTE: this solves the error, somehow the cropping currently doesn't support sdata tables
if "table" in outdata.tables:
del outdata.tables["table"]
# Apply the cropping function to the spatial data
sdata_crop = crop0(outdata)
fig, axs = plt.subplots(ncols=3, figsize=(12, 3))
sdata_crop.pl.render_images().pl.show(ax=axs[0])
axs[0].set_title('Raw_DAPI_image')
sdata_crop.pl.render_images().pl.render_labels(color="cell_id").pl.show(ax=axs[1])
axs[1].set_title('BIDcell segmentations overlaid on DAPI')
sdata_crop.pl.render_labels(color="cell_id").pl.show(ax=axs[2])
axs[2].set_title('BIDcell segmentations')
plt.tight_layout()
plt.show()


125 changes: 125 additions & 0 deletions bidcell.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import spatialdata as sd
import spatialdata_plot as pl
import matplotlib.pyplot as plt
import numpy as np
import tifffile
import cv2
import dask.dataframe as dd #dask.dataframe should only be imported after spatial data to avoid prevent dependency conflicts
import scanpy as sc
import pandas as pd
import natsort
import os
from bidcell import BIDCellModel


# preprocessing test data to get required input files for BIDcell
sdata = sd.read_zarr("dataset.zarr")
sdata_genes = sdata['transcripts']["feature_name"].unique().compute().sort_values().tolist()
# Extracting DAPI image from dataset.zarr
image_pyramid = []
img = sdata["morphology_mip"]["/scale0"]["image"].values # Convert dask array to numpy
img = np.squeeze(img) # Remove singleton channel dimension (c:1)
image_pyramid.append(img)

with tifffile.TiffWriter( "morphology_mip_pyramidal.tiff", bigtiff=True) as tiff:
for img in image_pyramid:
tiff.write(img, photometric="minisblack", resolution=(1, 1))


#Converting h5ad single cell reference to .csv
adata = sc.read_h5ad("dataset.h5ad")
shared_genes = [g for g in sdata_genes if g in adata.var["feature_name"].values] #crucial to avoid key error due to discrepencies in the genes present
adata = adata[:, adata.var["feature_name"].isin(shared_genes)]

# Set var_names to feature_name
adata.var_names = adata.var["feature_name"].astype(str)
sc_ref = pd.DataFrame(
data=adata[:, shared_genes].layers["normalized"].toarray(),
columns=shared_genes,
index=range(adata.n_obs)
)
celltypes = adata.obs['cell_type'].unique().tolist()
cell_type_col = adata.obs['cell_type'].astype('category')
sc_ref["ct_idx"] = cell_type_col.cat.codes.values
sc_ref["cell_type"] = cell_type_col.values
sc_ref["atlas"] = "custom"
sc_ref.to_csv( "scref.csv")

# generating transcript map .csv from test data
transcript = sdata["transcripts"].compute()
transcript=pd.DataFrame(transcript)
#transcript.to_csv(filename="transcript.csv.gz", single_file=True, compression='gzip') #to generate transcript file without the filter
transcript[transcript["feature_name"].isin(shared_genes)].to_csv("transcript.csv.gz",compression='gzip'
)


#generating positive and negative marker files from the single cell reference
# defining the function generate_markers

def generate_markers(ref_df, max_overlaps_pos=4, max_overlaps_neg=15):
"""
Generate positive and negative marker dataframes from reference data.

Args:
ref_df (pd.DataFrame): Reference dataframe with gene expression data and cell type info
max_overlaps_pos (int): Maximum number of cell types that can share a positive marker
max_overlaps_neg (int): Maximum number of cell types that can share a negative marker

Returns:
tuple: (df_pos, df_neg) - DataFrames containing positive and negative markers
"""

n_genes = ref_df.shape[1] - 3
cell_types = natsort.natsorted(list(set(ref_df["cell_type"].tolist())))
n_cell_types = len(cell_types)

ref_expr = ref_df.iloc[:, :n_genes].to_numpy()
gene_names = ref_df.columns[:n_genes]
ct_idx = ref_df["ct_idx"].to_numpy()

# Generate negative markers
pct_10 = np.percentile(ref_expr, 10, axis=1, keepdims=True)
pct_10 = np.tile(pct_10, (1, n_genes))
low_expr_true = np.zeros(pct_10.shape)
low_expr_true[ref_expr <= pct_10] = 1

low_expr_true_agg = np.zeros((n_cell_types, n_genes))
for ct in range(n_cell_types):
rows = np.where(ct_idx == ct)[0]
low_expr_true_ct = low_expr_true[rows]
low_expr_true_agg[ct, :] = np.prod(low_expr_true_ct, axis=0)

overlaps = np.sum(low_expr_true_agg, 0)
too_many = np.where(overlaps > max_overlaps_neg)[0]
low_expr_true_agg[:, too_many] = 0
df_neg = pd.DataFrame(low_expr_true_agg, index=cell_types, columns=gene_names)

# Generate positive markers
pct_90 = np.percentile(ref_expr, 90, axis=1, keepdims=True)
pct_90 = np.tile(pct_90, (1, n_genes))
high_expr_true = np.zeros(pct_90.shape)
high_expr_true[ref_expr >= pct_90] = 1

high_expr_true_agg = np.zeros((n_cell_types, n_genes))
for ct in range(n_cell_types):
rows = np.where(ct_idx == ct)[0]
high_expr_true_ct = high_expr_true[rows]
high_expr_true_agg[ct, :] = np.prod(high_expr_true_ct, axis=0)

overlaps = np.sum(high_expr_true_agg, 0)
too_many = np.where(overlaps > max_overlaps_pos)[0]
high_expr_true_agg[:, too_many] = 0
df_pos = pd.DataFrame(high_expr_true_agg, index=cell_types, columns=gene_names)

return df_pos, df_neg

# generate positive and negative marker files
df_pos, df_neg = generate_markers(sc_ref, max_overlaps_pos=4, max_overlaps_neg=15)
df_pos.to_csv( "pos_marker.csv")
df_neg.to_csv("neg_marker.csv")


# Setting up and running BIDCell
model = BIDCellModel("testdata.yaml")
model.run_pipeline()