Skip to content

Commit 19a4095

Browse files
authored
DEGA-331-Xenium-pre-processing-bug and lower max_workers default to 1 (#173)
* adding variable to set max_workers * Fix preprocessing path (#174) * fix preprocess data dir * tested on pancreas * updated notebook
1 parent a3983f5 commit 19a4095

File tree

5 files changed

+252
-108
lines changed

5 files changed

+252
-108
lines changed

notebooks/BNB_Streamlined_Pre_processing_Xenium_V1_Human_Pancreas_FFPE.ipynb

Lines changed: 217 additions & 41 deletions
Large diffs are not rendered by default.

notebooks/Pre-processor_xenium_merscope.ipynb

Lines changed: 17 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -10,18 +10,24 @@
1010
},
1111
{
1212
"cell_type": "code",
13-
"execution_count": 6,
13+
"execution_count": 1,
1414
"id": "b81ab32e",
1515
"metadata": {},
1616
"outputs": [
1717
{
1818
"name": "stdout",
1919
"output_type": "stream",
2020
"text": [
21-
"The autoreload extension is already loaded. To reload it, use:\n",
22-
" %reload_ext autoreload\n",
2321
"env: ANYWIDGET_HMR=1\n"
2422
]
23+
},
24+
{
25+
"name": "stderr",
26+
"output_type": "stream",
27+
"text": [
28+
"/Users/feni/Documents/celldega/dega/lib/python3.12/site-packages/h5py/__init__.py:36: UserWarning: h5py is running against HDF5 1.14.5 when it was built against 1.14.6, this may cause problems\n",
29+
" _warn((\"h5py is running against HDF5 {0} when it was built against {1}, \"\n"
30+
]
2531
}
2632
],
2733
"source": [
@@ -42,7 +48,7 @@
4248
},
4349
{
4450
"cell_type": "code",
45-
"execution_count": 7,
51+
"execution_count": null,
4652
"id": "13350680",
4753
"metadata": {},
4854
"outputs": [
@@ -51,6 +57,7 @@
5157
"output_type": "stream",
5258
"text": [
5359
"Starting preprocessing for sample: Xenium_V1_human_Pancreas_FFPE_outs\n",
60+
"Created directory: data/landscape_files/Xenium_V1_human_Pancreas_FFPE_outs_test\n",
5461
"\n",
5562
"========Unzip and extract Xenium-related files========\n",
5663
"All files have been successfully extracted or skipped.\n",
@@ -59,7 +66,7 @@
5966
"Transformation matrix saved to 'data/landscape_files/Xenium_V1_human_Pancreas_FFPE_outs_test/micron_to_image_transform.csv'.\n",
6067
"\n",
6168
"========Check if all required files or directories exist========\n",
62-
"All required files or directories for technology 'Xenium' are present in 'data/raw/Xenium_V1_human_Pancreas_FFPE_outs'.\n",
69+
"All required files or directories for technology 'Xenium' are present in 'data/xenium_data/Xenium_V1_human_Pancreas_FFPE_outs'.\n",
6370
"\n",
6471
"========Make meta cells in pixel space========\n",
6572
"Done.\n",
@@ -71,7 +78,7 @@
7178
"name": "stderr",
7279
"output_type": "stream",
7380
"text": [
74-
"/Users/whuan/dev/celldega/src/celldega/pre/__init__.py:213: PerformanceWarning: Concatenating sparse arrays with multiple fill values: '[True, False]'. Picking the first and converting the rest.\n",
81+
"/Users/feni/Documents/celldega/src/celldega/pre/__init__.py:213: PerformanceWarning: Concatenating sparse arrays with multiple fill values: '[True, False]'. Picking the first and converting the rest.\n",
7582
" df_sig = df_sig.dropna(axis=1, how=\"all\")\n"
7683
]
7784
},
@@ -112,60 +119,13 @@
112119
"Cell clusters and meta cluster files created successfully.\n",
113120
"\n",
114121
"========Generating image tiles========\n",
115-
"------ xenium\n",
116-
"generating dapi image tiles ...\n",
117-
"Image tiles created successfully.\n",
118-
"\n",
119-
"========Generating transcript tiles========\n"
120-
]
121-
},
122-
{
123-
"name": "stderr",
124-
"output_type": "stream",
125-
"text": [
126-
"Processing chunks: 100%|██████████| 81/81 [00:00<00:00, 1101.60it/s]\n",
127-
"Processing coarse tiles: 84tile [00:08, 9.64tile/s]\n"
128-
]
129-
},
130-
{
131-
"name": "stdout",
132-
"output_type": "stream",
133-
"text": [
134-
"tile bounds: {'x_min': 0, 'x_max': 34126.65, 'y_min': 0, 'y_max': 13744.4}\n",
135-
"\n",
136-
"========Generating boundary tiles========\n",
137-
"technology Xenium\n"
138-
]
139-
},
140-
{
141-
"name": "stderr",
142-
"output_type": "stream",
143-
"text": [
144-
"Processing coarse tiles: 100%|██████████| 14/14 [00:10<00:00, 1.35it/s]"
145-
]
146-
},
147-
{
148-
"name": "stdout",
149-
"output_type": "stream",
150-
"text": [
151-
"Done.\n",
152-
"\n",
153-
"========Save landscape parameters========\n",
154-
"Done.\n",
155-
"Preprocessing completed successfully.\n"
156-
]
157-
},
158-
{
159-
"name": "stderr",
160-
"output_type": "stream",
161-
"text": [
162-
"\n"
122+
"------ xenium\n"
163123
]
164124
}
165125
],
166126
"source": [
167127
"sample = 'Xenium_V1_human_Pancreas_FFPE_outs'\n",
168-
"data_dir = f'data/raw/{sample}'\n",
128+
"data_dir = f'data/xenium_data/'\n",
169129
"path_landscape_files=f'data/landscape_files/{sample}_test'\n",
170130
"\n",
171131
"tile_size=250\n",
@@ -394,7 +354,7 @@
394354
],
395355
"metadata": {
396356
"kernelspec": {
397-
"display_name": "celldega_env",
357+
"display_name": "Python 3 (ipykernel)",
398358
"language": "python",
399359
"name": "python3"
400360
},
@@ -408,7 +368,7 @@
408368
"name": "python",
409369
"nbconvert_exporter": "python",
410370
"pygments_lexer": "ipython3",
411-
"version": "3.12.9"
371+
"version": "3.12.7"
412372
},
413373
"toc": {
414374
"base_numbering": 1,

src/celldega/pre/boundary_tile.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,7 @@ def make_cell_boundary_tiles(
289289
tile_size=250,
290290
tile_bounds=None,
291291
image_scale=1,
292-
max_workers=8,
292+
max_workers=1,
293293
):
294294
"""
295295
Processes cell boundary data and divides it into spatial tiles based on the provided technology.
@@ -316,7 +316,7 @@ def make_cell_boundary_tiles(
316316
Dictionary containing the minimum and maximum bounds for x and y coordinates.
317317
image_scale : float, optional, default=1
318318
Scale factor to apply to the geometry data.
319-
max_workers : int, optional, default=8
319+
max_workers : int, optional, default=1
320320
Maximum number of parallel workers for processing tiles.
321321
322322
Returns

src/celldega/pre/run_pre_processing.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -109,14 +109,22 @@ def _setup_preprocessing_paths(technology, path_landscape_files, data_dir):
109109

110110

111111
def main(
112-
sample, data_root_dir, tile_size, image_tile_layer, path_landscape_files, use_int_index=True
112+
sample,
113+
data_root_dir,
114+
tile_size,
115+
image_tile_layer,
116+
path_landscape_files,
117+
use_int_index=True,
118+
max_workers=1,
113119
):
114120
"""
115-
Main function to preprocess Xenium data and generate landscape files.
121+
Main function to preprocess Xenium or MERSCOPE data and generate landscape files.
116122
117123
Args:
118124
sample (str): Name of the sample (e.g., 'Xenium_V1_human_Pancreas_FFPE_outs').
119-
data_root_dir (str): Root directory containing the sample data.
125+
data_root_dir (str): Root directory containing all sample data. The
126+
``sample`` name will be appended to this path to locate the
127+
specific dataset.
120128
tile_size (int): Size of the tiles for transcript and boundary tiles.
121129
image_tile_layer (str): Image layers to be tiled. 'dapi' or 'all'.
122130
path_landscape_files (str): Directory to save the landscape files.
@@ -136,7 +144,7 @@ def main(
136144
print(f"Starting preprocessing for sample: {sample}")
137145

138146
# Construct data directory
139-
data_dir = Path(data_root_dir)
147+
data_dir = Path(data_root_dir) / sample
140148

141149
# Create necessary directories if they don't exist
142150
_create_directories([data_dir, path_landscape_files])
@@ -212,7 +220,7 @@ def main(
212220
chunk_size=100000,
213221
verbose=False,
214222
image_scale=1,
215-
max_workers=2,
223+
max_workers=max_workers,
216224
)
217225
print(f"tile bounds: {tile_bounds}")
218226

@@ -227,7 +235,7 @@ def main(
227235
coarse_tile_factor=10,
228236
tile_size=tile_size,
229237
tile_bounds=tile_bounds,
230-
max_workers=2,
238+
max_workers=max_workers,
231239
)
232240

233241
# Force name to be str for MERSCOPE
@@ -268,7 +276,8 @@ def _setup_argument_parser():
268276
parser.add_argument(
269277
"--data_root_dir",
270278
required=True,
271-
help="Root directory containing the data for this sample and other samples.",
279+
help="Root directory containing all samples. The value will be joined with"
280+
" the provided sample name to locate the dataset.",
272281
)
273282
parser.add_argument(
274283
"--tile_size",

src/celldega/viz/local_server.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@ def get_local_server() -> int:
3636
int: The port number on which the server is running.
3737
"""
3838
server = HTTPServer(("", 0), CORSHTTPRequestHandler)
39-
print(f"Server running on port {server.server_address[1]}")
4039

4140
service = thr.Thread(target=server.serve_forever)
4241
service.start()

0 commit comments

Comments
 (0)