Add tests for feature extraction

LocalToasty · LocalToasty · commit bc413bf2f934 · 2025-01-15T14:50:11.000Z
diff --git a/pyproject.toml b/pyproject.toml
@@ -76,6 +76,7 @@ all = ["stamp[dinobloom,conch,ctranspath,uni,virchow2]"]
 
 [dependency-groups]
 dev = [
+    "huggingface-hub>=0.27.1",
     "ipykernel>=6.29.5",
     "pyright>=1.1.389,!=1.1.391",
     "pytest>=8.3.4",
@@ -88,4 +89,4 @@ build-backend = "hatchling.build"
 
 [tool.hatch.metadata]
 # To allow referencing git repos in dependencies
-allow-direct-references = true
+allow-direct-references = true
diff --git a/src/stamp/__main__.py b/src/stamp/__main__.py
@@ -72,7 +72,7 @@ def _run_cli(args: argparse.Namespace) -> None:
                 tile_size_px=config.preprocessing.tile_size_px,
                 extractor=config.preprocessing.extractor,
                 max_workers=config.preprocessing.max_workers,
-                accelerator=config.preprocessing.accelerator,
+                device=config.preprocessing.device,
                 brightness_cutoff=config.preprocessing.brightness_cutoff,
                 canny_cutoff=config.preprocessing.canny_cutoff,
             )
diff --git a/src/stamp/cache.py b/src/stamp/cache.py
@@ -15,6 +15,7 @@
 
 
 def download_file(*, url: str, file_name: str, sha256sum: str) -> Path:
+    """Downloads a file, or loads it from cache if it has been downloaded before"""
     outfile_path = STAMP_CACHE_DIR / file_name
     if outfile_path.is_file():
         with open(outfile_path, "rb") as weight_file:
diff --git a/src/stamp/config.yaml b/src/stamp/config.yaml
@@ -7,7 +7,7 @@ preprocessing:
   extractor: "ctranspath"
 
   # Device to run feature extraction on ("cpu", "cuda", "cuda:0", etc.)
-  accelerator: "cuda"
+  device: "cuda"
 
   # Optional settings:
 
diff --git a/src/stamp/preprocessing/__init__.py b/src/stamp/preprocessing/__init__.py
@@ -1,11 +1,11 @@
 import hashlib
 import logging
-from collections.abc import Callable
+from collections.abc import Callable, Iterator
 from functools import cache
 from pathlib import Path
 from random import shuffle
 from tempfile import NamedTemporaryFile
-from typing import Iterator, assert_never
+from typing import assert_never
 
 import h5py
 import numpy as np
@@ -18,6 +18,7 @@
 from torch.utils.data import DataLoader, IterableDataset
 from tqdm import tqdm
 
+import stamp
 from stamp.preprocessing.config import ExtractorName
 from stamp.preprocessing.extractor import Extractor
 from stamp.preprocessing.tiling import (
@@ -120,7 +121,7 @@ def extract_(
     tile_size_px: TilePixels,
     tile_size_um: Microns,
     max_workers: int,
-    accelerator: DeviceLikeType,
+    device: DeviceLikeType,
     brightness_cutoff: int | None,
     canny_cutoff: float | None,
 ) -> None:
@@ -161,7 +162,7 @@ def extract_(
         case _ as unreachable:
             assert_never(unreachable)
 
-    model = extractor.model.to(accelerator).eval()
+    model = extractor.model.to(device).eval()
     extractor_id = f"{extractor.identifier}-{_get_preprocessing_code_hash()[:8]}"
 
     logger.info(f"Using extractor {extractor.identifier}")
@@ -213,7 +214,7 @@ def extract_(
             feats, xs_um, ys_um = [], [], []
             for tiles, xs, ys in tqdm(dl, leave=False):
                 with torch.inference_mode():
-                    feats.append(model(tiles.to(accelerator)).detach().half().cpu())
+                    feats.append(model(tiles.to(device)).detach().half().cpu())
                 xs_um.append(xs.float())
                 ys_um.append(ys.float())
         except Exception:
@@ -235,6 +236,7 @@ def extract_(
                 h5_fp["coords"] = coords
                 h5_fp["feats"] = torch.concat(feats).numpy()
 
+                h5_fp.attrs["stamp_version"] = stamp.__version__
                 h5_fp.attrs["extractor"] = extractor_id
                 h5_fp.attrs["unit"] = "um"
                 h5_fp.attrs["tile_size"] = tile_size_um
diff --git a/src/stamp/preprocessing/config.py b/src/stamp/preprocessing/config.py
@@ -31,7 +31,7 @@ class PreprocessingConfig(BaseModel, arbitrary_types_allowed=True):
     tile_size_px: TilePixels = TilePixels(224)
     extractor: ExtractorName
     max_workers: int = 8
-    accelerator: DeviceLikeType = "cuda" if torch.cuda.is_available() else "cpu"
+    device: DeviceLikeType = "cuda" if torch.cuda.is_available() else "cpu"
 
     # Background rejection
     brightness_cutoff: int | None = Field(240, gt=0, lt=255)
diff --git a/src/stamp/preprocessing/extractor/empty.py b/src/stamp/preprocessing/extractor/empty.py
@@ -21,12 +21,17 @@ class _EmptyModel(torch.nn.Module):
     def forward(
         self, batch: Float[torch.Tensor, "batch channel height width"]
     ) -> Float[torch.Tensor, "batch feature"]:
-        return torch.zeros(batch.size(0)).type_as(batch)
+        return torch.zeros(batch.size(0), 0).type_as(batch)
 
 
 def empty() -> Extractor:
     return Extractor(
         model=_EmptyModel(),
-        transform=torchvision.transforms.functional.pil_to_tensor,
+        transform=torchvision.transforms.Compose(
+            [
+                torchvision.transforms.PILToTensor(),
+                torchvision.transforms.Lambda(lambda x: x.float()),
+            ]
+        ),
         identifier="empty",
     )
diff --git a/tests/test_feature_extractors.py b/tests/test_feature_extractors.py
@@ -0,0 +1,117 @@
+import os
+import tempfile
+from pathlib import Path
+
+import h5py
+import numpy as np
+import pytest
+import torch
+from huggingface_hub.errors import GatedRepoError
+
+from stamp.cache import download_file
+from stamp.preprocessing import ExtractorName, Microns, TilePixels, extract_
+
+
+def test_if_feature_extraction_crashes(extractor=ExtractorName.CTRANSPATH) -> None:
+    example_slide_path = download_file(
+        url="https://github.com/KatherLab/STAMP/releases/download/2.0.0.dev14/TCGA-G4-6625-01Z-00-DX1.0fa26667-2581-4f96-a891-d78dbc3299b4.svs",
+        file_name="TCGA-G4-6625-01Z-00-DX1.0fa26667-2581-4f96-a891-d78dbc3299b4.svs",
+        sha256sum="9b7d2b0294524351bf29229c656cc886af028cb9e7463882289fac43c1347525",
+    )
+    with tempfile.TemporaryDirectory(prefix="stamp_test_preprocessing_") as tmp_dir:
+        dir = Path(tmp_dir)
+        wsi_dir = dir / "wsis"
+        wsi_dir.mkdir()
+        (wsi_dir / "slide.svs").symlink_to(example_slide_path)
+
+        try:
+            extract_(
+                wsi_dir=wsi_dir,
+                output_dir=dir / "output",
+                extractor=extractor,
+                cache_dir=None,
+                tile_size_px=TilePixels(224),
+                tile_size_um=Microns(256.0),
+                max_workers=min(os.cpu_count() or 1, 16),
+                brightness_cutoff=224,
+                canny_cutoff=0.02,
+                device="cuda" if torch.cuda.is_available() else "cpu",
+            )
+        except ModuleNotFoundError:
+            pytest.skip(f"dependencies for {extractor} not installed")
+        except GatedRepoError:
+            pytest.skip(f"cannot access gated repo for {extractor}")
+
+        # Check if the file has any contents
+        with h5py.File(next((dir / "output").glob("*/*.h5"))) as h5_file:
+            just_extracted_feats = np.array(h5_file["feats"][:])  # pyright: ignore[reportIndexIssue]
+
+        assert len(just_extracted_feats) > 0
+
+
+def test_if_conch_feature_extraction_crashes() -> None:
+    test_if_feature_extraction_crashes(ExtractorName.CONCH)
+
+
+def test_if_uni_feature_extraction_crashes() -> None:
+    test_if_feature_extraction_crashes(ExtractorName.UNI)
+
+
+def test_if_dino_bloom_feature_extraction_crashes() -> None:
+    test_if_feature_extraction_crashes(ExtractorName.DINO_BLOOM)
+
+
+def test_if_virchow2_feature_extraction_crashes() -> None:
+    test_if_feature_extraction_crashes(ExtractorName.VIRCHOW2)
+
+
+def test_if_empty_feature_extraction_crashes() -> None:
+    test_if_feature_extraction_crashes(ExtractorName.EMPTY)
+
+
+def check_backward_compatability(extractor=ExtractorName.CTRANSPATH) -> None:
+    example_slide_path = download_file(
+        url="https://github.com/KatherLab/STAMP/releases/download/2.0.0.dev14/TCGA-G4-6625-01Z-00-DX1.0fa26667-2581-4f96-a891-d78dbc3299b4.svs",
+        file_name="TCGA-G4-6625-01Z-00-DX1.0fa26667-2581-4f96-a891-d78dbc3299b4.svs",
+        sha256sum="9b7d2b0294524351bf29229c656cc886af028cb9e7463882289fac43c1347525",
+    )
+    with tempfile.TemporaryDirectory(prefix="stamp_test_preprocessing_") as tmp_dir:
+        dir = Path(tmp_dir)
+        wsi_dir = dir / "wsis"
+        wsi_dir.mkdir()
+        (wsi_dir / "slide.svs").symlink_to(example_slide_path)
+
+        try:
+            extract_(
+                wsi_dir=wsi_dir,
+                output_dir=dir / "output",
+                extractor=extractor,
+                cache_dir=None,
+                tile_size_px=TilePixels(224),
+                tile_size_um=Microns(256.0),
+                max_workers=min(os.cpu_count() or 1, 16),
+                brightness_cutoff=224,
+                canny_cutoff=0.02,
+                device="cuda" if torch.cuda.is_available() else "cpu",
+            )
+        except ModuleNotFoundError:
+            pytest.skip(f"dependencies for {extractor} not installed")
+
+        reference_feature_path = download_file(
+            url="https://github.com/KatherLab/STAMP/releases/download/2.0.0.dev14/ctranspath-TCGA-G4-6625-01Z-00-DX1.0fa26667-2581-4f96-a891-d78dbc3299b4.h5",
+            file_name="ctranspath-TCGA-G4-6625-01Z-00-DX1.0fa26667-2581-4f96-a891-d78dbc3299b4.h5",
+            sha256sum="f3f33b069c3ed860d2bdb7d65ca5db64936d7acee3ba1061a457a8cdb1bc67e3",
+        )
+
+        with h5py.File(reference_feature_path) as h5_file:
+            reference_feats = h5_file["feats"][:]  # pyright: ignore[reportIndexIssue]
+            reference_version = h5_file.attrs["stamp_version"]
+
+        with h5py.File(next((dir / "output").glob("*/*.h5"))) as h5_file:
+            just_extracted_feats = h5_file["feats"][:]  # pyright: ignore[reportIndexIssue]
+
+        assert torch.allclose(
+            torch.tensor(just_extracted_feats), torch.tensor(reference_feats)
+        ), (
+            f"extracted {extractor} features differ from those made with stamp version {reference_version}"
+        )
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -72,7 +72,7 @@ def _run_cli(args: argparse.Namespace) -> None:`
`72`	`72`	`tile_size_px=config.preprocessing.tile_size_px,`
`73`	`73`	`extractor=config.preprocessing.extractor,`
`74`	`74`	`max_workers=config.preprocessing.max_workers,`
`75`		`- accelerator=config.preprocessing.accelerator,`
	`75`	`+ device=config.preprocessing.device,`
`76`	`76`	`brightness_cutoff=config.preprocessing.brightness_cutoff,`
`77`	`77`	`canny_cutoff=config.preprocessing.canny_cutoff,`
`78`	`78`	`)`