Move data generation routines to separate file

LocalToasty · LocalToasty · commit 55890b17272f · 2025-01-15T13:41:53.000Z
diff --git a/tests/random_data.py b/tests/random_data.py
@@ -0,0 +1,122 @@
+"""Routines to create random data"""
+
+import random
+import string
+import tempfile
+from collections.abc import Mapping, Sequence
+from pathlib import Path
+from typing import TypeAlias
+
+import h5py
+import numpy as np
+import pandas as pd
+import torch
+
+from stamp.modeling.data import Category, PatientId
+
+CliniPath: TypeAlias = Path
+SlidePath: TypeAlias = Path
+FeatureDir: TypeAlias = Path
+
+
+def seed_rng(seed: int) -> None:
+    """Seeds all the random number generators"""
+    random.seed(seed)
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+
+
+def create_random_dataset(
+    *,
+    dir: Path,
+    n_patients: int,
+    max_slides_per_patient: int,
+    min_tiles_per_slide: int,
+    max_tiles_per_slide: int,
+    feat_dim: int,
+    n_categories: int,
+) -> tuple[CliniPath, SlidePath, FeatureDir, Sequence[Category]]:
+    slide_path_to_patient: Mapping[Path, PatientId] = {}
+    patient_to_ground_truth: Mapping[PatientId, str] = {}
+    clini_path = dir / "clini.csv"
+    slide_path = dir / "slide.csv"
+
+    feat_dir = dir / "feats"
+    feat_dir.mkdir()
+
+    categories = [random_string(8) for _ in range(n_categories)]
+
+    for _ in range(n_patients):
+        # Random patient ID
+        patient_id = random_string(16)
+
+        patient_to_ground_truth[patient_id] = random.choice(categories)
+
+        # Generate some slides
+        for _ in range(random.randint(1, max_slides_per_patient)):
+            slide_path_to_patient[
+                create_random_feature_file(
+                    dir=feat_dir,
+                    min_tiles=min_tiles_per_slide,
+                    max_tiles=max_tiles_per_slide,
+                    feat_dim=feat_dim,
+                ).relative_to(feat_dir)
+            ] = patient_id
+
+    clini_df = pd.DataFrame(
+        patient_to_ground_truth.items(),
+        columns=["patient", "ground_truth"],  # pyright: ignore[reportArgumentType]
+    )
+    clini_df.to_csv(clini_path, index=False)
+
+    slide_df = pd.DataFrame(
+        slide_path_to_patient.items(),
+        columns=["slide_path", "patient"],  # pyright: ignore[reportArgumentType]
+    )
+    slide_df.to_csv(slide_path, index=False)
+
+    return clini_path, slide_path, feat_dir, categories
+
+
+def create_random_feature_file(
+    *, dir: Path, min_tiles: int, max_tiles: int, feat_dim: int
+) -> Path:
+    """Creates a h5 file with random contents.
+
+    Args:
+        dir:
+            Directory to create the file in.
+
+    Returns:
+        Path to the feature file.
+    """
+    n_tiles = random.randint(min_tiles, max_tiles)
+    with (
+        tempfile.NamedTemporaryFile(dir=dir, suffix=".h5", delete=False) as tmp_file,
+        h5py.File(tmp_file, "w") as h5_file,
+    ):
+        h5_file["feats"] = torch.rand(n_tiles, feat_dim)
+        h5_file["coords"] = torch.rand(n_tiles, 2)
+        return Path(tmp_file.name)
+
+
+def random_patient_preds(*, n_patients: int, categories: list[str]) -> pd.DataFrame:
+    return pd.DataFrame(
+        {
+            "patient": [random_string(8) for _ in range(n_patients)],
+            "ground_truth": [random.choice(categories) for _ in range(n_patients)],
+            **{
+                f"ground_truth_{cat}": scores
+                for i, (cat, scores) in enumerate(
+                    zip(
+                        categories,
+                        torch.softmax(torch.rand(len(categories), n_patients), dim=0),
+                    )
+                )
+            },
+        }
+    )
+
+
+def random_string(len: int) -> str:
+    return "".join(random.choices(string.ascii_uppercase + string.digits, k=len))
diff --git a/tests/test_crossval.py b/tests/test_crossval.py
@@ -1,18 +1,14 @@
 import os
 import random
-import string
 import tempfile
-from collections.abc import Mapping, Sequence
 from pathlib import Path
 from typing import TypeAlias
 
-import h5py
 import numpy as np
-import pandas as pd
 import torch
+from random_data import create_random_dataset
 
 from stamp.modeling.crossval import categorical_crossval_
-from stamp.modeling.data import Category, PatientId
 
 CliniPath: TypeAlias = Path
 SlidePath: TypeAlias = Path
@@ -34,7 +30,7 @@ def test_crossval_integration(
     np.random.seed(0)
 
     with tempfile.TemporaryDirectory(prefix="stamp_test_train_") as tmp_dir:
-        clini_path, slide_path, feature_dir, categories = _create_random_dataset(
+        clini_path, slide_path, feature_dir, categories = create_random_dataset(
             dir=Path(tmp_dir),
             n_categories=n_categories,
             n_patients=n_patients,
@@ -63,77 +59,9 @@ def test_crossval_integration(
             max_epochs=2,
             patience=1,
             accelerator="gpu" if torch.cuda.is_available() else "cpu",
-            n_splits=3,
+            n_splits=2,
             # Experimental features
             use_vary_precision_transform=use_vary_precision_transform,
             use_alibi=use_alibi,
         )
 
-
-def _create_random_dataset(
-    *,
-    dir: Path,
-    n_patients: int,
-    max_slides_per_patient: int,
-    min_tiles_per_slide: int,
-    max_tiles_per_slide: int,
-    feat_dim: int,
-    n_categories: int,
-) -> tuple[CliniPath, SlidePath, FeatureDir, Sequence[Category]]:
-    slide_path_to_patient: Mapping[Path, PatientId] = {}
-    patient_to_ground_truth: Mapping[PatientId, str] = {}
-    clini_path = dir / "clini.csv"
-    slide_path = dir / "slide.csv"
-
-    feat_dir = dir / "feats"
-    feat_dir.mkdir()
-
-    categories = [_random_string(8) for _ in range(n_categories)]
-
-    for _ in range(n_patients):
-        # Random patient ID
-        patient_id = _random_string(16)
-
-        patient_to_ground_truth[patient_id] = random.choice(categories)
-
-        # Generate some slides
-        for _ in range(random.randint(1, max_slides_per_patient)):
-            slide_path_to_patient[
-                _create_random_feature_file(
-                    dir=feat_dir,
-                    min_tiles_per_slide=min_tiles_per_slide,
-                    max_tiles_per_slide=max_tiles_per_slide,
-                    feat_dim=feat_dim,
-                ).relative_to(feat_dir)
-            ] = patient_id
-
-    clini_df = pd.DataFrame(
-        patient_to_ground_truth.items(),
-        columns=["patient", "ground_truth"],  # pyright: ignore[reportArgumentType]
-    )
-    clini_df.to_csv(clini_path, index=False)
-
-    slide_df = pd.DataFrame(
-        slide_path_to_patient.items(),
-        columns=["slide_path", "patient"],  # pyright: ignore[reportArgumentType]
-    )
-    slide_df.to_csv(slide_path, index=False)
-
-    return clini_path, slide_path, feat_dir, categories
-
-
-def _create_random_feature_file(
-    *, dir: Path, min_tiles_per_slide: int, max_tiles_per_slide: int, feat_dim: int
-) -> Path:
-    n_tiles = random.randint(min_tiles_per_slide, max_tiles_per_slide)
-    with (
-        tempfile.NamedTemporaryFile(dir=dir, suffix=".h5", delete=False) as tmp_file,
-        h5py.File(tmp_file, "w") as h5_file,
-    ):
-        h5_file["feats"] = torch.rand(n_tiles, feat_dim)
-        h5_file["coords"] = torch.rand(n_tiles, 2)
-        return Path(tmp_file.name)
-
-
-def _random_string(len: int):
-    return "".join(random.choices(string.ascii_uppercase + string.digits, k=len))
diff --git a/tests/test_statistics.py b/tests/test_statistics.py
@@ -0,0 +1,40 @@
+import random
+import tempfile
+from pathlib import Path
+
+import numpy as np
+import torch
+from random_data import random_patient_preds, random_string
+
+from stamp.statistics import compute_stats_
+
+
+def test_statistics_integration(
+    *,
+    n_patient_preds: int = 1,
+    n_categories: int = 3,
+) -> None:
+    """Just check if we can compute stats without crashing"""
+    random.seed(0)
+    np.random.seed(0)
+    torch.random.manual_seed(0)
+
+    categories = [random_string(8) for _ in range(n_categories)]
+
+    with tempfile.TemporaryDirectory(prefix="stamp_test_statistics_") as tmp_dir:
+        dir = Path(tmp_dir)
+        for patient_preds_i in range(n_patient_preds):
+            random_patient_preds(
+                n_patients=random.randint(100, 1000), categories=categories
+            ).to_csv(dir / f"patient-preds-{patient_preds_i}.csv")
+
+        compute_stats_(
+            output_dir=dir / "output",
+            pred_csvs=[dir / f"patient-preds-{i}.csv" for i in range(n_patient_preds)],
+            ground_truth_label="ground_truth",
+            true_class=categories[1],
+        )
+
+
+def test_statistics_integration_for_multiple_patient_preds() -> None:
+    return test_statistics_integration(n_patient_preds=5)