Skip to content

Commit 55890b1

Browse files
committed
Move data generation routines to separate file
1 parent 23b87a8 commit 55890b1

File tree

3 files changed

+165
-75
lines changed

3 files changed

+165
-75
lines changed

tests/random_data.py

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
"""Routines to create random data"""
2+
3+
import random
4+
import string
5+
import tempfile
6+
from collections.abc import Mapping, Sequence
7+
from pathlib import Path
8+
from typing import TypeAlias
9+
10+
import h5py
11+
import numpy as np
12+
import pandas as pd
13+
import torch
14+
15+
from stamp.modeling.data import Category, PatientId
16+
17+
CliniPath: TypeAlias = Path
18+
SlidePath: TypeAlias = Path
19+
FeatureDir: TypeAlias = Path
20+
21+
22+
def seed_rng(seed: int) -> None:
23+
"""Seeds all the random number generators"""
24+
random.seed(seed)
25+
torch.manual_seed(seed)
26+
np.random.seed(seed)
27+
28+
29+
def create_random_dataset(
30+
*,
31+
dir: Path,
32+
n_patients: int,
33+
max_slides_per_patient: int,
34+
min_tiles_per_slide: int,
35+
max_tiles_per_slide: int,
36+
feat_dim: int,
37+
n_categories: int,
38+
) -> tuple[CliniPath, SlidePath, FeatureDir, Sequence[Category]]:
39+
slide_path_to_patient: Mapping[Path, PatientId] = {}
40+
patient_to_ground_truth: Mapping[PatientId, str] = {}
41+
clini_path = dir / "clini.csv"
42+
slide_path = dir / "slide.csv"
43+
44+
feat_dir = dir / "feats"
45+
feat_dir.mkdir()
46+
47+
categories = [random_string(8) for _ in range(n_categories)]
48+
49+
for _ in range(n_patients):
50+
# Random patient ID
51+
patient_id = random_string(16)
52+
53+
patient_to_ground_truth[patient_id] = random.choice(categories)
54+
55+
# Generate some slides
56+
for _ in range(random.randint(1, max_slides_per_patient)):
57+
slide_path_to_patient[
58+
create_random_feature_file(
59+
dir=feat_dir,
60+
min_tiles=min_tiles_per_slide,
61+
max_tiles=max_tiles_per_slide,
62+
feat_dim=feat_dim,
63+
).relative_to(feat_dir)
64+
] = patient_id
65+
66+
clini_df = pd.DataFrame(
67+
patient_to_ground_truth.items(),
68+
columns=["patient", "ground_truth"], # pyright: ignore[reportArgumentType]
69+
)
70+
clini_df.to_csv(clini_path, index=False)
71+
72+
slide_df = pd.DataFrame(
73+
slide_path_to_patient.items(),
74+
columns=["slide_path", "patient"], # pyright: ignore[reportArgumentType]
75+
)
76+
slide_df.to_csv(slide_path, index=False)
77+
78+
return clini_path, slide_path, feat_dir, categories
79+
80+
81+
def create_random_feature_file(
82+
*, dir: Path, min_tiles: int, max_tiles: int, feat_dim: int
83+
) -> Path:
84+
"""Creates a h5 file with random contents.
85+
86+
Args:
87+
dir:
88+
Directory to create the file in.
89+
90+
Returns:
91+
Path to the feature file.
92+
"""
93+
n_tiles = random.randint(min_tiles, max_tiles)
94+
with (
95+
tempfile.NamedTemporaryFile(dir=dir, suffix=".h5", delete=False) as tmp_file,
96+
h5py.File(tmp_file, "w") as h5_file,
97+
):
98+
h5_file["feats"] = torch.rand(n_tiles, feat_dim)
99+
h5_file["coords"] = torch.rand(n_tiles, 2)
100+
return Path(tmp_file.name)
101+
102+
103+
def random_patient_preds(*, n_patients: int, categories: list[str]) -> pd.DataFrame:
104+
return pd.DataFrame(
105+
{
106+
"patient": [random_string(8) for _ in range(n_patients)],
107+
"ground_truth": [random.choice(categories) for _ in range(n_patients)],
108+
**{
109+
f"ground_truth_{cat}": scores
110+
for i, (cat, scores) in enumerate(
111+
zip(
112+
categories,
113+
torch.softmax(torch.rand(len(categories), n_patients), dim=0),
114+
)
115+
)
116+
},
117+
}
118+
)
119+
120+
121+
def random_string(len: int) -> str:
122+
return "".join(random.choices(string.ascii_uppercase + string.digits, k=len))

tests/test_crossval.py

Lines changed: 3 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,14 @@
11
import os
22
import random
3-
import string
43
import tempfile
5-
from collections.abc import Mapping, Sequence
64
from pathlib import Path
75
from typing import TypeAlias
86

9-
import h5py
107
import numpy as np
11-
import pandas as pd
128
import torch
9+
from random_data import create_random_dataset
1310

1411
from stamp.modeling.crossval import categorical_crossval_
15-
from stamp.modeling.data import Category, PatientId
1612

1713
CliniPath: TypeAlias = Path
1814
SlidePath: TypeAlias = Path
@@ -34,7 +30,7 @@ def test_crossval_integration(
3430
np.random.seed(0)
3531

3632
with tempfile.TemporaryDirectory(prefix="stamp_test_train_") as tmp_dir:
37-
clini_path, slide_path, feature_dir, categories = _create_random_dataset(
33+
clini_path, slide_path, feature_dir, categories = create_random_dataset(
3834
dir=Path(tmp_dir),
3935
n_categories=n_categories,
4036
n_patients=n_patients,
@@ -63,77 +59,9 @@ def test_crossval_integration(
6359
max_epochs=2,
6460
patience=1,
6561
accelerator="gpu" if torch.cuda.is_available() else "cpu",
66-
n_splits=3,
62+
n_splits=2,
6763
# Experimental features
6864
use_vary_precision_transform=use_vary_precision_transform,
6965
use_alibi=use_alibi,
7066
)
7167

72-
73-
def _create_random_dataset(
74-
*,
75-
dir: Path,
76-
n_patients: int,
77-
max_slides_per_patient: int,
78-
min_tiles_per_slide: int,
79-
max_tiles_per_slide: int,
80-
feat_dim: int,
81-
n_categories: int,
82-
) -> tuple[CliniPath, SlidePath, FeatureDir, Sequence[Category]]:
83-
slide_path_to_patient: Mapping[Path, PatientId] = {}
84-
patient_to_ground_truth: Mapping[PatientId, str] = {}
85-
clini_path = dir / "clini.csv"
86-
slide_path = dir / "slide.csv"
87-
88-
feat_dir = dir / "feats"
89-
feat_dir.mkdir()
90-
91-
categories = [_random_string(8) for _ in range(n_categories)]
92-
93-
for _ in range(n_patients):
94-
# Random patient ID
95-
patient_id = _random_string(16)
96-
97-
patient_to_ground_truth[patient_id] = random.choice(categories)
98-
99-
# Generate some slides
100-
for _ in range(random.randint(1, max_slides_per_patient)):
101-
slide_path_to_patient[
102-
_create_random_feature_file(
103-
dir=feat_dir,
104-
min_tiles_per_slide=min_tiles_per_slide,
105-
max_tiles_per_slide=max_tiles_per_slide,
106-
feat_dim=feat_dim,
107-
).relative_to(feat_dir)
108-
] = patient_id
109-
110-
clini_df = pd.DataFrame(
111-
patient_to_ground_truth.items(),
112-
columns=["patient", "ground_truth"], # pyright: ignore[reportArgumentType]
113-
)
114-
clini_df.to_csv(clini_path, index=False)
115-
116-
slide_df = pd.DataFrame(
117-
slide_path_to_patient.items(),
118-
columns=["slide_path", "patient"], # pyright: ignore[reportArgumentType]
119-
)
120-
slide_df.to_csv(slide_path, index=False)
121-
122-
return clini_path, slide_path, feat_dir, categories
123-
124-
125-
def _create_random_feature_file(
126-
*, dir: Path, min_tiles_per_slide: int, max_tiles_per_slide: int, feat_dim: int
127-
) -> Path:
128-
n_tiles = random.randint(min_tiles_per_slide, max_tiles_per_slide)
129-
with (
130-
tempfile.NamedTemporaryFile(dir=dir, suffix=".h5", delete=False) as tmp_file,
131-
h5py.File(tmp_file, "w") as h5_file,
132-
):
133-
h5_file["feats"] = torch.rand(n_tiles, feat_dim)
134-
h5_file["coords"] = torch.rand(n_tiles, 2)
135-
return Path(tmp_file.name)
136-
137-
138-
def _random_string(len: int):
139-
return "".join(random.choices(string.ascii_uppercase + string.digits, k=len))

tests/test_statistics.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import random
2+
import tempfile
3+
from pathlib import Path
4+
5+
import numpy as np
6+
import torch
7+
from random_data import random_patient_preds, random_string
8+
9+
from stamp.statistics import compute_stats_
10+
11+
12+
def test_statistics_integration(
13+
*,
14+
n_patient_preds: int = 1,
15+
n_categories: int = 3,
16+
) -> None:
17+
"""Just check if we can compute stats without crashing"""
18+
random.seed(0)
19+
np.random.seed(0)
20+
torch.random.manual_seed(0)
21+
22+
categories = [random_string(8) for _ in range(n_categories)]
23+
24+
with tempfile.TemporaryDirectory(prefix="stamp_test_statistics_") as tmp_dir:
25+
dir = Path(tmp_dir)
26+
for patient_preds_i in range(n_patient_preds):
27+
random_patient_preds(
28+
n_patients=random.randint(100, 1000), categories=categories
29+
).to_csv(dir / f"patient-preds-{patient_preds_i}.csv")
30+
31+
compute_stats_(
32+
output_dir=dir / "output",
33+
pred_csvs=[dir / f"patient-preds-{i}.csv" for i in range(n_patient_preds)],
34+
ground_truth_label="ground_truth",
35+
true_class=categories[1],
36+
)
37+
38+
39+
def test_statistics_integration_for_multiple_patient_preds() -> None:
40+
return test_statistics_integration(n_patient_preds=5)

0 commit comments

Comments
 (0)