-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprepare_dataset.py
More file actions
157 lines (118 loc) · 4.81 KB
/
prepare_dataset.py
File metadata and controls
157 lines (118 loc) · 4.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
"""
Prepare the GlucoFM Benchmark dataset from HuggingFace.
This script mirrors `Time-LLM/prepare_dataset.py` but is placed at the repo root
so all models can share the same preparation workflow.
It exports the HuggingFace dataset to per-subject CSV files and optionally
creates a `mixed/` folder that concatenates all sub-datasets.
"""
from __future__ import annotations
import os
import shutil
from pathlib import Path
import pandas as pd
def _require_datasets(): # pragma: no cover
try:
from datasets import load_dataset # type: ignore
except ModuleNotFoundError as exc:
raise ModuleNotFoundError(
"Missing dependency 'datasets'. Install it first:\n"
" pip install datasets\n"
) from exc
return load_dataset
def export_hf_to_csv(hf_name: str, split: str, out_root: str) -> None:
"""Export the HF dataset split into per-subject CSV files."""
print(f"\n{'=' * 70}")
print(f"Exporting {split} split from {hf_name}")
print(f"{'=' * 70}")
load_dataset = _require_datasets()
ds = load_dataset(hf_name, split=split)
subject_count = 0
for row in ds:
dataset = row["dataset"]
subject_id = row["subject_id"]
out_dir = os.path.join(out_root, split, str(dataset))
os.makedirs(out_dir, exist_ok=True)
csv_path = os.path.join(out_dir, f"{subject_id}.csv")
# Keep raw timestamps (epoch seconds) as provided by HF. Downstream
# loaders infer and convert to datetime as needed.
df = pd.DataFrame({"timestamp": row["timestamp"], "BGvalue": row["BGvalue"]})
df.to_csv(csv_path, index=False)
subject_count += 1
print(f"Exported {subject_count} subjects to: {os.path.join(out_root, split)}")
# Create "all" markers for the training split (some legacy loaders use it).
# if split == "train":
# datasets_found = set(ds["dataset"])
# for dataset in datasets_found:
# marker_path = os.path.join(out_root, "train", str(dataset), "all")
# Path(marker_path).touch()
# print(f"Created 'all' markers for {len(datasets_found)} datasets")
def copy_hf_csvs_to_mixed(root_dir: str = "hf_cache") -> None:
"""Copy all CSV files from subdirectories to a split-level `mixed/` folder.
Name collisions are avoided by prefixing the filename with the relative
dataset path (joined using `__`).
"""
print(f"\n{'=' * 70}")
print(f"Creating mixed datasets from {root_dir}")
print(f"{'=' * 70}")
root = Path(root_dir)
for split in ["train", "test"]:
split_dir = root / split
dst_dir = split_dir / "mixed"
dst_dir.mkdir(parents=True, exist_ok=True)
if not split_dir.exists():
print(f"[{split}] Skip: {split_dir} not found")
continue
csv_paths = [p for p in split_dir.rglob("*.csv") if "mixed" not in p.parts]
print(f"[{split}] Found {len(csv_paths)} CSV files")
copied = 0
for src_path in csv_paths:
rel = src_path.relative_to(split_dir)
safe_name = "__".join(rel.parts)
dst_path = dst_dir / safe_name
shutil.copy2(src_path, dst_path)
copied += 1
if split == "train":
(dst_dir / "all").touch()
print(f"[{split}] Copied {copied} files to: {dst_dir}")
def main() -> None:
import argparse
parser = argparse.ArgumentParser(description="Prepare GlucoFM Benchmark dataset from HuggingFace")
parser.add_argument(
"--hf_name",
"--hf-name",
type=str,
default="byluuu/gluco-tsfm-benchmark",
help="HuggingFace dataset name (default: byluuu/gluco-tsfm-benchmark).",
)
parser.add_argument(
"--output_dir",
"--output-dir",
type=str,
default="./hf_cache",
help="Output directory for exported CSV files (default: ./hf_cache).",
)
parser.add_argument(
"--create_mixed",
"--create-mixed",
action="store_true",
help="Create split-level `mixed/` folders that combine all subdatasets.",
)
args = parser.parse_args()
print("=" * 70)
print("GLUCOFM DATASET PREPARATION")
print("=" * 70)
export_hf_to_csv(hf_name=args.hf_name, split="train", out_root=args.output_dir)
export_hf_to_csv(hf_name=args.hf_name, split="test", out_root=args.output_dir)
if args.create_mixed:
copy_hf_csvs_to_mixed(root_dir=args.output_dir)
print(f"\n{'=' * 70}")
print("DATASET PREPARATION COMPLETE!")
print(f"{'=' * 70}")
print(f"Train data: {args.output_dir}/train/")
print(f"Test data: {args.output_dir}/test/")
if args.create_mixed:
print(f"Mixed train: {args.output_dir}/train/mixed/")
print(f"Mixed test: {args.output_dir}/test/mixed/")
print(f"{'=' * 70}")
if __name__ == "__main__":
main()