fgnt · michael-kuhlmann · Sep 12, 2025 · Nov 12, 2025 · Nov 12, 2025 · Nov 12, 2025
diff --git a/.github/workflows/run_python_tests.yml b/.github/workflows/run_python_tests.yml
@@ -12,10 +12,14 @@ on:
 jobs:
   build:
 
-    runs-on: ubuntu-latest
+    runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        os: [ubuntu-latest]
+        include:
+          - os: ubuntu-22.04
+            python-version: "3.7"
 
     steps:
     - uses: actions/checkout@v2

diff --git a/lazy_dataset/core.py b/lazy_dataset/core.py
@@ -232,6 +232,93 @@ def from_dataset(
                              immutable_warranty=immutable_warranty, name=name)
 
 
+def from_path(
+    root: Union[str, Path],
+    suffix: Union[str, List[str]],
+    immutable_warranty: str = 'pickle',
+    name: str = None,
+    parents: Optional[int] = None,
+    sep: str = "_",
+) -> "DictDataset":
+    """Create a new DictDataset from a directory path.
+
+    Scan and include all files in `root` that end with a suffix in `suffix`.
+    New examples are created for each unique file stem. The example_id is
+    derived from the file path.
+
+    >>> import tempfile
+    >>> temp_dir = tempfile.TemporaryDirectory()
+    >>> fp = Path(temp_dir.name) / "test1.txt"
+    >>> fp.touch()
+    >>> fp = Path(temp_dir.name) / "test1.wav"
+    >>> fp.touch()
+    >>> ds = from_path(temp_dir.name, suffix=".txt")
+    >>> ds
+      DictDataset(len=1)
+    MapDataset(_pickle.loads)
+    >>> ds[0]  # doctest: +ELLIPSIS
+    {'example_id': 'test1', 'txt': PosixPath('.../test1.txt')}
+
+    >>> ds = from_path(temp_dir.name, suffix=[".txt", ".wav"])
+    >>> ds
+      DictDataset(len=1)
+    MapDataset(_pickle.loads)
+    >>> ds[0]  # doctest: +ELLIPSIS
+    {'example_id': 'test1', 'txt': PosixPath('.../test1.txt'), 'wav': PosixPath('.../test1.wav')}
+
+    Args:
+        root (Union[str, Path]): Root directory to scan for files.
+        suffix (Union[str, List[str]]): List of file suffixes to scan for.
+            Files with these suffixes will be added to the dataset.
+        immutable_warranty (str, optional):
+        name (str, optional):
+        parents (Optional[int], optional): Level of parent folders to include in
+            the example_id. If `None`, only the file stem is used. `parents=1`
+            includes the immediate parent folder. Defaults to None.
+        sep (str, optional): Separator to use for joining folder names.
+            Defaults to "_".
+
+    Returns:
+        DictDataset: A dataset containing the scanned files.
+    """
+    from collections import defaultdict
+    import os
+    # https://stackoverflow.com/a/59803793/16085876
+    def _run_fast_scandir(root: Path, ext: List[str]):
+        subfolders, files = [], []
+
+        for f in os.scandir(root):
+            if f.is_dir():
+                subfolders.append(f.path)
+            if f.is_file():
+                if any(e in f.name.lower() for e in ext):
+                    files.append(Path(f.path))
+
+        for folder in list(subfolders):
+            sf, f = _run_fast_scandir(folder, ext)
+            subfolders.extend(sf)
+            files.extend(f)
+        return subfolders, files
+
+    def _make_example_id(file_path: Path):
+        if parents is None:
+            return file_path.stem
+        example_id = file_path.stem
+        prefix = sep.join(file_path.parts[-(2+parents):-1])
+        return sep.join((prefix, example_id))
+
+    if isinstance(suffix, str):
+        suffix = [suffix]
+    _, files = _run_fast_scandir(Path(root), suffix)
+    files = map(Path, files)
+    examples = defaultdict(dict)
+    for file in files:
+        example_id = _make_example_id(file)
+        examples[example_id]["example_id"] = example_id
+        examples[example_id][file.suffix.lstrip(".")] = file
+    return from_dict(examples, immutable_warranty, name)
+
+
 def concatenate(*datasets):
     """
     Create a new `Dataset` by concatenation of all passed datasets.