Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions .github/workflows/run_python_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,14 @@ on:
jobs:
build:

runs-on: ubuntu-latest
runs-on: ${{ matrix.os }}
strategy:
matrix:
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
os: [ubuntu-latest]
include:
- os: ubuntu-22.04
python-version: "3.7"

steps:
- uses: actions/checkout@v2
Expand Down
200 changes: 199 additions & 1 deletion lazy_dataset/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import datetime

import numpy as np
from typing import Optional, Union, Any, List, Dict
from typing import Optional, Union, Any, List, Dict, Callable, Tuple

LOG = logging.getLogger('lazy_dataset')

Expand Down Expand Up @@ -232,6 +232,204 @@ def from_dataset(
immutable_warranty=immutable_warranty, name=name)


def _make_example_id(
file_path: Path, parents: Union[int, Callable], sep: str = "/",
) -> Tuple[str, str]:
"""Create example IDs from file paths.

See `from_dir` for usage.

parents:
* 0: '{example_id}.{key}', all folders are considered as part of example
ID.
* > 0: '{example_id}/{key}', abs(parents) is the number of folders to
consider as part of example ID.
* < 0: '{key}/{example_id}', abs(parents) is the number of path parts to
consider as part of example ID.

>>> def test(parents, paths):
... for p in paths:
... example_id, key = _make_example_id(p, parents)
... print(f'ex[{example_id!r}][{key!r}] = root / {str(p)!r}')

# Assuming all files are in the 'root' directory
# and the naming is '{example_id}.{key}', where key is the file extension.
>>> test(0, [Path('ex1.wav'), Path('ex1.txt'), Path('ex2.wav'), Path('ex2.txt')])
ex['ex1']['wav'] = root / 'ex1.wav'
ex['ex1']['txt'] = root / 'ex1.txt'
ex['ex2']['wav'] = root / 'ex2.wav'
ex['ex2']['txt'] = root / 'ex2.txt'

# Assuming an folder per example, i.e., '{example_id}/{key}',
# where key is the file name with extension.
>>> test(1, [Path('ex1/audio/1.wav'), Path('ex1/audio/2.txt'), Path('ex1/meta.json')])
ex['ex1']['audio/1.wav'] = root / 'ex1/audio/1.wav'
ex['ex1']['audio/2.txt'] = root / 'ex1/audio/2.txt'
ex['ex1']['meta.json'] = root / 'ex1/meta.json'

# Assuming file stem is example ID, i.e., '{key}/{example_id}.{ext}',
# where key is the path up to the specified parent folder (-1: parent
# folder, -2: grandparent folder, ...).
>>> test(-1, [Path('audio/ex1.wav'), Path('meta/txt/ex1.txt')])
ex['ex1']['audio'] = root / 'audio/ex1.wav'
ex['ex1']['meta/txt'] = root / 'meta/txt/ex1.txt'

"""
if callable(parents):
return parents(file_path, sep)

if parents == 0:
return (
sep.join(file_path.with_suffix("").parts[parents:]),
file_path.suffix.lstrip('.')
)
if parents > 0:
return (
sep.join(file_path.with_suffix("").parts[:parents]),
sep.join(file_path.parts[parents:])
)
if parents < 0:
return (
sep.join(file_path.with_suffix("").parts[parents:]),
sep.join(file_path.parts[:parents])
)

raise ValueError(
f"parents must be either int or Callable, but got {type(parents)}"
)


def from_dir(
root: Union[str, Path],
suffix: Union[str, List[str]],
immutable_warranty: str = 'pickle',
name: str = None,
parents: Union[Callable, int] = 0,
sep: str = "/",
) -> "DictDataset":
"""Create a new DictDataset from a directory path.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a small text for a motivation and a warning that this function should usually not be used? I lack currently an motivation, when this function is recommented. (Scanning directories with large number of files shouldn't e done on demand.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I often need to evaluate audio files that were generated by TTS systems. I usually store them in a single directory (sometimes with a nested structure), and this is a convenient way to quickly obtain an iterable over these files (e.g., here). I could add a warning that is raised in the beginning to be cautious with this function. Otherwise, I would delegate the responsibility to the user.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, ok.

How about adding something like the following to the doc string (Feel free to improve the text):

Note: This function is not intended to be used for frequently used large datasets,
since the indexing overhead can get significant
For one time small datasets, it is a convenient way to load them.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.


Scan and include all files in `root` that end with a suffix in `suffix`.
New examples are created for each unique file stem. The example ID is
derived from the file path.

Note:
This function is not intended for frequently-used large datasets, since
the file indexing overhead can get significant (instead, see
`from_file` or `lazy_dataset.database.JsonDatabase`). For one-time
small datasets, it is a convenient way to load them.

>>> import tempfile
>>> temp_dir = tempfile.TemporaryDirectory()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this leak a folder? Or does the destructor of temp_dir do some cleanup?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The cleanup is not done automatically unless used as a context manager. I added a call to cleanup at the end of the doctest.

>>> fp = Path(temp_dir.name) / "test1.txt"
>>> fp.touch()
>>> fp = Path(temp_dir.name) / "test1.wav"
>>> fp.touch()
>>> ds = from_dir(temp_dir.name, suffix=".txt")
>>> ds
DictDataset(len=1)
MapDataset(_pickle.loads)
>>> ds[0] # doctest: +ELLIPSIS
{'example_id': 'test1', 'txt': PosixPath('.../test1.txt')}

>>> ds = from_dir(temp_dir.name, suffix=[".txt", ".wav"])
>>> ds
DictDataset(len=1)
MapDataset(_pickle.loads)
>>> ds[0] # doctest: +ELLIPSIS
{'example_id': 'test1', 'txt': PosixPath('.../test1.txt'), 'wav': PosixPath('.../test1.wav')}

# If keys are colliding, an AssertionError is raised
>>> (Path(temp_dir.name) / "collision").mkdir(); (Path(temp_dir.name) / "collision" / "test1.txt").touch()
>>> from_dir(temp_dir.name, suffix=".txt", parents=lambda fp, sep: (fp.stem, fp.suffix.lstrip('.'))) # doctest: +ELLIPSIS
Traceback (most recent call last):
...
AssertionError: Duplicate key 'txt' for example_id 'test1' found!
File: .../test1.txt, existing: .../collision/test1.txt
Consider changing the 'parents' argument to create unique keys for the same example ID.

# Change the parents argument to avoid key collisions
>>> ds = from_dir(temp_dir.name, suffix=".txt", parents=0)
>>> ds
DictDataset(len=2)
MapDataset(_pickle.loads)
>>> ds[0] # doctest: +ELLIPSIS
{'example_id': 'collision/test1', 'txt': PosixPath('.../collision/test1.txt')}
>>> ds[1] # doctest: +ELLIPSIS
{'example_id': 'test1', 'txt': PosixPath('.../test1.txt')}

>>> temp_dir.cleanup() # remove temporary files

Args:
root (Union[str, Path]): Root directory to scan for files.
suffix (Union[str, List[str]]): List of file suffixes to scan for.
Files with these suffixes will be added to the dataset.
immutable_warranty (str, optional):
name (str, optional):
parents (Union[Callable, int], optional): Defines how to create
examples. Options are:
* `int`: Level of parent folders to include in the example ID.
* 0: Assumes a flat folder structure. The file stem is the
example ID and the suffix is the key to access the file
path in the dataset.
* > 0: Assumes a folder per example. Example ID is the folder
and the file name (with extension) is the key to access the
file path in the dataset.
* < 0: Assumes that files belonging to the same example are
stored in disjoint folders. File stem is the example ID
and the folder is the key to access the file path in the
dataset.
* `Callable`: A function that takes the relative file path to `root`
and the seperator argument `sep` as inputs and returns an
example ID and key to access the file path in the dataset. This
allows to create custom example IDs.

Defaults to 0. If non-unique keys under the same example ID are
created, an AssertionError is raised.
sep (str, optional): Separator to use for joining folder names.
Defaults to "/".

Returns:
DictDataset: A dataset containing the scanned files.
"""
from collections import defaultdict
import os
# Adapted from https://stackoverflow.com/a/59803793/16085876
def _run_fast_scandir(root: os.PathLike, ext: List[str]):
for f in os.scandir(root):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about keeping that function simpler?
A recursive implementation is simpler, when you use generator style.
Why do you use in and lower() for an ext check?

    def _run_fast_scandir(root: Path, ext: List[str]):
        for f in os.scandir(root):
            if f.is_dir():
                yield from _run_fast_scandir(f)
            if f.is_file():
                if any(f.name.endswith(e) for e in ext):
                    yield Path(f.path)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. I remember that I had a very special case where I wanted to match only part of the extension, but I can't reproduce that scenario anymore. I adopted your suggestion.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok. If you need that again, we could allow callables for verification.

if f.is_dir():
yield from _run_fast_scandir(f.path, ext)
if f.is_file():
if any(f.name.endswith(e) for e in ext):
yield Path(f.path)

if isinstance(suffix, str):
suffix = [suffix]
files = _run_fast_scandir(Path(root), suffix)
files = map(Path, files)
examples = defaultdict(dict)
for file in sorted(files):
example_id, key = _make_example_id(
file.relative_to(root), parents=parents, sep=sep
)
examples[example_id]["example_id"] = example_id
if key in examples[example_id]:
raise AssertionError(
f"Duplicate key '{key}' for example_id '{example_id}' found!\n"
f"File: {file}, existing: {examples[example_id][key]}\n"
"Consider changing the 'parents' argument to create unique "
"keys for the same example ID."
)
if len(key) == 0:
LOG.warning(
"key has zero length! Check your directory structure!\n"
"file: %s, relative file path: %s, parents=%r",
file, file.relative_to(root), parents,
)
examples[example_id][key] = file
return from_dict(examples, immutable_warranty, name)


def concatenate(*datasets):
"""
Create a new `Dataset` by concatenation of all passed datasets.
Expand Down