-
Notifications
You must be signed in to change notification settings - Fork 8
Add from_path utility #67
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
a0cb523
4ccd8ce
df7c4e9
99d7c8f
a7d80a7
fafa88a
1cfade8
18dc877
baac3cf
ebc1737
64b01ae
edc6807
4b84e9c
be1b066
2d5455c
a7c4154
0ca7c2d
0c8f0a7
6c7e67a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,7 +12,7 @@ | |
| import datetime | ||
|
|
||
| import numpy as np | ||
| from typing import Optional, Union, Any, List, Dict | ||
| from typing import Optional, Union, Any, List, Dict, Callable, Tuple | ||
|
|
||
| LOG = logging.getLogger('lazy_dataset') | ||
|
|
||
|
|
@@ -232,6 +232,204 @@ def from_dataset( | |
| immutable_warranty=immutable_warranty, name=name) | ||
|
|
||
|
|
||
| def _make_example_id( | ||
| file_path: Path, parents: Union[int, Callable], sep: str = "/", | ||
| ) -> Tuple[str, str]: | ||
| """Create example IDs from file paths. | ||
|
|
||
| See `from_dir` for usage. | ||
|
|
||
| parents: | ||
| * 0: '{example_id}.{key}', all folders are considered as part of example | ||
| ID. | ||
| * > 0: '{example_id}/{key}', abs(parents) is the number of folders to | ||
| consider as part of example ID. | ||
| * < 0: '{key}/{example_id}', abs(parents) is the number of path parts to | ||
| consider as part of example ID. | ||
|
|
||
| >>> def test(parents, paths): | ||
| ... for p in paths: | ||
| ... example_id, key = _make_example_id(p, parents) | ||
| ... print(f'ex[{example_id!r}][{key!r}] = root / {str(p)!r}') | ||
|
|
||
| # Assuming all files are in the 'root' directory | ||
| # and the naming is '{example_id}.{key}', where key is the file extension. | ||
| >>> test(0, [Path('ex1.wav'), Path('ex1.txt'), Path('ex2.wav'), Path('ex2.txt')]) | ||
| ex['ex1']['wav'] = root / 'ex1.wav' | ||
| ex['ex1']['txt'] = root / 'ex1.txt' | ||
| ex['ex2']['wav'] = root / 'ex2.wav' | ||
| ex['ex2']['txt'] = root / 'ex2.txt' | ||
|
|
||
| # Assuming an folder per example, i.e., '{example_id}/{key}', | ||
| # where key is the file name with extension. | ||
| >>> test(1, [Path('ex1/audio/1.wav'), Path('ex1/audio/2.txt'), Path('ex1/meta.json')]) | ||
| ex['ex1']['audio/1.wav'] = root / 'ex1/audio/1.wav' | ||
| ex['ex1']['audio/2.txt'] = root / 'ex1/audio/2.txt' | ||
| ex['ex1']['meta.json'] = root / 'ex1/meta.json' | ||
|
|
||
| # Assuming file stem is example ID, i.e., '{key}/{example_id}.{ext}', | ||
| # where key is the path up to the specified parent folder (-1: parent | ||
| # folder, -2: grandparent folder, ...). | ||
| >>> test(-1, [Path('audio/ex1.wav'), Path('meta/txt/ex1.txt')]) | ||
| ex['ex1']['audio'] = root / 'audio/ex1.wav' | ||
| ex['ex1']['meta/txt'] = root / 'meta/txt/ex1.txt' | ||
|
|
||
| """ | ||
| if callable(parents): | ||
| return parents(file_path, sep) | ||
|
|
||
| if parents == 0: | ||
| return ( | ||
| sep.join(file_path.with_suffix("").parts[parents:]), | ||
| file_path.suffix.lstrip('.') | ||
| ) | ||
| if parents > 0: | ||
| return ( | ||
| sep.join(file_path.with_suffix("").parts[:parents]), | ||
| sep.join(file_path.parts[parents:]) | ||
| ) | ||
| if parents < 0: | ||
| return ( | ||
| sep.join(file_path.with_suffix("").parts[parents:]), | ||
| sep.join(file_path.parts[:parents]) | ||
| ) | ||
|
|
||
| raise ValueError( | ||
| f"parents must be either int or Callable, but got {type(parents)}" | ||
| ) | ||
|
|
||
|
|
||
| def from_dir( | ||
| root: Union[str, Path], | ||
| suffix: Union[str, List[str]], | ||
| immutable_warranty: str = 'pickle', | ||
| name: str = None, | ||
| parents: Union[Callable, int] = 0, | ||
| sep: str = "/", | ||
| ) -> "DictDataset": | ||
| """Create a new DictDataset from a directory path. | ||
|
|
||
| Scan and include all files in `root` that end with a suffix in `suffix`. | ||
| New examples are created for each unique file stem. The example ID is | ||
| derived from the file path. | ||
|
|
||
| Note: | ||
| This function is not intended for frequently-used large datasets, since | ||
| the file indexing overhead can get significant (instead, see | ||
| `from_file` or `lazy_dataset.database.JsonDatabase`). For one-time | ||
| small datasets, it is a convenient way to load them. | ||
|
|
||
| >>> import tempfile | ||
| >>> temp_dir = tempfile.TemporaryDirectory() | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this leak a folder? Or does the destructor of temp_dir do some cleanup?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The cleanup is not done automatically unless used as a context manager. I added a call to |
||
| >>> fp = Path(temp_dir.name) / "test1.txt" | ||
| >>> fp.touch() | ||
| >>> fp = Path(temp_dir.name) / "test1.wav" | ||
| >>> fp.touch() | ||
| >>> ds = from_dir(temp_dir.name, suffix=".txt") | ||
| >>> ds | ||
| DictDataset(len=1) | ||
| MapDataset(_pickle.loads) | ||
| >>> ds[0] # doctest: +ELLIPSIS | ||
| {'example_id': 'test1', 'txt': PosixPath('.../test1.txt')} | ||
|
|
||
| >>> ds = from_dir(temp_dir.name, suffix=[".txt", ".wav"]) | ||
| >>> ds | ||
| DictDataset(len=1) | ||
| MapDataset(_pickle.loads) | ||
| >>> ds[0] # doctest: +ELLIPSIS | ||
| {'example_id': 'test1', 'txt': PosixPath('.../test1.txt'), 'wav': PosixPath('.../test1.wav')} | ||
|
|
||
| # If keys are colliding, an AssertionError is raised | ||
| >>> (Path(temp_dir.name) / "collision").mkdir(); (Path(temp_dir.name) / "collision" / "test1.txt").touch() | ||
| >>> from_dir(temp_dir.name, suffix=".txt", parents=lambda fp, sep: (fp.stem, fp.suffix.lstrip('.'))) # doctest: +ELLIPSIS | ||
| Traceback (most recent call last): | ||
| ... | ||
| AssertionError: Duplicate key 'txt' for example_id 'test1' found! | ||
| File: .../test1.txt, existing: .../collision/test1.txt | ||
| Consider changing the 'parents' argument to create unique keys for the same example ID. | ||
|
|
||
| # Change the parents argument to avoid key collisions | ||
| >>> ds = from_dir(temp_dir.name, suffix=".txt", parents=0) | ||
| >>> ds | ||
| DictDataset(len=2) | ||
| MapDataset(_pickle.loads) | ||
| >>> ds[0] # doctest: +ELLIPSIS | ||
| {'example_id': 'collision/test1', 'txt': PosixPath('.../collision/test1.txt')} | ||
| >>> ds[1] # doctest: +ELLIPSIS | ||
| {'example_id': 'test1', 'txt': PosixPath('.../test1.txt')} | ||
|
|
||
| >>> temp_dir.cleanup() # remove temporary files | ||
|
|
||
| Args: | ||
| root (Union[str, Path]): Root directory to scan for files. | ||
| suffix (Union[str, List[str]]): List of file suffixes to scan for. | ||
| Files with these suffixes will be added to the dataset. | ||
| immutable_warranty (str, optional): | ||
| name (str, optional): | ||
| parents (Union[Callable, int], optional): Defines how to create | ||
| examples. Options are: | ||
| * `int`: Level of parent folders to include in the example ID. | ||
| * 0: Assumes a flat folder structure. The file stem is the | ||
| example ID and the suffix is the key to access the file | ||
| path in the dataset. | ||
| * > 0: Assumes a folder per example. Example ID is the folder | ||
| and the file name (with extension) is the key to access the | ||
| file path in the dataset. | ||
| * < 0: Assumes that files belonging to the same example are | ||
| stored in disjoint folders. File stem is the example ID | ||
| and the folder is the key to access the file path in the | ||
| dataset. | ||
| * `Callable`: A function that takes the relative file path to `root` | ||
| and the seperator argument `sep` as inputs and returns an | ||
| example ID and key to access the file path in the dataset. This | ||
| allows to create custom example IDs. | ||
|
|
||
| Defaults to 0. If non-unique keys under the same example ID are | ||
| created, an AssertionError is raised. | ||
| sep (str, optional): Separator to use for joining folder names. | ||
| Defaults to "/". | ||
|
|
||
| Returns: | ||
| DictDataset: A dataset containing the scanned files. | ||
| """ | ||
| from collections import defaultdict | ||
| import os | ||
| # Adapted from https://stackoverflow.com/a/59803793/16085876 | ||
| def _run_fast_scandir(root: os.PathLike, ext: List[str]): | ||
| for f in os.scandir(root): | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about keeping that function simpler? def _run_fast_scandir(root: Path, ext: List[str]):
for f in os.scandir(root):
if f.is_dir():
yield from _run_fast_scandir(f)
if f.is_file():
if any(f.name.endswith(e) for e in ext):
yield Path(f.path)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. I remember that I had a very special case where I wanted to match only part of the extension, but I can't reproduce that scenario anymore. I adopted your suggestion.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok. If you need that again, we could allow callables for verification. |
||
| if f.is_dir(): | ||
| yield from _run_fast_scandir(f.path, ext) | ||
| if f.is_file(): | ||
| if any(f.name.endswith(e) for e in ext): | ||
| yield Path(f.path) | ||
|
|
||
| if isinstance(suffix, str): | ||
| suffix = [suffix] | ||
| files = _run_fast_scandir(Path(root), suffix) | ||
| files = map(Path, files) | ||
| examples = defaultdict(dict) | ||
| for file in sorted(files): | ||
| example_id, key = _make_example_id( | ||
| file.relative_to(root), parents=parents, sep=sep | ||
| ) | ||
| examples[example_id]["example_id"] = example_id | ||
| if key in examples[example_id]: | ||
| raise AssertionError( | ||
| f"Duplicate key '{key}' for example_id '{example_id}' found!\n" | ||
| f"File: {file}, existing: {examples[example_id][key]}\n" | ||
| "Consider changing the 'parents' argument to create unique " | ||
| "keys for the same example ID." | ||
| ) | ||
| if len(key) == 0: | ||
| LOG.warning( | ||
| "key has zero length! Check your directory structure!\n" | ||
| "file: %s, relative file path: %s, parents=%r", | ||
| file, file.relative_to(root), parents, | ||
| ) | ||
| examples[example_id][key] = file | ||
| return from_dict(examples, immutable_warranty, name) | ||
|
|
||
|
|
||
| def concatenate(*datasets): | ||
| """ | ||
| Create a new `Dataset` by concatenation of all passed datasets. | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add a small text for a motivation and a warning that this function should usually not be used? I lack currently an motivation, when this function is recommented. (Scanning directories with large number of files shouldn't e done on demand.)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I often need to evaluate audio files that were generated by TTS systems. I usually store them in a single directory (sometimes with a nested structure), and this is a convenient way to quickly obtain an iterable over these files (e.g., here). I could add a warning that is raised in the beginning to be cautious with this function. Otherwise, I would delegate the responsibility to the user.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, ok.
How about adding something like the following to the doc string (Feel free to improve the text):
Note: This function is not intended to be used for frequently used large datasets,
since the indexing overhead can get significant
For one time small datasets, it is a convenient way to load them.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.