From 20e50f8e09e4b72416056727cf1a25f5351939de Mon Sep 17 00:00:00 2001 From: Rick Izzo Date: Sun, 10 Nov 2019 11:12:29 -0500 Subject: [PATCH 1/3] initial work on adding sampler --- src/hangar/arrayset.py | 13 ++ src/hangar/dataloaders/common.py | 8 +- src/hangar/dataloaders/sampler.py | 217 ++++++++++++++++++++++++++++++ 3 files changed, 234 insertions(+), 4 deletions(-) create mode 100644 src/hangar/dataloaders/sampler.py diff --git a/src/hangar/arrayset.py b/src/hangar/arrayset.py index ebf489d8..cc9e856f 100644 --- a/src/hangar/arrayset.py +++ b/src/hangar/arrayset.py @@ -1,3 +1,4 @@ +from collections import defaultdict import os import warnings from multiprocessing import cpu_count, get_context @@ -304,6 +305,18 @@ def backend_opts(self): """ return self._dflt_backend_opts + @property + def sample_classes(self): + grouped_spec_names = defaultdict(list) + for name, bespec in self._sspecs.items(): + grouped_spec_names[bespec].append(name) + + grouped_data_names = {} + for spec, names in grouped_spec_names.items(): + data = self._fs[spec.backend].read_data(spec) + grouped_data_names[tuple(data.tolist())] = names + return grouped_data_names + def keys(self, local: bool = False) -> Iterator[Union[str, int]]: """generator which yields the names of every sample in the arrayset diff --git a/src/hangar/dataloaders/common.py b/src/hangar/dataloaders/common.py index c711785b..aba41645 100644 --- a/src/hangar/dataloaders/common.py +++ b/src/hangar/dataloaders/common.py @@ -33,7 +33,7 @@ def __init__(self, if len(arraysets) == 0: raise ValueError('len(arraysets) cannot == 0') - aset_lens = set() + # aset_lens = set() all_keys = [] all_remote_keys = [] for aset in arraysets: @@ -41,12 +41,12 @@ def __init__(self, raise TypeError(f'Cannot load arraysets opened in `write-enabled` checkout.') self.arrayset_array.append(aset) self.arrayset_names.append(aset.name) - aset_lens.add(len(aset)) + # aset_lens.add(len(aset)) all_keys.append(set(aset.keys())) all_remote_keys.append(set(aset.remote_reference_keys)) - if len(aset_lens) > 1: - warnings.warn('Arraysets do not contain equal number of samples', UserWarning) + # if len(aset_lens) > 1: + # warnings.warn('Arraysets do not contain equal number of samples', UserWarning) common_keys = set.intersection(*all_keys) remote_keys = set.union(*all_remote_keys) diff --git a/src/hangar/dataloaders/sampler.py b/src/hangar/dataloaders/sampler.py new file mode 100644 index 00000000..4fce7c38 --- /dev/null +++ b/src/hangar/dataloaders/sampler.py @@ -0,0 +1,217 @@ +import numpy as np + + +def pnorm(p): + if not isinstance(p, (list, tuple)): + raise ValueError(f'probability map {p} must be of type (list, tuple), not {type(p)}') + ptot = np.sum(p) + if not np.allclose(ptot, 1): + p = [i / ptot for i in p] + return p + + +def multinomial(num_samples, p): + valid_p = pnorm(p) + res = np.random.multinomial(num_samples, valid_p) + return res + + +class Sampler(object): + r"""Base class for all Samplers. + Every Sampler subclass has to provide an :meth:`__iter__` method, providing a + way to iterate over indices of dataset elements, and a :meth:`__len__` method + that returns the length of the returned iterators. + .. note:: The :meth:`__len__` method isn't strictly required by + :class:`~torch.utils.data.DataLoader`, but is expected in any + calculation involving the length of a :class:`~torch.utils.data.DataLoader`. + """ + + def __init__(self, data_source): + pass + + def __iter__(self): + raise NotImplementedError + + # NOTE [ Lack of Default `__len__` in Python Abstract Base Classes ] + # + # Many times we have an abstract class representing a collection/iterable of + # data, e.g., `torch.utils.data.Sampler`, with its subclasses optionally + # implementing a `__len__` method. In such cases, we must make sure to not + # provide a default implementation, because both straightforward default + # implementations have their issues: + # + # + `return NotImplemented`: + # Calling `len(subclass_instance)` raises: + # TypeError: 'NotImplementedType' object cannot be interpreted as an integer + # + # + `raise NotImplementedError()`: + # This prevents triggering some fallback behavior. E.g., the built-in + # `list(X)` tries to call `len(X)` first, and executes a different code + # path if the method is not found or `NotImplemented` is returned, while + # raising an `NotImplementedError` will propagate and and make the call + # fail where it could have use `__iter__` to complete the call. + # + # Thus, the only two sensible things to do are + # + # + **not** provide a default `__len__`. + # + # + raise a `TypeError` instead, which is what Python uses when users call + # a method that is not defined on an object. + # (@ssnl verifies that this works on at least Python 3.7.) + + +class SequentialSampler(Sampler): + r"""Samples elements sequentially, always in the same order. + Arguments: + data_source (Dataset): dataset to sample from + """ + + def __init__(self, data_source): + self.data_source = data_source + + def __iter__(self): + return iter(self.data_source.keys()) + + def __len__(self): + return len(self.data_source) + + +class RandomSampler(Sampler): + r"""Samples elements randomly. If without replacement, then sample from a shuffled dataset. + If with replacement, then user can specify :attr:`num_samples` to draw. + Arguments: + data_source (Dataset): dataset to sample from + replacement (bool): samples are drawn with replacement if ``True``, default=``False`` + num_samples (int): number of samples to draw, default=`len(dataset)`. This argument + is supposed to be specified only when `replacement` is ``True``. + """ + + def __init__(self, data_source, replacement=False, num_samples=None): + self.data_source = data_source + self.replacement = replacement + self._num_samples = num_samples + + if not isinstance(self.replacement, bool): + raise ValueError("replacement should be a boolean value, but got " + "replacement={}".format(self.replacement)) + + if self._num_samples is not None and not replacement: + raise ValueError("With replacement=False, num_samples should not be specified, " + "since a random permute will be performed.") + + if not isinstance(self.num_samples, int) or self.num_samples <= 0: + raise ValueError("num_samples should be a positive integer " + "value, but got num_samples={}".format(self.num_samples)) + + @property + def num_samples(self): + # dataset size might change at runtime + if self._num_samples is None: + return len(self.data_source) + return self._num_samples + + def __iter__(self): + n = len(self.data_source) + keys = list(self.data_source.keys()) + if self.replacement: + choose = np.random.randint(low=0, high=n, size=(self.num_samples,), dtype=np.int64).tolist() + return (keys[x] for x in choose) + choose = np.random.permutation(self.num_samples) + return (keys[x] for x in choose) + + def __len__(self): + return self.num_samples + + +class SubsetRandomSampler(Sampler): + r"""Samples elements randomly from a given list of indices, without replacement. + Arguments: + indices (sequence): a sequence of indices + """ + + def __init__(self, indices): + self.indices = indices + + def __iter__(self): + choose = np.random.permutation(len(self.indices)) + return (self.indices[x] for x in choose) + + def __len__(self): + return len(self.indices) + + +class WeightedRandomSampler(Sampler): + r"""Samples elements from ``[0,..,len(weights)-1]`` with given probabilities (weights). + Args: + weights (sequence) : a sequence of weights, not necessary summing up to one + num_samples (int): number of samples to draw + replacement (bool): if ``True``, samples are drawn with replacement. + If not, they are drawn without replacement, which means that when a + sample index is drawn for a row, it cannot be drawn again for that row. + Example: + >>> list(WeightedRandomSampler([0.1, 0.9, 0.4, 0.7, 3.0, 0.6], 5, replacement=True)) + [0, 0, 0, 1, 0] + >>> list(WeightedRandomSampler([0.9, 0.4, 0.05, 0.2, 0.3, 0.1], 5, replacement=False)) + [0, 1, 4, 3, 2] + """ + + def __init__(self, weights, num_samples): + if not isinstance(num_samples, int) or isinstance(num_samples, bool) or \ + num_samples <= 0: + raise ValueError("num_samples should be a positive integer " + "value, but got num_samples={}".format(num_samples)) + self.weights = tuple(weights) + self.num_samples = num_samples + + def __iter__(self): + return iter(multinomial(self.num_samples, self.weights)) + + def __len__(self): + return self.num_samples + + +class BatchSampler(Sampler): + r"""Wraps another sampler to yield a mini-batch of indices. + Args: + sampler (Sampler): Base sampler. + batch_size (int): Size of mini-batch. + drop_last (bool): If ``True``, the sampler will drop the last batch if + its size would be less than ``batch_size`` + Example: + >>> list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=False)) + [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]] + >>> list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=True)) + [[0, 1, 2], [3, 4, 5], [6, 7, 8]] + """ + + def __init__(self, sampler, batch_size, drop_last): + if not isinstance(sampler, Sampler): + raise ValueError("sampler should be an instance of " + "torch.utils.data.Sampler, but got sampler={}" + .format(sampler)) + if not isinstance(batch_size, int) or isinstance(batch_size, bool) or \ + batch_size <= 0: + raise ValueError("batch_size should be a positive integer value, " + "but got batch_size={}".format(batch_size)) + if not isinstance(drop_last, bool): + raise ValueError("drop_last should be a boolean value, but got " + "drop_last={}".format(drop_last)) + self.sampler = sampler + self.batch_size = batch_size + self.drop_last = drop_last + + def __iter__(self): + batch = [] + for idx in self.sampler: + batch.append(idx) + if len(batch) == self.batch_size: + yield batch + batch = [] + if len(batch) > 0 and not self.drop_last: + yield batch + + def __len__(self): + if self.drop_last: + return len(self.sampler) // self.batch_size + else: + return (len(self.sampler) + self.batch_size - 1) // self.batch_size \ No newline at end of file From 426d3dd5d01797a1127ee8252674d7438060f1a1 Mon Sep 17 00:00:00 2001 From: Rick Izzo Date: Sun, 10 Nov 2019 22:44:23 -0500 Subject: [PATCH 2/3] working - but extremly primitive - method to sample and batch arrayset groups --- src/hangar/__init__.py | 1 - src/hangar/arrayset.py | 17 +- src/hangar/dataloaders/__init__.py | 3 + src/hangar/dataloaders/grouper.py | 125 +++++++++++++ src/hangar/dataloaders/sampler.py | 285 +++++++++++++++++++++-------- 5 files changed, 336 insertions(+), 95 deletions(-) create mode 100644 src/hangar/dataloaders/grouper.py diff --git a/src/hangar/__init__.py b/src/hangar/__init__.py index 30d20d27..aea64317 100644 --- a/src/hangar/__init__.py +++ b/src/hangar/__init__.py @@ -8,7 +8,6 @@ def raise_ImportError(message, *args, **kwargs): raise ImportError(message) - try: from .dataloaders.tfloader import make_tf_dataset except ImportError: diff --git a/src/hangar/arrayset.py b/src/hangar/arrayset.py index cc9e856f..212c3b92 100644 --- a/src/hangar/arrayset.py +++ b/src/hangar/arrayset.py @@ -27,8 +27,9 @@ from .records.parsing import arrayset_record_schema_db_val_from_raw_val -CompatibleArray = NamedTuple( - 'CompatibleArray', [('compatible', bool), ('reason', str)]) +CompatibleArray = NamedTuple('CompatibleArray', [ + ('compatible', bool), + ('reason', str)]) class ArraysetDataReader(object): @@ -305,18 +306,6 @@ def backend_opts(self): """ return self._dflt_backend_opts - @property - def sample_classes(self): - grouped_spec_names = defaultdict(list) - for name, bespec in self._sspecs.items(): - grouped_spec_names[bespec].append(name) - - grouped_data_names = {} - for spec, names in grouped_spec_names.items(): - data = self._fs[spec.backend].read_data(spec) - grouped_data_names[tuple(data.tolist())] = names - return grouped_data_names - def keys(self, local: bool = False) -> Iterator[Union[str, int]]: """generator which yields the names of every sample in the arrayset diff --git a/src/hangar/dataloaders/__init__.py b/src/hangar/dataloaders/__init__.py index e69de29b..68687325 100644 --- a/src/hangar/dataloaders/__init__.py +++ b/src/hangar/dataloaders/__init__.py @@ -0,0 +1,3 @@ +from .grouper import GroupedArraysetDataReader + +__all__ = ['GroupedArraysetDataReader'] \ No newline at end of file diff --git a/src/hangar/dataloaders/grouper.py b/src/hangar/dataloaders/grouper.py new file mode 100644 index 00000000..0ba618ac --- /dev/null +++ b/src/hangar/dataloaders/grouper.py @@ -0,0 +1,125 @@ +import numpy as np + +from ..arrayset import ArraysetDataReader + +from collections import defaultdict +import hashlib +from typing import Sequence, Union, Iterable, NamedTuple +import struct + + +# -------------------------- typehints --------------------------------------- + + +ArraysetSampleNames = Sequence[Union[str, int]] + +SampleGroup = NamedTuple('SampleGroup', [ + ('group', np.ndarray), + ('samples', Union[str, int])]) + + +# ------------------------------------------------------------------------------ + + +def _calculate_hash_digest(data: np.ndarray) -> str: + hasher = hashlib.blake2b(data, digest_size=20) + hasher.update(struct.pack(f'<{len(data.shape)}QB', *data.shape, data.dtype.num)) + digest = hasher.hexdigest() + return digest + + +class FakeNumpyKeyDict(object): + def __init__(self, group_spec_samples, group_spec_value, group_digest_spec): + self._group_spec_samples = group_spec_samples + self._group_spec_value = group_spec_value + self._group_digest_spec = group_digest_spec + + def __getitem__(self, key: np.ndarray) -> ArraysetSampleNames: + digest = _calculate_hash_digest(key) + spec = self._group_digest_spec[digest] + samples = self._group_spec_samples[spec] + return samples + + def get(self, key: np.ndarray) -> ArraysetSampleNames: + return self.__getitem__(key) + + def __setitem__(self, key, val): + raise PermissionError('Not User Editable') + + def __delitem__(self, key): + raise PermissionError('Not User Editable') + + def __len__(self) -> int: + return len(self._group_digest_spec) + + def __contains__(self, key: np.ndarray) -> bool: + digest = _calculate_hash_digest(key) + res = True if digest in self._group_digest_spec else False + return res + + def __iter__(self) -> Iterable[np.ndarray]: + for spec in self._group_digest_spec.values(): + yield self._group_spec_value[spec] + + def keys(self) -> Iterable[np.ndarray]: + for spec in self._group_digest_spec.values(): + yield self._group_spec_value[spec] + + def values(self) -> Iterable[ArraysetSampleNames]: + for spec in self._group_digest_spec.values(): + yield self._group_spec_samples[spec] + + def items(self) -> Iterable[ArraysetSampleNames]: + for spec in self._group_digest_spec.values(): + yield (self._group_spec_value[spec], self._group_spec_samples[spec]) + + def __repr__(self): + print('Mapping: Group Data Value -> Sample Name') + for k, v in self.items(): + print(k, v) + + def _repr_pretty_(self, p, cycle): + res = f'Mapping: Group Data Value -> Sample Name \n' + for k, v in self.items(): + res += f'\n {k} :: {v}' + p.text(res) + + + +# ---------------------------- MAIN METHOD ------------------------------------ + + +class GroupedArraysetDataReader(object): + '''Pass in an arrayset and automatically find sample groups. + ''' + + def __init__(self, arrayset: ArraysetDataReader, *args, **kwargs): + + self.__arrayset = arrayset # TODO: Do we actually need to keep this around? + self._group_spec_samples = defaultdict(list) + self._group_spec_value = {} + self._group_digest_spec = {} + + self._setup() + self._group_samples = FakeNumpyKeyDict( + self._group_spec_samples, + self._group_spec_value, + self._group_digest_spec) + + def _setup(self): + for name, bespec in self.__arrayset._sspecs.items(): + self._group_spec_samples[bespec].append(name) + for spec, names in self._group_spec_samples.items(): + data = self.__arrayset._fs[spec.backend].read_data(spec) + self._group_spec_value[spec] = data + digest = _calculate_hash_digest(data) + self._group_digest_spec[digest] = spec + + @property + def groups(self) -> Iterable[np.ndarray]: + for spec in self._group_digest_spec.values(): + yield self._group_spec_value[spec] + + @property + def group_samples(self): + return self._group_samples \ No newline at end of file diff --git a/src/hangar/dataloaders/sampler.py b/src/hangar/dataloaders/sampler.py index 4fce7c38..b152ab0e 100644 --- a/src/hangar/dataloaders/sampler.py +++ b/src/hangar/dataloaders/sampler.py @@ -1,29 +1,90 @@ +from typing import Sequence, Union, List, Iterable + import numpy as np +from ..arrayset import ArraysetDataReader + + +# -------------------------- typehints --------------------------------------- + + +ArraysetSampleNames = Sequence[Union[str, int]] -def pnorm(p): - if not isinstance(p, (list, tuple)): - raise ValueError(f'probability map {p} must be of type (list, tuple), not {type(p)}') + +# --------------------- sampling functions ------------------------------------ + + +def _p_normalize(p: Sequence[float]) -> List[float]: ptot = np.sum(p) if not np.allclose(ptot, 1): p = [i / ptot for i in p] return p -def multinomial(num_samples, p): - valid_p = pnorm(p) - res = np.random.multinomial(num_samples, valid_p) - return res +def _multinomial(num_samples: int, p: Sequence[float], replacement: bool) -> List[int]: + """Draw samples from a multinomial distribution. + + The multinomial distribution is a multivariate generalization of the + binomial distribution. Take an experiment with one of ``p`` possible + outcomes. An example of such an experiment is throwing a dice, where the + outcome can be 1 through 6. Each sample drawn from the distribution + represents `n` such experiments. Its values, ``X_i = [X_0, X_1, ..., + X_p]``, represent the number of times the outcome was ``i``. + + Parameters + ---------- + num_samples : int + number of samples to draw from the probabilities + + p : Sequence[float] + Input list containing probabilities of drawing a specific catagory. The + elements in ``p`` do not need to sum to one (in which case we normalize and + use the values as weights), but must be non-negative, finite and have a + non-zero sum. + + replacement : bool + Wheather to draw without replacement or not + + Returns + ------- + List[int] + Contains ``num_samples`` indices sampled from the multinomial probability + distribution located in the corresponding row of ``p`` probabilities. + + Raises + ------ + ValueError + If probability arg is not a Sequence (list or tuple) of len > 0 + """ + if not isinstance(p, Sequence) or (len(p) == 0): + raise ValueError(f'probability arg must be sequence of len > 0, {p} is invalid') + if not all((i >= 0 for i in p)) or not any( + (i > 0 for i in p)) or (np.Infinity in p) or (np.NaN in p): + raise ValueError(f'probs {p} invalid. all must be >= 0, finite, and have non-zero sum') + + valid_p = _p_normalize(p) + if not replacement: + idxs = np.random.multinomial(num_samples, valid_p) + else: + idxs = np.random.choice(np.arange(len(p)), replace=True, size=num_samples, p=valid_p) + return idxs.tolist() + + +# -------------------------- sampler methods ---------------------------------- class Sampler(object): - r"""Base class for all Samplers. + """Base class for all Samplers. + Every Sampler subclass has to provide an :meth:`__iter__` method, providing a way to iterate over indices of dataset elements, and a :meth:`__len__` method that returns the length of the returned iterators. - .. note:: The :meth:`__len__` method isn't strictly required by - :class:`~torch.utils.data.DataLoader`, but is expected in any - calculation involving the length of a :class:`~torch.utils.data.DataLoader`. + + .. note:: + + The :meth:`__len__` method isn't strictly required by + :class:`~torch.utils.data.DataLoader`, but is expected in any calculation + involving the length of a :class:`~torch.utils.data.DataLoader`. """ def __init__(self, data_source): @@ -61,134 +122,198 @@ def __iter__(self): class SequentialSampler(Sampler): - r"""Samples elements sequentially, always in the same order. - Arguments: - data_source (Dataset): dataset to sample from + """Samples elements sequentially, always in the same order. + + Order of keys is numeric first, then lexicographically sorted strings. + + Parameters + ---------- + data_source : ArraysetDataReader + arrayset to derive sample names from. + + TODO + ---- + - Discussion: ordering holds so long as the ArraysetDataReader is not + write enabled (the _sspecs dict is not able to mutate.) Even at that, + this is only guarrenteed due to the implicit ordered dictionary methods + with python 3.6. However, this could potentially change in the future if + we ever decided to store the sample keys in a different format or change + up how the backend actually stores data. Is sorted order something we + want to guarrenttee? """ - def __init__(self, data_source): + def __init__(self, data_source: ArraysetDataReader): self.data_source = data_source - def __iter__(self): + def __iter__(self) -> Iterable[ArraysetSampleNames]: return iter(self.data_source.keys()) - def __len__(self): + def __len__(self) -> int: return len(self.data_source) class RandomSampler(Sampler): - r"""Samples elements randomly. If without replacement, then sample from a shuffled dataset. - If with replacement, then user can specify :attr:`num_samples` to draw. - Arguments: - data_source (Dataset): dataset to sample from - replacement (bool): samples are drawn with replacement if ``True``, default=``False`` - num_samples (int): number of samples to draw, default=`len(dataset)`. This argument - is supposed to be specified only when `replacement` is ``True``. + """Sample names randomly from an arrayset. + + If without replacemement, then sample from a shuffled set of names. If with + replacement, then user can specify :attr:`num_samples` to draw. + + Parameters + ---------- + data_source : ArraysetDataReader + arrayset to sample names from + replacement : bool, optional + Samples are draw with replacement if ``True``, by default ``False`` + num_samples : int, optional + number of samples to draw, default=`len(dataset)`. This argument is + supposed to be specified only when `replacement` is ``True``, by + default ``None`` """ - def __init__(self, data_source, replacement=False, num_samples=None): + def __init__(self, + data_source: ArraysetDataReader, + replacement: bool = False, + num_samples: int = None): + self.data_source = data_source self.replacement = replacement self._num_samples = num_samples if not isinstance(self.replacement, bool): - raise ValueError("replacement should be a boolean value, but got " - "replacement={}".format(self.replacement)) - + raise ValueError(f"replacement must be boolean. Not {self.replacement}") if self._num_samples is not None and not replacement: raise ValueError("With replacement=False, num_samples should not be specified, " "since a random permute will be performed.") - if not isinstance(self.num_samples, int) or self.num_samples <= 0: - raise ValueError("num_samples should be a positive integer " - "value, but got num_samples={}".format(self.num_samples)) + raise ValueError(f"num_samples should be int >= 1, not {self.num_samples}") @property - def num_samples(self): + def num_samples(self) -> int: # dataset size might change at runtime if self._num_samples is None: return len(self.data_source) return self._num_samples - def __iter__(self): - n = len(self.data_source) - keys = list(self.data_source.keys()) + def __iter__(self) -> Iterable[ArraysetSampleNames]: if self.replacement: - choose = np.random.randint(low=0, high=n, size=(self.num_samples,), dtype=np.int64).tolist() - return (keys[x] for x in choose) - choose = np.random.permutation(self.num_samples) - return (keys[x] for x in choose) + n = len(self.data_source) + indices = np.random.randint(0, high=n, size=(self.num_samples,), dtype=np.int64) + else: + indices = np.random.permutation(self.num_samples) + keys = list(self.data_source.keys()) + return (keys[idx] for idx in indices.tolist()) - def __len__(self): + def __len__(self) -> int: return self.num_samples class SubsetRandomSampler(Sampler): - r"""Samples elements randomly from a given list of indices, without replacement. - Arguments: - indices (sequence): a sequence of indices + """Samples elements randomly from a given list of sample names, without replacement. + + Parameters + ---------- + sample_names : ArraysetSampleNames + a sequence of sample names """ + def __init__(self, sample_names: ArraysetSampleNames): + self.sample_names = sample_names - def __init__(self, indices): - self.indices = indices + def __iter__(self) -> Iterable[ArraysetSampleNames]: + indices = np.random.permutation(len(self.sample_names)) + return (self.sample_names[idx] for idx in indices) - def __iter__(self): - choose = np.random.permutation(len(self.indices)) - return (self.indices[x] for x in choose) - - def __len__(self): - return len(self.indices) + def __len__(self) -> int: + return len(self.sample_names) class WeightedRandomSampler(Sampler): - r"""Samples elements from ``[0,..,len(weights)-1]`` with given probabilities (weights). - Args: - weights (sequence) : a sequence of weights, not necessary summing up to one - num_samples (int): number of samples to draw - replacement (bool): if ``True``, samples are drawn with replacement. - If not, they are drawn without replacement, which means that when a - sample index is drawn for a row, it cannot be drawn again for that row. - Example: + """Samples elements from``[0,..,len(weights)-1]`` with given probabilities (weights). + + Examples + -------- + >>> list(WeightedRandomSampler([0.1, 0.9, 0.4, 0.7, 3.0, 0.6], 5, replacement=True)) [0, 0, 0, 1, 0] >>> list(WeightedRandomSampler([0.9, 0.4, 0.05, 0.2, 0.3, 0.1], 5, replacement=False)) [0, 1, 4, 3, 2] + + Parameters + ---------- + weights : Sequence[float] + a sequence of weights, not necessarily summing up to one + num_samples : int + number of samples to draw + replacement : bool + if ``True``, samples are drawn with replacement. If not, they are + drawn without replacement, meaning that when a sample name is drawn + once in a row, it cannot be drawn again for that same row + group_names : List[np.ndarray], optional + If provided, iteration across sampler will return corresponding arrayset + group value rather then a generic positional index identifying the selected + probability/weight. If set, ``len(group_names)`` must exactaly equal ``len(weights)``. + If not specified, or set to ``None``, returns position index corresponding to the weight + selected. defaults to ``None``. """ - def __init__(self, weights, num_samples): - if not isinstance(num_samples, int) or isinstance(num_samples, bool) or \ - num_samples <= 0: + def __init__(self, + weights: Sequence[float], + num_samples: int, + replacement: bool, + group_names: List[np.ndarray] = None): + + if not isinstance(num_samples, int) or isinstance(num_samples, bool) or num_samples <= 0: raise ValueError("num_samples should be a positive integer " "value, but got num_samples={}".format(num_samples)) + if not isinstance(replacement, bool): + raise ValueError("replacement should be a boolean value, but got " + "replacement={}".format(replacement)) + if group_names is not None: + if not isinstance(group_names, Sequence) or not all( + (isinstance(item, np.ndarray) for item in group_names)) or ( + len(group_names) != len(weights)): + raise ValueError(f'if provided, groupnames must be list of `numpy.ndarray` with ' + f'len(groups) == len(weights), not {group_names} ') self.weights = tuple(weights) self.num_samples = num_samples + self.replacement = replacement + self.group_names = group_names - def __iter__(self): - return iter(multinomial(self.num_samples, self.weights)) + def __iter__(self) -> Iterable[int]: + indices = _multinomial(self.num_samples, self.weights, self.replacement) + if self.group_names: + indices = (self.group_names[idx] for idx in indices) + return iter(indices) - def __len__(self): + def __len__(self) -> int: return self.num_samples class BatchSampler(Sampler): - r"""Wraps another sampler to yield a mini-batch of indices. - Args: - sampler (Sampler): Base sampler. - batch_size (int): Size of mini-batch. - drop_last (bool): If ``True``, the sampler will drop the last batch if - its size would be less than ``batch_size`` - Example: + """Wraps another sampler to yield a mini-batch of sample names + + Parameters + ---------- + sampler : Sampler + sampler instance inhereting from :class:'.Sampler` + batch_size : int + size of the mini-batch + drop_last : bool + If ``True, the sampler will drop the last batch if its size would be + less tahn ``batch_size``. + + Examples + -------- + >>> list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=False)) [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]] >>> list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=True)) [[0, 1, 2], [3, 4, 5], [6, 7, 8]] """ - def __init__(self, sampler, batch_size, drop_last): + def __init__(self, sampler: Sampler, batch_size: int, drop_last: bool): if not isinstance(sampler, Sampler): - raise ValueError("sampler should be an instance of " - "torch.utils.data.Sampler, but got sampler={}" - .format(sampler)) + raise ValueError( + f"sampler must be hangar.dataloader.Sampler instance. Not {sampler}") if not isinstance(batch_size, int) or isinstance(batch_size, bool) or \ batch_size <= 0: raise ValueError("batch_size should be a positive integer value, " @@ -200,17 +325,17 @@ def __init__(self, sampler, batch_size, drop_last): self.batch_size = batch_size self.drop_last = drop_last - def __iter__(self): + def __iter__(self) -> Iterable[List[ArraysetSampleNames]]: batch = [] - for idx in self.sampler: - batch.append(idx) + for sample_name in self.sampler: + batch.append(sample_name) if len(batch) == self.batch_size: yield batch batch = [] if len(batch) > 0 and not self.drop_last: yield batch - def __len__(self): + def __len__(self) -> int: if self.drop_last: return len(self.sampler) // self.batch_size else: From e63339fe9b3c170dfa820fddb1fb44b76fe43f21 Mon Sep 17 00:00:00 2001 From: Rick Izzo Date: Tue, 26 Nov 2019 16:16:19 -0500 Subject: [PATCH 3/3] updates --- src/hangar/dataloaders/grouper.py | 23 ++++--------- src/hangar/dataloaders/sampler.py | 57 +++++++++++++++++-------------- 2 files changed, 38 insertions(+), 42 deletions(-) diff --git a/src/hangar/dataloaders/grouper.py b/src/hangar/dataloaders/grouper.py index 0ba618ac..e6c16241 100644 --- a/src/hangar/dataloaders/grouper.py +++ b/src/hangar/dataloaders/grouper.py @@ -1,11 +1,10 @@ import numpy as np from ..arrayset import ArraysetDataReader +from ..records.hashmachine import array_hash_digest from collections import defaultdict -import hashlib -from typing import Sequence, Union, Iterable, NamedTuple -import struct +from typing import Sequence, Union, Iterable, NamedTuple, Tuple # -------------------------- typehints --------------------------------------- @@ -21,13 +20,6 @@ # ------------------------------------------------------------------------------ -def _calculate_hash_digest(data: np.ndarray) -> str: - hasher = hashlib.blake2b(data, digest_size=20) - hasher.update(struct.pack(f'<{len(data.shape)}QB', *data.shape, data.dtype.num)) - digest = hasher.hexdigest() - return digest - - class FakeNumpyKeyDict(object): def __init__(self, group_spec_samples, group_spec_value, group_digest_spec): self._group_spec_samples = group_spec_samples @@ -35,7 +27,7 @@ def __init__(self, group_spec_samples, group_spec_value, group_digest_spec): self._group_digest_spec = group_digest_spec def __getitem__(self, key: np.ndarray) -> ArraysetSampleNames: - digest = _calculate_hash_digest(key) + digest = array_hash_digest(key) spec = self._group_digest_spec[digest] samples = self._group_spec_samples[spec] return samples @@ -53,7 +45,7 @@ def __len__(self) -> int: return len(self._group_digest_spec) def __contains__(self, key: np.ndarray) -> bool: - digest = _calculate_hash_digest(key) + digest = array_hash_digest(key) res = True if digest in self._group_digest_spec else False return res @@ -69,7 +61,7 @@ def values(self) -> Iterable[ArraysetSampleNames]: for spec in self._group_digest_spec.values(): yield self._group_spec_samples[spec] - def items(self) -> Iterable[ArraysetSampleNames]: + def items(self) -> Iterable[Tuple[np.ndarray, ArraysetSampleNames]]: for spec in self._group_digest_spec.values(): yield (self._group_spec_value[spec], self._group_spec_samples[spec]) @@ -81,11 +73,10 @@ def __repr__(self): def _repr_pretty_(self, p, cycle): res = f'Mapping: Group Data Value -> Sample Name \n' for k, v in self.items(): - res += f'\n {k} :: {v}' + res += f'\n {k} :: {v} \n' p.text(res) - # ---------------------------- MAIN METHOD ------------------------------------ @@ -112,7 +103,7 @@ def _setup(self): for spec, names in self._group_spec_samples.items(): data = self.__arrayset._fs[spec.backend].read_data(spec) self._group_spec_value[spec] = data - digest = _calculate_hash_digest(data) + digest = array_hash_digest(data) self._group_digest_spec[digest] = spec @property diff --git a/src/hangar/dataloaders/sampler.py b/src/hangar/dataloaders/sampler.py index b152ab0e..38797ed2 100644 --- a/src/hangar/dataloaders/sampler.py +++ b/src/hangar/dataloaders/sampler.py @@ -1,6 +1,7 @@ from typing import Sequence, Union, List, Iterable import numpy as np +import numpy.random from ..arrayset import ArraysetDataReader @@ -93,32 +94,36 @@ def __init__(self, data_source): def __iter__(self): raise NotImplementedError - # NOTE [ Lack of Default `__len__` in Python Abstract Base Classes ] - # - # Many times we have an abstract class representing a collection/iterable of - # data, e.g., `torch.utils.data.Sampler`, with its subclasses optionally - # implementing a `__len__` method. In such cases, we must make sure to not - # provide a default implementation, because both straightforward default - # implementations have their issues: - # - # + `return NotImplemented`: - # Calling `len(subclass_instance)` raises: - # TypeError: 'NotImplementedType' object cannot be interpreted as an integer - # - # + `raise NotImplementedError()`: - # This prevents triggering some fallback behavior. E.g., the built-in - # `list(X)` tries to call `len(X)` first, and executes a different code - # path if the method is not found or `NotImplemented` is returned, while - # raising an `NotImplementedError` will propagate and and make the call - # fail where it could have use `__iter__` to complete the call. - # - # Thus, the only two sensible things to do are - # - # + **not** provide a default `__len__`. - # - # + raise a `TypeError` instead, which is what Python uses when users call - # a method that is not defined on an object. - # (@ssnl verifies that this works on at least Python 3.7.) + def __len__(self): + """ + # NOTE [ Lack of Default `__len__` in Python Abstract Base Classes ] + # + # Many times we have an abstract class representing a collection/iterable of + # data, e.g., `torch.utils.data.Sampler`, with its subclasses optionally + # implementing a `__len__` method. In such cases, we must make sure to not + # provide a default implementation, because both straightforward default + # implementations have their issues: + # + # + `return NotImplemented`: + # Calling `len(subclass_instance)` raises: + # TypeError: 'NotImplementedType' object cannot be interpreted as an integer + # + # + `raise NotImplementedError()`: + # This prevents triggering some fallback behavior. E.g., the built-in + # `list(X)` tries to call `len(X)` first, and executes a different code + # path if the method is not found or `NotImplemented` is returned, while + # raising an `NotImplementedError` will propagate and and make the call + # fail where it could have use `__iter__` to complete the call. + # + # Thus, the only two sensible things to do are + # + # + **not** provide a default `__len__`. + # + # + raise a `TypeError` instead, which is what Python uses when users call + # a method that is not defined on an object. + # (@ssnl verifies that this works on at least Python 3.7.) + """ + raise TypeError class SequentialSampler(Sampler):