Add Adult and Abalone datasets (#96)

denkle · github-actions[bot] · mikeheddes · web-flow · commit 92d3b4a15abd · 2022-12-19T09:57:58.000+01:00
* Create the first attempt to integrate datasets from Do we need 100s..

* [github-action] formatting fixes

* Faster download and extraction

* [github-action] formatting fixes

* Move dataset to Google Drive and add download progress bar

* [github-action] formatting fixes

* Add tqdm dependency

* Revisting logic of assigning data w.r.t. variables

* [github-action] formatting fixes

* Fix google drive download link extraction

* Rework classes to streamline inclusion of new datasets from the collections

* [github-action] formatting fixes

* Revised some logic of classes
Resovling the merge conflict

* [github-action] formatting fixes

* Delete collection_datasets.py

* Removed DS store

* Delete __init__.py

* Refactor datasets

* [github-action] formatting fixes

* Update workflow python version

* Refactor data loading

* Update docs

Co-authored-by: github-actions[bot] &lt;github-actions[bot]@users.noreply.github.com&gt;
Co-authored-by: mikeheddes &lt;mikeheddes@gmail.com&gt;
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -11,12 +11,12 @@ permissions:
 jobs:
   test:
     name: Test with Python ${{ matrix.python-version }} on ${{ matrix.os }}
-    runs-on: ubuntu-latest
+    runs-on: ${{ matrix.os }}
     timeout-minutes: 10
     strategy:
       matrix:
-        python-version: ['3.6', '3.8', '3.10']
-        os: [ubuntu-latest, windows-latest, macOS-latest]
+        python-version: ['3.8', '3.9', '3.10']
+        os: [ubuntu-latest, windows-latest, macos-latest]
 
     steps:
     - uses: actions/checkout@v3
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -5,4 +5,5 @@ requests
 numpy
 flake8
 pytest
-black
+black
+tqdm
diff --git a/docs/datasets.rst b/docs/datasets.rst
@@ -19,3 +19,17 @@ The Torchhd library provides many popular built-in datasets to work with.
     EMGHandGestures
     PAMAP
     CyclePowerPlant
+    Abalone
+    Adult
+
+
+Base classes
+------------------------
+
+.. autosummary::
+    :toctree: generated/
+    :template: class_dataset.rst
+
+    CollectionDataset
+    DatasetFourFold
+    DatasetTrainTest
diff --git a/setup.py b/setup.py
@@ -23,6 +23,7 @@
         "pandas",
         "numpy",
         "requests",
+        "tqdm",
     ],
     packages=find_packages(exclude=["docs", "torchhd.tests", "examples"]),
     python_requires=">=3.6, <4",
diff --git a/torchhd/datasets/__init__.py b/torchhd/datasets/__init__.py
@@ -6,6 +6,12 @@
 from torchhd.datasets.emg_hand_gestures import EMGHandGestures
 from torchhd.datasets.pamap import PAMAP
 from torchhd.datasets.ccpp import CyclePowerPlant
+from torchhd.datasets.dataset import CollectionDataset
+from torchhd.datasets.dataset import DatasetFourFold
+from torchhd.datasets.dataset import DatasetTrainTest
+from torchhd.datasets.abalone import Abalone
+from torchhd.datasets.adult import Adult
+
 
 __all__ = [
     "BeijingAirQuality",
@@ -16,4 +22,9 @@
     "EMGHandGestures",
     "PAMAP",
     "CyclePowerPlant",
+    "CollectionDataset",
+    "DatasetFourFold",
+    "DatasetTrainTest",
+    "Abalone",
+    "Adult",
 ]
diff --git a/torchhd/datasets/abalone.py b/torchhd/datasets/abalone.py
@@ -0,0 +1,32 @@
+from typing import List
+from torchhd.datasets import DatasetFourFold
+
+
+class Abalone(DatasetFourFold):
+    """`Abalone <https://archive.ics.uci.edu/ml/datasets/abalone>`_ dataset.
+
+    Args:
+        root (string): Root directory containing the files of the dataset.
+        train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by fold and hyper_search variables.
+            Otherwise returns a subset of train dataset if hypersearch is performed (``hyper_search = True``) if not (``hyper_search = False``) returns a subset of training dataset
+            as specified in ``conxuntos_kfold.dat`` if fold number is correct. Otherwise issues an error.
+        fold (int, optional): Specifies which fold number to use. The default value of -1 returns all the training data from the corresponding file.
+            Values between 0 and 3 specify, which fold in ``conxuntos_kfold.dat`` to use. Relevant only if hyper_search is set to False and ``0 <= fold <= 3``.
+            Indices in even rows (zero indexing) of ``conxuntos_kfold.dat`` correspond to train subsets while indices in odd rows correspond to test subsets.
+        hyper_search (bool, optional): If True, creates dataset using indeces in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
+            while the second row corresponds to test indices (used if ``train = False``).
+        transform (callable, optional): A function/transform that takes in an torch.FloatTensor
+            and returns a transformed version.
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        download (bool, optional): If True, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again.
+    """
+
+    name = "abalone"
+    classes: List[str] = [
+        "0",
+        "1",
+        "2",
+    ]
diff --git a/torchhd/datasets/adult.py b/torchhd/datasets/adult.py
@@ -0,0 +1,27 @@
+from typing import List
+from torchhd.datasets import DatasetTrainTest
+
+
+class Adult(DatasetTrainTest):
+    """`Adult <https://archive.ics.uci.edu/ml/datasets/adult>`_ dataset.
+
+    Args:
+        root (string): Root directory containing the files of the dataset.
+        train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by hyper_search variable.
+            Otherwise returns a subset of train dataset if hyperparameter search is performed (``hyper_search = True``) if not (``hyper_search = False``) returns test set.
+        hyper_search (bool, optional): If True, creates dataset using indices in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
+            while the second row corresponds to test indices (used if ``train = False``).
+        transform (callable, optional): A function/transform that takes in an torch.FloatTensor
+            and returns a transformed version.
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        download (bool, optional): If True, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again.
+    """
+
+    name = "adult"
+    classes: List[str] = [
+        ">50K",
+        "<=50K",
+    ]
diff --git a/torchhd/datasets/dataset.py b/torchhd/datasets/dataset.py
diff --git a/torchhd/datasets/utils.py b/torchhd/datasets/utils.py

-Original file line number
+Diff line change
 numpy
 flake8
 pytest
 -black
 +black
 +tqdm