Add gallery of examples (#40)

mirand863 · web-flow · commit e929befba5ed · 2022-06-17T20:10:43.000+02:00
diff --git a/README.md b/README.md
@@ -191,9 +191,9 @@ predictions = pipeline.predict(X_test)
 
 ## Step-by-step walk-through
 
-A step-by-step walk-through is available on our interactive notebook hosted on [Google Colab](https://colab.research.google.com/drive/1Idzht9dNoB85pjc9gOL24t9ksrXZEA-9?usp=sharing).
+A step-by-step walk-through is available on our documentation hosted on [Read the Docs](https://hiclass.readthedocs.io/en/latest/index.html).
 
-This will guide you through the process of installing hiclass with conda, training and predicting a small dataset.
+This will guide you through the process of installing hiclass within a virtual environment, training, predicting, persisting models and much more.
 
 ## API documentation
 
diff --git a/docs/examples/README.rst b/docs/examples/README.rst
@@ -0,0 +1,4 @@
+Gallery of Examples
+===================
+
+These examples illustrate the main features of HiClass.
diff --git a/docs/examples/plot_binary_policies.py b/docs/examples/plot_binary_policies.py
@@ -1,4 +1,7 @@
-Selecting a training policy
+# -*- coding: utf-8 -*-
+"""
+===========================
+Binary Training Policies
 ===========================
 
 The siblings policy is used by default on the local classifier per node, but the remaining ones can be selected with the parameter :literal:`binary_policy`, for example:
@@ -40,3 +43,36 @@
 
         rf = RandomForestClassifier()
         classifier = LocalClassifierPerNode(local_classifier=rf, binary_policy="exclusive_siblings")
+
+In the code below, the inclusive policy is selected.
+However, the code can be easily updated by replacing lines 20-21 with the examples shown in the tabs above.
+
+.. seealso::
+
+   Mathematical definition on the different policies is given at :ref:`Training Policies`.
+"""
+from sklearn.ensemble import RandomForestClassifier
+
+from hiclass import LocalClassifierPerNode
+
+# Define data
+X_train = [[1], [2], [3], [4]]
+X_test = [[4], [3], [2], [1]]
+Y_train = [
+    ["Animal", "Mammal", "Sheep"],
+    ["Animal", "Mammal", "Cow"],
+    ["Animal", "Reptile", "Snake"],
+    ["Animal", "Reptile", "Lizard"],
+]
+
+# Use random forest classifiers for every node
+# And exclusive siblings policy to select training examples for binary classifiers.
+rf = RandomForestClassifier()
+classifier = LocalClassifierPerNode(local_classifier=rf, binary_policy="inclusive")
+
+# Train local classifier per node
+classifier.fit(X_train, Y_train)
+
+# Predict
+predictions = classifier.predict(X_test)
+print(predictions)
diff --git a/docs/examples/plot_hello_hiclass.py b/docs/examples/plot_hello_hiclass.py
@@ -0,0 +1,32 @@
+# -*- coding: utf-8 -*-
+"""
+=====================
+Hello HiClass
+=====================
+
+A minimalist example showing how to use HiClass to train and predict.
+"""
+from sklearn.ensemble import RandomForestClassifier
+
+from hiclass import LocalClassifierPerNode
+
+# Define data
+X_train = [[1], [2], [3], [4]]
+X_test = [[4], [3], [2], [1]]
+Y_train = [
+    ["Animal", "Mammal", "Sheep"],
+    ["Animal", "Mammal", "Cow"],
+    ["Animal", "Reptile", "Snake"],
+    ["Animal", "Reptile", "Lizard"],
+]
+
+# Use random forest classifiers for every node
+rf = RandomForestClassifier()
+classifier = LocalClassifierPerNode(local_classifier=rf)
+
+# Train local classifier per node
+classifier.fit(X_train, Y_train)
+
+# Predict
+predictions = classifier.predict(X_test)
+print(predictions)
diff --git a/docs/examples/plot_model_persistence.py b/docs/examples/plot_model_persistence.py
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+"""
+=====================
+Model Persistence
+=====================
+
+HiClass is fully compatible with Pickle.
+Pickle can be used to easily store machine learning models on disk.
+In this example, we demonstrate how to use pickle to store and load trained classifiers.
+"""
+import pickle
+
+from sklearn.linear_model import LogisticRegression
+
+from hiclass import LocalClassifierPerLevel
+
+# Define data
+X_train = [[1, 2], [3, 4], [5, 6], [7, 8]]
+X_test = [[7, 8], [5, 6], [3, 4], [1, 2]]
+Y_train = [
+    ["Animal", "Mammal", "Sheep"],
+    ["Animal", "Mammal", "Cow"],
+    ["Animal", "Reptile", "Snake"],
+    ["Animal", "Reptile", "Lizard"],
+]
+
+# Use Logistic Regression classifiers for every level in the hierarchy
+lr = LogisticRegression()
+classifier = LocalClassifierPerLevel(local_classifier=lr)
+
+# Train local classifier per level
+classifier.fit(X_train, Y_train)
+
+# Save the model to disk
+filename = "trained_model.sav"
+pickle.dump(classifier, open(filename, "wb"))
+
+# Some time in the future...
+
+# Load the model from disk
+loaded_model = pickle.load(open(filename, "rb"))
+
+# Predict
+predictions = loaded_model.predict(X_test)
+print(predictions)
diff --git a/docs/examples/plot_parallel_training.py b/docs/examples/plot_parallel_training.py
@@ -0,0 +1,76 @@
+# -*- coding: utf-8 -*-
+"""
+=====================
+Parallel Training
+=====================
+
+Larger datasets require more time for training.
+While by default the models in HiClass are trained using a single core,
+it is possible to train each local classifier in parallel by leveraging the library Ray [1]_.
+In this example, we demonstrate how to train a hierarchical classifier in parallel,
+using all the cores available, on a mock dataset from Kaggle [2]_.
+
+.. [1] https://www.ray.io/
+.. [2] https://www.kaggle.com/datasets/kashnitsky/hierarchical-text-classification
+"""
+import sys
+from os import cpu_count
+
+import pandas as pd
+import requests
+from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
+from sklearn.linear_model import LogisticRegression
+from sklearn.pipeline import Pipeline
+
+from hiclass import LocalClassifierPerParentNode
+
+
+def download(url: str, path: str) -> None:
+    """
+    Download a file from the internet.
+
+    Parameters
+    ----------
+    url : str
+        The address of the file to be downloaded.
+    path : str
+        The path to store the downloaded file.
+    """
+    response = requests.get(url)
+    with open(path, "wb") as file:
+        file.write(response.content)
+
+
+# Download training data
+training_data_url = "https://zenodo.org/record/6657410/files/train_40k.csv?download=1"
+training_data_path = "train_40k.csv"
+download(training_data_url, training_data_path)
+
+# Load training data into pandas dataframe
+training_data = pd.read_csv(training_data_path).fillna(" ")
+
+# We will use logistic regression classifiers for every parent node
+lr = LogisticRegression(max_iter=1000)
+
+pipeline = Pipeline(
+    [
+        ("count", CountVectorizer()),
+        ("tfidf", TfidfTransformer()),
+        (
+            "lcppn",
+            LocalClassifierPerParentNode(local_classifier=lr, n_jobs=cpu_count()),
+        ),
+    ]
+)
+
+# Select training data
+X_train = training_data["Title"]
+Y_train = training_data[["Cat1", "Cat2", "Cat3"]]
+
+# Fixes bug AttributeError: '_LoggingTee' object has no attribute 'fileno'
+# This only happens when building the documentation
+# Hence, you don't actually need it for your code to work
+sys.stdout.fileno = lambda: False
+
+# Now, let's train the local classifier per parent node
+pipeline.fit(X_train, Y_train)
diff --git a/docs/examples/plot_pipeline.py b/docs/examples/plot_pipeline.py
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+"""
+=====================
+Building Pipelines
+=====================
+
+HiClass can be adopted in scikit-learn pipelines, and fully supports sparse matrices as input.
+This example desmonstrates the use of both of these features.
+"""
+from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
+from sklearn.linear_model import LogisticRegression
+from sklearn.pipeline import Pipeline
+
+from hiclass import LocalClassifierPerParentNode
+
+# Define data
+X_train = [
+    "Struggling to repay loan",
+    "Unable to get annual report",
+]
+X_test = [
+    "Unable to get annual report",
+    "Struggling to repay loan",
+]
+Y_train = [["Loan", "Student loan"], ["Credit reporting", "Reports"]]
+
+# We will use logistic regression classifiers for every parent node
+lr = LogisticRegression()
+
+# Let's build a pipeline using CountVectorizer and TfidfTransformer
+# to extract features as sparse matrices
+pipeline = Pipeline(
+    [
+        ("count", CountVectorizer()),
+        ("tfidf", TfidfTransformer()),
+        ("lcppn", LocalClassifierPerParentNode(local_classifier=lr)),
+    ]
+)
+
+# Now, let's train a local classifier per parent node
+pipeline.fit(X_train, Y_train)
+
+# Finally, let's predict using the pipeline
+predictions = pipeline.predict(X_test)
+print(predictions)
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -2,4 +2,7 @@
 sphinx==5.0.0
 sphinx_rtd_theme==1.0.0
 readthedocs-sphinx-search==0.1.2
-sphinx_code_tabs==0.5.3
+sphinx_code_tabs==0.5.3
+sphinx-gallery==0.10.1
+matplotlib==3.5.2
+pandas==1.4.2
diff --git a/docs/source/algorithms/local_classifier_per_node.rst b/docs/source/algorithms/local_classifier_per_node.rst
@@ -14,6 +14,5 @@ One of the most popular approaches in the literature, the local classifier per n
     :hidden:
 
     training_policies
-    selecting_training_policy
 
 Each binary classifier is trained in parallel using the library `Ray <https://www.ray.io/>`_. In order to avoid inconsistencies, prediction is performed in a top-down manner. For example, given a hypothetical test example, the local classifier per node firstly queries the binary classifiers at nodes "Reptile" and "Mammal". Let's suppose that in this case the probability of the test example belonging to class "Reptile" is 0.8, while the probability of belonging to class "Mammal" is 0.5, then class "Reptile" is picked. At the next level, only the classifiers at nodes "Snake" and "Lizard" are queried, and again the one with the highest probability is selected.
diff --git a/docs/source/algorithms/metrics.rst b/docs/source/algorithms/metrics.rst
@@ -1,6 +1,6 @@
 .. _metrics-overview:
 
-Hierarchical Metrics
+Metrics
 ====================
 
 According to [1]_, the use of flat classification metrics might not be adequate to give enough insight of which algorithm is better at classifying hierarchical data. Hence, in HiClass we implemented the metrics of hierarchical precision (hP), hierarchical recall (hR) and hierarchical F-score (hF), which are extensions of the renowned metrics of precision, recall and F-score, but tailored to the hierarchical classification scenario. These hierarchical counterparts were initially proposed by [2]_, and are defined as follows:
diff --git a/docs/source/algorithms/training_policies.rst b/docs/source/algorithms/training_policies.rst
@@ -48,6 +48,10 @@ Using as example the class "Wolf" from the hierarchy represented in the image be
 **Exclusive siblings**  Wolf                    Cat
 ======================  ======================  ===============================================
 
+.. seealso::
+
+   In terms of code, we explain how to select those different policies here: :ref:`Binary Training Policies`.
+
 .. [1] Silla, C. N., & Freitas, A. A. (2011). A survey of hierarchical classification across different application domains. Data Mining and Knowledge Discovery, 22(1), 31-72.
 
 .. [2] Eisner, R., Poulin, B., Szafron, D., Lu, P., & Greiner, R. (2005, November). Improving protein function prediction using the hierarchical structure of the gene ontology. In 2005 IEEE symposium on computational intelligence in bioinformatics and computational biology (pp. 1-10). IEEE.
diff --git a/docs/source/api/index.rst b/docs/source/api/index.rst
@@ -1,7 +1,7 @@
 .. _code:
 
-API reference documentation
-===========================
+API reference
+=============
 The documentation lists all available functions for each of the implemented classes. This includes inherited functions.
 Therefore, not everything that is listed under a classes documentations is necessarily implemented by said class.
 This is done in order to provide a complete list of the callable functions for each of the classes.
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -36,6 +36,7 @@
     'sphinx.ext.napoleon',
     'sphinx.ext.autosectionlabel',
     'sphinx_code_tabs',
+    'sphinx_gallery.gen_gallery',
 ]
 
 # Add any paths that contain templates here, relative to this directory.
@@ -72,4 +73,9 @@
 html_theme_options = {}
 
 if not use_rtd_scheme:
-    html_theme_options["sidebar_width"] = "230px"
+    html_theme_options["sidebar_width"] = "230px"
+
+sphinx_gallery_conf = {
+    'examples_dirs': '../examples',
+    'gallery_dirs': 'auto_examples',
+}
diff --git a/docs/source/get_started/full_example.rst b/docs/source/get_started/full_example.rst
@@ -9,7 +9,7 @@ It is now time to stitch the code together. Here is the full example:
     from hiclass import LocalClassifierPerNode
     from sklearn.ensemble import RandomForestClassifier
 
-    # define data
+    # Define data
     X_train = [[1], [2], [3], [4]]
     X_test = [[4], [3], [2], [1]]
     Y_train = [
@@ -45,4 +45,4 @@ The array below should be printed on the terminal:
      ['Animal' 'Mammal' 'Cow']
      ['Animal' 'Mammal' 'Sheep']]
 
-There is more to HiClass than what is shown in this "Hello World" example, such as training with missing data points, storing trained models and computation of hierarchical metrics. These concepts are covered in the next tutorial.
+There is more to HiClass than what is shown in this "Hello World" example, such as training with missing leaf nodes, storing trained models and computation of hierarchical metrics. These concepts are covered in the :ref:`Gallery of Examples`.
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -18,14 +18,14 @@ Welcome to hiclass' documentation!
     :target: https://codecov.io/gh/mirand863/hiclass
     :alt: codecov
 
+.. image:: https://static.pepy.tech/personalized-badge/hiclass?period=total&units=international_system&left_color=grey&right_color=brightgreen&left_text=pypi
+    :target: https://pypi.org/project/hiclass/
+    :alt: Downloads pypi
+
 .. image:: https://img.shields.io/conda/dn/conda-forge/hiclass?label=conda
     :target: https://anaconda.org/conda-forge/hiclass
     :alt: Downloads Conda
 
-.. image:: https://img.shields.io/pypi/dm/hiclass?label=pypi
-    :target: https://pypi.org/project/hiclass/
-    :alt: Downloads pypi
-
 .. image:: https://img.shields.io/badge/License-BSD_3--Clause-blue.svg
     :target: https://opensource.org/licenses/BSD-3-Clause
     :alt: License
@@ -35,6 +35,7 @@ Welcome to hiclass' documentation!
 
     introduction/index
     get_started/index
+    auto_examples/index
     algorithms/index
 
 .. toctree::
diff --git a/docs/source/introduction/learn.rst b/docs/source/introduction/learn.rst
@@ -6,6 +6,6 @@ In the next few chapters, you will learn how to :ref:`Install HiClass` and set u
 Once you are set up, we suggest working through our examples, including:
 
 - A typical :ref:`A "Hello World" example`, for an entry-level description of the main concepts.
-- A more detailed `tutorial <TODO>`_ to give you hands-on experience.
+- Further examples are displayed in our :ref:`Gallery of Examples`, to give you hands-on experience.
 
-We also recommend the :ref:`API reference documentation` for additional information.
+We also recommend the sections :ref:`Algorithms Overview` and :ref:`API reference` for additional information.
diff --git a/hiclass/LocalClassifierPerLevel.py b/hiclass/LocalClassifierPerLevel.py
@@ -139,7 +139,7 @@ def predict(self, X):
         for level, classifier in enumerate(self.local_classifiers_):
             self.logger_.info(f"Predicting level {level}")
             if level == 0:
-                y[:, level] = classifier.predict(X)
+                y[:, level] = classifier.predict(X).flatten()
             else:
                 all_probabilities = classifier.predict_proba(X)
                 successors = np.array(