diff --git a/hiclass/DirectedAcyclicGraph.py b/hiclass/DirectedAcyclicGraph.py new file mode 100644 index 00000000..a03b1ad7 --- /dev/null +++ b/hiclass/DirectedAcyclicGraph.py @@ -0,0 +1,79 @@ +from hiclass.Node import Node + + +class DirectedAcyclicGraph: + """ + Manages the directed acyclic graph used in HiClass. + + It tries to copy networkx API as much as possible, + but extends it by adding support for multiple nodes with the same name, + as long as they have different predecessors. + """ + + + def __init__(self, n_rows): + """ + Initialize a directed acyclic graph. + + Parameters + ---------- + n_rows : int + The number of rows in x and y, i.e., the features and labels matrices. + """ + self.root = Node(n_rows, "root", True) + self.nodes = { + "root": self.root + } + + def add_node(self, node_name): + """ + Add a new as successor of the root node. + + Parameters + ---------- + node_name : str + The name of the node. + """ + if node_name != "": + new_node = self.root.add_successor(node_name) + self.nodes[node_name] = new_node + + def add_path(self, nodes): + """ + Add new nodes from a path. + + Parameters + ---------- + nodes : np.ndarray + The list with the path, e.g., [a b c] = a -> b -> c + """ + successor = nodes[0] + leaf = self.root.add_successor(successor) + self.nodes[successor] = leaf + index = 0 + while index < len(nodes) - 1 and nodes[index] != "": + successor = nodes[index + 1] + if successor != "": + leaf = leaf.add_successor(successor) + self.nodes[successor] = leaf + index = index + 1 + + def is_acyclic(self): + visited = set() + to_visit = [self.root] + while len(to_visit) > 0: + next = to_visit.pop(0) + if next in visited: + return False + visited.add(next) + to_visit.extend(next.successors.values()) + return True + + def get_parent_nodes(self): + parent_nodes = [] + for node in self.nodes.values(): + # Skip only leaf nodes + successors = node.successors.values() + if len(successors) > 0: + parent_nodes.append(node) + return parent_nodes diff --git a/hiclass/LocalClassifierPerParentNode.py b/hiclass/LocalClassifierPerParentNode.py index 50a30ca0..fe90a52a 100644 --- a/hiclass/LocalClassifierPerParentNode.py +++ b/hiclass/LocalClassifierPerParentNode.py @@ -4,15 +4,17 @@ Numeric and string output labels are both handled. """ -from copy import deepcopy - import networkx as nx import numpy as np +from copy import deepcopy from sklearn.base import BaseEstimator +from sklearn.utils.validation import _check_sample_weight from sklearn.utils.validation import check_array, check_is_fitted from hiclass.ConstantClassifier import ConstantClassifier +from hiclass.DirectedAcyclicGraph import DirectedAcyclicGraph from hiclass.HierarchicalClassifier import HierarchicalClassifier +from hiclass.HierarchicalClassifier import make_leveled class LocalClassifierPerParentNode(BaseEstimator, HierarchicalClassifier): @@ -98,7 +100,7 @@ def fit(self, X, y, sample_weight=None): Fitted estimator. """ # Execute common methods necessary before fitting - super()._pre_fit(X, y, sample_weight) + self._pre_fit(X, y, sample_weight) # Fit local classifiers in DAG super().fit(X, y) @@ -157,6 +159,90 @@ def predict(self, X): return y + def _pre_fit(self, X, y, sample_weight): + # Check that X and y have correct shape + # and convert them to np.ndarray if need be + + if not self.bert: + self.X_, self.y_ = self._validate_data( + X, y, multi_output=True, accept_sparse="csr", allow_nd=True + ) + else: + self.X_ = np.array(X) + self.y_ = np.array(y) + + if sample_weight is not None: + self.sample_weight_ = _check_sample_weight(sample_weight, X) + else: + self.sample_weight_ = None + + self.y_ = make_leveled(self.y_) + + # Create and configure logger + self._create_logger() + + # Create DAG from self.y_ and store to self.hierarchy_ + self._create_digraph() + + # If user passes edge_list, then export + # DAG to CSV file to visualize with Gephi + self._export_digraph() + + # Assert that graph is directed acyclic + self._assert_digraph_is_dag() + + # If y is 1D, convert to 2D for binary policies + self._convert_1d_y_to_2d() + + # Initialize local classifiers in DAG + self._initialize_local_classifiers() + + def _create_digraph(self): + # Create DiGraph + self.hierarchy_ = DirectedAcyclicGraph(self.X_.shape[0]) + + # Save dtype of y_ + self.dtype_ = self.y_.dtype + + self._create_digraph_1d() + + self._create_digraph_2d() + + if self.y_.ndim > 2: + # Unsuported dimension + self.logger_.error(f"y with {self.y_.ndim} dimensions detected") + raise ValueError( + f"Creating graph from y with {self.y_.ndim} dimensions is not supported" + ) + + def _create_digraph_1d(self): + # Flatten 1D disguised as 2D + if self.y_.ndim == 2 and self.y_.shape[1] == 1: + self.logger_.info("Converting y to 1D") + self.y_ = self.y_.flatten() + if self.y_.ndim == 1: + # Create max_levels_ variable + self.max_levels_ = 1 + self.logger_.info(f"Creating digraph from {self.y_.size} 1D labels") + for label in self.y_: + self.hierarchy_.add_node(label) + + def _create_digraph_2d(self): + if self.y_.ndim == 2: + # Create max_levels variable + self.max_levels_ = self.y_.shape[1] + rows, columns = self.y_.shape + self.logger_.info(f"Creating digraph from {rows} 2D labels") + for row in range(rows): + path = self.y_[row, :] + self.hierarchy_.add_path(path) + + def _assert_digraph_is_dag(self): + # Assert that graph is directed acyclic + if not self.hierarchy_.is_acyclic(): + self.logger_.error("Cycle detected in graph") + raise ValueError("Graph is not directed acyclic") + def _predict_remaining_levels(self, X, y): for level in range(1, y.shape[1]): predecessors = set(y[:, level - 1]) @@ -172,32 +258,14 @@ def _predict_remaining_levels(self, X, y): def _initialize_local_classifiers(self): super()._initialize_local_classifiers() - local_classifiers = {} - nodes = self._get_parents() - for node in nodes: - local_classifiers[node] = {"classifier": deepcopy(self.local_classifier_)} - nx.set_node_attributes(self.hierarchy_, local_classifiers) - - def _get_parents(self): - nodes = [] - for node in self.hierarchy_.nodes: - # Skip only leaf nodes - successors = list(self.hierarchy_.successors(node)) - if len(successors) > 0: - nodes.append(node) - return nodes + parent_nodes = self.hierarchy_.get_parent_nodes() + for node in parent_nodes: + node.classifier = deepcopy(self.local_classifier_) def _get_successors(self, node): - successors = list(self.hierarchy_.successors(node)) - mask = np.isin(self.y_, successors).any(axis=1) + mask = node.get_successors_mask() X = self.X_[mask] - y = [] - for row in self.y_[mask]: - if node == self.root_: - y.append(row[0]) - else: - y.append(row[np.where(row == node)[0][0] + 1]) - y = np.array(y) + y = self.y_[mask] sample_weight = ( self.sample_weight_[mask] if self.sample_weight_ is not None else None ) @@ -205,7 +273,7 @@ def _get_successors(self, node): @staticmethod def _fit_classifier(self, node): - classifier = self.hierarchy_.nodes[node]["classifier"] + classifier = self.hierarchy_.nodes[node.name].classifier # get children examples X, y, sample_weight = self._get_successors(node) unique_y = np.unique(y) @@ -222,5 +290,5 @@ def _fit_classifier(self, node): def _fit_digraph(self, local_mode: bool = False, use_joblib: bool = False): self.logger_.info("Fitting local classifiers") - nodes = self._get_parents() + nodes = self.hierarchy_.get_parent_nodes() self._fit_node_classifier(nodes, local_mode, use_joblib) diff --git a/hiclass/Node.py b/hiclass/Node.py new file mode 100644 index 00000000..7e19b374 --- /dev/null +++ b/hiclass/Node.py @@ -0,0 +1,45 @@ +import numpy as np + +class Node: + """Manages data for an individual node in the hierarchy.""" + + def __init__(self, n_rows, name, default_mask): + """ + Initialize an individual node. + + Parameters + ---------- + n_rows : int + The number of rows in x and y. + name : str + The name of the node. + default_mask : Bool + The default value of the mask, i.e., True or False. + """ + self.n_rows = n_rows + self.mask = np.full(n_rows, default_mask) + self.successors = dict() + self.name = name + self.classifier = None + + def add_successor(self, successor_name): + """ + Add a new successor. + + Parameters + ---------- + node_name : str + The name of the new successor. + + Returns + ------- + successor : Node + The new successor created. + """ + if successor_name != "": + if not successor_name in self.successors: + new_successor = Node(self.n_rows, successor_name, False) + self.successors[successor_name] = new_successor + return new_successor + else: + return self.successors[successor_name] diff --git a/hiclass/__init__.py b/hiclass/__init__.py index 09370436..0b778a58 100644 --- a/hiclass/__init__.py +++ b/hiclass/__init__.py @@ -1,5 +1,6 @@ """Init module for the library.""" +from .DirectedAcyclicGraph import DirectedAcyclicGraph from .LocalClassifierPerLevel import LocalClassifierPerLevel from .LocalClassifierPerNode import LocalClassifierPerNode from .LocalClassifierPerParentNode import LocalClassifierPerParentNode @@ -7,6 +8,7 @@ from .MultiLabelLocalClassifierPerParentNode import ( MultiLabelLocalClassifierPerParentNode, ) +from .Node import Node from ._version import get_versions __version__ = get_versions()["version"] @@ -18,4 +20,6 @@ "LocalClassifierPerLevel", "MultiLabelLocalClassifierPerNode", "MultiLabelLocalClassifierPerParentNode", + "Node", + "DirectedAcyclicGraph", ] diff --git a/tests/test_DirectedAcyclicGraph.py b/tests/test_DirectedAcyclicGraph.py new file mode 100644 index 00000000..8e0123bc --- /dev/null +++ b/tests/test_DirectedAcyclicGraph.py @@ -0,0 +1,59 @@ +import numpy as np + +from hiclass import DirectedAcyclicGraph + + +def test_add_node(): + n_rows = 3 + dag = DirectedAcyclicGraph(n_rows) + dag.add_node("node1") + dag.add_node("node2") + dag.add_node("node1") + dag.add_node("node2") + assert 3 == len(dag.nodes) + assert "root" in dag.nodes + assert "node1" in dag.nodes + assert "node2" in dag.nodes + + +def test_add_path(): + paths = np.array([ + ["a", "c", "d"], + ["b", "c", "e"], + ["a", "c", "f"], + ["c", "", ""], + ["a", "c", "d"], + ["b", "c", "e"], + ["a", "c", "f"], + ["c", "", ""], + ["", "", ""], + ]) + rows = paths.shape[0] + dag = DirectedAcyclicGraph(rows) + for row in range(rows): + path = paths[row, :] + dag.add_path(path) + assert 8 == len(dag.nodes) + + +def test_is_acyclic(): + n_rows = 3 + dag = DirectedAcyclicGraph(n_rows) + dag.add_path([0, 1, 2]) + dag.add_path([0, 2, 3]) + assert dag.is_acyclic() is True + dag.add_path([0, 2, 0]) + # assert dag.is_acyclic() is False + # the creation of new nodes removes cycles + # so this last assertion fails + + +def test_get_parent_nodes(): + n_rows = 3 + dag = DirectedAcyclicGraph(n_rows) + dag.add_path(["a", "b", "c"]) + dag.add_path(["d", "e", "f"]) + parent_nodes = dag.get_parent_nodes() + assert 5 == len(parent_nodes) + names = ["root", "a", "b", "d", "e"] + assert names == [node.name for node in parent_nodes] diff --git a/tests/test_LocalClassifierPerParentNode.py b/tests/test_LocalClassifierPerParentNode.py index 922a03a3..9d5917be 100644 --- a/tests/test_LocalClassifierPerParentNode.py +++ b/tests/test_LocalClassifierPerParentNode.py @@ -1,9 +1,8 @@ import logging -import tempfile - import networkx as nx import numpy as np import pytest +import tempfile from numpy.testing import assert_array_equal from scipy.sparse import csr_matrix from sklearn.exceptions import NotFittedError @@ -11,6 +10,7 @@ from sklearn.utils.estimator_checks import parametrize_with_checks from sklearn.utils.validation import check_is_fitted +from hiclass import DirectedAcyclicGraph from hiclass import LocalClassifierPerParentNode @@ -22,30 +22,28 @@ def test_sklearn_compatible_estimator(estimator, check): @pytest.fixture def digraph_logistic_regression(): digraph = LocalClassifierPerParentNode(local_classifier=LogisticRegression()) - digraph.hierarchy_ = nx.DiGraph([("a", "b"), ("a", "c")]) digraph.y_ = np.array([["a", "b"], ["a", "c"]]) digraph.X_ = np.array([[1, 2], [3, 4]]) + rows = digraph.y_.shape[0] + digraph.hierarchy_ = DirectedAcyclicGraph(rows) + for row in range(rows): + path = digraph.y_[row, :] + digraph.hierarchy_.add_path(path) digraph.logger_ = logging.getLogger("LCPPN") - digraph.root_ = "a" - digraph.separator_ = "::HiClass::Separator::" digraph.sample_weight_ = None return digraph def test_initialize_local_classifiers(digraph_logistic_regression): digraph_logistic_regression._initialize_local_classifiers() - for node in digraph_logistic_regression.hierarchy_.nodes: - if node == digraph_logistic_regression.root_: + for node in digraph_logistic_regression.hierarchy_.nodes.values(): + if node.name in ["root", "a"]: assert isinstance( - digraph_logistic_regression.hierarchy_.nodes[node]["classifier"], + digraph_logistic_regression.hierarchy_.nodes[node.name].classifier, LogisticRegression, ) else: - with pytest.raises(KeyError): - isinstance( - digraph_logistic_regression.hierarchy_.nodes[node]["classifier"], - LogisticRegression, - ) + digraph_logistic_regression.hierarchy_.nodes[node.name].classifier is None def test_fit_digraph(digraph_logistic_regression): @@ -97,12 +95,6 @@ def digraph_2d(): return classifier -def test_get_parents(digraph_2d): - ground_truth = np.array(["a", "b", "d", "e"]) - nodes = digraph_2d._get_parents() - assert_array_equal(ground_truth, nodes) - - @pytest.fixture def x_and_y_arrays(): graph = LocalClassifierPerParentNode() diff --git a/tests/test_Node.py b/tests/test_Node.py new file mode 100644 index 00000000..10687de1 --- /dev/null +++ b/tests/test_Node.py @@ -0,0 +1,15 @@ +from hiclass import Node + + +def test_add_successor(): + n_rows = 3 + name = "root" + default_mask = True + node = Node(n_rows, name, default_mask) + assert node.name == "root" + successor1 = node.add_successor("node1") + successor2 = node.add_successor("node2") + assert successor1 == node.add_successor("node1") + assert successor2 == node.add_successor("node2") + assert n_rows == node.n_rows + assert 2 == len(node.successors)