From ff63f8eb7ac4b3660f454f494eb0394410dbc24b Mon Sep 17 00:00:00 2001 From: "Michael Ludvig (CyberCX)" Date: Tue, 10 Jun 2025 15:23:03 +1200 Subject: [PATCH] Optimize find_indirect_clusters() --- ragas/src/ragas/testset/graph.py | 82 ++++++++++++++++---------------- 1 file changed, 40 insertions(+), 42 deletions(-) diff --git a/ragas/src/ragas/testset/graph.py b/ragas/src/ragas/testset/graph.py index 4694bbcfd..336b05469 100644 --- a/ragas/src/ragas/testset/graph.py +++ b/ragas/src/ragas/testset/graph.py @@ -256,61 +256,59 @@ def find_indirect_clusters( depth_limit: int = 3, ) -> t.List[t.Set[Node]]: """ - Finds indirect clusters of nodes in the knowledge graph based on a relationship condition. - Here if A -> B -> C -> D, then A, B, C, and D form a cluster. If there's also a path A -> B -> C -> E, - it will form a separate cluster. + Finds clusters (connected components) of nodes in the knowledge graph reachable within + a limited number of hops, according to a relationship condition. + + A cluster is defined as a set of nodes such that each node in the cluster can be reached + from any other node in the same cluster by traversing up to `depth_limit` relationships + (edges), following only relationships that satisfy the given `relationship_condition`. + + For example, if A → B → C → D exists (and the relationships match the condition), + then {A, B, C, D} will be grouped in the same cluster if all are within `depth_limit` + of each other. Nodes that are not connected within this limit will be in separate clusters. Parameters ---------- relationship_condition : Callable[[Relationship], bool], optional - A function that takes a Relationship and returns a boolean, by default lambda _: True + A function that takes a Relationship and returns a boolean indicating if it should be + considered during clustering. By default, all relationships are considered. + + depth_limit : int, optional + The maximum number of hops to use for clustering. Default is 3. Returns ------- List[Set[Node]] - A list of sets, where each set contains nodes that form a cluster. + A list of clusters: each cluster is a set of nodes, and no node appears in more than one cluster. """ clusters = [] - visited_paths = set() - - relationships = [ - rel for rel in self.relationships if relationship_condition(rel) - ] + assigned = set() + relationships = [rel for rel in self.relationships if relationship_condition(rel)] - def dfs(node: Node, cluster: t.Set[Node], depth: int, path: t.Tuple[Node, ...]): - if depth >= depth_limit or path in visited_paths: - return - visited_paths.add(path) - cluster.add(node) - - for rel in relationships: - neighbor = None - if rel.source == node and rel.target not in cluster: - neighbor = rel.target - elif ( - rel.bidirectional - and rel.target == node - and rel.source not in cluster - ): - neighbor = rel.source - - if neighbor is not None: - dfs(neighbor, cluster.copy(), depth + 1, path + (neighbor,)) - - # Add completed path-based cluster - if len(cluster) > 1: - clusters.append(cluster) + # Build adjacency list + adjacency = {node: set() for node in self.nodes} + for rel in relationships: + adjacency[rel.source].add(rel.target) + if rel.bidirectional: + adjacency[rel.target].add(rel.source) for node in self.nodes: - initial_cluster = set() - dfs(node, initial_cluster, 0, (node,)) - - # Remove duplicates by converting clusters to frozensets - unique_clusters = [ - set(cluster) for cluster in set(frozenset(c) for c in clusters) - ] - - return unique_clusters + if node not in assigned: + # BFS for all nodes within depth_limit + cluster = set([node]) + q = [(node, 0)] + while q: + curr, depth = q.pop(0) + if depth == depth_limit: + continue + for neighbor in adjacency.get(curr, []): + if neighbor not in cluster: + cluster.add(neighbor) + q.append((neighbor, depth + 1)) + if len(cluster) > 1: + clusters.append(cluster) + assigned.update(cluster) + return clusters def remove_node( self, node: Node, inplace: bool = True