diff --git a/ragas/pyproject.toml b/ragas/pyproject.toml index d93f76ab3..c8fe5e3eb 100644 --- a/ragas/pyproject.toml +++ b/ragas/pyproject.toml @@ -64,6 +64,7 @@ dev = [ "haystack-ai", "sacrebleu", "r2r", + "scipy", "pytest", "pytest-xdist[psutil]", "pytest-asyncio", diff --git a/ragas/src/ragas/testset/transforms/relationship_builders/cosine.py b/ragas/src/ragas/testset/transforms/relationship_builders/cosine.py index 0492ca1ed..834b179f0 100644 --- a/ragas/src/ragas/testset/transforms/relationship_builders/cosine.py +++ b/ragas/src/ragas/testset/transforms/relationship_builders/cosine.py @@ -12,24 +12,40 @@ class CosineSimilarityBuilder(RelationshipBuilder): property_name: str = "embedding" new_property_name: str = "cosine_similarity" threshold: float = 0.9 + block_size: int = 1024 + + def _block_cosine_similarity(self, i: np.ndarray, j: np.ndarray): + """Calculate cosine similarity matrix between two sets of embeddings.""" + i_norm = i / np.linalg.norm(i, axis=1, keepdims=True) + j_norm = j / np.linalg.norm(j, axis=1, keepdims=True) + return np.dot(i_norm, j_norm.T) + + async def _find_similar_embedding_pairs( + self, embeddings: np.ndarray, threshold: float, block_size: int = 1024 + ) -> t.Set[t.Tuple[int, int, float]]: + """Sharded computation of cosine similarity to find similar pairs.""" + + def process_block(i: int, j: int) -> t.Set[t.Tuple[int, int, float]]: + end_i = min(i + block_size, n_embeddings) + end_j = min(j + block_size, n_embeddings) + block = self._block_cosine_similarity( + embeddings[i:end_i, :], embeddings[j:end_j, :] + ) + similar_idx = np.argwhere(block >= threshold) + return { + (int(i + ii), int(j + jj), float(block[ii, jj])) + for ii, jj in similar_idx + if int(i + ii) < int(j + jj) + } - def _find_similar_embedding_pairs( - self, embeddings: np.ndarray, threshold: float - ) -> t.List[t.Tuple[int, int, float]]: - # Normalize the embeddings - normalized = embeddings / np.linalg.norm(embeddings, axis=1)[:, np.newaxis] + n_embeddings, _dimension = embeddings.shape + triplets = set() - # Calculate cosine similarity matrix - similarity_matrix = np.dot(normalized, normalized.T) - # Find pairs with similarity >= threshold - similar_pairs = np.argwhere(similarity_matrix >= threshold) + for i in range(0, n_embeddings, block_size): + for j in range(i, n_embeddings, block_size): + triplets.update(process_block(i, j)) - # Filter out self-comparisons and duplicate pairs - return [ - (pair[0], pair[1], similarity_matrix[pair[0], pair[1]]) - for pair in similar_pairs - if pair[0] < pair[1] - ] + return triplets def _validate_embedding_shapes(self, embeddings: t.List[t.Any]): if not embeddings: @@ -43,40 +59,66 @@ def _validate_embedding_shapes(self, embeddings: t.List[t.Any]): ) async def transform(self, kg: KnowledgeGraph) -> t.List[Relationship]: - if self.property_name is None: - self.property_name = "embedding" - embeddings = [] for node in kg.nodes: embedding = node.get_property(self.property_name) if embedding is None: raise ValueError(f"Node {node.id} has no {self.property_name}") embeddings.append(embedding) - self._validate_embedding_shapes(embeddings) - similar_pairs = self._find_similar_embedding_pairs( - np.array(embeddings), self.threshold + similar_pairs = await self._find_similar_embedding_pairs( + np.array(embeddings), self.threshold, self.block_size ) - return [ Relationship( source=kg.nodes[i], target=kg.nodes[j], - type="cosine_similarity", + type=self.new_property_name, properties={self.new_property_name: similarity_float}, bidirectional=True, ) for i, j, similarity_float in similar_pairs ] + def generate_execution_plan(self, kg: KnowledgeGraph) -> t.List[t.Coroutine]: + """ + Generates a coroutine task for finding similar embedding pairs, which can be scheduled/executed by an Executor. + """ + filtered_kg = self.filter(kg) + + embeddings = [] + for node in filtered_kg.nodes: + embedding = node.get_property(self.property_name) + if embedding is None: + raise ValueError(f"Node {node.id} has no {self.property_name}") + embeddings.append(embedding) + self._validate_embedding_shapes(embeddings) + + async def find_and_add_relationships(): + similar_pairs = await self._find_similar_embedding_pairs( + np.array(embeddings), self.threshold, self.block_size + ) + for i, j, similarity_float in similar_pairs: + rel = Relationship( + source=filtered_kg.nodes[i], + target=filtered_kg.nodes[j], + type=self.new_property_name, + properties={self.new_property_name: similarity_float}, + bidirectional=True, + ) + kg.relationships.append(rel) + + return [find_and_add_relationships()] + @dataclass class SummaryCosineSimilarityBuilder(CosineSimilarityBuilder): property_name: str = "summary_embedding" new_property_name: str = "summary_cosine_similarity" threshold: float = 0.1 + block_size: int = 1024 - def filter(self, kg: KnowledgeGraph) -> KnowledgeGraph: + def _document_summary_filter(self, kg: KnowledgeGraph) -> KnowledgeGraph: """ Filters the knowledge graph to only include nodes with a summary embedding. """ @@ -90,22 +132,22 @@ def filter(self, kg: KnowledgeGraph) -> KnowledgeGraph: return KnowledgeGraph(nodes=nodes) async def transform(self, kg: KnowledgeGraph) -> t.List[Relationship]: + filtered_kg = self._document_summary_filter(kg) embeddings = [ node.get_property(self.property_name) - for node in kg.nodes + for node in filtered_kg.nodes if node.get_property(self.property_name) is not None ] if not embeddings: raise ValueError(f"No nodes have a valid {self.property_name}") - self._validate_embedding_shapes(embeddings) - similar_pairs = self._find_similar_embedding_pairs( - np.array(embeddings), self.threshold + similar_pairs = await self._find_similar_embedding_pairs( + np.array(embeddings), self.threshold, self.block_size ) return [ Relationship( - source=kg.nodes[i], - target=kg.nodes[j], - type="summary_cosine_similarity", + source=filtered_kg.nodes[i], + target=filtered_kg.nodes[j], + type=self.new_property_name, properties={self.new_property_name: similarity_float}, bidirectional=True, ) diff --git a/ragas/src/ragas/testset/transforms/relationship_builders/traditional.py b/ragas/src/ragas/testset/transforms/relationship_builders/traditional.py index ad33ea42f..5b1a7d6f8 100644 --- a/ragas/src/ragas/testset/transforms/relationship_builders/traditional.py +++ b/ragas/src/ragas/testset/transforms/relationship_builders/traditional.py @@ -1,3 +1,4 @@ +import itertools import typing as t from collections import Counter from dataclasses import dataclass @@ -19,39 +20,62 @@ def _jaccard_similarity(self, set1: t.Set[str], set2: t.Set[str]) -> float: union = len(set1.union(set2)) return intersection / union if union > 0 else 0.0 - async def transform(self, kg: KnowledgeGraph) -> t.List[Relationship]: - if self.property_name is None: - self.property_name - - similar_pairs = [] - for i, node1 in enumerate(kg.nodes): - for j, node2 in enumerate(kg.nodes): - if i >= j: - continue - items1 = node1.get_property(self.property_name) - items2 = node2.get_property(self.property_name) - if items1 is None or items2 is None: - raise ValueError( - f"Node {node1.id} or {node2.id} has no {self.property_name}" - ) - if self.key_name is not None: - items1 = items1.get(self.key_name, []) - items2 = items2.get(self.key_name, []) - similarity = self._jaccard_similarity(set(items1), set(items2)) - if similarity >= self.threshold: - similar_pairs.append((i, j, similarity)) + async def _find_similar_embedding_pairs( + self, kg: KnowledgeGraph + ) -> t.Set[t.Tuple[int, int, float]]: + """ + Finds all node index pairs with Jaccard similarity above the threshold. + Returns a set of (i, j, similarity) tuples. + """ + + similar_pairs = set() + for (i, node1), (j, node2) in itertools.combinations(enumerate(kg.nodes), 2): + items1 = node1.get_property(self.property_name) + items2 = node2.get_property(self.property_name) + if items1 is None or items2 is None: + raise ValueError( + f"Node {node1.id} or {node2.id} has no {self.property_name}" + ) + if self.key_name is not None: + items1 = items1.get(self.key_name, []) + items2 = items2.get(self.key_name, []) + similarity = self._jaccard_similarity(set(items1), set(items2)) + if similarity >= self.threshold: + similar_pairs.add((i, j, similarity)) + return similar_pairs + async def transform(self, kg: KnowledgeGraph) -> t.List[Relationship]: + similar_pairs = await self._find_similar_embedding_pairs(kg) return [ Relationship( source=kg.nodes[i], target=kg.nodes[j], - type="jaccard_similarity", + type=self.new_property_name, properties={self.new_property_name: similarity_float}, bidirectional=True, ) for i, j, similarity_float in similar_pairs ] + def generate_execution_plan(self, kg: KnowledgeGraph) -> t.List[t.Coroutine]: + """ + Generates a coroutine task for finding similar pairs, which can be scheduled/executed by an Executor. + """ + + async def find_and_add_relationships(): + similar_pairs = await self._find_similar_embedding_pairs(kg) + for i, j, similarity_float in similar_pairs: + rel = Relationship( + source=kg.nodes[i], + target=kg.nodes[j], + type=self.new_property_name, + properties={self.new_property_name: similarity_float}, + bidirectional=True, + ) + kg.relationships.append(rel) + + return [find_and_add_relationships()] + @dataclass class OverlapScoreBuilder(RelationshipBuilder): @@ -65,6 +89,7 @@ class OverlapScoreBuilder(RelationshipBuilder): def __post_init__(self): try: from rapidfuzz import distance + except ImportError: raise ImportError( "rapidfuzz is required for string distance. Please install it using `pip install rapidfuzz`" @@ -78,13 +103,11 @@ def __post_init__(self): } def _overlap_score(self, overlaps: t.List[bool]) -> float: - return sum(overlaps) / len(overlaps) if len(overlaps) > 0 else 0.0 def _get_noisy_items( self, nodes: t.List[Node], property_name: str, percent_cut_off: float = 0.05 ) -> t.List[str]: - all_items = [] for node in nodes: items = node.get_property(property_name) diff --git a/ragas/tests/unit/test_cosine_relationship_builders.py b/ragas/tests/unit/test_cosine_relationship_builders.py new file mode 100644 index 000000000..23c00aeb2 --- /dev/null +++ b/ragas/tests/unit/test_cosine_relationship_builders.py @@ -0,0 +1,436 @@ +import asyncio +import copy +import random +from typing import Optional +from uuid import UUID + +import numpy as np +import pytest + +from ragas.testset.graph import KnowledgeGraph, Node, NodeType, Relationship +from ragas.testset.transforms.relationship_builders.cosine import ( + CosineSimilarityBuilder, + SummaryCosineSimilarityBuilder, +) + + +def generate_test_vectors( + n: int = 16, + d: int = 32, + min_similarity: float = 0.5, + similar_fraction: float = 0.3, + seed: Optional[int] = None, +) -> np.ndarray: + """ + Generate `n` unit vectors of dimension `d`, where at least `similar_fraction` of them + are similar to each other (cosine similarity > `min_similarity`), and the result is shuffled. + + Parameters: + - n (int): Total number of vectors to generate. + - d (int): Dimensionality of each vector. + - min_similarity (float): Minimum cosine similarity for similar pairs. + - similar_fraction (float): Fraction (0-1) of vectors that should be similar. + - seed (int): Optional random seed for reproducibility. + + Returns: + - np.ndarray: Array of shape (n, d) of unit vectors. + """ + + if seed is not None: + np.random.seed(seed) + random.seed(seed) + + num_similar = max(2, int(n * similar_fraction)) # at least two similar vectors + num_random = n - num_similar + + # Step 1: Create a base vector + base = np.random.randn(d) + base /= np.linalg.norm(base) + + # Step 2: Generate similar vectors + similar_vectors = [base] + angle = np.arccos(min_similarity) + + for _ in range(num_similar - 1): + perturbation = np.random.randn(d) + perturbation -= perturbation.dot(base) * base # make orthogonal + perturbation /= np.linalg.norm(perturbation) + + similar_vec = np.cos(angle * 0.9) * base + np.sin(angle * 0.9) * perturbation + similar_vec /= np.linalg.norm(similar_vec) + similar_vectors.append(similar_vec) + + # Step 3: Generate additional random unit vectors + random_vectors = [] + for _ in range(num_random): + v = np.random.randn(d) + v /= np.linalg.norm(v) + random_vectors.append(v) + + # Step 4: Combine and shuffle + all_vectors = similar_vectors + random_vectors + random.shuffle(all_vectors) + + return np.stack(all_vectors) + + +def cosine_similarity_matrix(embeddings: np.ndarray): + """Calculate cosine similarity matrix for a set of embeddings.""" + from scipy.spatial.distance import cdist + + similarity = 1 - cdist(embeddings, embeddings, metric="cosine") + + # normalized = embeddings / np.linalg.norm(embeddings, axis=1)[:, np.newaxis] + # similarity = np.dot(normalized, normalized.T) + return similarity + + +def cosine_similarity_pair(embeddings: np.ndarray, threshold: float): + """Find pairs of embeddings with cosine similarity >= threshold.""" + # Find pairs with similarity >= threshold + similarity_matrix = cosine_similarity_matrix(embeddings) + similar_pairs = np.argwhere(similarity_matrix >= threshold) + + # Filter out self-comparisons and duplicate pairs + return [ + (int(pair[0]), int(pair[1]), float(similarity_matrix[pair[0], pair[1]])) + for pair in similar_pairs + if pair[0] < pair[1] + ] + + +def vector_cosine_similarity(a, b): + """Find pairwise cosine similarity between two vectors.""" + return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) + + +@pytest.fixture +def simple_kg(): + # Arrange: create a simple knowledge graph with embeddings + # roughly, we expect the following relationships: + # 1 <-> 2 (0.1928 similarity) + # 2 <-> 3 (0.6520 similarity) + # 1 <-> 3 (0.8258 similarity) + nodes = [ + Node( + id=UUID("4da47a69-539c-49a2-b289-01780989d82c"), + type=NodeType.DOCUMENT, + properties={ + "embedding": [0.2313, -0.362, 0.5875, -0.0526, -0.0954], + "summary_embedding": [0.2313, -0.362, 0.5875, -0.0526, -0.0954], + }, + ), + Node( + id=UUID("f353e5c2-e432-4d1e-84a8-d750c93d4edf"), + type=NodeType.DOCUMENT, + properties={ + "embedding": [0.9066, 0.786, 0.6925, 0.8022, 0.5297], + "summary_embedding": [0.9066, 0.786, 0.6925, 0.8022, 0.5297], + }, + ), + Node( + id=UUID("437c8c08-cef6-4ebf-a35f-93d6168b61a4"), + type=NodeType.DOCUMENT, + properties={ + "embedding": [0.5555, -0.1074, 0.8454, 0.3499, -0.1669], + "summary_embedding": [0.5555, -0.1074, 0.8454, 0.3499, -0.1669], + }, + ), + ] + return KnowledgeGraph(nodes=nodes) + + +# node order +# UUID("4da47a69-539c-49a2-b289-01780989d82c") +# UUID("f353e5c2-e432-4d1e-84a8-d750c93d4edf") +# UUID("437c8c08-cef6-4ebf-a35f-93d6168b61a4") + + +@pytest.mark.parametrize( + "n_test_embeddings", + [ + (16), + (256), + (1024), + ], +) +def test__cosine_similarity(n_test_embeddings): + """ + Validate that the cosine similarity function correctly computes pairwise similarities + and that the results match expected values. + """ + + threshold = 0.7 + embeddings = generate_test_vectors( + n=n_test_embeddings, + d=64, + min_similarity=min(threshold + 0.025, 1.0), + similar_fraction=0.3, + ) + expected = cosine_similarity_matrix(embeddings) + + builder = CosineSimilarityBuilder(property_name="embedding", threshold=threshold) + result = builder._block_cosine_similarity(embeddings, embeddings) + + assert result.shape == expected.shape, "Result shape does not match expected shape" + assert np.allclose( + result, expected, atol=1e-5 + ), "Cosine similarity does not match expected values" + + +# Test for the internal _find_similar_embedding_pairs method +@pytest.mark.parametrize( + "n_test_embeddings, threshold, block_size", + [ + (16, 0.5, 16), + (16, 0.7, 16), + (16, 0.9, 16), + (16, 0.7, 32), # block size >> n_test_embeddings + (16, 0.7, 37), # block size >> n_test_embeddings + (32, 0.7, 16), # block size 1/2 n_test_embeddings + (37, 0.7, 4), # block size doesn't shard evenly + ], +) +def test__find_similar_embedding_pairs(n_test_embeddings, threshold, block_size): + """Validate that _find_similar_embedding_pairs correctly identifies pairs when compared with scipy's cosine distance.""" + + embeddings = generate_test_vectors( + n=n_test_embeddings, + d=64, + min_similarity=min(threshold + 0.025, 1.0), + similar_fraction=0.3, + ) + expected = cosine_similarity_pair(embeddings, threshold) + + builder = CosineSimilarityBuilder(property_name="embedding", threshold=threshold) + result = asyncio.run( + builder._find_similar_embedding_pairs( + embeddings, threshold=threshold, block_size=block_size + ) + ) + + assert len(result) == len(expected) + + for i, j, similarity_float in result: + assert i < j, "Pairs should be ordered (i < j)" + assert ( + similarity_float >= threshold + ), f"Similarity {similarity_float} should be >= {threshold}" + for x, y, expected_similarity in expected: + if i == x and j == y: + assert similarity_float == pytest.approx( + expected_similarity + ), "Cosine similarity does not match expected value" + + break + + +class TestCosineSimilarityBuilder: + @pytest.mark.asyncio + async def test_no_self_similarity_relationships(self, simple_kg): + builder = CosineSimilarityBuilder(property_name="embedding", threshold=0.1) + relationships = await builder.transform(copy.deepcopy(simple_kg)) + for r in relationships: + assert ( + r.source.id != r.target.id + ), "Self-relationships should not be created" + + @pytest.mark.asyncio + async def test_no_duplicate_relationships(self, simple_kg): + builder = CosineSimilarityBuilder(property_name="embedding", threshold=0.1) + relationships = await builder.transform(copy.deepcopy(simple_kg)) + seen = set() + for r in relationships: + pair = tuple(sorted([r.source.id, r.target.id])) + assert pair not in seen, "Duplicate relationships found" + seen.add(pair) + + @pytest.mark.asyncio + async def test_similarity_at_threshold(self): + node1 = Node(type=NodeType.CHUNK, properties={"embedding": [1, 0, 0]}) + node2 = Node(type=NodeType.CHUNK, properties={"embedding": [1, 0, 0]}) + kg = KnowledgeGraph(nodes=[node1, node2]) + builder = CosineSimilarityBuilder(property_name="embedding", threshold=1.0) + relationships = await builder.transform(kg) + assert len(relationships) == 1, "Should create relationship at threshold" + + @pytest.mark.asyncio + async def test_all_below_threshold(self): + node1 = Node(type=NodeType.CHUNK, properties={"embedding": [1, 0, 0]}) + node2 = Node(type=NodeType.CHUNK, properties={"embedding": [-1, 0, 0]}) + kg = KnowledgeGraph(nodes=[node1, node2]) + builder = CosineSimilarityBuilder(property_name="embedding", threshold=0.5) + relationships = await builder.transform(kg) + assert ( + len(relationships) == 0 + ), "No relationships should be created below threshold" + + @pytest.mark.asyncio + async def test_all_above_threshold(self): + node1 = Node(type=NodeType.CHUNK, properties={"embedding": [1, 0, 0]}) + node2 = Node(type=NodeType.CHUNK, properties={"embedding": [1, 0, 0]}) + node3 = Node(type=NodeType.CHUNK, properties={"embedding": [1, 0, 0]}) + kg = KnowledgeGraph(nodes=[node1, node2, node3]) + builder = CosineSimilarityBuilder(property_name="embedding", threshold=0.9) + relationships = await builder.transform(kg) + assert len(relationships) == 3 + + @pytest.mark.asyncio + async def test_malformed_embedding_raises(self): + node1 = Node(type=NodeType.CHUNK, properties={"embedding": [1, 0, 0]}) + node2 = Node(type=NodeType.CHUNK, properties={"embedding": ["a", 0, 0]}) + kg = KnowledgeGraph(nodes=[node1, node2]) + builder = CosineSimilarityBuilder(property_name="embedding", threshold=0.5) + with pytest.raises(Exception): + await builder.transform(kg) + + @pytest.mark.asyncio + async def test_cosine_similarity_builder_empty_graph(self): + kg = KnowledgeGraph(nodes=[]) + builder = CosineSimilarityBuilder(property_name="embedding") + with pytest.raises(ValueError): + await builder.transform(kg) + + @pytest.mark.asyncio + async def test_cosine_similarity_builder_basic(self, simple_kg): + # Act + builder = CosineSimilarityBuilder(property_name="embedding", threshold=0.5) + relationships = await builder.transform(simple_kg) + # Assert + assert all(isinstance(r, Relationship) for r in relationships) + assert all(r.type == "cosine_similarity" for r in relationships) + # 2 <-> 3 (~0.6520 similarity) + assert any( + str(r.source.id) == "f353e5c2-e432-4d1e-84a8-d750c93d4edf" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in relationships + ) + # 1 <-> 3 (~0.8258 similarity) + assert any( + str(r.source.id) == "4da47a69-539c-49a2-b289-01780989d82c" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in relationships + ) + + @pytest.mark.asyncio + async def test_cosine_similarity_builder_no_embeddings(self): + kg = KnowledgeGraph( + nodes=[ + Node(type=NodeType.DOCUMENT, properties={}), + Node(type=NodeType.DOCUMENT, properties={}), + ] + ) + builder = CosineSimilarityBuilder(property_name="embedding") + with pytest.raises(ValueError, match="has no embedding"): + await builder.transform(kg) + + @pytest.mark.asyncio + async def test_cosine_similarity_builder_shape_validation(self): + kg = KnowledgeGraph( + nodes=[ + Node(type=NodeType.DOCUMENT, properties={"embedding": [1.0, 0.0]}), + Node( + type=NodeType.DOCUMENT, + properties={"embedding": [0.0, 1.0, 2.0]}, + ), + ] + ) + builder = CosineSimilarityBuilder(property_name="embedding") + with pytest.raises( + ValueError, match="Embedding at index 1 has length 3, expected 2" + ): + await builder.transform(kg) + + @pytest.mark.asyncio + async def test_apply_transforms_cosine_similarity_builder(self, simple_kg): + from ragas.run_config import RunConfig + from ragas.testset.transforms.engine import apply_transforms + + # CosineSimilarityBuilder should add relationships to the graph + builder = CosineSimilarityBuilder(property_name="embedding", threshold=0.5) + kg = simple_kg + # Should mutate kg in-place + apply_transforms(kg, builder, run_config=RunConfig(max_workers=2)) + # Check that relationships were added + assert any( + r.type == "cosine_similarity" for r in kg.relationships + ), "No cosine_similarity relationships found after apply_transforms" + # Check that expected relationship exists + assert any( + str(r.source.id) == "f353e5c2-e432-4d1e-84a8-d750c93d4edf" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in kg.relationships + ) + # 1 <-> 3 (~0.8258 similarity) + assert any( + str(r.source.id) == "4da47a69-539c-49a2-b289-01780989d82c" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in kg.relationships + ) + + +class TestSummaryCosineSimilarityBuilder: + @pytest.mark.asyncio + async def test_summary_cosine_similarity_builder_basic(self, simple_kg): + builder = SummaryCosineSimilarityBuilder( + property_name="summary_embedding", threshold=0.5 + ) + relationships = await builder.transform(simple_kg) + assert all(isinstance(r, Relationship) for r in relationships) + assert all(r.type == "summary_cosine_similarity" for r in relationships) + assert any( + str(r.source.id) == "f353e5c2-e432-4d1e-84a8-d750c93d4edf" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in relationships + ) + assert any( + str(r.source.id) == "4da47a69-539c-49a2-b289-01780989d82c" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in relationships + ) + + @pytest.mark.asyncio + async def test_summary_cosine_similarity_only_document_nodes(self): + node1 = Node( + type=NodeType.DOCUMENT, properties={"summary_embedding": [1, 0, 0]} + ) + node2 = Node(type=NodeType.CHUNK, properties={"summary_embedding": [1, 0, 0]}) + kg = KnowledgeGraph(nodes=[node1, node2]) + builder = SummaryCosineSimilarityBuilder( + property_name="summary_embedding", threshold=0.5 + ) + relationships = await builder.transform(kg) + assert len(relationships) == 0 + + @pytest.mark.asyncio + async def test_summary_cosine_similarity_builder_filter_and_error(self): + kg = KnowledgeGraph(nodes=[Node(type=NodeType.DOCUMENT, properties={})]) + builder = SummaryCosineSimilarityBuilder(property_name="summary_embedding") + with pytest.raises(ValueError, match="has no summary_embedding"): + await builder.transform(kg) + + +@pytest.mark.asyncio +async def test_apply_transforms_summary_cosine_similarity_builder(simple_kg): + from ragas.run_config import RunConfig + from ragas.testset.transforms.engine import apply_transforms + + builder = SummaryCosineSimilarityBuilder( + property_name="summary_embedding", threshold=0.5 + ) + kg = simple_kg + apply_transforms(kg, builder, run_config=RunConfig(max_workers=2)) + assert any( + r.type == "summary_cosine_similarity" for r in kg.relationships + ), "No summary_cosine_similarity relationships found after apply_transforms" + assert any( + str(r.source.id) == "f353e5c2-e432-4d1e-84a8-d750c93d4edf" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in kg.relationships + ) + # 1 <-> 3 (~0.8258 similarity) + assert any( + str(r.source.id) == "4da47a69-539c-49a2-b289-01780989d82c" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in kg.relationships + ) diff --git a/ragas/tests/unit/test_traditional_relationship_builders.py b/ragas/tests/unit/test_traditional_relationship_builders.py new file mode 100644 index 000000000..81e6201b2 --- /dev/null +++ b/ragas/tests/unit/test_traditional_relationship_builders.py @@ -0,0 +1,396 @@ +import asyncio +import copy +import math +import random +import string +from typing import List, Set, Tuple +from uuid import UUID + +import numpy as np +import pytest + +from ragas.testset.graph import KnowledgeGraph, Node, NodeType, Relationship +from ragas.testset.transforms.relationship_builders.traditional import ( + JaccardSimilarityBuilder, +) + + +def generate_test_sets( + n: int = 16, + max_len: int = 32, + min_similarity: float = 0.5, + similar_fraction: float = 0.3, +) -> List[Set[str]]: + """ + Generate `n` sets up to `max_len`, where at least `similar_fraction` of all possible + pairs have Jaccard similarity >= `min_similarity`. The result is shuffled. + + Parameters: + - n (int): Total number of sets to generate. + - max_len (int): Maximum length of each set. + - min_similarity (float): Minimum Jaccard similarity for similar pairs. + - similar_fraction (float): Fraction (0-1) of sets that should be similar. + + Returns: + - list: List of generated sets. + """ + + if not (0 < min_similarity <= 1): + raise ValueError("min_similarity must be between 0 and 1.") + if not (0 <= similar_fraction <= 1): + raise ValueError("similar_fraction must be between 0 and 1.") + + def generate_entity(k: int = 5) -> str: + """Generate a random entity of length k.""" + return "".join(random.choices(string.ascii_lowercase, k=k)) + + def jaccard(a: set[str], b: set[str]) -> float: + from scipy.spatial.distance import jaccard as jaccard_dist + + # union of elements -> boolean indicator vectors + elems = sorted(a | b) + va = np.array([e in a for e in elems], dtype=bool) + vb = np.array([e in b for e in elems], dtype=bool) + # SciPy returns the Jaccard distance; similarity = 1 - distance + return 1.0 - jaccard_dist(va, vb) + + total_pairs = n * (n - 1) // 2 + if total_pairs == 0: + return [set() for _ in range(n)] + + target_similar_pairs = math.ceil(total_pairs * similar_fraction) + + if target_similar_pairs == 0: + # Generate n random, dissimilar sets + sets = [] + pool = {generate_entity() for _ in range(n * max_len)} + for _ in range(n): + length = random.randint(0, max_len) + s = set(random.sample(list(pool), min(length, len(pool)))) + pool -= s + sets.append(s) + random.shuffle(sets) + return sets + + # Calculate the size of a clique of similar sets needed + # n_clique * (n_clique - 1) / 2 >= target_similar_pairs + n_clique = math.ceil((1 + math.sqrt(1 + 8 * target_similar_pairs)) / 2) + n_clique = min(n, n_clique) + n_dissimilar = n - n_clique + + # To guarantee a given similarity, the size of the core set + # and the number of unique elements added are constrained by the max_len. + # We need cs + unique_per_set <= max_len. + # And unique_per_set is a function of cs and min_similarity. + core_size = math.floor((2 * max_len * min_similarity) / (1 + min_similarity)) + if core_size == 0 and max_len > 0 and min_similarity > 0: + raise ValueError( + "Cannot generate sets with these constraints. " + "Try increasing max_len or decreasing min_similarity." + ) + + if min_similarity == 1.0: + max_additional_elements = 0 + else: + # This is the max number of elements that can be non-core across TWO sets + max_additional_elements = math.floor(core_size * (1 / min_similarity - 1)) + + core = {generate_entity() for _ in range(core_size)} + + # A large pool of entities to draw from + pool_size = (n * max_len) * 2 # just to be safe + pool = {generate_entity() for _ in range(pool_size)} - core + + similar_sets = [] + for _ in range(n_clique): + s = core.copy() + + # Max unique elements per set to guarantee similarity + max_unique_for_set = math.floor(max_additional_elements / 2) + # Also respect max_len + max_unique_for_set = min(max_unique_for_set, max_len - core_size) + + if max_unique_for_set > 0: + num_unique = random.randint(0, max_unique_for_set) + if len(pool) < num_unique: + # Replenish pool if needed + pool.update({generate_entity() for _ in range(num_unique * 2)} - core) + new_elements = set(random.sample(list(pool), num_unique)) + s.update(new_elements) + pool -= new_elements + similar_sets.append(s) + + # --- Generate the dissimilar sets --- + dissimilar_sets = [] + for _ in range(n_dissimilar): + length = random.randint(0, max_len) + length = min(length, len(pool)) + if length > 0: + s = set(random.sample(list(pool), length)) + pool -= s + else: + s = set() + dissimilar_sets.append(s) + + sets = similar_sets + dissimilar_sets + random.shuffle(sets) + + # --- Verify the result --- + actual_similar_pairs = 0 + for i in range(n): + for j in range(i + 1, n): + if jaccard(sets[i], sets[j]) >= min_similarity: + actual_similar_pairs += 1 + + assert actual_similar_pairs >= target_similar_pairs, ( + f"Failed to generate the required number of similar pairs. " + f"Target: {target_similar_pairs}, Actual: {actual_similar_pairs}" + ) + + return sets + + +def validate_sets(sets: list[set[str]], min_similarity: float, similar_fraction: float): + n = len(sets) + n_similar_needed = int(n * similar_fraction) + + similar_pairs = jaccard_similarity_pair(sets, min_similarity) + n_similar_pairs = len(similar_pairs) + actual_similar_fraction = n_similar_pairs / (n * (n - 1) // 2) + + print(f"Expected similar pairs: {n_similar_needed}") + print(f"Actual similar pairs: {n_similar_pairs}") + print(f"Actual similar fraction: {actual_similar_fraction:.2f}") + print(f"Similarity threshold: {min_similarity}") + + +def jaccard_similarity_matrix(sets: List[Set[str]]) -> np.ndarray: + """Calculate Jaccard similarity matrix for a list of string sets.""" + n = len(sets) + similarity = np.zeros((n, n), dtype=float) + + for i in range(n): + for j in range(i, n): + intersection = sets[i].intersection(sets[j]) + union = sets[i].union(sets[j]) + score = len(intersection) / len(union) if union else 0.0 + similarity[i, j] = similarity[j, i] = score + + return similarity + + +def jaccard_similarity_pair( + sets: List[Set[str]], threshold: float +) -> List[Tuple[int, int, float]]: + """Find pairs of sets with Jaccard similarity >= threshold.""" + similarity_matrix = jaccard_similarity_matrix(sets) + similar_pairs = np.argwhere(similarity_matrix >= threshold) + + return [ + (int(i), int(j), float(similarity_matrix[i, j])) + for i, j in similar_pairs + if i < j # avoid self-pairs and duplicates + ] + + +@pytest.fixture +def simple_kg(): + # Arrange: create a simple knowledge graph with embeddings + # roughly, we expect the following relationships: + # 1 <-> 2 (0.0 similarity) + # 2 <-> 3 (0.1667 similarity) + # 1 <-> 3 (0.25 similarity) + nodes = [ + Node( + id=UUID("4da47a69-539c-49a2-b289-01780989d82c"), + type=NodeType.DOCUMENT, + properties={ + "entities": {"cat", "dog", "fish", "fox", "bird"}, + }, + ), + Node( + id=UUID("f353e5c2-e432-4d1e-84a8-d750c93d4edf"), + type=NodeType.DOCUMENT, + properties={ + "entities": {"apple", "banana"}, + }, + ), + Node( + id=UUID("437c8c08-cef6-4ebf-a35f-93d6168b61a4"), + type=NodeType.DOCUMENT, + properties={ + "entities": {"cat", "banana", "dog", "rock", "tree"}, + }, + ), + ] + return KnowledgeGraph(nodes=nodes) + + +# node order +# UUID("4da47a69-539c-49a2-b289-01780989d82c") +# UUID("f353e5c2-e432-4d1e-84a8-d750c93d4edf") +# UUID("437c8c08-cef6-4ebf-a35f-93d6168b61a4") + + +@pytest.mark.parametrize( + "n_test_sets, max_len, threshold", + [ + (8, 100, 0.2), + (16, 8, 0.1), + (16, 16, 0.5), + (32, 5, 0.3), + ], +) +def test__find_similar_embedding_pairs_jaccard(n_test_sets, max_len, threshold): + """ + Validate that _find_similar_embedding_pairs correctly identifies pairs when compared with scipy's jaccard distance. + """ + sets = generate_test_sets( + n=n_test_sets, + max_len=max_len, + min_similarity=min(threshold + 0.05, 1.0), + similar_fraction=0.3, + ) + expected = jaccard_similarity_pair(sets, threshold) + + kg = KnowledgeGraph( + nodes=[Node(type=NodeType.DOCUMENT, properties={"entities": s}) for s in sets] + ) + builder = JaccardSimilarityBuilder(property_name="entities", threshold=threshold) + result = list(asyncio.run(builder._find_similar_embedding_pairs(kg))) + + assert len(result) == len(expected) + for i, j, similarity_float in result: + assert i < j, "Pairs should be ordered (i < j)" + assert ( + similarity_float >= threshold + ), f"Similarity {similarity_float} should be >= {threshold}" + for x, y, expected_similarity in expected: + if i == x and j == y: + assert similarity_float == pytest.approx(expected_similarity) + break + + +class TestJaccardSimilarityBuilder: + @pytest.mark.asyncio + async def test_no_self_similarity_relationships(self, simple_kg): + builder = JaccardSimilarityBuilder(property_name="entities", threshold=0.1) + relationships = await builder.transform(copy.deepcopy(simple_kg)) + for r in relationships: + assert ( + r.source.id != r.target.id + ), "Self-relationships should not be created" + + @pytest.mark.asyncio + async def test_no_duplicate_relationships(self, simple_kg): + builder = JaccardSimilarityBuilder(property_name="entities", threshold=0.1) + relationships = await builder.transform(copy.deepcopy(simple_kg)) + seen = set() + for r in relationships: + pair = tuple(sorted([r.source.id, r.target.id])) + assert pair not in seen, "Duplicate relationships found" + seen.add(pair) + + @pytest.mark.asyncio + async def test_similarity_at_threshold(self): + node1 = Node(type=NodeType.DOCUMENT, properties={"entities": {"a", "b", "c"}}) + node2 = Node(type=NodeType.DOCUMENT, properties={"entities": {"a", "b", "c"}}) + kg = KnowledgeGraph(nodes=[node1, node2]) + builder = JaccardSimilarityBuilder(property_name="entities", threshold=1.0) + relationships = await builder.transform(kg) + assert len(relationships) == 1, "Should create relationship at threshold" + + @pytest.mark.asyncio + async def test_all_below_threshold(self): + node1 = Node(type=NodeType.DOCUMENT, properties={"entities": {"a", "b", "c"}}) + node2 = Node(type=NodeType.DOCUMENT, properties={"entities": {"x", "y", "z"}}) + kg = KnowledgeGraph(nodes=[node1, node2]) + builder = JaccardSimilarityBuilder(property_name="entities", threshold=0.1) + relationships = await builder.transform(kg) + assert ( + len(relationships) == 0 + ), "No relationships should be created below threshold" + + @pytest.mark.asyncio + async def test_all_above_threshold(self): + node1 = Node(type=NodeType.DOCUMENT, properties={"entities": {"a", "b", "c"}}) + node2 = Node(type=NodeType.DOCUMENT, properties={"entities": {"a", "b", "c"}}) + node3 = Node(type=NodeType.DOCUMENT, properties={"entities": {"a", "b", "c"}}) + kg = KnowledgeGraph(nodes=[node1, node2, node3]) + builder = JaccardSimilarityBuilder(property_name="entities", threshold=0.9) + relationships = await builder.transform(kg) + assert len(relationships) == 3 + + @pytest.mark.asyncio + async def test_malformed_entities_raises(self): + node1 = Node(type=NodeType.DOCUMENT, properties={"entities": {"a", "b", "c"}}) + node2 = Node(type=NodeType.DOCUMENT, properties={"entities": None}) + kg = KnowledgeGraph(nodes=[node1, node2]) + builder = JaccardSimilarityBuilder(property_name="entities", threshold=0.5) + with pytest.raises(ValueError): + await builder.transform(kg) + + @pytest.mark.asyncio + async def test_jaccard_similarity_builder_empty_graph(self): + kg = KnowledgeGraph(nodes=[]) + builder = JaccardSimilarityBuilder(property_name="entities") + relationships = await builder.transform(kg) + assert relationships == [] + + @pytest.mark.asyncio + async def test_jaccard_similarity_builder_basic(self, simple_kg): + builder = JaccardSimilarityBuilder(property_name="entities", threshold=0.15) + relationships = await builder.transform(simple_kg) + assert all(isinstance(r, Relationship) for r in relationships) + assert all(r.type == "jaccard_similarity" for r in relationships) + # 2 <-> 3 (~0.1667 similarity) + assert any( + str(r.source.id) == "f353e5c2-e432-4d1e-84a8-d750c93d4edf" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in relationships + ) + # 1 <-> 3 (~0.25 similarity) + assert any( + str(r.source.id) == "4da47a69-539c-49a2-b289-01780989d82c" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in relationships + ) + + @pytest.mark.asyncio + async def test_jaccard_similarity_builder_no_entities(self): + kg = KnowledgeGraph( + nodes=[ + Node(type=NodeType.DOCUMENT, properties={}), + Node(type=NodeType.DOCUMENT, properties={}), + ] + ) + builder = JaccardSimilarityBuilder(property_name="entities") + with pytest.raises(ValueError, match="has no entities"): + await builder.transform(kg) + + @pytest.mark.asyncio + async def test_apply_transforms_cosine_similarity_builder(self, simple_kg): + from ragas.run_config import RunConfig + from ragas.testset.transforms.engine import apply_transforms + + # JaccardSimilarityBuilder should add relationships to the graph + builder = JaccardSimilarityBuilder(property_name="entities", threshold=0.15) + kg = simple_kg + # Should mutate kg in-place + apply_transforms(kg, builder, run_config=RunConfig(max_workers=2)) + # Check that relationships were added + assert any( + r.type == "jaccard_similarity" for r in kg.relationships + ), "No jaccard_similarity relationships found after apply_transforms" + # Check that expected relationship exists + assert any( + str(r.source.id) == "f353e5c2-e432-4d1e-84a8-d750c93d4edf" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in kg.relationships + ) + # 1 <-> 3 (~0.8258 similarity) + assert any( + str(r.source.id) == "4da47a69-539c-49a2-b289-01780989d82c" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in kg.relationships + )