From ce7ab5adcdab205404eea79342587bc86beecbb1 Mon Sep 17 00:00:00 2001 From: ahgraber Date: Thu, 12 Jun 2025 15:17:42 -0400 Subject: [PATCH 01/11] feat: use blockwise calculation for cosine similarity to reduce memory requirements --- .../relationship_builders/cosine.py | 113 +++++-- .../unit/test_cosine_relationship_builders.py | 304 ++++++++++++++++++ 2 files changed, 386 insertions(+), 31 deletions(-) create mode 100644 ragas/tests/unit/test_cosine_relationship_builders.py diff --git a/ragas/src/ragas/testset/transforms/relationship_builders/cosine.py b/ragas/src/ragas/testset/transforms/relationship_builders/cosine.py index 0492ca1ed..3ea1b1479 100644 --- a/ragas/src/ragas/testset/transforms/relationship_builders/cosine.py +++ b/ragas/src/ragas/testset/transforms/relationship_builders/cosine.py @@ -12,24 +12,51 @@ class CosineSimilarityBuilder(RelationshipBuilder): property_name: str = "embedding" new_property_name: str = "cosine_similarity" threshold: float = 0.9 + block_size: int = 1024 - def _find_similar_embedding_pairs( - self, embeddings: np.ndarray, threshold: float - ) -> t.List[t.Tuple[int, int, float]]: - # Normalize the embeddings - normalized = embeddings / np.linalg.norm(embeddings, axis=1)[:, np.newaxis] + def _validate_embedding_shapes(self, embeddings: t.List[t.Any]): + if not embeddings: + raise ValueError(f"No nodes have a valid {self.property_name}") + first_len = len(embeddings[0]) + for idx, emb in enumerate(embeddings): + if len(emb) != first_len: + raise ValueError( + f"Embedding at index {idx} has length {len(emb)}, expected {first_len}. " + "All embeddings must have the same length." + ) - # Calculate cosine similarity matrix - similarity_matrix = np.dot(normalized, normalized.T) - # Find pairs with similarity >= threshold - similar_pairs = np.argwhere(similarity_matrix >= threshold) + def _block_cosine_similarity(self, i: np.ndarray, j: np.ndarray): + """Calculate cosine similarity matrix between two sets of embeddings.""" + i_norm = i / np.linalg.norm(i, axis=1, keepdims=True) + j_norm = j / np.linalg.norm(j, axis=1, keepdims=True) + return np.dot(i_norm, j_norm.T) + + async def _find_similar_embedding_pairs( + self, embeddings: np.ndarray, threshold: float, block_size: int = 1024 + ) -> t.Set[t.Tuple[int, int, float]]: + """Sharded computation of cosine similarity to find similar pairs.""" + + def process_block(i: int, j: int) -> t.Set[t.Tuple[int, int, float]]: + end_i = min(i + block_size, n_embeddings) + end_j = min(j + block_size, n_embeddings) + block = self._block_cosine_similarity( + embeddings[i:end_i, :], embeddings[j:end_j, :] + ) + similar_idx = np.argwhere(block >= threshold) + return { + (int(i + ii), int(j + jj), float(block[ii, jj])) + for ii, jj in similar_idx + if int(i + ii) < int(j + jj) + } - # Filter out self-comparisons and duplicate pairs - return [ - (pair[0], pair[1], similarity_matrix[pair[0], pair[1]]) - for pair in similar_pairs - if pair[0] < pair[1] - ] + n_embeddings, _dimension = embeddings.shape + triplets = set() + + for i in range(0, n_embeddings, block_size): + for j in range(i, n_embeddings, block_size): + triplets.update(process_block(i, j)) + + return triplets def _validate_embedding_shapes(self, embeddings: t.List[t.Any]): if not embeddings: @@ -43,40 +70,64 @@ def _validate_embedding_shapes(self, embeddings: t.List[t.Any]): ) async def transform(self, kg: KnowledgeGraph) -> t.List[Relationship]: - if self.property_name is None: - self.property_name = "embedding" - embeddings = [] for node in kg.nodes: embedding = node.get_property(self.property_name) if embedding is None: raise ValueError(f"Node {node.id} has no {self.property_name}") embeddings.append(embedding) - self._validate_embedding_shapes(embeddings) - similar_pairs = self._find_similar_embedding_pairs( - np.array(embeddings), self.threshold + similar_pairs = await self._find_similar_embedding_pairs( + np.array(embeddings), self.threshold, self.block_size ) - return [ Relationship( source=kg.nodes[i], target=kg.nodes[j], - type="cosine_similarity", + type=self.new_property_name, properties={self.new_property_name: similarity_float}, bidirectional=True, ) for i, j, similarity_float in similar_pairs ] + def generate_execution_plan(self, kg: KnowledgeGraph) -> t.List[t.Coroutine]: + """ + Generates a coroutine task for finding similar embedding pairs, which can be scheduled/executed by an Executor. + """ + embeddings = [] + for node in kg.nodes: + embedding = node.get_property(self.property_name) + if embedding is None: + raise ValueError(f"Node {node.id} has no {self.property_name}") + embeddings.append(embedding) + self._validate_embedding_shapes(embeddings) + + async def find_and_add_relationships(): + similar_pairs = await self._find_similar_embedding_pairs( + np.array(embeddings), self.threshold, self.block_size + ) + for i, j, similarity_float in similar_pairs: + rel = Relationship( + source=kg.nodes[i], + target=kg.nodes[j], + type=self.new_property_name, + properties={self.new_property_name: similarity_float}, + bidirectional=True, + ) + kg.relationships.append(rel) + + return [find_and_add_relationships()] + @dataclass class SummaryCosineSimilarityBuilder(CosineSimilarityBuilder): property_name: str = "summary_embedding" new_property_name: str = "summary_cosine_similarity" threshold: float = 0.1 + block_size: int = 1024 - def filter(self, kg: KnowledgeGraph) -> KnowledgeGraph: + def _document_summary_filter(self, kg: KnowledgeGraph) -> KnowledgeGraph: """ Filters the knowledge graph to only include nodes with a summary embedding. """ @@ -90,22 +141,22 @@ def filter(self, kg: KnowledgeGraph) -> KnowledgeGraph: return KnowledgeGraph(nodes=nodes) async def transform(self, kg: KnowledgeGraph) -> t.List[Relationship]: + filtered_kg = self._document_summary_filter(kg) embeddings = [ node.get_property(self.property_name) - for node in kg.nodes + for node in filtered_kg.nodes if node.get_property(self.property_name) is not None ] if not embeddings: raise ValueError(f"No nodes have a valid {self.property_name}") - self._validate_embedding_shapes(embeddings) - similar_pairs = self._find_similar_embedding_pairs( - np.array(embeddings), self.threshold + similar_pairs = await self._find_similar_embedding_pairs( + np.array(embeddings), self.threshold, self.block_size ) return [ Relationship( - source=kg.nodes[i], - target=kg.nodes[j], - type="summary_cosine_similarity", + source=filtered_kg.nodes[i], + target=filtered_kg.nodes[j], + type=self.new_property_name, properties={self.new_property_name: similarity_float}, bidirectional=True, ) diff --git a/ragas/tests/unit/test_cosine_relationship_builders.py b/ragas/tests/unit/test_cosine_relationship_builders.py new file mode 100644 index 000000000..5bf6d33a2 --- /dev/null +++ b/ragas/tests/unit/test_cosine_relationship_builders.py @@ -0,0 +1,304 @@ +import asyncio +import random +from uuid import UUID + +import numpy as np +import pytest + +from ragas.testset.graph import KnowledgeGraph, Node, NodeType, Relationship +from ragas.testset.transforms.relationship_builders.cosine import ( + CosineSimilarityBuilder, + SummaryCosineSimilarityBuilder, +) + + +def generate_test_vectors( + n=16, d=32, min_similarity=0.5, similar_fraction=0.3, seed=None +): + """ + Generate `n` unit vectors of dimension `d`, where at least `similar_fraction` of them + are similar to each other (cosine similarity > `min_similarity`), and the result is shuffled. + + Parameters: + - n (int): Total number of vectors to generate. + - d (int): Dimensionality of each vector. + - min_similarity (float): Minimum cosine similarity for similar pairs. + - similar_fraction (float): Fraction (0-1) of vectors that should be similar. + - seed (int): Optional random seed for reproducibility. + + Returns: + - np.ndarray: Array of shape (n, d) of unit vectors. + """ + + if seed is not None: + np.random.seed(seed) + random.seed(seed) + + num_similar = max(2, int(n * similar_fraction)) # at least two similar vectors + num_random = n - num_similar + + # Step 1: Create a base vector + base = np.random.randn(d) + base /= np.linalg.norm(base) + + # Step 2: Generate similar vectors + similar_vectors = [base] + angle = np.arccos(min_similarity) + + for _ in range(num_similar - 1): + perturbation = np.random.randn(d) + perturbation -= perturbation.dot(base) * base # make orthogonal + perturbation /= np.linalg.norm(perturbation) + + similar_vec = np.cos(angle * 0.9) * base + np.sin(angle * 0.9) * perturbation + similar_vec /= np.linalg.norm(similar_vec) + similar_vectors.append(similar_vec) + + # Step 3: Generate additional random unit vectors + random_vectors = [] + for _ in range(num_random): + v = np.random.randn(d) + v /= np.linalg.norm(v) + random_vectors.append(v) + + # Step 4: Combine and shuffle + all_vectors = similar_vectors + random_vectors + random.shuffle(all_vectors) + + return np.stack(all_vectors) + + +def cosine_similarity(embeddings: np.ndarray): + from scipy.spatial.distance import cdist + + similarity = 1 - cdist(embeddings, embeddings, metric="cosine") + + # normalized = embeddings / np.linalg.norm(embeddings, axis=1)[:, np.newaxis] + # similarity = np.dot(normalized, normalized.T) + return similarity + + +def cosine_similarity_pair(embeddings: np.ndarray, threshold: float): + # Find pairs with similarity >= threshold + similarity_matrix = cosine_similarity(embeddings) + similar_pairs = np.argwhere(similarity_matrix >= threshold) + + # Filter out self-comparisons and duplicate pairs + return [ + (int(pair[0]), int(pair[1]), float(similarity_matrix[pair[0], pair[1]])) + for pair in similar_pairs + if pair[0] < pair[1] + ] + + +def vector_cosine_similarity(a, b): + return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) + + +@pytest.fixture +def simple_kg(): + # Arrange: create a simple knowledge graph with embeddings + # roughly, we expect the following relationships: + # 1 <-> 2 (0.1928 similarity) + # 2 <-> 3 (0.6520 similarity) + # 1 <-> 3 (0.8258 similarity) + nodes = [ + Node( + id=UUID("4da47a69-539c-49a2-b289-01780989d82c"), + type=NodeType.DOCUMENT, + properties={ + "embedding": [0.2313, -0.362, 0.5875, -0.0526, -0.0954], + "summary_embedding": [0.2313, -0.362, 0.5875, -0.0526, -0.0954], + }, + ), + Node( + id=UUID("f353e5c2-e432-4d1e-84a8-d750c93d4edf"), + type=NodeType.DOCUMENT, + properties={ + "embedding": [0.9066, 0.786, 0.6925, 0.8022, 0.5297], + "summary_embedding": [0.9066, 0.786, 0.6925, 0.8022, 0.5297], + }, + ), + Node( + id=UUID("437c8c08-cef6-4ebf-a35f-93d6168b61a4"), + type=NodeType.DOCUMENT, + properties={ + "embedding": [0.5555, -0.1074, 0.8454, 0.3499, -0.1669], + "summary_embedding": [0.5555, -0.1074, 0.8454, 0.3499, -0.1669], + }, + ), + ] + return KnowledgeGraph(nodes=nodes) + + +# node order +# UUID("4da47a69-539c-49a2-b289-01780989d82c") +# UUID("f353e5c2-e432-4d1e-84a8-d750c93d4edf") +# UUID("437c8c08-cef6-4ebf-a35f-93d6168b61a4") + + +@pytest.mark.parametrize( + "n_test_embeddings", + [ + (16), + (256), + (1024), + ], +) +def test__cosine_similarity(n_test_embeddings): + """ + Validate that the cosine similarity function correctly computes pairwise similarities + and that the results match expected values. + """ + + threshold = 0.7 + embeddings = generate_test_vectors( + n=n_test_embeddings, + d=64, + min_similarity=min(threshold + 0.025, 1.0), + similar_fraction=0.3, + ) + expected = cosine_similarity(embeddings) + + builder = CosineSimilarityBuilder(property_name="embedding", threshold=threshold) + result = builder._block_cosine_similarity(embeddings, embeddings) + + assert result.shape == expected.shape, "Result shape does not match expected shape" + assert np.allclose(result, expected, atol=1e-5), ( + "Cosine similarity does not match expected values" + ) + + +# Test for the internal _find_similar_embedding_pairs method +@pytest.mark.parametrize( + "n_test_embeddings, threshold, block_size", + [ + (16, 0.5, 16), + (16, 0.7, 16), + (16, 0.9, 16), + (16, 0.7, 32), # block size >> n_test_embeddings + (16, 0.7, 37), # block size >> n_test_embeddings + (32, 0.7, 16), # block size 1/2 n_test_embeddings + (37, 0.7, 4), # block size doesn't shard evenly + ], +) +def test__find_similar_embedding_pairs(n_test_embeddings, threshold, block_size): + """Validate that _find_similar_embedding_pairs correctly identifies pairs when compared with scipy's cosine distance.""" + + embeddings = generate_test_vectors( + n=n_test_embeddings, + d=64, + min_similarity=min(threshold + 0.025, 1.0), + similar_fraction=0.3, + ) + expected = cosine_similarity_pair(embeddings, threshold) + + builder = CosineSimilarityBuilder(property_name="embedding", threshold=threshold) + result = asyncio.run( + builder._find_similar_embedding_pairs( + embeddings, threshold=threshold, block_size=block_size + ) + ) + + assert len(result) == len(expected) + + for i, j, similarity_float in result: + assert i < j, "Pairs should be ordered (i < j)" + assert similarity_float >= threshold, ( + f"Similarity {similarity_float} should be >= {threshold}" + ) + for x, y, expected_similarity in expected: + if i == x and j == y: + assert similarity_float == pytest.approx(expected_similarity), ( + "Cosine similarity does not match expected value" + ) + + break + + +@pytest.mark.asyncio +async def test_cosine_similarity_builder_basic(simple_kg): + # Act + builder = CosineSimilarityBuilder(property_name="embedding", threshold=0.5) + relationships = await builder.transform(simple_kg) + # Assert + assert all(isinstance(r, Relationship) for r in relationships) + assert all(r.type == "cosine_similarity" for r in relationships) + # 2 <-> 3 (~0.6520 similarity) + assert any( + str(r.source.id) == "f353e5c2-e432-4d1e-84a8-d750c93d4edf" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in relationships + ) + # 1 <-> 3 (~0.8258 similarity) + assert any( + str(r.source.id) == "4da47a69-539c-49a2-b289-01780989d82c" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in relationships + ) + + +@pytest.mark.asyncio +async def test_summary_cosine_similarity_builder_basic(simple_kg): + # Act + builder = SummaryCosineSimilarityBuilder( + property_name="summary_embedding", threshold=0.5 + ) + relationships = await builder.transform(simple_kg) + # Assert + assert all(isinstance(r, Relationship) for r in relationships) + assert all(r.type == "summary_cosine_similarity" for r in relationships) + # 2 <-> 3 (~0.6520 similarity) + assert any( + str(r.source.id) == "f353e5c2-e432-4d1e-84a8-d750c93d4edf" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in relationships + ) + # 1 <-> 3 (~0.8258 similarity) + assert any( + str(r.source.id) == "4da47a69-539c-49a2-b289-01780989d82c" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in relationships + ) + + +@pytest.mark.asyncio +async def test_cosine_similarity_builder_no_embeddings(): + kg = KnowledgeGraph(nodes=[Node(type=NodeType.DOCUMENT, properties={})]) + builder = CosineSimilarityBuilder(property_name="embedding") + with pytest.raises(ValueError, match="has no embedding"): + await builder.transform(kg) + + +@pytest.mark.asyncio +async def test_summary_cosine_similarity_builder_filter_and_error(): + kg = KnowledgeGraph(nodes=[Node(type=NodeType.DOCUMENT, properties={})]) + builder = SummaryCosineSimilarityBuilder(property_name="summary_embedding") + with pytest.raises(ValueError, match="has no summary_embedding"): + await builder.transform(kg) + + +@pytest.mark.asyncio +async def test_cosine_similarity_builder_shape_validation(): + kg = KnowledgeGraph( + nodes=[ + Node(type=NodeType.DOCUMENT, properties={"embedding": [1.0, 0.0]}), + Node( + type=NodeType.DOCUMENT, + properties={"embedding": [0.0, 1.0, 2.0]}, + ), + ] + ) + builder = CosineSimilarityBuilder(property_name="embedding") + with pytest.raises( + ValueError, match="Embedding at index 1 has length 3, expected 2" + ): + await builder.transform(kg) + + +@pytest.mark.asyncio +async def test_cosine_similarity_builder_empty_graph(): + kg = KnowledgeGraph(nodes=[]) + builder = CosineSimilarityBuilder(property_name="embedding") + with pytest.raises(ValueError, match="No nodes have a valid embedding"): + await builder.transform(kg) From b24b9a9216ad8cb027ecb973920cebb04f1abfb1 Mon Sep 17 00:00:00 2001 From: ahgraber Date: Thu, 12 Jun 2025 15:29:20 -0400 Subject: [PATCH 02/11] feat: test edge cases and organize tests --- .../unit/test_cosine_relationship_builders.py | 261 +++++++++++++----- 1 file changed, 190 insertions(+), 71 deletions(-) diff --git a/ragas/tests/unit/test_cosine_relationship_builders.py b/ragas/tests/unit/test_cosine_relationship_builders.py index 5bf6d33a2..ab08211c9 100644 --- a/ragas/tests/unit/test_cosine_relationship_builders.py +++ b/ragas/tests/unit/test_cosine_relationship_builders.py @@ -1,4 +1,5 @@ import asyncio +import copy import random from uuid import UUID @@ -216,89 +217,207 @@ def test__find_similar_embedding_pairs(n_test_embeddings, threshold, block_size) break -@pytest.mark.asyncio -async def test_cosine_similarity_builder_basic(simple_kg): - # Act - builder = CosineSimilarityBuilder(property_name="embedding", threshold=0.5) - relationships = await builder.transform(simple_kg) - # Assert - assert all(isinstance(r, Relationship) for r in relationships) - assert all(r.type == "cosine_similarity" for r in relationships) - # 2 <-> 3 (~0.6520 similarity) - assert any( - str(r.source.id) == "f353e5c2-e432-4d1e-84a8-d750c93d4edf" - and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" - for r in relationships - ) - # 1 <-> 3 (~0.8258 similarity) - assert any( - str(r.source.id) == "4da47a69-539c-49a2-b289-01780989d82c" - and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" - for r in relationships - ) +class TestCosineSimilarityBuilder: + @pytest.mark.asyncio + async def test_no_self_similarity_relationships(self, simple_kg): + builder = CosineSimilarityBuilder(property_name="embedding", threshold=0.1) + relationships = await builder.transform(copy.deepcopy(simple_kg)) + for r in relationships: + assert r.source.id != r.target.id, ( + "Self-relationships should not be created" + ) + + @pytest.mark.asyncio + async def test_no_duplicate_relationships(self, simple_kg): + builder = CosineSimilarityBuilder(property_name="embedding", threshold=0.1) + relationships = await builder.transform(copy.deepcopy(simple_kg)) + seen = set() + for r in relationships: + pair = tuple(sorted([r.source.id, r.target.id])) + assert pair not in seen, "Duplicate relationships found" + seen.add(pair) + + @pytest.mark.asyncio + async def test_similarity_at_threshold(self): + node1 = Node(type=NodeType.CHUNK, properties={"embedding": [1, 0, 0]}) + node2 = Node(type=NodeType.CHUNK, properties={"embedding": [1, 0, 0]}) + kg = KnowledgeGraph(nodes=[node1, node2]) + builder = CosineSimilarityBuilder(property_name="embedding", threshold=1.0) + relationships = await builder.transform(kg) + assert len(relationships) == 1, "Should create relationship at threshold" + + @pytest.mark.asyncio + async def test_all_below_threshold(self): + node1 = Node(type=NodeType.CHUNK, properties={"embedding": [1, 0, 0]}) + node2 = Node(type=NodeType.CHUNK, properties={"embedding": [-1, 0, 0]}) + kg = KnowledgeGraph(nodes=[node1, node2]) + builder = CosineSimilarityBuilder(property_name="embedding", threshold=0.5) + relationships = await builder.transform(kg) + assert len(relationships) == 0, ( + "No relationships should be created below threshold" + ) + + @pytest.mark.asyncio + async def test_all_above_threshold(self): + node1 = Node(type=NodeType.CHUNK, properties={"embedding": [1, 0, 0]}) + node2 = Node(type=NodeType.CHUNK, properties={"embedding": [1, 0, 0]}) + node3 = Node(type=NodeType.CHUNK, properties={"embedding": [1, 0, 0]}) + kg = KnowledgeGraph(nodes=[node1, node2, node3]) + builder = CosineSimilarityBuilder(property_name="embedding", threshold=0.9) + relationships = await builder.transform(kg) + assert len(relationships) == 3 + + @pytest.mark.asyncio + async def test_malformed_embedding_raises(self): + node1 = Node(type=NodeType.CHUNK, properties={"embedding": [1, 0, 0]}) + node2 = Node(type=NodeType.CHUNK, properties={"embedding": ["a", 0, 0]}) + kg = KnowledgeGraph(nodes=[node1, node2]) + builder = CosineSimilarityBuilder(property_name="embedding", threshold=0.5) + with pytest.raises(Exception): + await builder.transform(kg) + + @pytest.mark.asyncio + async def test_cosine_similarity_builder_empty_graph(self): + kg = KnowledgeGraph(nodes=[]) + builder = CosineSimilarityBuilder(property_name="embedding") + with pytest.raises(ValueError, match="No nodes have a valid embedding"): + await builder.transform(kg) + + @pytest.mark.asyncio + async def test_cosine_similarity_builder_basic(self, simple_kg): + # Act + builder = CosineSimilarityBuilder(property_name="embedding", threshold=0.5) + relationships = await builder.transform(simple_kg) + # Assert + assert all(isinstance(r, Relationship) for r in relationships) + assert all(r.type == "cosine_similarity" for r in relationships) + # 2 <-> 3 (~0.6520 similarity) + assert any( + str(r.source.id) == "f353e5c2-e432-4d1e-84a8-d750c93d4edf" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in relationships + ) + # 1 <-> 3 (~0.8258 similarity) + assert any( + str(r.source.id) == "4da47a69-539c-49a2-b289-01780989d82c" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in relationships + ) + + @pytest.mark.asyncio + async def test_cosine_similarity_builder_no_embeddings(self): + kg = KnowledgeGraph(nodes=[Node(type=NodeType.DOCUMENT, properties={})]) + builder = CosineSimilarityBuilder(property_name="embedding") + with pytest.raises(ValueError, match="has no embedding"): + await builder.transform(kg) + + @pytest.mark.asyncio + async def test_cosine_similarity_builder_shape_validation(self): + kg = KnowledgeGraph( + nodes=[ + Node(type=NodeType.DOCUMENT, properties={"embedding": [1.0, 0.0]}), + Node( + type=NodeType.DOCUMENT, + properties={"embedding": [0.0, 1.0, 2.0]}, + ), + ] + ) + builder = CosineSimilarityBuilder(property_name="embedding") + with pytest.raises( + ValueError, match="Embedding at index 1 has length 3, expected 2" + ): + await builder.transform(kg) + + @pytest.mark.asyncio + async def test_apply_transforms_cosine_similarity_builder(self, simple_kg): + from ragas.run_config import RunConfig + from ragas.testset.transforms.engine import apply_transforms + + # CosineSimilarityBuilder should add relationships to the graph + builder = CosineSimilarityBuilder(property_name="embedding", threshold=0.5) + kg = simple_kg + # Should mutate kg in-place + apply_transforms(kg, builder, run_config=RunConfig(max_workers=2)) + # Check that relationships were added + assert any(r.type == "cosine_similarity" for r in kg.relationships), ( + "No cosine_similarity relationships found after apply_transforms" + ) + # Check that expected relationship exists + assert any( + str(r.source.id) == "f353e5c2-e432-4d1e-84a8-d750c93d4edf" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in kg.relationships + ) + # 1 <-> 3 (~0.8258 similarity) + assert any( + str(r.source.id) == "4da47a69-539c-49a2-b289-01780989d82c" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in kg.relationships + ) + + +class TestSummaryCosineSimilarityBuilder: + @pytest.mark.asyncio + async def test_summary_cosine_similarity_builder_basic(self, simple_kg): + builder = SummaryCosineSimilarityBuilder( + property_name="summary_embedding", threshold=0.5 + ) + relationships = await builder.transform(simple_kg) + assert all(isinstance(r, Relationship) for r in relationships) + assert all(r.type == "summary_cosine_similarity" for r in relationships) + assert any( + str(r.source.id) == "f353e5c2-e432-4d1e-84a8-d750c93d4edf" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in relationships + ) + assert any( + str(r.source.id) == "4da47a69-539c-49a2-b289-01780989d82c" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in relationships + ) + + @pytest.mark.asyncio + async def test_summary_cosine_similarity_only_document_nodes(self): + node1 = Node( + type=NodeType.DOCUMENT, properties={"summary_embedding": [1, 0, 0]} + ) + node2 = Node(type=NodeType.CHUNK, properties={"summary_embedding": [1, 0, 0]}) + kg = KnowledgeGraph(nodes=[node1, node2]) + builder = SummaryCosineSimilarityBuilder( + property_name="summary_embedding", threshold=0.5 + ) + relationships = await builder.transform(kg) + assert len(relationships) == 0 + + @pytest.mark.asyncio + async def test_summary_cosine_similarity_builder_filter_and_error(self): + kg = KnowledgeGraph(nodes=[Node(type=NodeType.DOCUMENT, properties={})]) + builder = SummaryCosineSimilarityBuilder(property_name="summary_embedding") + with pytest.raises(ValueError, match="has no summary_embedding"): + await builder.transform(kg) @pytest.mark.asyncio -async def test_summary_cosine_similarity_builder_basic(simple_kg): - # Act +async def test_apply_transforms_summary_cosine_similarity_builder(simple_kg): + from ragas.run_config import RunConfig + from ragas.testset.transforms.engine import apply_transforms + builder = SummaryCosineSimilarityBuilder( property_name="summary_embedding", threshold=0.5 ) - relationships = await builder.transform(simple_kg) - # Assert - assert all(isinstance(r, Relationship) for r in relationships) - assert all(r.type == "summary_cosine_similarity" for r in relationships) - # 2 <-> 3 (~0.6520 similarity) + kg = simple_kg + apply_transforms(kg, builder, run_config=RunConfig(max_workers=2)) + assert any(r.type == "summary_cosine_similarity" for r in kg.relationships), ( + "No summary_cosine_similarity relationships found after apply_transforms" + ) assert any( str(r.source.id) == "f353e5c2-e432-4d1e-84a8-d750c93d4edf" and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" - for r in relationships + for r in kg.relationships ) # 1 <-> 3 (~0.8258 similarity) assert any( str(r.source.id) == "4da47a69-539c-49a2-b289-01780989d82c" and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" - for r in relationships + for r in kg.relationships ) - - -@pytest.mark.asyncio -async def test_cosine_similarity_builder_no_embeddings(): - kg = KnowledgeGraph(nodes=[Node(type=NodeType.DOCUMENT, properties={})]) - builder = CosineSimilarityBuilder(property_name="embedding") - with pytest.raises(ValueError, match="has no embedding"): - await builder.transform(kg) - - -@pytest.mark.asyncio -async def test_summary_cosine_similarity_builder_filter_and_error(): - kg = KnowledgeGraph(nodes=[Node(type=NodeType.DOCUMENT, properties={})]) - builder = SummaryCosineSimilarityBuilder(property_name="summary_embedding") - with pytest.raises(ValueError, match="has no summary_embedding"): - await builder.transform(kg) - - -@pytest.mark.asyncio -async def test_cosine_similarity_builder_shape_validation(): - kg = KnowledgeGraph( - nodes=[ - Node(type=NodeType.DOCUMENT, properties={"embedding": [1.0, 0.0]}), - Node( - type=NodeType.DOCUMENT, - properties={"embedding": [0.0, 1.0, 2.0]}, - ), - ] - ) - builder = CosineSimilarityBuilder(property_name="embedding") - with pytest.raises( - ValueError, match="Embedding at index 1 has length 3, expected 2" - ): - await builder.transform(kg) - - -@pytest.mark.asyncio -async def test_cosine_similarity_builder_empty_graph(): - kg = KnowledgeGraph(nodes=[]) - builder = CosineSimilarityBuilder(property_name="embedding") - with pytest.raises(ValueError, match="No nodes have a valid embedding"): - await builder.transform(kg) From b422c50e705ff961a7ee1d977d0b70315fd22ad3 Mon Sep 17 00:00:00 2001 From: ahgraber Date: Fri, 13 Jun 2025 15:02:42 -0400 Subject: [PATCH 03/11] feat: add scipy to development dependencies --- ragas/pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ragas/pyproject.toml b/ragas/pyproject.toml index d93f76ab3..8201c39d3 100644 --- a/ragas/pyproject.toml +++ b/ragas/pyproject.toml @@ -64,6 +64,9 @@ dev = [ "haystack-ai", "sacrebleu", "r2r", + "scipy", +] +test = [ "pytest", "pytest-xdist[psutil]", "pytest-asyncio", From 60bb6e83840825860b76c62e291ed5126f4d515e Mon Sep 17 00:00:00 2001 From: ahgraber Date: Fri, 13 Jun 2025 15:04:17 -0400 Subject: [PATCH 04/11] feat: enhance JaccardSimilarityBuilder with async processing and new methods - Refactored the JaccardSimilarityBuilder to use async methods for finding similar embedding pairs. - Introduced a new method `generate_execution_plan` to generate coroutines of comparisons for better tracking and potential concurrency - Updated the `transform` method to utilize the new async functionality. - Added comprehensive test coverage for the new features in the JaccardSimilarityBuilder. --- .../relationship_builders/traditional.py | 71 ++-- .../unit/test_cosine_relationship_builders.py | 24 +- .../test_traditional_relationship_builders.py | 396 ++++++++++++++++++ 3 files changed, 461 insertions(+), 30 deletions(-) create mode 100644 ragas/tests/unit/test_traditional_relationship_builders.py diff --git a/ragas/src/ragas/testset/transforms/relationship_builders/traditional.py b/ragas/src/ragas/testset/transforms/relationship_builders/traditional.py index ad33ea42f..5b1a7d6f8 100644 --- a/ragas/src/ragas/testset/transforms/relationship_builders/traditional.py +++ b/ragas/src/ragas/testset/transforms/relationship_builders/traditional.py @@ -1,3 +1,4 @@ +import itertools import typing as t from collections import Counter from dataclasses import dataclass @@ -19,39 +20,62 @@ def _jaccard_similarity(self, set1: t.Set[str], set2: t.Set[str]) -> float: union = len(set1.union(set2)) return intersection / union if union > 0 else 0.0 - async def transform(self, kg: KnowledgeGraph) -> t.List[Relationship]: - if self.property_name is None: - self.property_name - - similar_pairs = [] - for i, node1 in enumerate(kg.nodes): - for j, node2 in enumerate(kg.nodes): - if i >= j: - continue - items1 = node1.get_property(self.property_name) - items2 = node2.get_property(self.property_name) - if items1 is None or items2 is None: - raise ValueError( - f"Node {node1.id} or {node2.id} has no {self.property_name}" - ) - if self.key_name is not None: - items1 = items1.get(self.key_name, []) - items2 = items2.get(self.key_name, []) - similarity = self._jaccard_similarity(set(items1), set(items2)) - if similarity >= self.threshold: - similar_pairs.append((i, j, similarity)) + async def _find_similar_embedding_pairs( + self, kg: KnowledgeGraph + ) -> t.Set[t.Tuple[int, int, float]]: + """ + Finds all node index pairs with Jaccard similarity above the threshold. + Returns a set of (i, j, similarity) tuples. + """ + + similar_pairs = set() + for (i, node1), (j, node2) in itertools.combinations(enumerate(kg.nodes), 2): + items1 = node1.get_property(self.property_name) + items2 = node2.get_property(self.property_name) + if items1 is None or items2 is None: + raise ValueError( + f"Node {node1.id} or {node2.id} has no {self.property_name}" + ) + if self.key_name is not None: + items1 = items1.get(self.key_name, []) + items2 = items2.get(self.key_name, []) + similarity = self._jaccard_similarity(set(items1), set(items2)) + if similarity >= self.threshold: + similar_pairs.add((i, j, similarity)) + return similar_pairs + async def transform(self, kg: KnowledgeGraph) -> t.List[Relationship]: + similar_pairs = await self._find_similar_embedding_pairs(kg) return [ Relationship( source=kg.nodes[i], target=kg.nodes[j], - type="jaccard_similarity", + type=self.new_property_name, properties={self.new_property_name: similarity_float}, bidirectional=True, ) for i, j, similarity_float in similar_pairs ] + def generate_execution_plan(self, kg: KnowledgeGraph) -> t.List[t.Coroutine]: + """ + Generates a coroutine task for finding similar pairs, which can be scheduled/executed by an Executor. + """ + + async def find_and_add_relationships(): + similar_pairs = await self._find_similar_embedding_pairs(kg) + for i, j, similarity_float in similar_pairs: + rel = Relationship( + source=kg.nodes[i], + target=kg.nodes[j], + type=self.new_property_name, + properties={self.new_property_name: similarity_float}, + bidirectional=True, + ) + kg.relationships.append(rel) + + return [find_and_add_relationships()] + @dataclass class OverlapScoreBuilder(RelationshipBuilder): @@ -65,6 +89,7 @@ class OverlapScoreBuilder(RelationshipBuilder): def __post_init__(self): try: from rapidfuzz import distance + except ImportError: raise ImportError( "rapidfuzz is required for string distance. Please install it using `pip install rapidfuzz`" @@ -78,13 +103,11 @@ def __post_init__(self): } def _overlap_score(self, overlaps: t.List[bool]) -> float: - return sum(overlaps) / len(overlaps) if len(overlaps) > 0 else 0.0 def _get_noisy_items( self, nodes: t.List[Node], property_name: str, percent_cut_off: float = 0.05 ) -> t.List[str]: - all_items = [] for node in nodes: items = node.get_property(property_name) diff --git a/ragas/tests/unit/test_cosine_relationship_builders.py b/ragas/tests/unit/test_cosine_relationship_builders.py index ab08211c9..ea048086c 100644 --- a/ragas/tests/unit/test_cosine_relationship_builders.py +++ b/ragas/tests/unit/test_cosine_relationship_builders.py @@ -14,8 +14,12 @@ def generate_test_vectors( - n=16, d=32, min_similarity=0.5, similar_fraction=0.3, seed=None -): + n: int = 16, + d: int = 32, + min_similarity: float = 0.5, + similar_fraction: float = 0.3, + seed: int | None = None, +) -> np.ndarray: """ Generate `n` unit vectors of dimension `d`, where at least `similar_fraction` of them are similar to each other (cosine similarity > `min_similarity`), and the result is shuffled. @@ -69,7 +73,8 @@ def generate_test_vectors( return np.stack(all_vectors) -def cosine_similarity(embeddings: np.ndarray): +def cosine_similarity_matrix(embeddings: np.ndarray): + """Calculate cosine similarity matrix for a set of embeddings.""" from scipy.spatial.distance import cdist similarity = 1 - cdist(embeddings, embeddings, metric="cosine") @@ -80,8 +85,9 @@ def cosine_similarity(embeddings: np.ndarray): def cosine_similarity_pair(embeddings: np.ndarray, threshold: float): + """Find pairs of embeddings with cosine similarity >= threshold.""" # Find pairs with similarity >= threshold - similarity_matrix = cosine_similarity(embeddings) + similarity_matrix = cosine_similarity_matrix(embeddings) similar_pairs = np.argwhere(similarity_matrix >= threshold) # Filter out self-comparisons and duplicate pairs @@ -93,6 +99,7 @@ def cosine_similarity_pair(embeddings: np.ndarray, threshold: float): def vector_cosine_similarity(a, b): + """Find pairwise cosine similarity between two vectors.""" return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) @@ -159,7 +166,7 @@ def test__cosine_similarity(n_test_embeddings): min_similarity=min(threshold + 0.025, 1.0), similar_fraction=0.3, ) - expected = cosine_similarity(embeddings) + expected = cosine_similarity_matrix(embeddings) builder = CosineSimilarityBuilder(property_name="embedding", threshold=threshold) result = builder._block_cosine_similarity(embeddings, embeddings) @@ -306,7 +313,12 @@ async def test_cosine_similarity_builder_basic(self, simple_kg): @pytest.mark.asyncio async def test_cosine_similarity_builder_no_embeddings(self): - kg = KnowledgeGraph(nodes=[Node(type=NodeType.DOCUMENT, properties={})]) + kg = KnowledgeGraph( + nodes=[ + Node(type=NodeType.DOCUMENT, properties={}), + Node(type=NodeType.DOCUMENT, properties={}), + ] + ) builder = CosineSimilarityBuilder(property_name="embedding") with pytest.raises(ValueError, match="has no embedding"): await builder.transform(kg) diff --git a/ragas/tests/unit/test_traditional_relationship_builders.py b/ragas/tests/unit/test_traditional_relationship_builders.py new file mode 100644 index 000000000..15d37e7a0 --- /dev/null +++ b/ragas/tests/unit/test_traditional_relationship_builders.py @@ -0,0 +1,396 @@ +import asyncio +import copy +import math +import random +import string +from typing import List, Set, Tuple +from uuid import UUID + +import numpy as np +import pytest + +from ragas.testset.graph import KnowledgeGraph, Node, NodeType, Relationship +from ragas.testset.transforms.relationship_builders.traditional import ( + JaccardSimilarityBuilder, +) + + +def generate_test_sets( + n: int = 16, + max_len: int = 32, + min_similarity: float = 0.5, + similar_fraction: float = 0.3, +) -> List[Set[str]]: + """ + Generate `n` sets up to `max_len`, where at least `similar_fraction` of all possible + pairs have Jaccard similarity >= `min_similarity`. The result is shuffled. + + Parameters: + - n (int): Total number of sets to generate. + - max_len (int): Maximum length of each set. + - min_similarity (float): Minimum Jaccard similarity for similar pairs. + - similar_fraction (float): Fraction (0-1) of sets that should be similar. + + Returns: + - list: List of generated sets. + """ + + def generate_entity(k: int = 5) -> str: + """Generate a random entity of length k.""" + return "".join(random.choices(string.ascii_lowercase, k=k)) + + def jaccard(a: set[str], b: set[str]) -> float: + from scipy.spatial.distance import jaccard as jaccard_dist + + # union of elements -> boolean indicator vectors + elems = sorted(a | b) + va = np.array([e in a for e in elems], dtype=bool) + vb = np.array([e in b for e in elems], dtype=bool) + # SciPy returns the Jaccard distance; similarity = 1 - distance + return 1.0 - jaccard_dist(va, vb) + + # bias toward shorter lengths (expovariate with λ=1.0) + def sample_length() -> int: + length = int(random.expovariate(1.0)) + return min(length, max_len) + + total_pairs = n * (n - 1) // 2 + target_similar = math.ceil(total_pairs * similar_fraction) + + # Initialize all sets with random, ragged lengths + sets = [{generate_entity() for _ in range(sample_length())} for _ in range(n)] + + # Count how many pairs are “similar” right now + current_similar = len(jaccard_similarity_pair(sets, min_similarity)) + + # Iteratively fix random non‐similar pairs until we hit target + max_attempts = target_similar * 10 + attempts = 0 + + while current_similar < target_similar and attempts < max_attempts: + # pick a non‐similar pair + bad_pairs = [ + (i, j) + for i in range(n) + for j in range(i + 1, n) + if jaccard(sets[i], sets[j]) < min_similarity + ] + if not bad_pairs: + break + i, j = random.choice(bad_pairs) + + # decide new lengths + Li, Lj = sample_length(), sample_length() + # solve for needed intersection size I such that + # I / (Li + Lj - I) >= min_similarity + I = math.ceil(min_similarity * (Li + Lj) / (1 + min_similarity)) + + # build new similar pair + shared = {generate_entity() for _ in range(I)} + Ai = shared | {generate_entity() for _ in range(Li - I)} + Bj = shared | {generate_entity() for _ in range(Lj - I)} + + sets[i], sets[j] = Ai, Bj + + current_similar = len(jaccard_similarity_pair(sets, min_similarity)) + attempts += 1 + else: + raise ValueError( + f"Could not generate enough similar pairs after {max_attempts} attempts." + ) + + # Create a core set of shared elements for similar sets + core_size = max(1, int(max_len * min_similarity)) + core = {generate_entity() for _ in range(core_size)} + + # Create a set of unique elements to draw from + base_pool = {generate_entity() for _ in range(n * max_len * 8)} + base_pool -= core + + n_similar = int(n * similar_fraction) + n_dissimilar = n - n_similar + + # Pre-calculate max add'l unique elements that can be added to core while still guaranteeing min_similarity + max_unique = int(core_size * ((1 - min_similarity) / min_similarity)) + if max_unique > max_len: + raise ValueError( + "max_unique exceeds max_len, cannot guarantee min_similarity with given parameters." + ) + + # Generate similar sets + similar = [] + for _ in range(n_similar): + # Random size for this set, at least the core size + set_len = core_size + random.randint(0, max_unique) + s = core.copy() + # Add random elements from the base pool until we reach set_len + while len(s) < set_len: + if not base_pool: + raise ValueError("Base pool is empty, cannot generate more sets.") + element = base_pool.pop() + if element not in s: + s.add(element) + similar.append(s) + + # Generate dissimilar sets + dissimilar = [] + for _ in range(n_dissimilar): + set_len = random.randint(0, max_len) + s = set() + while len(s) < set_len: + if not base_pool: + raise ValueError("Base pool is empty, cannot generate more sets.") + element = base_pool.pop() + if element not in s: + s.add(element) + dissimilar.append(s) + + sets = similar + dissimilar + random.shuffle(sets) + return sets + + +def validate_sets(sets: list[set[str]], min_similarity: float, similar_fraction: float): + n = len(sets) + n_similar_needed = int(n * similar_fraction) + + similar_pairs = jaccard_similarity_pair(sets, min_similarity) + n_similar_pairs = len(similar_pairs) + actual_similar_fraction = n_similar_pairs / (n * (n - 1) // 2) + + print(f"Expected similar pairs: {n_similar_needed}") + print(f"Actual similar pairs: {n_similar_pairs}") + print(f"Actual similar fraction: {actual_similar_fraction:.2f}") + print(f"Similarity threshold: {min_similarity}") + + +def jaccard_similarity_matrix(sets: List[Set[str]]) -> np.ndarray: + """Calculate Jaccard similarity matrix for a list of string sets.""" + n = len(sets) + similarity = np.zeros((n, n), dtype=float) + + for i in range(n): + for j in range(i, n): + intersection = sets[i].intersection(sets[j]) + union = sets[i].union(sets[j]) + score = len(intersection) / len(union) if union else 0.0 + similarity[i, j] = similarity[j, i] = score + + return similarity + + +def jaccard_similarity_pair( + sets: List[Set[str]], threshold: float +) -> List[Tuple[int, int, float]]: + """Find pairs of sets with Jaccard similarity >= threshold.""" + similarity_matrix = jaccard_similarity_matrix(sets) + similar_pairs = np.argwhere(similarity_matrix >= threshold) + + return [ + (int(i), int(j), float(similarity_matrix[i, j])) + for i, j in similar_pairs + if i < j # avoid self-pairs and duplicates + ] + + +@pytest.fixture +def simple_kg(): + # Arrange: create a simple knowledge graph with embeddings + # roughly, we expect the following relationships: + # 1 <-> 2 (0.0 similarity) + # 2 <-> 3 (0.1667 similarity) + # 1 <-> 3 (0.25 similarity) + nodes = [ + Node( + id=UUID("4da47a69-539c-49a2-b289-01780989d82c"), + type=NodeType.DOCUMENT, + properties={ + "entities": {"cat", "dog", "fish", "fox", "bird"}, + }, + ), + Node( + id=UUID("f353e5c2-e432-4d1e-84a8-d750c93d4edf"), + type=NodeType.DOCUMENT, + properties={ + "entities": {"apple", "banana"}, + }, + ), + Node( + id=UUID("437c8c08-cef6-4ebf-a35f-93d6168b61a4"), + type=NodeType.DOCUMENT, + properties={ + "entities": {"cat", "banana", "dog", "rock", "tree"}, + }, + ), + ] + return KnowledgeGraph(nodes=nodes) + + +# node order +# UUID("4da47a69-539c-49a2-b289-01780989d82c") +# UUID("f353e5c2-e432-4d1e-84a8-d750c93d4edf") +# UUID("437c8c08-cef6-4ebf-a35f-93d6168b61a4") + + +@pytest.mark.parametrize( + "n_test_sets, max_len, threshold", + [ + (8, 100, 0.2), + (16, 8, 0.1), + (16, 16, 0.5), + (32, 5, 0.3), + ], +) +def test__find_similar_embedding_pairs_jaccard(n_test_sets, max_len, threshold): + """ + Validate that _find_similar_embedding_pairs correctly identifies pairs when compared with scipy's jaccard distance. + """ + sets = generate_test_sets( + n=n_test_sets, + max_len=max_len, + min_similarity=min(threshold + 0.05, 1.0), + similar_fraction=0.3, + ) + expected = jaccard_similarity_pair(sets, threshold) + + kg = KnowledgeGraph( + nodes=[Node(type=NodeType.DOCUMENT, properties={"entities": s}) for s in sets] + ) + builder = JaccardSimilarityBuilder(property_name="entities", threshold=threshold) + result = list(asyncio.run(builder._find_similar_embedding_pairs(kg))) + + assert len(result) == len(expected) + for i, j, similarity_float in result: + assert i < j, "Pairs should be ordered (i < j)" + assert similarity_float >= threshold, ( + f"Similarity {similarity_float} should be >= {threshold}" + ) + for x, y, expected_similarity in expected: + if i == x and j == y: + assert similarity_float == pytest.approx(expected_similarity) + break + + +class TestJaccardSimilarityBuilder: + @pytest.mark.asyncio + async def test_no_self_similarity_relationships(self, simple_kg): + builder = JaccardSimilarityBuilder(property_name="entities", threshold=0.1) + relationships = await builder.transform(copy.deepcopy(simple_kg)) + for r in relationships: + assert r.source.id != r.target.id, ( + "Self-relationships should not be created" + ) + + @pytest.mark.asyncio + async def test_no_duplicate_relationships(self, simple_kg): + builder = JaccardSimilarityBuilder(property_name="entities", threshold=0.1) + relationships = await builder.transform(copy.deepcopy(simple_kg)) + seen = set() + for r in relationships: + pair = tuple(sorted([r.source.id, r.target.id])) + assert pair not in seen, "Duplicate relationships found" + seen.add(pair) + + @pytest.mark.asyncio + async def test_similarity_at_threshold(self): + node1 = Node(type=NodeType.DOCUMENT, properties={"entities": {"a", "b", "c"}}) + node2 = Node(type=NodeType.DOCUMENT, properties={"entities": {"a", "b", "c"}}) + kg = KnowledgeGraph(nodes=[node1, node2]) + builder = JaccardSimilarityBuilder(property_name="entities", threshold=1.0) + relationships = await builder.transform(kg) + assert len(relationships) == 1, "Should create relationship at threshold" + + @pytest.mark.asyncio + async def test_all_below_threshold(self): + node1 = Node(type=NodeType.DOCUMENT, properties={"entities": {"a", "b", "c"}}) + node2 = Node(type=NodeType.DOCUMENT, properties={"entities": {"x", "y", "z"}}) + kg = KnowledgeGraph(nodes=[node1, node2]) + builder = JaccardSimilarityBuilder(property_name="entities", threshold=0.1) + relationships = await builder.transform(kg) + assert len(relationships) == 0, ( + "No relationships should be created below threshold" + ) + + @pytest.mark.asyncio + async def test_all_above_threshold(self): + node1 = Node(type=NodeType.DOCUMENT, properties={"entities": {"a", "b", "c"}}) + node2 = Node(type=NodeType.DOCUMENT, properties={"entities": {"a", "b", "c"}}) + node3 = Node(type=NodeType.DOCUMENT, properties={"entities": {"a", "b", "c"}}) + kg = KnowledgeGraph(nodes=[node1, node2, node3]) + builder = JaccardSimilarityBuilder(property_name="entities", threshold=0.9) + relationships = await builder.transform(kg) + assert len(relationships) == 3 + + @pytest.mark.asyncio + async def test_malformed_entities_raises(self): + node1 = Node(type=NodeType.DOCUMENT, properties={"entities": {"a", "b", "c"}}) + node2 = Node(type=NodeType.DOCUMENT, properties={"entities": None}) + kg = KnowledgeGraph(nodes=[node1, node2]) + builder = JaccardSimilarityBuilder(property_name="entities", threshold=0.5) + with pytest.raises(ValueError): + await builder.transform(kg) + + @pytest.mark.asyncio + async def test_jaccard_similarity_builder_empty_graph(self): + kg = KnowledgeGraph(nodes=[]) + builder = JaccardSimilarityBuilder(property_name="entities") + relationships = await builder.transform(kg) + assert relationships == [] + + @pytest.mark.asyncio + async def test_jaccard_similarity_builder_basic(self, simple_kg): + builder = JaccardSimilarityBuilder(property_name="entities", threshold=0.15) + relationships = await builder.transform(simple_kg) + assert all(isinstance(r, Relationship) for r in relationships) + assert all(r.type == "jaccard_similarity" for r in relationships) + # 2 <-> 3 (~0.1667 similarity) + assert any( + str(r.source.id) == "f353e5c2-e432-4d1e-84a8-d750c93d4edf" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in relationships + ) + # 1 <-> 3 (~0.25 similarity) + assert any( + str(r.source.id) == "4da47a69-539c-49a2-b289-01780989d82c" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in relationships + ) + + @pytest.mark.asyncio + async def test_jaccard_similarity_builder_no_entities(self): + kg = KnowledgeGraph( + nodes=[ + Node(type=NodeType.DOCUMENT, properties={}), + Node(type=NodeType.DOCUMENT, properties={}), + ] + ) + builder = JaccardSimilarityBuilder(property_name="entities") + with pytest.raises(ValueError, match="has no entities"): + await builder.transform(kg) + + @pytest.mark.asyncio + async def test_apply_transforms_cosine_similarity_builder(self, simple_kg): + from ragas.run_config import RunConfig + from ragas.testset.transforms.engine import apply_transforms + + # JaccardSimilarityBuilder should add relationships to the graph + builder = JaccardSimilarityBuilder(property_name="entities", threshold=0.15) + kg = simple_kg + # Should mutate kg in-place + apply_transforms(kg, builder, run_config=RunConfig(max_workers=2)) + # Check that relationships were added + assert any(r.type == "jaccard_similarity" for r in kg.relationships), ( + "No jaccard_similarity relationships found after apply_transforms" + ) + # Check that expected relationship exists + assert any( + str(r.source.id) == "f353e5c2-e432-4d1e-84a8-d750c93d4edf" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in kg.relationships + ) + # 1 <-> 3 (~0.8258 similarity) + assert any( + str(r.source.id) == "4da47a69-539c-49a2-b289-01780989d82c" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in kg.relationships + ) From f043959189b94e3a3bc16fb6fff014b1d748e644 Mon Sep 17 00:00:00 2001 From: ahgraber Date: Sun, 22 Jun 2025 15:43:26 -0400 Subject: [PATCH 05/11] chore: format --- .../unit/test_cosine_relationship_builders.py | 42 +++++++++---------- .../test_traditional_relationship_builders.py | 24 +++++------ 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/ragas/tests/unit/test_cosine_relationship_builders.py b/ragas/tests/unit/test_cosine_relationship_builders.py index ea048086c..25333ed8c 100644 --- a/ragas/tests/unit/test_cosine_relationship_builders.py +++ b/ragas/tests/unit/test_cosine_relationship_builders.py @@ -172,9 +172,9 @@ def test__cosine_similarity(n_test_embeddings): result = builder._block_cosine_similarity(embeddings, embeddings) assert result.shape == expected.shape, "Result shape does not match expected shape" - assert np.allclose(result, expected, atol=1e-5), ( - "Cosine similarity does not match expected values" - ) + assert np.allclose( + result, expected, atol=1e-5 + ), "Cosine similarity does not match expected values" # Test for the internal _find_similar_embedding_pairs method @@ -212,14 +212,14 @@ def test__find_similar_embedding_pairs(n_test_embeddings, threshold, block_size) for i, j, similarity_float in result: assert i < j, "Pairs should be ordered (i < j)" - assert similarity_float >= threshold, ( - f"Similarity {similarity_float} should be >= {threshold}" - ) + assert ( + similarity_float >= threshold + ), f"Similarity {similarity_float} should be >= {threshold}" for x, y, expected_similarity in expected: if i == x and j == y: - assert similarity_float == pytest.approx(expected_similarity), ( - "Cosine similarity does not match expected value" - ) + assert similarity_float == pytest.approx( + expected_similarity + ), "Cosine similarity does not match expected value" break @@ -230,9 +230,9 @@ async def test_no_self_similarity_relationships(self, simple_kg): builder = CosineSimilarityBuilder(property_name="embedding", threshold=0.1) relationships = await builder.transform(copy.deepcopy(simple_kg)) for r in relationships: - assert r.source.id != r.target.id, ( - "Self-relationships should not be created" - ) + assert ( + r.source.id != r.target.id + ), "Self-relationships should not be created" @pytest.mark.asyncio async def test_no_duplicate_relationships(self, simple_kg): @@ -260,9 +260,9 @@ async def test_all_below_threshold(self): kg = KnowledgeGraph(nodes=[node1, node2]) builder = CosineSimilarityBuilder(property_name="embedding", threshold=0.5) relationships = await builder.transform(kg) - assert len(relationships) == 0, ( - "No relationships should be created below threshold" - ) + assert ( + len(relationships) == 0 + ), "No relationships should be created below threshold" @pytest.mark.asyncio async def test_all_above_threshold(self): @@ -351,9 +351,9 @@ async def test_apply_transforms_cosine_similarity_builder(self, simple_kg): # Should mutate kg in-place apply_transforms(kg, builder, run_config=RunConfig(max_workers=2)) # Check that relationships were added - assert any(r.type == "cosine_similarity" for r in kg.relationships), ( - "No cosine_similarity relationships found after apply_transforms" - ) + assert any( + r.type == "cosine_similarity" for r in kg.relationships + ), "No cosine_similarity relationships found after apply_transforms" # Check that expected relationship exists assert any( str(r.source.id) == "f353e5c2-e432-4d1e-84a8-d750c93d4edf" @@ -419,9 +419,9 @@ async def test_apply_transforms_summary_cosine_similarity_builder(simple_kg): ) kg = simple_kg apply_transforms(kg, builder, run_config=RunConfig(max_workers=2)) - assert any(r.type == "summary_cosine_similarity" for r in kg.relationships), ( - "No summary_cosine_similarity relationships found after apply_transforms" - ) + assert any( + r.type == "summary_cosine_similarity" for r in kg.relationships + ), "No summary_cosine_similarity relationships found after apply_transforms" assert any( str(r.source.id) == "f353e5c2-e432-4d1e-84a8-d750c93d4edf" and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" diff --git a/ragas/tests/unit/test_traditional_relationship_builders.py b/ragas/tests/unit/test_traditional_relationship_builders.py index 15d37e7a0..47fbffc93 100644 --- a/ragas/tests/unit/test_traditional_relationship_builders.py +++ b/ragas/tests/unit/test_traditional_relationship_builders.py @@ -262,9 +262,9 @@ def test__find_similar_embedding_pairs_jaccard(n_test_sets, max_len, threshold): assert len(result) == len(expected) for i, j, similarity_float in result: assert i < j, "Pairs should be ordered (i < j)" - assert similarity_float >= threshold, ( - f"Similarity {similarity_float} should be >= {threshold}" - ) + assert ( + similarity_float >= threshold + ), f"Similarity {similarity_float} should be >= {threshold}" for x, y, expected_similarity in expected: if i == x and j == y: assert similarity_float == pytest.approx(expected_similarity) @@ -277,9 +277,9 @@ async def test_no_self_similarity_relationships(self, simple_kg): builder = JaccardSimilarityBuilder(property_name="entities", threshold=0.1) relationships = await builder.transform(copy.deepcopy(simple_kg)) for r in relationships: - assert r.source.id != r.target.id, ( - "Self-relationships should not be created" - ) + assert ( + r.source.id != r.target.id + ), "Self-relationships should not be created" @pytest.mark.asyncio async def test_no_duplicate_relationships(self, simple_kg): @@ -307,9 +307,9 @@ async def test_all_below_threshold(self): kg = KnowledgeGraph(nodes=[node1, node2]) builder = JaccardSimilarityBuilder(property_name="entities", threshold=0.1) relationships = await builder.transform(kg) - assert len(relationships) == 0, ( - "No relationships should be created below threshold" - ) + assert ( + len(relationships) == 0 + ), "No relationships should be created below threshold" @pytest.mark.asyncio async def test_all_above_threshold(self): @@ -379,9 +379,9 @@ async def test_apply_transforms_cosine_similarity_builder(self, simple_kg): # Should mutate kg in-place apply_transforms(kg, builder, run_config=RunConfig(max_workers=2)) # Check that relationships were added - assert any(r.type == "jaccard_similarity" for r in kg.relationships), ( - "No jaccard_similarity relationships found after apply_transforms" - ) + assert any( + r.type == "jaccard_similarity" for r in kg.relationships + ), "No jaccard_similarity relationships found after apply_transforms" # Check that expected relationship exists assert any( str(r.source.id) == "f353e5c2-e432-4d1e-84a8-d750c93d4edf" From e765baef3e196d1830a7ae0a790b98b2342beb0e Mon Sep 17 00:00:00 2001 From: ahgraber Date: Sun, 22 Jun 2025 15:45:40 -0400 Subject: [PATCH 06/11] chore: lint --- .../test_traditional_relationship_builders.py | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/ragas/tests/unit/test_traditional_relationship_builders.py b/ragas/tests/unit/test_traditional_relationship_builders.py index 47fbffc93..4bd01d7b7 100644 --- a/ragas/tests/unit/test_traditional_relationship_builders.py +++ b/ragas/tests/unit/test_traditional_relationship_builders.py @@ -81,14 +81,14 @@ def sample_length() -> int: # decide new lengths Li, Lj = sample_length(), sample_length() - # solve for needed intersection size I such that - # I / (Li + Lj - I) >= min_similarity - I = math.ceil(min_similarity * (Li + Lj) / (1 + min_similarity)) + # solve for needed intersection size intersection_size such that + # intersection_size / (Li + Lj - intersection_size) >= min_similarity + intersection_size = math.ceil(min_similarity * (Li + Lj) / (1 + min_similarity)) # build new similar pair - shared = {generate_entity() for _ in range(I)} - Ai = shared | {generate_entity() for _ in range(Li - I)} - Bj = shared | {generate_entity() for _ in range(Lj - I)} + shared = {generate_entity() for _ in range(intersection_size)} + Ai = shared | {generate_entity() for _ in range(Li - intersection_size)} + Bj = shared | {generate_entity() for _ in range(Lj - intersection_size)} sets[i], sets[j] = Ai, Bj @@ -262,9 +262,9 @@ def test__find_similar_embedding_pairs_jaccard(n_test_sets, max_len, threshold): assert len(result) == len(expected) for i, j, similarity_float in result: assert i < j, "Pairs should be ordered (i < j)" - assert ( - similarity_float >= threshold - ), f"Similarity {similarity_float} should be >= {threshold}" + assert similarity_float >= threshold, ( + f"Similarity {similarity_float} should be >= {threshold}" + ) for x, y, expected_similarity in expected: if i == x and j == y: assert similarity_float == pytest.approx(expected_similarity) @@ -277,9 +277,9 @@ async def test_no_self_similarity_relationships(self, simple_kg): builder = JaccardSimilarityBuilder(property_name="entities", threshold=0.1) relationships = await builder.transform(copy.deepcopy(simple_kg)) for r in relationships: - assert ( - r.source.id != r.target.id - ), "Self-relationships should not be created" + assert r.source.id != r.target.id, ( + "Self-relationships should not be created" + ) @pytest.mark.asyncio async def test_no_duplicate_relationships(self, simple_kg): @@ -307,9 +307,9 @@ async def test_all_below_threshold(self): kg = KnowledgeGraph(nodes=[node1, node2]) builder = JaccardSimilarityBuilder(property_name="entities", threshold=0.1) relationships = await builder.transform(kg) - assert ( - len(relationships) == 0 - ), "No relationships should be created below threshold" + assert len(relationships) == 0, ( + "No relationships should be created below threshold" + ) @pytest.mark.asyncio async def test_all_above_threshold(self): @@ -379,9 +379,9 @@ async def test_apply_transforms_cosine_similarity_builder(self, simple_kg): # Should mutate kg in-place apply_transforms(kg, builder, run_config=RunConfig(max_workers=2)) # Check that relationships were added - assert any( - r.type == "jaccard_similarity" for r in kg.relationships - ), "No jaccard_similarity relationships found after apply_transforms" + assert any(r.type == "jaccard_similarity" for r in kg.relationships), ( + "No jaccard_similarity relationships found after apply_transforms" + ) # Check that expected relationship exists assert any( str(r.source.id) == "f353e5c2-e432-4d1e-84a8-d750c93d4edf" From b1f6ecbad9bfab90012f198e968c7766e8e511fa Mon Sep 17 00:00:00 2001 From: ahgraber Date: Fri, 18 Jul 2025 12:55:38 -0400 Subject: [PATCH 07/11] chore: remove empty test group from pyproject.toml --- ragas/pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/ragas/pyproject.toml b/ragas/pyproject.toml index 8201c39d3..f54c4fdaa 100644 --- a/ragas/pyproject.toml +++ b/ragas/pyproject.toml @@ -72,7 +72,6 @@ test = [ "pytest-asyncio", "nbmake", ] -test = [] [tool.setuptools] package-dir = {"" = "src"} From 62546681e7db7704caf555cd2900ba7e076e3251 Mon Sep 17 00:00:00 2001 From: ahgraber Date: Fri, 18 Jul 2025 12:58:37 -0400 Subject: [PATCH 08/11] chore: move test dependencies to dev so ci works --- ragas/pyproject.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ragas/pyproject.toml b/ragas/pyproject.toml index f54c4fdaa..c8fe5e3eb 100644 --- a/ragas/pyproject.toml +++ b/ragas/pyproject.toml @@ -65,13 +65,12 @@ dev = [ "sacrebleu", "r2r", "scipy", -] -test = [ "pytest", "pytest-xdist[psutil]", "pytest-asyncio", "nbmake", ] +test = [] [tool.setuptools] package-dir = {"" = "src"} From d30e58d0b99576b125cef884163d3677201af5b1 Mon Sep 17 00:00:00 2001 From: ahgraber Date: Fri, 18 Jul 2025 13:12:38 -0400 Subject: [PATCH 09/11] fix: format and tests --- .../relationship_builders/cosine.py | 11 --------- .../unit/test_cosine_relationship_builders.py | 5 ++-- .../test_traditional_relationship_builders.py | 24 +++++++++---------- 3 files changed, 15 insertions(+), 25 deletions(-) diff --git a/ragas/src/ragas/testset/transforms/relationship_builders/cosine.py b/ragas/src/ragas/testset/transforms/relationship_builders/cosine.py index 3ea1b1479..55ca1ccdd 100644 --- a/ragas/src/ragas/testset/transforms/relationship_builders/cosine.py +++ b/ragas/src/ragas/testset/transforms/relationship_builders/cosine.py @@ -14,17 +14,6 @@ class CosineSimilarityBuilder(RelationshipBuilder): threshold: float = 0.9 block_size: int = 1024 - def _validate_embedding_shapes(self, embeddings: t.List[t.Any]): - if not embeddings: - raise ValueError(f"No nodes have a valid {self.property_name}") - first_len = len(embeddings[0]) - for idx, emb in enumerate(embeddings): - if len(emb) != first_len: - raise ValueError( - f"Embedding at index {idx} has length {len(emb)}, expected {first_len}. " - "All embeddings must have the same length." - ) - def _block_cosine_similarity(self, i: np.ndarray, j: np.ndarray): """Calculate cosine similarity matrix between two sets of embeddings.""" i_norm = i / np.linalg.norm(i, axis=1, keepdims=True) diff --git a/ragas/tests/unit/test_cosine_relationship_builders.py b/ragas/tests/unit/test_cosine_relationship_builders.py index 25333ed8c..23c00aeb2 100644 --- a/ragas/tests/unit/test_cosine_relationship_builders.py +++ b/ragas/tests/unit/test_cosine_relationship_builders.py @@ -1,6 +1,7 @@ import asyncio import copy import random +from typing import Optional from uuid import UUID import numpy as np @@ -18,7 +19,7 @@ def generate_test_vectors( d: int = 32, min_similarity: float = 0.5, similar_fraction: float = 0.3, - seed: int | None = None, + seed: Optional[int] = None, ) -> np.ndarray: """ Generate `n` unit vectors of dimension `d`, where at least `similar_fraction` of them @@ -287,7 +288,7 @@ async def test_malformed_embedding_raises(self): async def test_cosine_similarity_builder_empty_graph(self): kg = KnowledgeGraph(nodes=[]) builder = CosineSimilarityBuilder(property_name="embedding") - with pytest.raises(ValueError, match="No nodes have a valid embedding"): + with pytest.raises(ValueError): await builder.transform(kg) @pytest.mark.asyncio diff --git a/ragas/tests/unit/test_traditional_relationship_builders.py b/ragas/tests/unit/test_traditional_relationship_builders.py index 4bd01d7b7..6dd8f01c9 100644 --- a/ragas/tests/unit/test_traditional_relationship_builders.py +++ b/ragas/tests/unit/test_traditional_relationship_builders.py @@ -262,9 +262,9 @@ def test__find_similar_embedding_pairs_jaccard(n_test_sets, max_len, threshold): assert len(result) == len(expected) for i, j, similarity_float in result: assert i < j, "Pairs should be ordered (i < j)" - assert similarity_float >= threshold, ( - f"Similarity {similarity_float} should be >= {threshold}" - ) + assert ( + similarity_float >= threshold + ), f"Similarity {similarity_float} should be >= {threshold}" for x, y, expected_similarity in expected: if i == x and j == y: assert similarity_float == pytest.approx(expected_similarity) @@ -277,9 +277,9 @@ async def test_no_self_similarity_relationships(self, simple_kg): builder = JaccardSimilarityBuilder(property_name="entities", threshold=0.1) relationships = await builder.transform(copy.deepcopy(simple_kg)) for r in relationships: - assert r.source.id != r.target.id, ( - "Self-relationships should not be created" - ) + assert ( + r.source.id != r.target.id + ), "Self-relationships should not be created" @pytest.mark.asyncio async def test_no_duplicate_relationships(self, simple_kg): @@ -307,9 +307,9 @@ async def test_all_below_threshold(self): kg = KnowledgeGraph(nodes=[node1, node2]) builder = JaccardSimilarityBuilder(property_name="entities", threshold=0.1) relationships = await builder.transform(kg) - assert len(relationships) == 0, ( - "No relationships should be created below threshold" - ) + assert ( + len(relationships) == 0 + ), "No relationships should be created below threshold" @pytest.mark.asyncio async def test_all_above_threshold(self): @@ -379,9 +379,9 @@ async def test_apply_transforms_cosine_similarity_builder(self, simple_kg): # Should mutate kg in-place apply_transforms(kg, builder, run_config=RunConfig(max_workers=2)) # Check that relationships were added - assert any(r.type == "jaccard_similarity" for r in kg.relationships), ( - "No jaccard_similarity relationships found after apply_transforms" - ) + assert any( + r.type == "jaccard_similarity" for r in kg.relationships + ), "No jaccard_similarity relationships found after apply_transforms" # Check that expected relationship exists assert any( str(r.source.id) == "f353e5c2-e432-4d1e-84a8-d750c93d4edf" From e153e99ae10cb2bcc930ae44363fb3dd443b9b1c Mon Sep 17 00:00:00 2001 From: ahgraber Date: Fri, 18 Jul 2025 21:02:33 -0400 Subject: [PATCH 10/11] fix: validate input parameters in generate_test_sets - Improved logic for generating similar and dissimilar sets based on input constraints. --- .../test_traditional_relationship_builders.py | 176 +++++++++--------- 1 file changed, 88 insertions(+), 88 deletions(-) diff --git a/ragas/tests/unit/test_traditional_relationship_builders.py b/ragas/tests/unit/test_traditional_relationship_builders.py index 6dd8f01c9..81e6201b2 100644 --- a/ragas/tests/unit/test_traditional_relationship_builders.py +++ b/ragas/tests/unit/test_traditional_relationship_builders.py @@ -35,6 +35,11 @@ def generate_test_sets( - list: List of generated sets. """ + if not (0 < min_similarity <= 1): + raise ValueError("min_similarity must be between 0 and 1.") + if not (0 <= similar_fraction <= 1): + raise ValueError("similar_fraction must be between 0 and 1.") + def generate_entity(k: int = 5) -> str: """Generate a random entity of length k.""" return "".join(random.choices(string.ascii_lowercase, k=k)) @@ -49,104 +54,99 @@ def jaccard(a: set[str], b: set[str]) -> float: # SciPy returns the Jaccard distance; similarity = 1 - distance return 1.0 - jaccard_dist(va, vb) - # bias toward shorter lengths (expovariate with λ=1.0) - def sample_length() -> int: - length = int(random.expovariate(1.0)) - return min(length, max_len) - total_pairs = n * (n - 1) // 2 - target_similar = math.ceil(total_pairs * similar_fraction) - - # Initialize all sets with random, ragged lengths - sets = [{generate_entity() for _ in range(sample_length())} for _ in range(n)] - - # Count how many pairs are “similar” right now - current_similar = len(jaccard_similarity_pair(sets, min_similarity)) - - # Iteratively fix random non‐similar pairs until we hit target - max_attempts = target_similar * 10 - attempts = 0 - - while current_similar < target_similar and attempts < max_attempts: - # pick a non‐similar pair - bad_pairs = [ - (i, j) - for i in range(n) - for j in range(i + 1, n) - if jaccard(sets[i], sets[j]) < min_similarity - ] - if not bad_pairs: - break - i, j = random.choice(bad_pairs) - - # decide new lengths - Li, Lj = sample_length(), sample_length() - # solve for needed intersection size intersection_size such that - # intersection_size / (Li + Lj - intersection_size) >= min_similarity - intersection_size = math.ceil(min_similarity * (Li + Lj) / (1 + min_similarity)) - - # build new similar pair - shared = {generate_entity() for _ in range(intersection_size)} - Ai = shared | {generate_entity() for _ in range(Li - intersection_size)} - Bj = shared | {generate_entity() for _ in range(Lj - intersection_size)} - - sets[i], sets[j] = Ai, Bj - - current_similar = len(jaccard_similarity_pair(sets, min_similarity)) - attempts += 1 - else: + if total_pairs == 0: + return [set() for _ in range(n)] + + target_similar_pairs = math.ceil(total_pairs * similar_fraction) + + if target_similar_pairs == 0: + # Generate n random, dissimilar sets + sets = [] + pool = {generate_entity() for _ in range(n * max_len)} + for _ in range(n): + length = random.randint(0, max_len) + s = set(random.sample(list(pool), min(length, len(pool)))) + pool -= s + sets.append(s) + random.shuffle(sets) + return sets + + # Calculate the size of a clique of similar sets needed + # n_clique * (n_clique - 1) / 2 >= target_similar_pairs + n_clique = math.ceil((1 + math.sqrt(1 + 8 * target_similar_pairs)) / 2) + n_clique = min(n, n_clique) + n_dissimilar = n - n_clique + + # To guarantee a given similarity, the size of the core set + # and the number of unique elements added are constrained by the max_len. + # We need cs + unique_per_set <= max_len. + # And unique_per_set is a function of cs and min_similarity. + core_size = math.floor((2 * max_len * min_similarity) / (1 + min_similarity)) + if core_size == 0 and max_len > 0 and min_similarity > 0: raise ValueError( - f"Could not generate enough similar pairs after {max_attempts} attempts." + "Cannot generate sets with these constraints. " + "Try increasing max_len or decreasing min_similarity." ) - # Create a core set of shared elements for similar sets - core_size = max(1, int(max_len * min_similarity)) - core = {generate_entity() for _ in range(core_size)} + if min_similarity == 1.0: + max_additional_elements = 0 + else: + # This is the max number of elements that can be non-core across TWO sets + max_additional_elements = math.floor(core_size * (1 / min_similarity - 1)) - # Create a set of unique elements to draw from - base_pool = {generate_entity() for _ in range(n * max_len * 8)} - base_pool -= core + core = {generate_entity() for _ in range(core_size)} - n_similar = int(n * similar_fraction) - n_dissimilar = n - n_similar + # A large pool of entities to draw from + pool_size = (n * max_len) * 2 # just to be safe + pool = {generate_entity() for _ in range(pool_size)} - core - # Pre-calculate max add'l unique elements that can be added to core while still guaranteeing min_similarity - max_unique = int(core_size * ((1 - min_similarity) / min_similarity)) - if max_unique > max_len: - raise ValueError( - "max_unique exceeds max_len, cannot guarantee min_similarity with given parameters." - ) - - # Generate similar sets - similar = [] - for _ in range(n_similar): - # Random size for this set, at least the core size - set_len = core_size + random.randint(0, max_unique) + similar_sets = [] + for _ in range(n_clique): s = core.copy() - # Add random elements from the base pool until we reach set_len - while len(s) < set_len: - if not base_pool: - raise ValueError("Base pool is empty, cannot generate more sets.") - element = base_pool.pop() - if element not in s: - s.add(element) - similar.append(s) - - # Generate dissimilar sets - dissimilar = [] + + # Max unique elements per set to guarantee similarity + max_unique_for_set = math.floor(max_additional_elements / 2) + # Also respect max_len + max_unique_for_set = min(max_unique_for_set, max_len - core_size) + + if max_unique_for_set > 0: + num_unique = random.randint(0, max_unique_for_set) + if len(pool) < num_unique: + # Replenish pool if needed + pool.update({generate_entity() for _ in range(num_unique * 2)} - core) + new_elements = set(random.sample(list(pool), num_unique)) + s.update(new_elements) + pool -= new_elements + similar_sets.append(s) + + # --- Generate the dissimilar sets --- + dissimilar_sets = [] for _ in range(n_dissimilar): - set_len = random.randint(0, max_len) - s = set() - while len(s) < set_len: - if not base_pool: - raise ValueError("Base pool is empty, cannot generate more sets.") - element = base_pool.pop() - if element not in s: - s.add(element) - dissimilar.append(s) - - sets = similar + dissimilar + length = random.randint(0, max_len) + length = min(length, len(pool)) + if length > 0: + s = set(random.sample(list(pool), length)) + pool -= s + else: + s = set() + dissimilar_sets.append(s) + + sets = similar_sets + dissimilar_sets random.shuffle(sets) + + # --- Verify the result --- + actual_similar_pairs = 0 + for i in range(n): + for j in range(i + 1, n): + if jaccard(sets[i], sets[j]) >= min_similarity: + actual_similar_pairs += 1 + + assert actual_similar_pairs >= target_similar_pairs, ( + f"Failed to generate the required number of similar pairs. " + f"Target: {target_similar_pairs}, Actual: {actual_similar_pairs}" + ) + return sets From 9d9335c6fb78c86f28e919b10fdba8d3add7fd2b Mon Sep 17 00:00:00 2001 From: ahgraber Date: Mon, 21 Jul 2025 16:35:44 -0400 Subject: [PATCH 11/11] fix: CosineSimilarityBuilder.generate_execution_plan to use filtered knowledge graph --- .../testset/transforms/relationship_builders/cosine.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/ragas/src/ragas/testset/transforms/relationship_builders/cosine.py b/ragas/src/ragas/testset/transforms/relationship_builders/cosine.py index 55ca1ccdd..834b179f0 100644 --- a/ragas/src/ragas/testset/transforms/relationship_builders/cosine.py +++ b/ragas/src/ragas/testset/transforms/relationship_builders/cosine.py @@ -84,8 +84,10 @@ def generate_execution_plan(self, kg: KnowledgeGraph) -> t.List[t.Coroutine]: """ Generates a coroutine task for finding similar embedding pairs, which can be scheduled/executed by an Executor. """ + filtered_kg = self.filter(kg) + embeddings = [] - for node in kg.nodes: + for node in filtered_kg.nodes: embedding = node.get_property(self.property_name) if embedding is None: raise ValueError(f"Node {node.id} has no {self.property_name}") @@ -98,8 +100,8 @@ async def find_and_add_relationships(): ) for i, j, similarity_float in similar_pairs: rel = Relationship( - source=kg.nodes[i], - target=kg.nodes[j], + source=filtered_kg.nodes[i], + target=filtered_kg.nodes[j], type=self.new_property_name, properties={self.new_property_name: similarity_float}, bidirectional=True,