Skip to content

Commit df52d64

Browse files
committed
worked through duplicates for KNN target node filter
1 parent 4cde7e2 commit df52d64

File tree

3 files changed

+91
-2
lines changed

3 files changed

+91
-2
lines changed

algo/src/main/java/org/neo4j/gds/similarity/filteredknn/TargetNodeFilter.java

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,17 @@
2929
import java.util.stream.Stream;
3030

3131
/**
32-
* We sort results by score, descending.
32+
* This target node filter evaluates and stores incoming elements (neighbours) with their priority (score). We sort
33+
* elements by priority, descending.
3334
*
34-
* For now a simple bounded priority queue that does _not_ handle duplicates.
35+
* For now it is a simple, bounded priority queue backed by a {@link java.util.TreeSet}. We handle duplicates, in the
36+
* sense that _exact_ pairs of element and priority, that already exist in the queue, are not added twice - this happens
37+
* to be the semantics of the {@link java.util.TreeSet} we use. So no duplicates in the output, even though our dear
38+
* {@link org.neo4j.gds.similarity.knn.Knn} algorithm does present us with such cases.
39+
*
40+
* NB: this data structure would _not_ handle "re-prioritisations" like a neighbour with a different score. Luckily we
41+
* have convinced ourselves that {@link org.neo4j.gds.similarity.knn.Knn} never presents us with such cases. So this
42+
* data structure suffices.
3543
*/
3644
public class TargetNodeFilter implements NeighbourConsumer {
3745
private final TreeSet<Pair<Double, Long>> priorityQueue = new TreeSet<>(Comparator.reverseOrder());

algo/src/test/java/org/neo4j/gds/similarity/filteredknn/FilteredKnnTest.java

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434

3535
import java.util.Comparator;
3636
import java.util.List;
37+
import java.util.Map;
3738
import java.util.Set;
3839
import java.util.stream.Collectors;
3940

@@ -229,4 +230,48 @@ void shouldOnlyProduceResultsForFilteredTargetNodes() {
229230
).isEqualTo(Set.of(targetNode1, targetNode2));
230231
}
231232
}
233+
234+
@Nested
235+
class TargetNodeFilteringAndDuplicates {
236+
@GdlGraph
237+
private static final String DB_CYPHER =
238+
"CREATE" +
239+
" (a { knn: 1.2 } )" +
240+
", (b { knn: 1.1 } )" +
241+
", (c { knn: 2.1 } )" +
242+
", (d { knn: 3.1 } )" +
243+
", (e { knn: 4.1 } )";
244+
245+
@Test
246+
void shouldIgnoreDuplicates() {
247+
var targetNode1 = idFunction.of("a");
248+
var targetNode2 = idFunction.of("b");
249+
var targetNode3 = idFunction.of("c");
250+
var targetNode4 = idFunction.of("d");
251+
var targetNode5 = idFunction.of("e");
252+
var config = FilteredKnnBaseConfigImpl.builder()
253+
.nodeProperties(List.of("knn"))
254+
.topK(42)
255+
.targetNodeFilter(List.of(targetNode1, targetNode2, targetNode3, targetNode4, targetNode5))
256+
.build();
257+
var knnContext = KnnContext.empty();
258+
var knn = FilteredKnn.create(graph, config, knnContext);
259+
var result = knn.compute();
260+
261+
/*
262+
* Ok we want to express that, for each source node, the target nodes found have no duplicates.
263+
* First, group the results
264+
*/
265+
Map<Long, List<SimilarityResult>> resultsPerSourceNode = result
266+
.similarityResultStream()
267+
.collect(Collectors.groupingBy(SimilarityResult::sourceNodeId));
268+
269+
// now for each result, see that there are no duplicates
270+
resultsPerSourceNode
271+
.values()
272+
.forEach(similarityResultList -> assertThat(similarityResultList
273+
.stream()
274+
.mapToLong(SimilarityResult::targetNodeId)).doesNotHaveDuplicates());
275+
}
276+
}
232277
}

algo/src/test/java/org/neo4j/gds/similarity/filteredknn/TargetNodeFilterTest.java

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,4 +64,40 @@ void shouldOnlyIncludeTargetNodes() {
6464

6565
assertThat(consumer.asSimilarityStream(117)).isEmpty();
6666
}
67+
68+
@Test
69+
void shouldIgnoreExactDuplicates() {
70+
TargetNodeFilter consumer = new TargetNodeFilter(l -> true, 4);
71+
72+
consumer.offer(23, 3.14);
73+
consumer.offer(42, 1.61);
74+
consumer.offer(87, 2.71);
75+
consumer.offer(42, 1.61);
76+
77+
assertThat(consumer.asSimilarityStream(117)).containsExactly(
78+
new SimilarityResult(117, 23, 3.14),
79+
new SimilarityResult(117, 87, 2.71),
80+
new SimilarityResult(117, 42, 1.61)
81+
);
82+
}
83+
84+
/**
85+
* This is documenting a fact rather than illustrating something desirable.
86+
*/
87+
@Test
88+
void shouldAllowDuplicateElementsWithNewPriorities() {
89+
TargetNodeFilter consumer = new TargetNodeFilter(l -> true, 4);
90+
91+
consumer.offer(23, 3.14);
92+
consumer.offer(42, 1.61);
93+
consumer.offer(87, 2.71);
94+
consumer.offer(42, 1.41);
95+
96+
assertThat(consumer.asSimilarityStream(117)).containsExactly(
97+
new SimilarityResult(117, 23, 3.14),
98+
new SimilarityResult(117, 87, 2.71),
99+
new SimilarityResult(117, 42, 1.61),
100+
new SimilarityResult(117, 42, 1.41)
101+
);
102+
}
67103
}

0 commit comments

Comments
 (0)