worked through duplicates for KNN target node filter

lassewesth · lassewesth · commit df52d64fa7f5 · 2022-05-19T13:26:17.000+02:00
diff --git a/algo/src/main/java/org/neo4j/gds/similarity/filteredknn/TargetNodeFilter.java b/algo/src/main/java/org/neo4j/gds/similarity/filteredknn/TargetNodeFilter.java
@@ -29,9 +29,17 @@
 import java.util.stream.Stream;
 
 /**
- * We sort results by score, descending.
+ * This target node filter evaluates and stores incoming elements (neighbours) with their priority (score). We sort
+ * elements by priority, descending.
  *
- * For now a simple bounded priority queue that does _not_ handle duplicates.
+ * For now it is a simple, bounded priority queue backed by a {@link java.util.TreeSet}. We handle duplicates, in the
+ * sense that _exact_ pairs of element and priority, that already exist in the queue, are not added twice - this happens
+ * to be the semantics of the {@link java.util.TreeSet} we use. So no duplicates in the output, even though our dear
+ * {@link org.neo4j.gds.similarity.knn.Knn} algorithm does present us with such cases.
+ *
+ * NB: this data structure would _not_ handle "re-prioritisations" like a neighbour with a different score. Luckily we
+ * have convinced ourselves that {@link org.neo4j.gds.similarity.knn.Knn} never presents us with such cases. So this
+ * data structure suffices.
  */
 public class TargetNodeFilter implements NeighbourConsumer {
     private final TreeSet<Pair<Double, Long>> priorityQueue = new TreeSet<>(Comparator.reverseOrder());
diff --git a/algo/src/test/java/org/neo4j/gds/similarity/filteredknn/FilteredKnnTest.java b/algo/src/test/java/org/neo4j/gds/similarity/filteredknn/FilteredKnnTest.java
@@ -34,6 +34,7 @@
 
 import java.util.Comparator;
 import java.util.List;
+import java.util.Map;
 import java.util.Set;
 import java.util.stream.Collectors;
 
@@ -229,4 +230,48 @@ void shouldOnlyProduceResultsForFilteredTargetNodes() {
             ).isEqualTo(Set.of(targetNode1, targetNode2));
         }
     }
+
+    @Nested
+    class TargetNodeFilteringAndDuplicates {
+        @GdlGraph
+        private static final String DB_CYPHER =
+            "CREATE" +
+            "  (a { knn: 1.2 } )" +
+            ", (b { knn: 1.1 } )" +
+            ", (c { knn: 2.1 } )" +
+            ", (d { knn: 3.1 } )" +
+            ", (e { knn: 4.1 } )";
+
+        @Test
+        void shouldIgnoreDuplicates() {
+            var targetNode1 = idFunction.of("a");
+            var targetNode2 = idFunction.of("b");
+            var targetNode3 = idFunction.of("c");
+            var targetNode4 = idFunction.of("d");
+            var targetNode5 = idFunction.of("e");
+            var config = FilteredKnnBaseConfigImpl.builder()
+                .nodeProperties(List.of("knn"))
+                .topK(42)
+                .targetNodeFilter(List.of(targetNode1, targetNode2, targetNode3, targetNode4, targetNode5))
+                .build();
+            var knnContext = KnnContext.empty();
+            var knn = FilteredKnn.create(graph, config, knnContext);
+            var result = knn.compute();
+
+            /*
+             * Ok we want to express that, for each source node, the target nodes found have no duplicates.
+             * First, group the results
+             */
+            Map<Long, List<SimilarityResult>> resultsPerSourceNode = result
+                .similarityResultStream()
+                .collect(Collectors.groupingBy(SimilarityResult::sourceNodeId));
+
+            // now for each result, see that there are no duplicates
+            resultsPerSourceNode
+                .values()
+                .forEach(similarityResultList -> assertThat(similarityResultList
+                    .stream()
+                    .mapToLong(SimilarityResult::targetNodeId)).doesNotHaveDuplicates());
+        }
+    }
 }
diff --git a/algo/src/test/java/org/neo4j/gds/similarity/filteredknn/TargetNodeFilterTest.java b/algo/src/test/java/org/neo4j/gds/similarity/filteredknn/TargetNodeFilterTest.java
@@ -64,4 +64,40 @@ void shouldOnlyIncludeTargetNodes() {
 
         assertThat(consumer.asSimilarityStream(117)).isEmpty();
     }
+
+    @Test
+    void shouldIgnoreExactDuplicates() {
+        TargetNodeFilter consumer = new TargetNodeFilter(l -> true, 4);
+
+        consumer.offer(23, 3.14);
+        consumer.offer(42, 1.61);
+        consumer.offer(87, 2.71);
+        consumer.offer(42, 1.61);
+
+        assertThat(consumer.asSimilarityStream(117)).containsExactly(
+            new SimilarityResult(117, 23, 3.14),
+            new SimilarityResult(117, 87, 2.71),
+            new SimilarityResult(117, 42, 1.61)
+        );
+    }
+
+    /**
+     * This is documenting a fact rather than illustrating something desirable.
+     */
+    @Test
+    void shouldAllowDuplicateElementsWithNewPriorities() {
+        TargetNodeFilter consumer = new TargetNodeFilter(l -> true, 4);
+
+        consumer.offer(23, 3.14);
+        consumer.offer(42, 1.61);
+        consumer.offer(87, 2.71);
+        consumer.offer(42, 1.41);
+
+        assertThat(consumer.asSimilarityStream(117)).containsExactly(
+            new SimilarityResult(117, 23, 3.14),
+            new SimilarityResult(117, 87, 2.71),
+            new SimilarityResult(117, 42, 1.61),
+            new SimilarityResult(117, 42, 1.41)
+        );
+    }
 }