diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index f9f036eb1040..7ad0c916172a 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -111,6 +111,9 @@ Optimizations
* GITHUB#14609: Optimizes PointRangeQuery by rewriting to MatchAllDocsQuery/FieldExistsQuery/MatchNoDocsQuery if all
docs in index are contained or excluded (Elliott Bradshaw)
+* GITHUB#14679: Optimize exhaustive evaluation of disjunctive queries.
+ (Adrien Grand)
+
Bug Fixes
---------------------
(No changes)
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene103/Lucene103PostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/Lucene103PostingsReader.java
index 3ecaddef6174..fac90c25da32 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene103/Lucene103PostingsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/Lucene103PostingsReader.java
@@ -46,6 +46,7 @@
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
import org.apache.lucene.internal.vectorization.VectorizationProvider;
+import org.apache.lucene.search.DocAndFreqBuffer;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.DataInput;
@@ -1034,6 +1035,50 @@ public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOExcept
}
}
+ @Override
+ public void nextPostings(int upTo, DocAndFreqBuffer buffer) throws IOException {
+ assert needsRefilling == false;
+
+ if (needsFreq == false) {
+ super.nextPostings(upTo, buffer);
+ return;
+ }
+
+ buffer.size = 0;
+ if (doc >= upTo) {
+ return;
+ }
+
+ // Only return docs from the current block
+ buffer.growNoCopy(BLOCK_SIZE);
+ upTo = (int) Math.min(upTo, level0LastDocID + 1L);
+
+ // Frequencies are decoded lazily, calling freq() makes sure that the freq block is decoded
+ freq();
+
+ int start = docBufferUpto - 1;
+ buffer.size = 0;
+ switch (encoding) {
+ case PACKED:
+ int end = computeBufferEndBoundary(upTo);
+ buffer.size = end - start;
+ System.arraycopy(docBuffer, start, buffer.docs, 0, buffer.size);
+ break;
+ case UNARY:
+ docBitSet.forEach(
+ doc - docBitSetBase,
+ upTo - docBitSetBase,
+ docBitSetBase,
+ d -> buffer.docs[buffer.size++] = d);
+ break;
+ }
+
+ assert buffer.size > 0;
+ System.arraycopy(freqBuffer, start, buffer.freqs, 0, buffer.size);
+
+ advance(upTo);
+ }
+
private int computeBufferEndBoundary(int upTo) {
if (docBufferSize != 0 && docBuffer[docBufferSize - 1] < upTo) {
// All docs in the buffer are under upTo
diff --git a/lucene/core/src/java/org/apache/lucene/index/CompositeReader.java b/lucene/core/src/java/org/apache/lucene/index/CompositeReader.java
index 4b35ba6cd7c9..ccc454c59944 100644
--- a/lucene/core/src/java/org/apache/lucene/index/CompositeReader.java
+++ b/lucene/core/src/java/org/apache/lucene/index/CompositeReader.java
@@ -44,7 +44,7 @@
* synchronization, you should not synchronize on the IndexReader
instance; use
* your own (non-Lucene) objects instead.
*/
-public abstract non-sealed class CompositeReader extends IndexReader {
+public abstract class CompositeReader extends IndexReader {
private volatile CompositeReaderContext readerContext = null; // lazy init
diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexReader.java b/lucene/core/src/java/org/apache/lucene/index/IndexReader.java
index 8e965ee8099c..e23efc23ece2 100644
--- a/lucene/core/src/java/org/apache/lucene/index/IndexReader.java
+++ b/lucene/core/src/java/org/apache/lucene/index/IndexReader.java
@@ -63,7 +63,7 @@
* synchronization, you should not synchronize on the IndexReader
instance; use
* your own (non-Lucene) objects instead.
*/
-public abstract sealed class IndexReader implements Closeable permits CompositeReader, LeafReader {
+public abstract class IndexReader implements Closeable {
private boolean closed = false;
private boolean closedByChild = false;
diff --git a/lucene/core/src/java/org/apache/lucene/index/LeafReader.java b/lucene/core/src/java/org/apache/lucene/index/LeafReader.java
index 0f39d1ae1e8d..eea6c317b9a7 100644
--- a/lucene/core/src/java/org/apache/lucene/index/LeafReader.java
+++ b/lucene/core/src/java/org/apache/lucene/index/LeafReader.java
@@ -44,7 +44,7 @@
* synchronization, you should not synchronize on the IndexReader
instance; use
* your own (non-Lucene) objects instead.
*/
-public abstract non-sealed class LeafReader extends IndexReader {
+public abstract class LeafReader extends IndexReader {
private final LeafReaderContext readerContext = new LeafReaderContext(this);
diff --git a/lucene/core/src/java/org/apache/lucene/index/PostingsEnum.java b/lucene/core/src/java/org/apache/lucene/index/PostingsEnum.java
index 2cb0092aaa49..8e82e0a6d696 100644
--- a/lucene/core/src/java/org/apache/lucene/index/PostingsEnum.java
+++ b/lucene/core/src/java/org/apache/lucene/index/PostingsEnum.java
@@ -17,6 +17,7 @@
package org.apache.lucene.index;
import java.io.IOException;
+import org.apache.lucene.search.DocAndFreqBuffer;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.BytesRef;
@@ -97,4 +98,48 @@ protected PostingsEnum() {}
* anything (neither members of the returned BytesRef nor bytes in the byte[]).
*/
public abstract BytesRef getPayload() throws IOException;
+
+ /**
+ * Fill a buffer of doc IDs and frequencies with some number of doc IDs and their corresponding
+ * frequencies, starting at the current doc ID, and ending before {@code upTo}. Because it starts
+ * on the current doc ID, it is illegal to call this method if the {@link #docID() current doc ID}
+ * is {@code -1}.
+ *
+ *
An empty buffer after this method returns indicates that there are no postings left between + * the current doc ID and {@code upTo}. + * + *
Implementations should ideally fill the buffer with a number of entries comprised between 8 + * and a couple hundreds, to keep heap requirements contained, while still being large enough to + * enable operations on the buffer to auto-vectorize efficiently. + * + *
The default implementation is provided below: + * + *
+ * int batchSize = 16; // arbitrary + * buffer.growNoCopy(batchSize); + * int size = 0; + * for (int doc = docID(); doc < upTo && size < batchSize; doc = nextDoc()) { + * buffer.docs[size] = doc; + * buffer.freqs[size] = freq(); + * ++size; + * } + * buffer.size = size; + *+ * + *
NOTE: The provided {@link DocAndFreqBuffer} should not hold references to internal
+ * data structures.
+ *
+ * @lucene.internal
+ */
+ public void nextPostings(int upTo, DocAndFreqBuffer buffer) throws IOException {
+ int batchSize = 16; // arbitrary
+ buffer.growNoCopy(batchSize);
+ int size = 0;
+ for (int doc = docID(); doc < upTo && size < batchSize; doc = nextDoc()) {
+ buffer.docs[size] = doc;
+ buffer.freqs[size] = freq();
+ ++size;
+ }
+ buffer.size = size;
+ }
}
diff --git a/lucene/core/src/java/org/apache/lucene/search/BlockMaxConjunctionBulkScorer.java b/lucene/core/src/java/org/apache/lucene/search/BlockMaxConjunctionBulkScorer.java
index fd89153ffe6d..09910c752ed3 100644
--- a/lucene/core/src/java/org/apache/lucene/search/BlockMaxConjunctionBulkScorer.java
+++ b/lucene/core/src/java/org/apache/lucene/search/BlockMaxConjunctionBulkScorer.java
@@ -22,7 +22,6 @@
import java.util.List;
import org.apache.lucene.search.Weight.DefaultBulkScorer;
import org.apache.lucene.util.Bits;
-import org.apache.lucene.util.MathUtil;
/**
* BulkScorer implementation of {@link BlockMaxConjunctionScorer} that focuses on top-level
@@ -36,13 +35,13 @@
final class BlockMaxConjunctionBulkScorer extends BulkScorer {
private final Scorer[] scorers;
- private final Scorable[] scorables;
private final DocIdSetIterator[] iterators;
- private final DocIdSetIterator lead1, lead2;
- private final Scorable scorer1, scorer2;
+ private final DocIdSetIterator lead1;
private final DocAndScore scorable = new DocAndScore();
private final double[] sumOfOtherClauses;
private final int maxDoc;
+ private final DocAndScoreBuffer docAndScoreBuffer = new DocAndScoreBuffer();
+ private final DocAndScoreAccBuffer docAndScoreAccBuffer = new DocAndScoreAccBuffer();
BlockMaxConjunctionBulkScorer(int maxDoc, List An empty return value indicates that there are no postings left between the current doc ID
+ * and {@code upTo}.
+ *
+ * Implementations should ideally fill the buffer with a number of entries comprised between 8
+ * and a couple hundreds, to keep heap requirements contained, while still being large enough to
+ * enable operations on the buffer to auto-vectorize efficiently.
+ *
+ * The default implementation is provided below:
+ *
+ * NOTE: The provided {@link DocAndScoreBuffer} should not hold references to internal
+ * data structures.
+ *
+ * NOTE: In case this {@link Scorer} exposes a {@link #twoPhaseIterator()
+ * TwoPhaseIterator}, it should be positioned on a matching document before this method is called.
+ *
+ * @lucene.internal
+ */
+ public void nextDocsAndScores(int upTo, Bits liveDocs, DocAndScoreBuffer buffer)
+ throws IOException {
+ int batchSize = 16; // arbitrary
+ buffer.growNoCopy(batchSize);
+ int size = 0;
+ DocIdSetIterator iterator = iterator();
+ for (int doc = docID(); doc < upTo && size < batchSize; doc = iterator.nextDoc()) {
+ if (liveDocs == null || liveDocs.get(doc)) {
+ buffer.docs[size] = doc;
+ buffer.scores[size] = score();
+ ++size;
+ }
+ }
+ buffer.size = size;
+ }
+
+ /**
+ * Apply this {@link Scorer} as a required clause on the given {@link DocAndScoreAccBuffer}. This
+ * filters out documents from the buffer that do not match this scorer, and adds the scores of
+ * this {@link Scorer} to the scores.
+ */
+ public void applyAsRequiredClause(DocAndScoreAccBuffer buffer) throws IOException {
+ DocIdSetIterator iterator = iterator();
+ int intersectionSize = 0;
+ int curDoc = iterator.docID();
+ for (int i = 0; i < buffer.size; ++i) {
+ int targetDoc = buffer.docs[i];
+ if (curDoc < targetDoc) {
+ curDoc = iterator.advance(targetDoc);
+ }
+ if (curDoc == targetDoc) {
+ buffer.docs[intersectionSize] = targetDoc;
+ buffer.scores[intersectionSize] = buffer.scores[i] + score();
+ intersectionSize++;
+ }
+ }
+ buffer.size = intersectionSize;
+ }
}
diff --git a/lucene/core/src/java/org/apache/lucene/search/ScorerUtil.java b/lucene/core/src/java/org/apache/lucene/search/ScorerUtil.java
index a08b36413a58..881ad4ed6609 100644
--- a/lucene/core/src/java/org/apache/lucene/search/ScorerUtil.java
+++ b/lucene/core/src/java/org/apache/lucene/search/ScorerUtil.java
@@ -22,6 +22,7 @@
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.MathUtil;
import org.apache.lucene.util.PriorityQueue;
/** Util class for Scorer related methods */
@@ -117,4 +118,22 @@ public int length() {
return in.length();
}
}
+
+ static void filterCompetitiveHits(
+ DocAndScoreAccBuffer buffer,
+ double maxRemainingScore,
+ float minCompetitiveScore,
+ int numScorers) {
+ int newSize = 0;
+ for (int i = 0; i < buffer.size; ++i) {
+ float maxPossibleScore =
+ (float) MathUtil.sumUpperBound(buffer.scores[i] + maxRemainingScore, numScorers);
+ if (maxPossibleScore >= minCompetitiveScore) {
+ buffer.docs[newSize] = buffer.docs[i];
+ buffer.scores[newSize] = buffer.scores[i];
+ newSize++;
+ }
+ }
+ buffer.size = newSize;
+ }
}
diff --git a/lucene/core/src/java/org/apache/lucene/search/TermScorer.java b/lucene/core/src/java/org/apache/lucene/search/TermScorer.java
index 4b53788f233e..47d9eae80b6c 100644
--- a/lucene/core/src/java/org/apache/lucene/search/TermScorer.java
+++ b/lucene/core/src/java/org/apache/lucene/search/TermScorer.java
@@ -17,11 +17,16 @@
package org.apache.lucene.search;
import java.io.IOException;
+import java.util.Arrays;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SlowImpactsEnum;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.LongsRef;
/**
* Expert: A
+ * int batchSize = 16; // arbitrary
+ * buffer.growNoCopy(batchSize);
+ * int size = 0;
+ * DocIdSetIterator iterator = iterator();
+ * for (int doc = docID(); doc < upTo && size < batchSize; doc = iterator.nextDoc()) {
+ * if (liveDocs == null || liveDocs.get(doc)) {
+ * buffer.docs[size] = doc;
+ * buffer.scores[size] = score();
+ * ++size;
+ * }
+ * }
+ * buffer.size = size;
+ *
+ *
+ * Scorer
for documents matching a Term
.
@@ -35,6 +40,9 @@ public final class TermScorer extends Scorer {
private final NumericDocValues norms;
private final ImpactsDISI impactsDisi;
private final MaxScoreCache maxScoreCache;
+ private DocAndFreqBuffer docAndFreqBuffer;
+ private int[] freqs = IntsRef.EMPTY_INTS;
+ private long[] normValues = LongsRef.EMPTY_LONGS;
/** Construct a {@link TermScorer} that will iterate all documents. */
public TermScorer(PostingsEnum postingsEnum, SimScorer scorer, NumericDocValues norms) {
@@ -120,4 +128,81 @@ public void setMinCompetitiveScore(float minScore) {
impactsDisi.setMinCompetitiveScore(minScore);
}
}
+
+ @Override
+ public void nextDocsAndScores(int upTo, Bits liveDocs, DocAndScoreBuffer buffer)
+ throws IOException {
+ if (docAndFreqBuffer == null) {
+ docAndFreqBuffer = new DocAndFreqBuffer();
+ }
+
+ for (; ; ) {
+ postingsEnum.nextPostings(upTo, docAndFreqBuffer);
+ if (liveDocs != null && docAndFreqBuffer.size != 0) {
+ // An empty return value indicates that there are no more docs before upTo. We may be
+ // unlucky, and there are docs left, but all docs from the current batch happen to be marked
+ // as deleted. So we need to iterate until we find a batch that has at least one non-deleted
+ // doc.
+ docAndFreqBuffer.apply(liveDocs);
+ if (docAndFreqBuffer.size == 0) {
+ continue;
+ }
+ }
+ break;
+ }
+
+ int size = docAndFreqBuffer.size;
+ normValues = ArrayUtil.grow(normValues, size);
+ if (norms == null) {
+ Arrays.fill(normValues, 0, size, 1L);
+ } else {
+ for (int i = 0; i < size; ++i) {
+ if (norms.advanceExact(docAndFreqBuffer.docs[i])) {
+ normValues[i] = norms.longValue();
+ } else {
+ normValues[i] = 1L;
+ }
+ }
+ }
+
+ buffer.growNoCopy(size);
+ buffer.size = size;
+ System.arraycopy(docAndFreqBuffer.docs, 0, buffer.docs, 0, size);
+ for (int i = 0; i < size; ++i) {
+ // Unless SimScorer#score is megamorphic, SimScorer#score should inline and (part of) score
+ // computations should auto-vectorize.
+ buffer.scores[i] = scorer.score(docAndFreqBuffer.freqs[i], normValues[i]);
+ }
+ }
+
+ @Override
+ public void applyAsRequiredClause(DocAndScoreAccBuffer buffer) throws IOException {
+ freqs = ArrayUtil.growNoCopy(freqs, buffer.size);
+ normValues = ArrayUtil.growNoCopy(normValues, buffer.size);
+
+ int intersectionSize = 0;
+ int curDoc = iterator.docID();
+ for (int i = 0; i < buffer.size; ++i) {
+ int targetDoc = buffer.docs[i];
+ if (curDoc < targetDoc) {
+ curDoc = iterator.advance(targetDoc);
+ }
+ if (curDoc == targetDoc) {
+ buffer.docs[intersectionSize] = targetDoc;
+ buffer.scores[intersectionSize] = buffer.scores[i];
+ freqs[intersectionSize] = postingsEnum.freq();
+ if (norms == null || norms.advanceExact(targetDoc) == false) {
+ normValues[intersectionSize] = 1L;
+ } else {
+ normValues[intersectionSize] = norms.longValue();
+ }
+ intersectionSize++;
+ }
+ }
+
+ buffer.size = intersectionSize;
+ for (int i = 0; i < intersectionSize; ++i) {
+ buffer.scores[i] += scorer.score(freqs[i], normValues[i]);
+ }
+ }
}
diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/AssertingLeafReader.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/AssertingLeafReader.java
index c56f880c0893..cb9d83e8ac39 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/AssertingLeafReader.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/AssertingLeafReader.java
@@ -50,6 +50,7 @@
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.internal.tests.IndexPackageAccess;
import org.apache.lucene.internal.tests.TestSecrets;
+import org.apache.lucene.search.DocAndFreqBuffer;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
@@ -578,6 +579,20 @@ public BytesRef getPayload() throws IOException {
return payload;
}
+ @Override
+ public void nextPostings(int upTo, DocAndFreqBuffer buffer) throws IOException {
+ assert state != DocsEnumState.START : "nextPostings() called before nextDoc()/advance()";
+ in.nextPostings(upTo, buffer);
+ doc = in.docID();
+ if (doc == DocIdSetIterator.NO_MORE_DOCS) {
+ state = DocsEnumState.FINISHED;
+ positionMax = 0;
+ } else {
+ state = DocsEnumState.ITERATING;
+ positionMax = super.freq();
+ }
+ }
+
void reset() {
state = DocsEnumState.START;
doc = in.docID();
diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingScorer.java b/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingScorer.java
index 0e9d8bfa9ba9..e0819a069c86 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingScorer.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingScorer.java
@@ -20,11 +20,14 @@
import java.util.Collection;
import java.util.Collections;
import java.util.Random;
+import org.apache.lucene.search.DocAndScoreAccBuffer;
+import org.apache.lucene.search.DocAndScoreBuffer;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FilterDocIdSetIterator;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TwoPhaseIterator;
+import org.apache.lucene.util.Bits;
import org.apache.lucene.util.FixedBitSet;
/** Wraps a Scorer with additional checks */
@@ -278,4 +281,32 @@ public String toString() {
}
};
}
+
+ @Override
+ public void nextDocsAndScores(int upTo, Bits liveDocs, DocAndScoreBuffer buffer)
+ throws IOException {
+ assert doc != -1;
+ in.nextDocsAndScores(upTo, liveDocs, buffer);
+ if (doc != in.iterator().docID()) {
+ doc = in.iterator().docID();
+ if (doc == DocIdSetIterator.NO_MORE_DOCS) {
+ state = IteratorState.FINISHED;
+ } else {
+ state = IteratorState.ITERATING;
+ }
+ }
+ }
+
+ @Override
+ public void applyAsRequiredClause(DocAndScoreAccBuffer buffer) throws IOException {
+ in.applyAsRequiredClause(buffer);
+ if (doc != in.iterator().docID()) {
+ doc = in.iterator().docID();
+ if (doc == DocIdSetIterator.NO_MORE_DOCS) {
+ state = IteratorState.FINISHED;
+ } else {
+ state = IteratorState.ITERATING;
+ }
+ }
+ }
}