Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,9 @@ Optimizations
* GITHUB#14609: Optimizes PointRangeQuery by rewriting to MatchAllDocsQuery/FieldExistsQuery/MatchNoDocsQuery if all
docs in index are contained or excluded (Elliott Bradshaw)

* GITHUB#14679: Optimize exhaustive evaluation of disjunctive queries.
(Adrien Grand)

Bug Fixes
---------------------
(No changes)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
import org.apache.lucene.internal.vectorization.VectorizationProvider;
import org.apache.lucene.search.DocAndFreqBuffer;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.DataInput;
Expand Down Expand Up @@ -1034,6 +1035,50 @@ public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOExcept
}
}

@Override
public void nextPostings(int upTo, DocAndFreqBuffer buffer) throws IOException {
assert needsRefilling == false;

if (needsFreq == false) {
super.nextPostings(upTo, buffer);
return;
}

buffer.size = 0;
if (doc >= upTo) {
return;
}

// Only return docs from the current block
buffer.growNoCopy(BLOCK_SIZE);
upTo = (int) Math.min(upTo, level0LastDocID + 1L);

// Frequencies are decoded lazily, calling freq() makes sure that the freq block is decoded
freq();

int start = docBufferUpto - 1;
buffer.size = 0;
switch (encoding) {
case PACKED:
int end = computeBufferEndBoundary(upTo);
buffer.size = end - start;
System.arraycopy(docBuffer, start, buffer.docs, 0, buffer.size);
break;
case UNARY:
docBitSet.forEach(
doc - docBitSetBase,
upTo - docBitSetBase,
docBitSetBase,
d -> buffer.docs[buffer.size++] = d);
break;
}

assert buffer.size > 0;
System.arraycopy(freqBuffer, start, buffer.freqs, 0, buffer.size);

advance(upTo);
}

private int computeBufferEndBoundary(int upTo) {
if (docBufferSize != 0 && docBuffer[docBufferSize - 1] < upTo) {
// All docs in the buffer are under upTo
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
* synchronization, you should <b>not</b> synchronize on the <code>IndexReader</code> instance; use
* your own (non-Lucene) objects instead.
*/
public abstract non-sealed class CompositeReader extends IndexReader {
public abstract class CompositeReader extends IndexReader {

private volatile CompositeReaderContext readerContext = null; // lazy init

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
* synchronization, you should <b>not</b> synchronize on the <code>IndexReader</code> instance; use
* your own (non-Lucene) objects instead.
*/
public abstract sealed class IndexReader implements Closeable permits CompositeReader, LeafReader {
public abstract class IndexReader implements Closeable {

private boolean closed = false;
private boolean closedByChild = false;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
* synchronization, you should <b>not</b> synchronize on the <code>IndexReader</code> instance; use
* your own (non-Lucene) objects instead.
*/
public abstract non-sealed class LeafReader extends IndexReader {
public abstract class LeafReader extends IndexReader {

private final LeafReaderContext readerContext = new LeafReaderContext(this);

Expand Down
45 changes: 45 additions & 0 deletions lucene/core/src/java/org/apache/lucene/index/PostingsEnum.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package org.apache.lucene.index;

import java.io.IOException;
import org.apache.lucene.search.DocAndFreqBuffer;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.BytesRef;

Expand Down Expand Up @@ -97,4 +98,48 @@ protected PostingsEnum() {}
* anything (neither members of the returned BytesRef nor bytes in the byte[]).
*/
public abstract BytesRef getPayload() throws IOException;

/**
* Fill a buffer of doc IDs and frequencies with some number of doc IDs and their corresponding
* frequencies, starting at the current doc ID, and ending before {@code upTo}. Because it starts
* on the current doc ID, it is illegal to call this method if the {@link #docID() current doc ID}
* is {@code -1}.
*
* <p>An empty buffer after this method returns indicates that there are no postings left between
* the current doc ID and {@code upTo}.
*
* <p>Implementations should ideally fill the buffer with a number of entries comprised between 8
* and a couple hundreds, to keep heap requirements contained, while still being large enough to
* enable operations on the buffer to auto-vectorize efficiently.
*
* <p>The default implementation is provided below:
*
* <pre class="prettyprint">
* int batchSize = 16; // arbitrary
* buffer.growNoCopy(batchSize);
* int size = 0;
* for (int doc = docID(); doc &lt; upTo &amp;&amp; size &lt; batchSize; doc = nextDoc()) {
* buffer.docs[size] = doc;
* buffer.freqs[size] = freq();
* ++size;
* }
* buffer.size = size;
* </pre>
*
* <p><b>NOTE</b>: The provided {@link DocAndFreqBuffer} should not hold references to internal
* data structures.
*
* @lucene.internal
*/
public void nextPostings(int upTo, DocAndFreqBuffer buffer) throws IOException {
int batchSize = 16; // arbitrary
buffer.growNoCopy(batchSize);
int size = 0;
for (int doc = docID(); doc < upTo && size < batchSize; doc = nextDoc()) {
buffer.docs[size] = doc;
buffer.freqs[size] = freq();
++size;
}
buffer.size = size;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
import java.util.List;
import org.apache.lucene.search.Weight.DefaultBulkScorer;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.MathUtil;

/**
* BulkScorer implementation of {@link BlockMaxConjunctionScorer} that focuses on top-level
Expand All @@ -36,28 +35,23 @@
final class BlockMaxConjunctionBulkScorer extends BulkScorer {

private final Scorer[] scorers;
private final Scorable[] scorables;
private final DocIdSetIterator[] iterators;
private final DocIdSetIterator lead1, lead2;
private final Scorable scorer1, scorer2;
private final DocIdSetIterator lead1;
private final DocAndScore scorable = new DocAndScore();
private final double[] sumOfOtherClauses;
private final int maxDoc;
private final DocAndScoreBuffer docAndScoreBuffer = new DocAndScoreBuffer();
private final DocAndScoreAccBuffer docAndScoreAccBuffer = new DocAndScoreAccBuffer();

BlockMaxConjunctionBulkScorer(int maxDoc, List<Scorer> scorers) throws IOException {
if (scorers.size() <= 1) {
throw new IllegalArgumentException("Expected 2 or more scorers, got " + scorers.size());
}
this.scorers = scorers.toArray(Scorer[]::new);
Arrays.sort(this.scorers, Comparator.comparingLong(scorer -> scorer.iterator().cost()));
this.scorables =
Arrays.stream(this.scorers).map(ScorerUtil::likelyTermScorer).toArray(Scorable[]::new);
this.iterators =
Arrays.stream(this.scorers).map(Scorer::iterator).toArray(DocIdSetIterator[]::new);
lead1 = ScorerUtil.likelyImpactsEnum(iterators[0]);
lead2 = ScorerUtil.likelyImpactsEnum(iterators[1]);
scorer1 = this.scorables[0];
scorer2 = this.scorables[1];
this.sumOfOtherClauses = new double[this.scorers.length];
for (int i = 0; i < sumOfOtherClauses.length; i++) {
sumOfOtherClauses[i] = Double.POSITIVE_INFINITY;
Expand Down Expand Up @@ -118,98 +112,45 @@ private void scoreWindow(
return;
}

Scorable scorer1 = this.scorer1;
if (scorers[0].getMaxScore(max - 1) == 0f) {
// Null out scorer1 if it may only produce 0 scores over this window. In practice, this is
// mostly useful because FILTER clauses are pushed as constant-scoring MUST clauses with a
// 0 score to this scorer. Setting it to null instead of using a different impl helps
// reduce polymorphism of calls to Scorable#score and skip the check of whether the leading
// clause produced a high-enough score for the doc to be competitive.
scorer1 = null;
}

final double sumOfOtherMaxScoresAt1 = sumOfOtherClauses[1];

advanceHead:
for (int doc = lead1.docID(); doc < max; ) {
if (acceptDocs != null && acceptDocs.get(doc) == false) {
doc = lead1.nextDoc();
continue;
}
for (scorers[0].nextDocsAndScores(max, acceptDocs, docAndScoreBuffer);
docAndScoreBuffer.size > 0;
scorers[0].nextDocsAndScores(max, acceptDocs, docAndScoreBuffer)) {

// Compute the score as we find more matching clauses, in order to skip advancing other
// clauses if the total score has no chance of being competitive. This works well because
// computing a score is usually cheaper than decoding a full block of postings and
// frequencies.
final boolean hasMinCompetitiveScore = scorable.minCompetitiveScore > 0;
double currentScore;
if (scorer1 != null && hasMinCompetitiveScore) {
currentScore = scorer1.score();

// This is the same logic as in the below for loop, specialized for the 2nd least costly
// clause. This seems to help the JVM.

// First check if we have a chance of having a match based on max scores
if ((float) MathUtil.sumUpperBound(currentScore + sumOfOtherMaxScoresAt1, scorers.length)
< scorable.minCompetitiveScore) {
doc = lead1.nextDoc();
continue advanceHead;
}
} else {
currentScore = 0;
}
docAndScoreAccBuffer.copyFrom(docAndScoreBuffer);

// NOTE: lead2 may be on `doc` already if we `continue`d on the previous loop iteration.
if (lead2.docID() < doc) {
int next = lead2.advance(doc);
if (next != doc) {
doc = lead1.advance(next);
continue advanceHead;
}
}
assert lead2.docID() == doc;
if (hasMinCompetitiveScore) {
currentScore += scorer2.score();
if (scorable.minCompetitiveScore > 0) {
ScorerUtil.filterCompetitiveHits(
docAndScoreAccBuffer,
sumOfOtherMaxScoresAt1,
scorable.minCompetitiveScore,
scorers.length);
}

for (int i = 2; i < iterators.length; ++i) {
// First check if we have a chance of having a match based on max scores
if (hasMinCompetitiveScore
&& (float) MathUtil.sumUpperBound(currentScore + sumOfOtherClauses[i], scorers.length)
< scorable.minCompetitiveScore) {
doc = lead1.nextDoc();
continue advanceHead;
}

// NOTE: these iterators may be on `doc` already if we called `continue advanceHead` on the
// previous loop iteration.
if (iterators[i].docID() < doc) {
int next = iterators[i].advance(doc);
if (next != doc) {
doc = lead1.advance(next);
continue advanceHead;
}
}
assert iterators[i].docID() == doc;
if (hasMinCompetitiveScore) {
currentScore += scorables[i].score();
for (int i = 1; i < scorers.length; ++i) {
if (scorable.minCompetitiveScore > 0) {
ScorerUtil.filterCompetitiveHits(
docAndScoreAccBuffer,
sumOfOtherClauses[i],
scorable.minCompetitiveScore,
scorers.length);
}
scorers[i].applyAsRequiredClause(docAndScoreAccBuffer);
}

if (hasMinCompetitiveScore == false) {
for (Scorable scorer : scorables) {
currentScore += scorer.score();
}
}
scorable.score = (float) currentScore;
collector.collect(doc);
// The collect() call may have updated the minimum competitive score.
if (maxWindowScore < scorable.minCompetitiveScore) {
// no more hits are competitive
return;
for (int i = 0; i < docAndScoreAccBuffer.size; ++i) {
scorable.score = (float) docAndScoreAccBuffer.scores[i];
collector.collect(docAndScoreAccBuffer.docs[i]);
}
}

doc = lead1.nextDoc();
int maxOtherDoc = -1;
for (int i = 0; i < iterators.length; ++i) {
maxOtherDoc = Math.max(iterators[i].docID(), maxOtherDoc);
}
if (lead1.docID() < maxOtherDoc) {
lead1.advance(maxOtherDoc);
}
}

Expand Down
29 changes: 21 additions & 8 deletions lucene/core/src/java/org/apache/lucene/search/BooleanScorer.java
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ public DisiWrapper get(int i) {
final int minShouldMatch;
final long cost;
final boolean needsScores;
private final DocAndScoreBuffer docAndScoreBuffer = new DocAndScoreBuffer();

BooleanScorer(Collection<Scorer> scorers, int minShouldMatch, boolean needsScores) {
if (minShouldMatch < 1 || minShouldMatch > scorers.size()) {
Expand Down Expand Up @@ -135,23 +136,35 @@ private void scoreWindowIntoBitSetAndReplay(
assert w.doc < max;

DocIdSetIterator it = w.iterator;
int doc = w.doc;
if (doc < min) {
doc = it.advance(min);
if (w.doc < min) {
it.advance(min);
}
if (buckets == null) {
if (buckets == null) { // means minShouldMatch=1 and scores are not needed
// This doesn't apply live docs, so we'll need to apply them later
it.intoBitSet(max, matching, base);
} else if (needsScores) {
for (w.scorer.nextDocsAndScores(max, acceptDocs, docAndScoreBuffer);
docAndScoreBuffer.size > 0;
w.scorer.nextDocsAndScores(max, acceptDocs, docAndScoreBuffer)) {
for (int index = 0; index < docAndScoreBuffer.size; ++index) {
final int doc = docAndScoreBuffer.docs[index];
final float score = docAndScoreBuffer.scores[index];
final int d = doc & MASK;
matching.set(d);
final Bucket bucket = buckets[d];
bucket.freq++;
bucket.score += score;
}
}
} else {
for (; doc < max; doc = it.nextDoc()) {
// Scores are not needed but we need to keep track of freqs to know which hits match
assert minShouldMatch > 1;
for (int doc = it.docID(); doc < max; doc = it.nextDoc()) {
if (acceptDocs == null || acceptDocs.get(doc)) {
final int d = doc & MASK;
matching.set(d);
final Bucket bucket = buckets[d];
bucket.freq++;
if (needsScores) {
bucket.score += w.scorable.score();
}
}
}
}
Expand Down
Loading