apache · jpountz · May 15, 2025 · May 20, 2025 · May 20, 2025 · May 20, 2025
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -111,6 +111,9 @@ Optimizations
 * GITHUB#14609: Optimizes PointRangeQuery by rewriting to MatchAllDocsQuery/FieldExistsQuery/MatchNoDocsQuery if all
   docs in index are contained or excluded (Elliott Bradshaw)
 
+* GITHUB#14679: Optimize exhaustive evaluation of disjunctive queries.
+  (Adrien Grand)
+
 Bug Fixes
 ---------------------
 (No changes)

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene103/Lucene103PostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/Lucene103PostingsReader.java
@@ -46,6 +46,7 @@
 import org.apache.lucene.index.SegmentReadState;
 import org.apache.lucene.internal.vectorization.PostingDecodingUtil;
 import org.apache.lucene.internal.vectorization.VectorizationProvider;
+import org.apache.lucene.search.DocAndFreqBuffer;
 import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.store.ChecksumIndexInput;
 import org.apache.lucene.store.DataInput;
@@ -1034,6 +1035,50 @@ public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOExcept
       }
     }
 
+    @Override
+    public void nextPostings(int upTo, DocAndFreqBuffer buffer) throws IOException {
+      assert needsRefilling == false;
+
+      if (needsFreq == false) {
+        super.nextPostings(upTo, buffer);
+        return;
+      }
+
+      buffer.size = 0;
+      if (doc >= upTo) {
+        return;
+      }
+
+      // Only return docs from the current block
+      buffer.growNoCopy(BLOCK_SIZE);
+      upTo = (int) Math.min(upTo, level0LastDocID + 1L);
+
+      // Frequencies are decoded lazily, calling freq() makes sure that the freq block is decoded
+      freq();
+
+      int start = docBufferUpto - 1;
+      buffer.size = 0;
+      switch (encoding) {
+        case PACKED:
+          int end = computeBufferEndBoundary(upTo);
+          buffer.size = end - start;
+          System.arraycopy(docBuffer, start, buffer.docs, 0, buffer.size);
+          break;
+        case UNARY:
+          docBitSet.forEach(
+              doc - docBitSetBase,
+              upTo - docBitSetBase,
+              docBitSetBase,
+              d -> buffer.docs[buffer.size++] = d);
+          break;
+      }
+
+      assert buffer.size > 0;
+      System.arraycopy(freqBuffer, start, buffer.freqs, 0, buffer.size);
+
+      advance(upTo);
+    }
+
     private int computeBufferEndBoundary(int upTo) {
       if (docBufferSize != 0 && docBuffer[docBufferSize - 1] < upTo) {
         // All docs in the buffer are under upTo

diff --git a/lucene/core/src/java/org/apache/lucene/index/CompositeReader.java b/lucene/core/src/java/org/apache/lucene/index/CompositeReader.java
@@ -44,7 +44,7 @@
  * synchronization, you should <b>not</b> synchronize on the <code>IndexReader</code> instance; use
  * your own (non-Lucene) objects instead.
  */
-public abstract non-sealed class CompositeReader extends IndexReader {
+public abstract class CompositeReader extends IndexReader {
 
   private volatile CompositeReaderContext readerContext = null; // lazy init
 

diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexReader.java b/lucene/core/src/java/org/apache/lucene/index/IndexReader.java
@@ -63,7 +63,7 @@
  * synchronization, you should <b>not</b> synchronize on the <code>IndexReader</code> instance; use
  * your own (non-Lucene) objects instead.
  */
-public abstract sealed class IndexReader implements Closeable permits CompositeReader, LeafReader {
+public abstract class IndexReader implements Closeable {
 
   private boolean closed = false;
   private boolean closedByChild = false;

diff --git a/lucene/core/src/java/org/apache/lucene/index/LeafReader.java b/lucene/core/src/java/org/apache/lucene/index/LeafReader.java
@@ -44,7 +44,7 @@
  * synchronization, you should <b>not</b> synchronize on the <code>IndexReader</code> instance; use
  * your own (non-Lucene) objects instead.
  */
-public abstract non-sealed class LeafReader extends IndexReader {
+public abstract class LeafReader extends IndexReader {
 
   private final LeafReaderContext readerContext = new LeafReaderContext(this);
 

diff --git a/lucene/core/src/java/org/apache/lucene/index/PostingsEnum.java b/lucene/core/src/java/org/apache/lucene/index/PostingsEnum.java
@@ -17,6 +17,7 @@
 package org.apache.lucene.index;
 
 import java.io.IOException;
+import org.apache.lucene.search.DocAndFreqBuffer;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.util.BytesRef;
 
@@ -97,4 +98,48 @@ protected PostingsEnum() {}
    * anything (neither members of the returned BytesRef nor bytes in the byte[]).
    */
   public abstract BytesRef getPayload() throws IOException;
+
+  /**
+   * Fill a buffer of doc IDs and frequencies with some number of doc IDs and their corresponding
+   * frequencies, starting at the current doc ID, and ending before {@code upTo}. Because it starts
+   * on the current doc ID, it is illegal to call this method if the {@link #docID() current doc ID}
+   * is {@code -1}.
+   *
+   * <p>An empty buffer after this method returns indicates that there are no postings left between
+   * the current doc ID and {@code upTo}.
+   *
+   * <p>Implementations should ideally fill the buffer with a number of entries comprised between 8
+   * and a couple hundreds, to keep heap requirements contained, while still being large enough to
+   * enable operations on the buffer to auto-vectorize efficiently.
+   *
+   * <p>The default implementation is provided below:
+   *
+   * <pre class="prettyprint">
+   * int batchSize = 16; // arbitrary
+   * buffer.growNoCopy(batchSize);
+   * int size = 0;
+   * for (int doc = docID(); doc &lt; upTo &amp;&amp; size &lt; batchSize; doc = nextDoc()) {
+   *   buffer.docs[size] = doc;
+   *   buffer.freqs[size] = freq();
+   *   ++size;
+   * }
+   * buffer.size = size;
+   * </pre>
+   *
+   * <p><b>NOTE</b>: The provided {@link DocAndFreqBuffer} should not hold references to internal
+   * data structures.
+   *
+   * @lucene.internal
+   */
+  public void nextPostings(int upTo, DocAndFreqBuffer buffer) throws IOException {
+    int batchSize = 16; // arbitrary
+    buffer.growNoCopy(batchSize);
+    int size = 0;
+    for (int doc = docID(); doc < upTo && size < batchSize; doc = nextDoc()) {
+      buffer.docs[size] = doc;
+      buffer.freqs[size] = freq();
+      ++size;
+    }
+    buffer.size = size;
+  }
 }
diff --git a/lucene/core/src/java/org/apache/lucene/search/BlockMaxConjunctionBulkScorer.java b/lucene/core/src/java/org/apache/lucene/search/BlockMaxConjunctionBulkScorer.java
@@ -22,7 +22,6 @@
 import java.util.List;
 import org.apache.lucene.search.Weight.DefaultBulkScorer;
 import org.apache.lucene.util.Bits;
-import org.apache.lucene.util.MathUtil;
 
 /**
  * BulkScorer implementation of {@link BlockMaxConjunctionScorer} that focuses on top-level
@@ -36,28 +35,23 @@
 final class BlockMaxConjunctionBulkScorer extends BulkScorer {
 
   private final Scorer[] scorers;
-  private final Scorable[] scorables;
   private final DocIdSetIterator[] iterators;
-  private final DocIdSetIterator lead1, lead2;
-  private final Scorable scorer1, scorer2;
+  private final DocIdSetIterator lead1;
   private final DocAndScore scorable = new DocAndScore();
   private final double[] sumOfOtherClauses;
   private final int maxDoc;
+  private final DocAndScoreBuffer docAndScoreBuffer = new DocAndScoreBuffer();
+  private final DocAndScoreAccBuffer docAndScoreAccBuffer = new DocAndScoreAccBuffer();
 
   BlockMaxConjunctionBulkScorer(int maxDoc, List<Scorer> scorers) throws IOException {
     if (scorers.size() <= 1) {
       throw new IllegalArgumentException("Expected 2 or more scorers, got " + scorers.size());
     }
     this.scorers = scorers.toArray(Scorer[]::new);
     Arrays.sort(this.scorers, Comparator.comparingLong(scorer -> scorer.iterator().cost()));
-    this.scorables =
-        Arrays.stream(this.scorers).map(ScorerUtil::likelyTermScorer).toArray(Scorable[]::new);
     this.iterators =
         Arrays.stream(this.scorers).map(Scorer::iterator).toArray(DocIdSetIterator[]::new);
     lead1 = ScorerUtil.likelyImpactsEnum(iterators[0]);
-    lead2 = ScorerUtil.likelyImpactsEnum(iterators[1]);
-    scorer1 = this.scorables[0];
-    scorer2 = this.scorables[1];
     this.sumOfOtherClauses = new double[this.scorers.length];
     for (int i = 0; i < sumOfOtherClauses.length; i++) {
       sumOfOtherClauses[i] = Double.POSITIVE_INFINITY;
@@ -118,98 +112,45 @@ private void scoreWindow(
       return;
     }
 
-    Scorable scorer1 = this.scorer1;
-    if (scorers[0].getMaxScore(max - 1) == 0f) {
-      // Null out scorer1 if it may only produce 0 scores over this window. In practice, this is
-      // mostly useful because FILTER clauses are pushed as constant-scoring MUST clauses with a
-      // 0 score to this scorer. Setting it to null instead of using a different impl helps
-      // reduce polymorphism of calls to Scorable#score and skip the check of whether the leading
-      // clause produced a high-enough score for the doc to be competitive.
-      scorer1 = null;
-    }
-
     final double sumOfOtherMaxScoresAt1 = sumOfOtherClauses[1];
 
-    advanceHead:
-    for (int doc = lead1.docID(); doc < max; ) {
-      if (acceptDocs != null && acceptDocs.get(doc) == false) {
-        doc = lead1.nextDoc();
-        continue;
-      }
+    for (scorers[0].nextDocsAndScores(max, acceptDocs, docAndScoreBuffer);
+        docAndScoreBuffer.size > 0;
+        scorers[0].nextDocsAndScores(max, acceptDocs, docAndScoreBuffer)) {
 
-      // Compute the score as we find more matching clauses, in order to skip advancing other
-      // clauses if the total score has no chance of being competitive. This works well because
-      // computing a score is usually cheaper than decoding a full block of postings and
-      // frequencies.
-      final boolean hasMinCompetitiveScore = scorable.minCompetitiveScore > 0;
-      double currentScore;
-      if (scorer1 != null && hasMinCompetitiveScore) {
-        currentScore = scorer1.score();
-
-        // This is the same logic as in the below for loop, specialized for the 2nd least costly
-        // clause. This seems to help the JVM.
-
-        // First check if we have a chance of having a match based on max scores
-        if ((float) MathUtil.sumUpperBound(currentScore + sumOfOtherMaxScoresAt1, scorers.length)
-            < scorable.minCompetitiveScore) {
-          doc = lead1.nextDoc();
-          continue advanceHead;
-        }
-      } else {
-        currentScore = 0;
-      }
+      docAndScoreAccBuffer.copyFrom(docAndScoreBuffer);
 
-      // NOTE: lead2 may be on `doc` already if we `continue`d on the previous loop iteration.
-      if (lead2.docID() < doc) {
-        int next = lead2.advance(doc);
-        if (next != doc) {
-          doc = lead1.advance(next);
-          continue advanceHead;
-        }
-      }
-      assert lead2.docID() == doc;
-      if (hasMinCompetitiveScore) {
-        currentScore += scorer2.score();
+      if (scorable.minCompetitiveScore > 0) {
+        ScorerUtil.filterCompetitiveHits(
+            docAndScoreAccBuffer,
+            sumOfOtherMaxScoresAt1,
+            scorable.minCompetitiveScore,
+            scorers.length);
       }
 
-      for (int i = 2; i < iterators.length; ++i) {
-        // First check if we have a chance of having a match based on max scores
-        if (hasMinCompetitiveScore
-            && (float) MathUtil.sumUpperBound(currentScore + sumOfOtherClauses[i], scorers.length)
-                < scorable.minCompetitiveScore) {
-          doc = lead1.nextDoc();
-          continue advanceHead;
-        }
-
-        // NOTE: these iterators may be on `doc` already if we called `continue advanceHead` on the
-        // previous loop iteration.
-        if (iterators[i].docID() < doc) {
-          int next = iterators[i].advance(doc);
-          if (next != doc) {
-            doc = lead1.advance(next);
-            continue advanceHead;
-          }
-        }
-        assert iterators[i].docID() == doc;
-        if (hasMinCompetitiveScore) {
-          currentScore += scorables[i].score();
+      for (int i = 1; i < scorers.length; ++i) {
+        if (scorable.minCompetitiveScore > 0) {
+          ScorerUtil.filterCompetitiveHits(
+              docAndScoreAccBuffer,
+              sumOfOtherClauses[i],
+              scorable.minCompetitiveScore,
+              scorers.length);
         }
+        scorers[i].applyAsRequiredClause(docAndScoreAccBuffer);
       }
 
-      if (hasMinCompetitiveScore == false) {
-        for (Scorable scorer : scorables) {
-          currentScore += scorer.score();
-        }
-      }
-      scorable.score = (float) currentScore;
-      collector.collect(doc);
-      // The collect() call may have updated the minimum competitive score.
-      if (maxWindowScore < scorable.minCompetitiveScore) {
-        // no more hits are competitive
-        return;
+      for (int i = 0; i < docAndScoreAccBuffer.size; ++i) {
+        scorable.score = (float) docAndScoreAccBuffer.scores[i];
+        collector.collect(docAndScoreAccBuffer.docs[i]);
       }
+    }
 
-      doc = lead1.nextDoc();
+    int maxOtherDoc = -1;
+    for (int i = 0; i < iterators.length; ++i) {
+      maxOtherDoc = Math.max(iterators[i].docID(), maxOtherDoc);
+    }
+    if (lead1.docID() < maxOtherDoc) {
+      lead1.advance(maxOtherDoc);
     }
   }
 

diff --git a/lucene/core/src/java/org/apache/lucene/search/BooleanScorer.java b/lucene/core/src/java/org/apache/lucene/search/BooleanScorer.java
@@ -81,6 +81,7 @@ public DisiWrapper get(int i) {
   final int minShouldMatch;
   final long cost;
   final boolean needsScores;
+  private final DocAndScoreBuffer docAndScoreBuffer = new DocAndScoreBuffer();
 
   BooleanScorer(Collection<Scorer> scorers, int minShouldMatch, boolean needsScores) {
     if (minShouldMatch < 1 || minShouldMatch > scorers.size()) {
@@ -135,23 +136,35 @@ private void scoreWindowIntoBitSetAndReplay(
       assert w.doc < max;
 
       DocIdSetIterator it = w.iterator;
-      int doc = w.doc;
-      if (doc < min) {
-        doc = it.advance(min);
+      if (w.doc < min) {
+        it.advance(min);
       }
-      if (buckets == null) {
+      if (buckets == null) { // means minShouldMatch=1 and scores are not needed
         // This doesn't apply live docs, so we'll need to apply them later
         it.intoBitSet(max, matching, base);
+      } else if (needsScores) {
+        for (w.scorer.nextDocsAndScores(max, acceptDocs, docAndScoreBuffer);
+            docAndScoreBuffer.size > 0;
+            w.scorer.nextDocsAndScores(max, acceptDocs, docAndScoreBuffer)) {
+          for (int index = 0; index < docAndScoreBuffer.size; ++index) {
+            final int doc = docAndScoreBuffer.docs[index];
+            final float score = docAndScoreBuffer.scores[index];
+            final int d = doc & MASK;
+            matching.set(d);
+            final Bucket bucket = buckets[d];
+            bucket.freq++;
+            bucket.score += score;
+          }
+        }
       } else {
-        for (; doc < max; doc = it.nextDoc()) {
+        // Scores are not needed but we need to keep track of freqs to know which hits match
+        assert minShouldMatch > 1;
+        for (int doc = it.docID(); doc < max; doc = it.nextDoc()) {
           if (acceptDocs == null || acceptDocs.get(doc)) {
             final int d = doc & MASK;
             matching.set(d);
             final Bucket bucket = buckets[d];
             bucket.freq++;
-            if (needsScores) {
-              bucket.score += w.scorable.score();
-            }
           }
         }
       }