diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index f9f036eb1040..7ad0c916172a 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -111,6 +111,9 @@ Optimizations * GITHUB#14609: Optimizes PointRangeQuery by rewriting to MatchAllDocsQuery/FieldExistsQuery/MatchNoDocsQuery if all docs in index are contained or excluded (Elliott Bradshaw) +* GITHUB#14679: Optimize exhaustive evaluation of disjunctive queries. + (Adrien Grand) + Bug Fixes --------------------- (No changes) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene103/Lucene103PostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/Lucene103PostingsReader.java index 3ecaddef6174..fac90c25da32 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene103/Lucene103PostingsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/Lucene103PostingsReader.java @@ -46,6 +46,7 @@ import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.internal.vectorization.PostingDecodingUtil; import org.apache.lucene.internal.vectorization.VectorizationProvider; +import org.apache.lucene.search.DocAndFreqBuffer; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.DataInput; @@ -1034,6 +1035,50 @@ public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOExcept } } + @Override + public void nextPostings(int upTo, DocAndFreqBuffer buffer) throws IOException { + assert needsRefilling == false; + + if (needsFreq == false) { + super.nextPostings(upTo, buffer); + return; + } + + buffer.size = 0; + if (doc >= upTo) { + return; + } + + // Only return docs from the current block + buffer.growNoCopy(BLOCK_SIZE); + upTo = (int) Math.min(upTo, level0LastDocID + 1L); + + // Frequencies are decoded lazily, calling freq() makes sure that the freq block is decoded + freq(); + + int start = docBufferUpto - 1; + buffer.size = 0; + switch (encoding) { + case PACKED: + int end = computeBufferEndBoundary(upTo); + buffer.size = end - start; + System.arraycopy(docBuffer, start, buffer.docs, 0, buffer.size); + break; + case UNARY: + docBitSet.forEach( + doc - docBitSetBase, + upTo - docBitSetBase, + docBitSetBase, + d -> buffer.docs[buffer.size++] = d); + break; + } + + assert buffer.size > 0; + System.arraycopy(freqBuffer, start, buffer.freqs, 0, buffer.size); + + advance(upTo); + } + private int computeBufferEndBoundary(int upTo) { if (docBufferSize != 0 && docBuffer[docBufferSize - 1] < upTo) { // All docs in the buffer are under upTo diff --git a/lucene/core/src/java/org/apache/lucene/index/CompositeReader.java b/lucene/core/src/java/org/apache/lucene/index/CompositeReader.java index 4b35ba6cd7c9..ccc454c59944 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CompositeReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/CompositeReader.java @@ -44,7 +44,7 @@ * synchronization, you should not synchronize on the IndexReader instance; use * your own (non-Lucene) objects instead. */ -public abstract non-sealed class CompositeReader extends IndexReader { +public abstract class CompositeReader extends IndexReader { private volatile CompositeReaderContext readerContext = null; // lazy init diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexReader.java b/lucene/core/src/java/org/apache/lucene/index/IndexReader.java index 8e965ee8099c..e23efc23ece2 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexReader.java @@ -63,7 +63,7 @@ * synchronization, you should not synchronize on the IndexReader instance; use * your own (non-Lucene) objects instead. */ -public abstract sealed class IndexReader implements Closeable permits CompositeReader, LeafReader { +public abstract class IndexReader implements Closeable { private boolean closed = false; private boolean closedByChild = false; diff --git a/lucene/core/src/java/org/apache/lucene/index/LeafReader.java b/lucene/core/src/java/org/apache/lucene/index/LeafReader.java index 0f39d1ae1e8d..eea6c317b9a7 100644 --- a/lucene/core/src/java/org/apache/lucene/index/LeafReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/LeafReader.java @@ -44,7 +44,7 @@ * synchronization, you should not synchronize on the IndexReader instance; use * your own (non-Lucene) objects instead. */ -public abstract non-sealed class LeafReader extends IndexReader { +public abstract class LeafReader extends IndexReader { private final LeafReaderContext readerContext = new LeafReaderContext(this); diff --git a/lucene/core/src/java/org/apache/lucene/index/PostingsEnum.java b/lucene/core/src/java/org/apache/lucene/index/PostingsEnum.java index 2cb0092aaa49..8e82e0a6d696 100644 --- a/lucene/core/src/java/org/apache/lucene/index/PostingsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/index/PostingsEnum.java @@ -17,6 +17,7 @@ package org.apache.lucene.index; import java.io.IOException; +import org.apache.lucene.search.DocAndFreqBuffer; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.BytesRef; @@ -97,4 +98,48 @@ protected PostingsEnum() {} * anything (neither members of the returned BytesRef nor bytes in the byte[]). */ public abstract BytesRef getPayload() throws IOException; + + /** + * Fill a buffer of doc IDs and frequencies with some number of doc IDs and their corresponding + * frequencies, starting at the current doc ID, and ending before {@code upTo}. Because it starts + * on the current doc ID, it is illegal to call this method if the {@link #docID() current doc ID} + * is {@code -1}. + * + *

An empty buffer after this method returns indicates that there are no postings left between + * the current doc ID and {@code upTo}. + * + *

Implementations should ideally fill the buffer with a number of entries comprised between 8 + * and a couple hundreds, to keep heap requirements contained, while still being large enough to + * enable operations on the buffer to auto-vectorize efficiently. + * + *

The default implementation is provided below: + * + *

+   * int batchSize = 16; // arbitrary
+   * buffer.growNoCopy(batchSize);
+   * int size = 0;
+   * for (int doc = docID(); doc < upTo && size < batchSize; doc = nextDoc()) {
+   *   buffer.docs[size] = doc;
+   *   buffer.freqs[size] = freq();
+   *   ++size;
+   * }
+   * buffer.size = size;
+   * 
+ * + *

NOTE: The provided {@link DocAndFreqBuffer} should not hold references to internal + * data structures. + * + * @lucene.internal + */ + public void nextPostings(int upTo, DocAndFreqBuffer buffer) throws IOException { + int batchSize = 16; // arbitrary + buffer.growNoCopy(batchSize); + int size = 0; + for (int doc = docID(); doc < upTo && size < batchSize; doc = nextDoc()) { + buffer.docs[size] = doc; + buffer.freqs[size] = freq(); + ++size; + } + buffer.size = size; + } } diff --git a/lucene/core/src/java/org/apache/lucene/search/BlockMaxConjunctionBulkScorer.java b/lucene/core/src/java/org/apache/lucene/search/BlockMaxConjunctionBulkScorer.java index fd89153ffe6d..09910c752ed3 100644 --- a/lucene/core/src/java/org/apache/lucene/search/BlockMaxConjunctionBulkScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/BlockMaxConjunctionBulkScorer.java @@ -22,7 +22,6 @@ import java.util.List; import org.apache.lucene.search.Weight.DefaultBulkScorer; import org.apache.lucene.util.Bits; -import org.apache.lucene.util.MathUtil; /** * BulkScorer implementation of {@link BlockMaxConjunctionScorer} that focuses on top-level @@ -36,13 +35,13 @@ final class BlockMaxConjunctionBulkScorer extends BulkScorer { private final Scorer[] scorers; - private final Scorable[] scorables; private final DocIdSetIterator[] iterators; - private final DocIdSetIterator lead1, lead2; - private final Scorable scorer1, scorer2; + private final DocIdSetIterator lead1; private final DocAndScore scorable = new DocAndScore(); private final double[] sumOfOtherClauses; private final int maxDoc; + private final DocAndScoreBuffer docAndScoreBuffer = new DocAndScoreBuffer(); + private final DocAndScoreAccBuffer docAndScoreAccBuffer = new DocAndScoreAccBuffer(); BlockMaxConjunctionBulkScorer(int maxDoc, List scorers) throws IOException { if (scorers.size() <= 1) { @@ -50,14 +49,9 @@ final class BlockMaxConjunctionBulkScorer extends BulkScorer { } this.scorers = scorers.toArray(Scorer[]::new); Arrays.sort(this.scorers, Comparator.comparingLong(scorer -> scorer.iterator().cost())); - this.scorables = - Arrays.stream(this.scorers).map(ScorerUtil::likelyTermScorer).toArray(Scorable[]::new); this.iterators = Arrays.stream(this.scorers).map(Scorer::iterator).toArray(DocIdSetIterator[]::new); lead1 = ScorerUtil.likelyImpactsEnum(iterators[0]); - lead2 = ScorerUtil.likelyImpactsEnum(iterators[1]); - scorer1 = this.scorables[0]; - scorer2 = this.scorables[1]; this.sumOfOtherClauses = new double[this.scorers.length]; for (int i = 0; i < sumOfOtherClauses.length; i++) { sumOfOtherClauses[i] = Double.POSITIVE_INFINITY; @@ -118,98 +112,45 @@ private void scoreWindow( return; } - Scorable scorer1 = this.scorer1; - if (scorers[0].getMaxScore(max - 1) == 0f) { - // Null out scorer1 if it may only produce 0 scores over this window. In practice, this is - // mostly useful because FILTER clauses are pushed as constant-scoring MUST clauses with a - // 0 score to this scorer. Setting it to null instead of using a different impl helps - // reduce polymorphism of calls to Scorable#score and skip the check of whether the leading - // clause produced a high-enough score for the doc to be competitive. - scorer1 = null; - } - final double sumOfOtherMaxScoresAt1 = sumOfOtherClauses[1]; - advanceHead: - for (int doc = lead1.docID(); doc < max; ) { - if (acceptDocs != null && acceptDocs.get(doc) == false) { - doc = lead1.nextDoc(); - continue; - } + for (scorers[0].nextDocsAndScores(max, acceptDocs, docAndScoreBuffer); + docAndScoreBuffer.size > 0; + scorers[0].nextDocsAndScores(max, acceptDocs, docAndScoreBuffer)) { - // Compute the score as we find more matching clauses, in order to skip advancing other - // clauses if the total score has no chance of being competitive. This works well because - // computing a score is usually cheaper than decoding a full block of postings and - // frequencies. - final boolean hasMinCompetitiveScore = scorable.minCompetitiveScore > 0; - double currentScore; - if (scorer1 != null && hasMinCompetitiveScore) { - currentScore = scorer1.score(); - - // This is the same logic as in the below for loop, specialized for the 2nd least costly - // clause. This seems to help the JVM. - - // First check if we have a chance of having a match based on max scores - if ((float) MathUtil.sumUpperBound(currentScore + sumOfOtherMaxScoresAt1, scorers.length) - < scorable.minCompetitiveScore) { - doc = lead1.nextDoc(); - continue advanceHead; - } - } else { - currentScore = 0; - } + docAndScoreAccBuffer.copyFrom(docAndScoreBuffer); - // NOTE: lead2 may be on `doc` already if we `continue`d on the previous loop iteration. - if (lead2.docID() < doc) { - int next = lead2.advance(doc); - if (next != doc) { - doc = lead1.advance(next); - continue advanceHead; - } - } - assert lead2.docID() == doc; - if (hasMinCompetitiveScore) { - currentScore += scorer2.score(); + if (scorable.minCompetitiveScore > 0) { + ScorerUtil.filterCompetitiveHits( + docAndScoreAccBuffer, + sumOfOtherMaxScoresAt1, + scorable.minCompetitiveScore, + scorers.length); } - for (int i = 2; i < iterators.length; ++i) { - // First check if we have a chance of having a match based on max scores - if (hasMinCompetitiveScore - && (float) MathUtil.sumUpperBound(currentScore + sumOfOtherClauses[i], scorers.length) - < scorable.minCompetitiveScore) { - doc = lead1.nextDoc(); - continue advanceHead; - } - - // NOTE: these iterators may be on `doc` already if we called `continue advanceHead` on the - // previous loop iteration. - if (iterators[i].docID() < doc) { - int next = iterators[i].advance(doc); - if (next != doc) { - doc = lead1.advance(next); - continue advanceHead; - } - } - assert iterators[i].docID() == doc; - if (hasMinCompetitiveScore) { - currentScore += scorables[i].score(); + for (int i = 1; i < scorers.length; ++i) { + if (scorable.minCompetitiveScore > 0) { + ScorerUtil.filterCompetitiveHits( + docAndScoreAccBuffer, + sumOfOtherClauses[i], + scorable.minCompetitiveScore, + scorers.length); } + scorers[i].applyAsRequiredClause(docAndScoreAccBuffer); } - if (hasMinCompetitiveScore == false) { - for (Scorable scorer : scorables) { - currentScore += scorer.score(); - } - } - scorable.score = (float) currentScore; - collector.collect(doc); - // The collect() call may have updated the minimum competitive score. - if (maxWindowScore < scorable.minCompetitiveScore) { - // no more hits are competitive - return; + for (int i = 0; i < docAndScoreAccBuffer.size; ++i) { + scorable.score = (float) docAndScoreAccBuffer.scores[i]; + collector.collect(docAndScoreAccBuffer.docs[i]); } + } - doc = lead1.nextDoc(); + int maxOtherDoc = -1; + for (int i = 0; i < iterators.length; ++i) { + maxOtherDoc = Math.max(iterators[i].docID(), maxOtherDoc); + } + if (lead1.docID() < maxOtherDoc) { + lead1.advance(maxOtherDoc); } } diff --git a/lucene/core/src/java/org/apache/lucene/search/BooleanScorer.java b/lucene/core/src/java/org/apache/lucene/search/BooleanScorer.java index 0be42a2a3e7d..1f4dd4f80ce1 100644 --- a/lucene/core/src/java/org/apache/lucene/search/BooleanScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/BooleanScorer.java @@ -81,6 +81,7 @@ public DisiWrapper get(int i) { final int minShouldMatch; final long cost; final boolean needsScores; + private final DocAndScoreBuffer docAndScoreBuffer = new DocAndScoreBuffer(); BooleanScorer(Collection scorers, int minShouldMatch, boolean needsScores) { if (minShouldMatch < 1 || minShouldMatch > scorers.size()) { @@ -135,23 +136,35 @@ private void scoreWindowIntoBitSetAndReplay( assert w.doc < max; DocIdSetIterator it = w.iterator; - int doc = w.doc; - if (doc < min) { - doc = it.advance(min); + if (w.doc < min) { + it.advance(min); } - if (buckets == null) { + if (buckets == null) { // means minShouldMatch=1 and scores are not needed // This doesn't apply live docs, so we'll need to apply them later it.intoBitSet(max, matching, base); + } else if (needsScores) { + for (w.scorer.nextDocsAndScores(max, acceptDocs, docAndScoreBuffer); + docAndScoreBuffer.size > 0; + w.scorer.nextDocsAndScores(max, acceptDocs, docAndScoreBuffer)) { + for (int index = 0; index < docAndScoreBuffer.size; ++index) { + final int doc = docAndScoreBuffer.docs[index]; + final float score = docAndScoreBuffer.scores[index]; + final int d = doc & MASK; + matching.set(d); + final Bucket bucket = buckets[d]; + bucket.freq++; + bucket.score += score; + } + } } else { - for (; doc < max; doc = it.nextDoc()) { + // Scores are not needed but we need to keep track of freqs to know which hits match + assert minShouldMatch > 1; + for (int doc = it.docID(); doc < max; doc = it.nextDoc()) { if (acceptDocs == null || acceptDocs.get(doc)) { final int d = doc & MASK; matching.set(d); final Bucket bucket = buckets[d]; bucket.freq++; - if (needsScores) { - bucket.score += w.scorable.score(); - } } } } diff --git a/lucene/core/src/java/org/apache/lucene/search/ConstantScoreScorer.java b/lucene/core/src/java/org/apache/lucene/search/ConstantScoreScorer.java index 7407edaaf548..d81332bcaeb4 100644 --- a/lucene/core/src/java/org/apache/lucene/search/ConstantScoreScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/ConstantScoreScorer.java @@ -135,6 +135,30 @@ public TwoPhaseIterator twoPhaseIterator() { return twoPhaseIterator; } + @Override + public void applyAsRequiredClause(DocAndScoreAccBuffer buffer) throws IOException { + int intersectionSize = 0; + int curDoc = disi.docID(); + for (int i = 0; i < buffer.size; ++i) { + int targetDoc = buffer.docs[i]; + if (curDoc < targetDoc) { + curDoc = disi.advance(targetDoc); + } + if (curDoc == targetDoc) { + buffer.docs[intersectionSize] = targetDoc; + buffer.scores[intersectionSize] = buffer.scores[i]; + intersectionSize++; + } + } + + buffer.size = intersectionSize; + if (score != 0) { + for (int i = 0; i < intersectionSize; ++i) { + buffer.scores[i] += score; + } + } + } + @Override public int docID() { return disi.docID(); diff --git a/lucene/core/src/java/org/apache/lucene/search/DocAndFreqBuffer.java b/lucene/core/src/java/org/apache/lucene/search/DocAndFreqBuffer.java new file mode 100644 index 000000000000..3e45d3cf429f --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/DocAndFreqBuffer.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.IntsRef; + +/** + * Wrapper around parallel arrays storing doc IDs and their corresponding frequencies. + * + * @lucene.internal + */ +public final class DocAndFreqBuffer { + + /** Doc IDs */ + public int[] docs = IntsRef.EMPTY_INTS; + + /** Frequencies */ + public int[] freqs = IntsRef.EMPTY_INTS; + + /** Number of valid entries in the doc ID and frequency arrays. */ + public int size; + + /** Sole constructor. */ + public DocAndFreqBuffer() {} + + /** Grow both arrays to ensure that they can store at least the given number of entries. */ + public void growNoCopy(int minSize) { + if (docs.length < minSize) { + docs = ArrayUtil.growNoCopy(docs, minSize); + freqs = new int[docs.length]; + } + } + + /** Remove entries from this buffer if their bit is unset in the given {@link Bits}. */ + public void apply(Bits liveDocs) { + int newSize = 0; + for (int i = 0; i < size; ++i) { + if (liveDocs.get(docs[i])) { + docs[newSize] = docs[i]; + freqs[newSize] = freqs[i]; + newSize++; + } + } + this.size = newSize; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/DocAndScoreAccBuffer.java b/lucene/core/src/java/org/apache/lucene/search/DocAndScoreAccBuffer.java new file mode 100644 index 000000000000..a9e0a56c2cf1 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/DocAndScoreAccBuffer.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.IntsRef; + +/** + * Wrapper around parallel arrays storing doc IDs and their corresponding score accumulators. + * + * @lucene.internal + */ +public final class DocAndScoreAccBuffer { + + private static final double[] EMPTY_DOUBLES = new double[0]; + + /** Doc IDs */ + public int[] docs = IntsRef.EMPTY_INTS; + + /** Scores */ + public double[] scores = EMPTY_DOUBLES; + + /** Number of valid entries in the doc ID and score arrays. */ + public int size; + + /** Sole constructor. */ + public DocAndScoreAccBuffer() {} + + /** Grow both arrays to ensure that they can store at least the given number of entries. */ + public void growNoCopy(int minSize) { + if (docs.length < minSize) { + docs = ArrayUtil.growNoCopy(docs, minSize); + scores = new double[docs.length]; + } + } + + /** Copy content from the given {@link DocAndScoreBuffer}, expanding float scores to doubles. */ + public void copyFrom(DocAndScoreBuffer buffer) { + growNoCopy(buffer.size); + System.arraycopy(buffer.docs, 0, docs, 0, buffer.size); + for (int i = 0; i < buffer.size; ++i) { + scores[i] = buffer.scores[i]; + } + this.size = buffer.size; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/DocAndScoreBuffer.java b/lucene/core/src/java/org/apache/lucene/search/DocAndScoreBuffer.java new file mode 100644 index 000000000000..5daef38c0ff6 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/DocAndScoreBuffer.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.IntsRef; + +/** + * Wrapper around parallel arrays storing doc IDs and their corresponding scores. + * + * @lucene.internal + */ +public final class DocAndScoreBuffer { + + private static final float[] EMPTY_FLOATS = new float[0]; + + /** Doc IDs */ + public int[] docs = IntsRef.EMPTY_INTS; + + /** Scores */ + public float[] scores = EMPTY_FLOATS; + + /** Number of valid entries in the doc ID and score arrays. */ + public int size; + + /** Sole constructor. */ + public DocAndScoreBuffer() {} + + /** Grow both arrays to ensure that they can store at least the given number of entries. */ + public void growNoCopy(int minSize) { + if (docs.length < minSize) { + docs = ArrayUtil.growNoCopy(docs, minSize); + scores = new float[docs.length]; + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java b/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java index 40e44b7c4286..d1d2edac6849 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java @@ -51,6 +51,9 @@ final class MaxScoreBulkScorer extends BulkScorer { private final long[] windowMatches = new long[FixedBitSet.bits2words(INNER_WINDOW_SIZE)]; private final double[] windowScores = new double[INNER_WINDOW_SIZE]; + private final DocAndScoreBuffer docAndScoreBuffer = new DocAndScoreBuffer(); + private final DocAndScoreAccBuffer docAndScoreAccBuffer = new DocAndScoreAccBuffer(); + MaxScoreBulkScorer(int maxDoc, List scorers, Scorer filter) throws IOException { this.maxDoc = maxDoc; this.filter = filter == null ? null : new DisiWrapper(filter, false); @@ -218,12 +221,18 @@ private void scoreInnerWindowSingleEssentialClause( // single essential clause in this window, we can iterate it directly and skip the bitset. // this is a common case for 2-clauses queries - for (int doc = top.doc; doc < upTo; doc = top.iterator.nextDoc()) { - if (acceptDocs != null && acceptDocs.get(doc) == false) { - continue; + for (top.scorer.nextDocsAndScores(upTo, acceptDocs, docAndScoreBuffer); + docAndScoreBuffer.size > 0; + top.scorer.nextDocsAndScores(upTo, acceptDocs, docAndScoreBuffer)) { + for (int i = 0; i < docAndScoreBuffer.size; ++i) { + scoreNonEssentialClauses( + collector, + docAndScoreBuffer.docs[i], + docAndScoreBuffer.scores[i], + firstEssentialScorer); } - scoreNonEssentialClauses(collector, doc, top.scorable.score(), firstEssentialScorer); } + top.doc = top.iterator.docID(); essentialQueue.updateTop(); } @@ -235,63 +244,34 @@ private void scoreInnerWindowAsConjunction(LeafCollector collector, Bits acceptD DisiWrapper lead1 = allScorers[allScorers.length - 1]; assert essentialQueue.size() == 1; assert lead1 == essentialQueue.top(); - DisiWrapper lead2 = allScorers[allScorers.length - 2]; - if (lead1.doc < lead2.doc) { - lead1.doc = lead1.iterator.advance(Math.min(lead2.doc, max)); - } - // maximum score contribution of all scorers but the lead - double maxScoreSumAtLead2 = maxScoreSums[allScorers.length - 2]; - outer: - while (lead1.doc < max) { + for (lead1.scorer.nextDocsAndScores(max, acceptDocs, docAndScoreBuffer); + docAndScoreBuffer.size > 0; + lead1.scorer.nextDocsAndScores(max, acceptDocs, docAndScoreBuffer)) { - if (acceptDocs != null && acceptDocs.get(lead1.doc) == false) { - lead1.doc = lead1.iterator.nextDoc(); - continue; - } + docAndScoreAccBuffer.copyFrom(docAndScoreBuffer); - double score = lead1.scorable.score(); + for (int i = allScorers.length - 2; i >= firstRequiredScorer; --i) { - // We specialize handling the second best scorer, which seems to help a bit with performance. - // But this is the exact same logic as in the below for loop. - if ((float) MathUtil.sumUpperBound(score + maxScoreSumAtLead2, allScorers.length) - < minCompetitiveScore) { - // a competitive match is not possible according to max scores, skip to the next candidate - lead1.doc = lead1.iterator.nextDoc(); - continue; - } + if (minCompetitiveScore > 0) { + ScorerUtil.filterCompetitiveHits( + docAndScoreAccBuffer, maxScoreSums[i], minCompetitiveScore, allScorers.length); + } - if (lead2.doc < lead1.doc) { - lead2.doc = lead2.iterator.advance(lead1.doc); + allScorers[i].scorer.applyAsRequiredClause(docAndScoreAccBuffer); } - if (lead2.doc != lead1.doc) { - lead1.doc = lead1.iterator.advance(Math.min(lead2.doc, max)); - continue; - } - - score += lead2.scorable.score(); - for (int i = allScorers.length - 3; i >= firstRequiredScorer; --i) { - if ((float) MathUtil.sumUpperBound(score + maxScoreSums[i], allScorers.length) - < minCompetitiveScore) { - // a competitive match is not possible according to max scores, skip to the next candidate - lead1.doc = lead1.iterator.nextDoc(); - continue outer; - } - - DisiWrapper w = allScorers[i]; - if (w.doc < lead1.doc) { - w.doc = w.iterator.advance(lead1.doc); - } - if (w.doc != lead1.doc) { - lead1.doc = lead1.iterator.advance(Math.min(w.doc, max)); - continue outer; - } - score += w.scorable.score(); + for (int i = 0; i < docAndScoreAccBuffer.size; ++i) { + scoreNonEssentialClauses( + collector, + docAndScoreAccBuffer.docs[i], + docAndScoreAccBuffer.scores[i], + firstRequiredScorer); } + } - scoreNonEssentialClauses(collector, lead1.doc, score, firstRequiredScorer); - lead1.doc = lead1.iterator.nextDoc(); + for (int i = allScorers.length - 1; i >= firstRequiredScorer; --i) { + allScorers[i].doc = allScorers[i].iterator.docID(); } } @@ -304,13 +284,18 @@ private void scoreInnerWindowMultipleEssentialClauses( // Collect matches of essential clauses into a bitset do { - for (int doc = top.doc; doc < innerWindowMax; doc = top.iterator.nextDoc()) { - if (acceptDocs == null || acceptDocs.get(doc)) { + for (top.scorer.nextDocsAndScores(innerWindowMax, acceptDocs, docAndScoreBuffer); + docAndScoreBuffer.size > 0; + top.scorer.nextDocsAndScores(innerWindowMax, acceptDocs, docAndScoreBuffer)) { + for (int index = 0; index < docAndScoreBuffer.size; ++index) { + final int doc = docAndScoreBuffer.docs[index]; + final float score = docAndScoreBuffer.scores[index]; final int i = doc - innerWindowMin; windowMatches[i >>> 6] |= 1L << i; - windowScores[i] += top.scorable.score(); + windowScores[i] += score; } } + top.doc = top.iterator.docID(); top = essentialQueue.updateTop(); } while (top.doc < innerWindowMax); diff --git a/lucene/core/src/java/org/apache/lucene/search/Scorer.java b/lucene/core/src/java/org/apache/lucene/search/Scorer.java index 7da2420207ec..d63c5ef79df8 100644 --- a/lucene/core/src/java/org/apache/lucene/search/Scorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/Scorer.java @@ -17,6 +17,7 @@ package org.apache.lucene.search; import java.io.IOException; +import org.apache.lucene.util.Bits; /** * Expert: Common scoring functionality for different types of queries. @@ -76,4 +77,80 @@ public int advanceShallow(int target) throws IOException { * {@link #advanceShallow(int) shallow-advanced} to included and {@code upTo} included. */ public abstract float getMaxScore(int upTo) throws IOException; + + /** + * Return a new batch of doc IDs and scores, starting at the current doc ID, and ending before + * {@code upTo}. Because it starts on the current doc ID, it is illegal to call this method if the + * {@link #docID() current doc ID} is {@code -1}. + * + *

An empty return value indicates that there are no postings left between the current doc ID + * and {@code upTo}. + * + *

Implementations should ideally fill the buffer with a number of entries comprised between 8 + * and a couple hundreds, to keep heap requirements contained, while still being large enough to + * enable operations on the buffer to auto-vectorize efficiently. + * + *

The default implementation is provided below: + * + *

+   * int batchSize = 16; // arbitrary
+   * buffer.growNoCopy(batchSize);
+   * int size = 0;
+   * DocIdSetIterator iterator = iterator();
+   * for (int doc = docID(); doc < upTo && size < batchSize; doc = iterator.nextDoc()) {
+   *   if (liveDocs == null || liveDocs.get(doc)) {
+   *     buffer.docs[size] = doc;
+   *     buffer.scores[size] = score();
+   *     ++size;
+   *   }
+   * }
+   * buffer.size = size;
+   * 
+ * + *

NOTE: The provided {@link DocAndScoreBuffer} should not hold references to internal + * data structures. + * + *

NOTE: In case this {@link Scorer} exposes a {@link #twoPhaseIterator() + * TwoPhaseIterator}, it should be positioned on a matching document before this method is called. + * + * @lucene.internal + */ + public void nextDocsAndScores(int upTo, Bits liveDocs, DocAndScoreBuffer buffer) + throws IOException { + int batchSize = 16; // arbitrary + buffer.growNoCopy(batchSize); + int size = 0; + DocIdSetIterator iterator = iterator(); + for (int doc = docID(); doc < upTo && size < batchSize; doc = iterator.nextDoc()) { + if (liveDocs == null || liveDocs.get(doc)) { + buffer.docs[size] = doc; + buffer.scores[size] = score(); + ++size; + } + } + buffer.size = size; + } + + /** + * Apply this {@link Scorer} as a required clause on the given {@link DocAndScoreAccBuffer}. This + * filters out documents from the buffer that do not match this scorer, and adds the scores of + * this {@link Scorer} to the scores. + */ + public void applyAsRequiredClause(DocAndScoreAccBuffer buffer) throws IOException { + DocIdSetIterator iterator = iterator(); + int intersectionSize = 0; + int curDoc = iterator.docID(); + for (int i = 0; i < buffer.size; ++i) { + int targetDoc = buffer.docs[i]; + if (curDoc < targetDoc) { + curDoc = iterator.advance(targetDoc); + } + if (curDoc == targetDoc) { + buffer.docs[intersectionSize] = targetDoc; + buffer.scores[intersectionSize] = buffer.scores[i] + score(); + intersectionSize++; + } + } + buffer.size = intersectionSize; + } } diff --git a/lucene/core/src/java/org/apache/lucene/search/ScorerUtil.java b/lucene/core/src/java/org/apache/lucene/search/ScorerUtil.java index a08b36413a58..881ad4ed6609 100644 --- a/lucene/core/src/java/org/apache/lucene/search/ScorerUtil.java +++ b/lucene/core/src/java/org/apache/lucene/search/ScorerUtil.java @@ -22,6 +22,7 @@ import org.apache.lucene.index.ImpactsEnum; import org.apache.lucene.util.Bits; import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.MathUtil; import org.apache.lucene.util.PriorityQueue; /** Util class for Scorer related methods */ @@ -117,4 +118,22 @@ public int length() { return in.length(); } } + + static void filterCompetitiveHits( + DocAndScoreAccBuffer buffer, + double maxRemainingScore, + float minCompetitiveScore, + int numScorers) { + int newSize = 0; + for (int i = 0; i < buffer.size; ++i) { + float maxPossibleScore = + (float) MathUtil.sumUpperBound(buffer.scores[i] + maxRemainingScore, numScorers); + if (maxPossibleScore >= minCompetitiveScore) { + buffer.docs[newSize] = buffer.docs[i]; + buffer.scores[newSize] = buffer.scores[i]; + newSize++; + } + } + buffer.size = newSize; + } } diff --git a/lucene/core/src/java/org/apache/lucene/search/TermScorer.java b/lucene/core/src/java/org/apache/lucene/search/TermScorer.java index 4b53788f233e..47d9eae80b6c 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermScorer.java @@ -17,11 +17,16 @@ package org.apache.lucene.search; import java.io.IOException; +import java.util.Arrays; import org.apache.lucene.index.ImpactsEnum; import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.SlowImpactsEnum; import org.apache.lucene.search.similarities.Similarity.SimScorer; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.LongsRef; /** * Expert: A Scorer for documents matching a Term. @@ -35,6 +40,9 @@ public final class TermScorer extends Scorer { private final NumericDocValues norms; private final ImpactsDISI impactsDisi; private final MaxScoreCache maxScoreCache; + private DocAndFreqBuffer docAndFreqBuffer; + private int[] freqs = IntsRef.EMPTY_INTS; + private long[] normValues = LongsRef.EMPTY_LONGS; /** Construct a {@link TermScorer} that will iterate all documents. */ public TermScorer(PostingsEnum postingsEnum, SimScorer scorer, NumericDocValues norms) { @@ -120,4 +128,81 @@ public void setMinCompetitiveScore(float minScore) { impactsDisi.setMinCompetitiveScore(minScore); } } + + @Override + public void nextDocsAndScores(int upTo, Bits liveDocs, DocAndScoreBuffer buffer) + throws IOException { + if (docAndFreqBuffer == null) { + docAndFreqBuffer = new DocAndFreqBuffer(); + } + + for (; ; ) { + postingsEnum.nextPostings(upTo, docAndFreqBuffer); + if (liveDocs != null && docAndFreqBuffer.size != 0) { + // An empty return value indicates that there are no more docs before upTo. We may be + // unlucky, and there are docs left, but all docs from the current batch happen to be marked + // as deleted. So we need to iterate until we find a batch that has at least one non-deleted + // doc. + docAndFreqBuffer.apply(liveDocs); + if (docAndFreqBuffer.size == 0) { + continue; + } + } + break; + } + + int size = docAndFreqBuffer.size; + normValues = ArrayUtil.grow(normValues, size); + if (norms == null) { + Arrays.fill(normValues, 0, size, 1L); + } else { + for (int i = 0; i < size; ++i) { + if (norms.advanceExact(docAndFreqBuffer.docs[i])) { + normValues[i] = norms.longValue(); + } else { + normValues[i] = 1L; + } + } + } + + buffer.growNoCopy(size); + buffer.size = size; + System.arraycopy(docAndFreqBuffer.docs, 0, buffer.docs, 0, size); + for (int i = 0; i < size; ++i) { + // Unless SimScorer#score is megamorphic, SimScorer#score should inline and (part of) score + // computations should auto-vectorize. + buffer.scores[i] = scorer.score(docAndFreqBuffer.freqs[i], normValues[i]); + } + } + + @Override + public void applyAsRequiredClause(DocAndScoreAccBuffer buffer) throws IOException { + freqs = ArrayUtil.growNoCopy(freqs, buffer.size); + normValues = ArrayUtil.growNoCopy(normValues, buffer.size); + + int intersectionSize = 0; + int curDoc = iterator.docID(); + for (int i = 0; i < buffer.size; ++i) { + int targetDoc = buffer.docs[i]; + if (curDoc < targetDoc) { + curDoc = iterator.advance(targetDoc); + } + if (curDoc == targetDoc) { + buffer.docs[intersectionSize] = targetDoc; + buffer.scores[intersectionSize] = buffer.scores[i]; + freqs[intersectionSize] = postingsEnum.freq(); + if (norms == null || norms.advanceExact(targetDoc) == false) { + normValues[intersectionSize] = 1L; + } else { + normValues[intersectionSize] = norms.longValue(); + } + intersectionSize++; + } + } + + buffer.size = intersectionSize; + for (int i = 0; i < intersectionSize; ++i) { + buffer.scores[i] += scorer.score(freqs[i], normValues[i]); + } + } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/AssertingLeafReader.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/AssertingLeafReader.java index c56f880c0893..cb9d83e8ac39 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/AssertingLeafReader.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/AssertingLeafReader.java @@ -50,6 +50,7 @@ import org.apache.lucene.index.TermsEnum; import org.apache.lucene.internal.tests.IndexPackageAccess; import org.apache.lucene.internal.tests.TestSecrets; +import org.apache.lucene.search.DocAndFreqBuffer; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; @@ -578,6 +579,20 @@ public BytesRef getPayload() throws IOException { return payload; } + @Override + public void nextPostings(int upTo, DocAndFreqBuffer buffer) throws IOException { + assert state != DocsEnumState.START : "nextPostings() called before nextDoc()/advance()"; + in.nextPostings(upTo, buffer); + doc = in.docID(); + if (doc == DocIdSetIterator.NO_MORE_DOCS) { + state = DocsEnumState.FINISHED; + positionMax = 0; + } else { + state = DocsEnumState.ITERATING; + positionMax = super.freq(); + } + } + void reset() { state = DocsEnumState.START; doc = in.docID(); diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingScorer.java b/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingScorer.java index 0e9d8bfa9ba9..e0819a069c86 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingScorer.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingScorer.java @@ -20,11 +20,14 @@ import java.util.Collection; import java.util.Collections; import java.util.Random; +import org.apache.lucene.search.DocAndScoreAccBuffer; +import org.apache.lucene.search.DocAndScoreBuffer; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.FilterDocIdSetIterator; import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.TwoPhaseIterator; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.FixedBitSet; /** Wraps a Scorer with additional checks */ @@ -278,4 +281,32 @@ public String toString() { } }; } + + @Override + public void nextDocsAndScores(int upTo, Bits liveDocs, DocAndScoreBuffer buffer) + throws IOException { + assert doc != -1; + in.nextDocsAndScores(upTo, liveDocs, buffer); + if (doc != in.iterator().docID()) { + doc = in.iterator().docID(); + if (doc == DocIdSetIterator.NO_MORE_DOCS) { + state = IteratorState.FINISHED; + } else { + state = IteratorState.ITERATING; + } + } + } + + @Override + public void applyAsRequiredClause(DocAndScoreAccBuffer buffer) throws IOException { + in.applyAsRequiredClause(buffer); + if (doc != in.iterator().docID()) { + doc = in.iterator().docID(); + if (doc == DocIdSetIterator.NO_MORE_DOCS) { + state = IteratorState.FINISHED; + } else { + state = IteratorState.ITERATING; + } + } + } }