From 90835c52a417ecdb0feb21024d509dcfe2af9f4d Mon Sep 17 00:00:00 2001 From: Ramakrishna chilaka Date: Tue, 26 Aug 2025 17:18:38 +0530 Subject: [PATCH] PostingsDecodingUtil: interchange loops to enable better memory access and SIMD vectorisation --- lucene/CHANGES.txt | 2 ++ .../jmh/PostingIndexInputBenchmark.java | 16 ++++++++++++++++ .../vectorization/PostingDecodingUtil.java | 19 +++++++++++++------ 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index e90ca7c9edb0..050621b72c6f 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -150,6 +150,8 @@ New Features * GITHUB#14729: Support for Re-Ranking Queries using Late Interaction Model Multi-Vectors. (Vigya Sharma, Jim Ferenczi) +* GITHUB#15110: PostingsDecodingUtil: interchange loops to enable better memory access and SIMD vectorisation. (Ramakrishna Chilaka) + Improvements --------------------- * GITHUB#14458: Add an IndexDeletion policy that retains the last N commits. (Owais Kazi) diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/PostingIndexInputBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/PostingIndexInputBenchmark.java index 0881d0f6bf95..c72809da8475 100644 --- a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/PostingIndexInputBenchmark.java +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/PostingIndexInputBenchmark.java @@ -105,4 +105,20 @@ public void decodeAndPrefixSum(Blackhole bh) throws IOException { postingIn.decodeAndPrefixSum(bpv, 100, values); bh.consume(values); } + + @Benchmark + @Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) + public void decodeVector(Blackhole bh) throws IOException { + in.seek(3); // random unaligned offset + postingIn.decode(bpv, values); + bh.consume(values); + } + + @Benchmark + @Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) + public void decodeAndPrefixSumVector(Blackhole bh) throws IOException { + in.seek(3); // random unaligned offset + postingIn.decodeAndPrefixSum(bpv, 100, values); + bh.consume(values); + } } diff --git a/lucene/core/src/java/org/apache/lucene/internal/vectorization/PostingDecodingUtil.java b/lucene/core/src/java/org/apache/lucene/internal/vectorization/PostingDecodingUtil.java index e45ce55bbc59..b66600eee353 100644 --- a/lucene/core/src/java/org/apache/lucene/internal/vectorization/PostingDecodingUtil.java +++ b/lucene/core/src/java/org/apache/lucene/internal/vectorization/PostingDecodingUtil.java @@ -44,14 +44,21 @@ protected PostingDecodingUtil(IndexInput in) { public void splitInts( int count, int[] b, int bShift, int dec, int bMask, int[] c, int cIndex, int cMask) throws IOException { - // Default implementation, which takes advantage of the C2 compiler's loop unrolling and - // auto-vectorization. in.readInts(c, cIndex, count); - int maxIter = (bShift - 1) / dec; - for (int i = 0; i < count; ++i) { - for (int j = 0; j <= maxIter; ++j) { - b[count * j + i] = (c[cIndex + i] >>> (bShift - j * dec)) & bMask; + final int maxIter = (bShift - 1) / dec; + + // Process each shift level across all elements (better for vectorization) + for (int j = 0; j <= maxIter; ++j) { + final int shift = bShift - j * dec; + final int bOffset = count * j; + // Vectorizable loop: contiguous memory access with simple operations + for (int i = 0; i < count; ++i) { + b[bOffset + i] = (c[cIndex + i] >>> shift) & bMask; } + } + + // Apply mask to c array (vectorizable) + for (int i = 0; i < count; ++i) { c[cIndex + i] &= cMask; } }