From 874ab87b6b9347b61d109f8584582cc243ee0dba Mon Sep 17 00:00:00 2001
From: Adrien Grand <jpountz@gmail.com>
Date: Tue, 21 Jan 2025 10:03:02 +0100
Subject: [PATCH] Add small bias towards bit set encoding.

Currently, blocks of postings get encoded as a bit set instead of packed deltas
(FOR) whenever the bit set is more storage-efficient. However, the bit set
approach is quite more CPU-efficient at search time, so this PR introduces a
small bias towards the bit set encoding by using it as soon as it's more
storage-efficient than FOR with the next number of bits per value.

The impact on storage efficiency of the Wikipedia dataset is negligible (+0.15%
on `.doc` files, while `.doc` files don't dominate storage requirements,
positions do) while some queries get a good speedup.
---
 .../lucene101/Lucene101PostingsWriter.java      | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsWriter.java
index 1cabefe681ef..3d19a69b82d8 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsWriter.java
@@ -424,15 +424,17 @@ private void flushDocBlock(boolean finishTerm) throws IOException {
       long numSkipBytes = level0Output.size();
       // Now we need to decide whether to encode block deltas as packed integers (FOR) or unary
       // codes (bit set). FOR makes #nextDoc() a bit faster while the bit set approach makes
-      // #advance() sometimes faster and #intoBitSet() much faster. Since the trade-off is not
-      // obvious, we make the decision purely based on storage efficiency, using the approach that
-      // requires fewer bits to encode the block.
+      // #advance() usually faster and #intoBitSet() much faster. In the end, we make the decision
+      // based on storage requirements, picking the bit set approach whenever it's more
+      // storage-efficient than the next number of bits per value (which effectively slightly biases
+      // towards the bit set approach).
       int bitsPerValue = forDeltaUtil.bitsRequired(docDeltaBuffer);
       int sum = Math.toIntExact(Arrays.stream(docDeltaBuffer).sum());
       int numBitSetLongs = FixedBitSet.bits2words(sum);
+      int numBitsNextBitsPerValue = Math.min(Integer.SIZE, bitsPerValue + 1) * BLOCK_SIZE;
       if (sum == BLOCK_SIZE) {
         level0Output.writeByte((byte) 0);
-      } else if (version < VERSION_DENSE_BLOCKS_AS_BITSETS || bitsPerValue * BLOCK_SIZE < sum) {
+      } else if (version < VERSION_DENSE_BLOCKS_AS_BITSETS || numBitsNextBitsPerValue <= sum) {
         level0Output.writeByte((byte) bitsPerValue);
         forDeltaUtil.encodeDeltas(bitsPerValue, docDeltaBuffer, level0Output);
       } else {
@@ -444,10 +446,9 @@ private void flushDocBlock(boolean finishTerm) throws IOException {
           s += i;
           spareBitSet.set(s);
         }
-        // Since we use the bit set encoding when it's more storage efficient than storing deltas,
-        // we know that each doc ID uses less than 32 bits, the maximum number of bits required to
-        // store a delta between consecutive doc IDs. So in the end, the bit set cannot have more
-        // than BLOCK_SIZE * Integer.SIZE / Long.SIZE = 64 longs, which fits on a byte.
+        // We never use the bit set encoding when it requires more than Integer.SIZE=32 bits per
+        // value. So the bit set cannot have more than BLOCK_SIZE * Integer.SIZE / Long.SIZE = 64
+        // longs, which fits on a byte.
         assert numBitSetLongs <= BLOCK_SIZE / 2;
         level0Output.writeByte((byte) -numBitSetLongs);
         for (int i = 0; i < numBitSetLongs; ++i) {