From 874ab87b6b9347b61d109f8584582cc243ee0dba Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Tue, 21 Jan 2025 10:03:02 +0100 Subject: [PATCH] Add small bias towards bit set encoding. Currently, blocks of postings get encoded as a bit set instead of packed deltas (FOR) whenever the bit set is more storage-efficient. However, the bit set approach is quite more CPU-efficient at search time, so this PR introduces a small bias towards the bit set encoding by using it as soon as it's more storage-efficient than FOR with the next number of bits per value. The impact on storage efficiency of the Wikipedia dataset is negligible (+0.15% on `.doc` files, while `.doc` files don't dominate storage requirements, positions do) while some queries get a good speedup. --- .../lucene101/Lucene101PostingsWriter.java | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsWriter.java index 1cabefe681ef..3d19a69b82d8 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsWriter.java @@ -424,15 +424,17 @@ private void flushDocBlock(boolean finishTerm) throws IOException { long numSkipBytes = level0Output.size(); // Now we need to decide whether to encode block deltas as packed integers (FOR) or unary // codes (bit set). FOR makes #nextDoc() a bit faster while the bit set approach makes - // #advance() sometimes faster and #intoBitSet() much faster. Since the trade-off is not - // obvious, we make the decision purely based on storage efficiency, using the approach that - // requires fewer bits to encode the block. + // #advance() usually faster and #intoBitSet() much faster. In the end, we make the decision + // based on storage requirements, picking the bit set approach whenever it's more + // storage-efficient than the next number of bits per value (which effectively slightly biases + // towards the bit set approach). int bitsPerValue = forDeltaUtil.bitsRequired(docDeltaBuffer); int sum = Math.toIntExact(Arrays.stream(docDeltaBuffer).sum()); int numBitSetLongs = FixedBitSet.bits2words(sum); + int numBitsNextBitsPerValue = Math.min(Integer.SIZE, bitsPerValue + 1) * BLOCK_SIZE; if (sum == BLOCK_SIZE) { level0Output.writeByte((byte) 0); - } else if (version < VERSION_DENSE_BLOCKS_AS_BITSETS || bitsPerValue * BLOCK_SIZE < sum) { + } else if (version < VERSION_DENSE_BLOCKS_AS_BITSETS || numBitsNextBitsPerValue <= sum) { level0Output.writeByte((byte) bitsPerValue); forDeltaUtil.encodeDeltas(bitsPerValue, docDeltaBuffer, level0Output); } else { @@ -444,10 +446,9 @@ private void flushDocBlock(boolean finishTerm) throws IOException { s += i; spareBitSet.set(s); } - // Since we use the bit set encoding when it's more storage efficient than storing deltas, - // we know that each doc ID uses less than 32 bits, the maximum number of bits required to - // store a delta between consecutive doc IDs. So in the end, the bit set cannot have more - // than BLOCK_SIZE * Integer.SIZE / Long.SIZE = 64 longs, which fits on a byte. + // We never use the bit set encoding when it requires more than Integer.SIZE=32 bits per + // value. So the bit set cannot have more than BLOCK_SIZE * Integer.SIZE / Long.SIZE = 64 + // longs, which fits on a byte. assert numBitSetLongs <= BLOCK_SIZE / 2; level0Output.writeByte((byte) -numBitSetLongs); for (int i = 0; i < numBitSetLongs; ++i) {