Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -697,7 +697,8 @@ private IndexWriter createFastIndexWriter(Directory dir, int maxBufferedDocs) th
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
conf.setMaxBufferedDocs(maxBufferedDocs);
conf.setRAMBufferSizeMB(-1);
conf.setMergePolicy(newLogMergePolicy(random().nextBoolean()));
conf.setMergePolicy(newLogMergePolicy());
conf.getCodec().compoundFormat().setShouldUseCompoundFile(random().nextBoolean());
return new IndexWriter(dir, conf);
}

Expand Down Expand Up @@ -727,7 +728,8 @@ private void doTestSortedNumericBlocksOfVariousBitsPerValue(LongSupplier counts)
conf.setMaxBufferedDocs(atLeast(Lucene80DocValuesFormat.NUMERIC_BLOCK_SIZE));
conf.setRAMBufferSizeMB(-1);
// so Lucene docids are predictable / stay in order
conf.setMergePolicy(newLogMergePolicy(random().nextBoolean()));
conf.setMergePolicy(newLogMergePolicy());
conf.getCodec().compoundFormat().setShouldUseCompoundFile(random().nextBoolean());
IndexWriter writer = new IndexWriter(dir, conf);

final int numDocs = atLeast(Lucene80DocValuesFormat.NUMERIC_BLOCK_SIZE * 3);
Expand Down Expand Up @@ -797,7 +799,8 @@ private void doTestSparseNumericBlocksOfVariousBitsPerValue(double density) thro
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
conf.setMaxBufferedDocs(atLeast(Lucene80DocValuesFormat.NUMERIC_BLOCK_SIZE));
conf.setRAMBufferSizeMB(-1);
conf.setMergePolicy(newLogMergePolicy(random().nextBoolean()));
conf.setMergePolicy(newLogMergePolicy());
conf.getCodec().compoundFormat().setShouldUseCompoundFile(random().nextBoolean());
IndexWriter writer = new IndexWriter(dir, conf);
Document doc = new Document();
Field storedField = newStringField("stored", "", Field.Store.YES);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.KnnVectorValues;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.LogByteSizeMergePolicy;
import org.apache.lucene.index.MultiBits;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.MultiTerms;
Expand Down Expand Up @@ -125,15 +124,14 @@ protected void createIndex(Directory directory) throws IOException {
}

static void createIndex(Directory dir, boolean doCFS, boolean fullyMerged) throws IOException {
LogByteSizeMergePolicy mp = new LogByteSizeMergePolicy();
mp.setNoCFSRatio(doCFS ? 1.0 : 0.0);
mp.setMaxCFSSegmentSizeMB(Double.POSITIVE_INFINITY);
// TODO: remove randomness
IndexWriterConfig conf =
new IndexWriterConfig(new MockAnalyzer(random()))
.setMaxBufferedDocs(10)
.setCodec(TestUtil.getDefaultCodec())
.setMergePolicy(NoMergePolicy.INSTANCE);
conf.getCodec().compoundFormat().setShouldUseCompoundFile(doCFS);
conf.getCodec().compoundFormat().setMaxCFSSegmentSizeMB(Double.POSITIVE_INFINITY);
IndexWriter writer = new IndexWriter(dir, conf);

for (int i = 0; i < DOCS_COUNT; i++) {
Expand All @@ -147,14 +145,13 @@ static void createIndex(Directory dir, boolean doCFS, boolean fullyMerged) throw

if (!fullyMerged) {
// open fresh writer so we get no prx file in the added segment
mp = new LogByteSizeMergePolicy();
mp.setNoCFSRatio(doCFS ? 1.0 : 0.0);
// TODO: remove randomness
conf =
new IndexWriterConfig(new MockAnalyzer(random()))
.setMaxBufferedDocs(10)
.setCodec(TestUtil.getDefaultCodec())
.setMergePolicy(NoMergePolicy.INSTANCE);
conf.getCodec().compoundFormat().setShouldUseCompoundFile(doCFS);
writer = new IndexWriter(dir, conf);
addNoProxDoc(writer);
writer.close();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,15 +153,15 @@ public void testSortedIndex() throws Exception {
@Override
protected void createIndex(Directory directory) throws IOException {
LogByteSizeMergePolicy mp = new LogByteSizeMergePolicy();
mp.setNoCFSRatio(1.0);
mp.setMaxCFSSegmentSizeMB(Double.POSITIVE_INFINITY);
MockAnalyzer analyzer = new MockAnalyzer(random());

// Don't filter out tokens that are too short because we use those tokens in assertions (#14344)
analyzer.setMaxTokenLength(RandomizedTest.randomIntBetween(5, IndexWriter.MAX_TERM_LENGTH));

// TODO: remove randomness
IndexWriterConfig conf = new IndexWriterConfig(analyzer);
conf.getCodec().compoundFormat().setShouldUseCompoundFile(true);
conf.getCodec().compoundFormat().setMaxCFSSegmentSizeMB(Double.POSITIVE_INFINITY);
conf.setMergePolicy(mp);
conf.setUseCompoundFile(false);
conf.setCodec(TestUtil.getDefaultCodec());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,6 @@ public static Iterable<Object[]> testVersionsFactory() {
@Override
protected void createIndex(Directory directory) throws IOException {
LogByteSizeMergePolicy mp = new LogByteSizeMergePolicy();
mp.setNoCFSRatio(1.0);
mp.setMaxCFSSegmentSizeMB(Double.POSITIVE_INFINITY);
MockAnalyzer analyzer = new MockAnalyzer(random());
analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH));

Expand All @@ -67,6 +65,8 @@ protected void createIndex(Directory directory) throws IOException {
.setMergePolicy(mp)
.setCodec(TestUtil.getDefaultCodec())
.setUseCompoundFile(false);
conf.getCodec().compoundFormat().setShouldUseCompoundFile(true);
conf.getCodec().compoundFormat().setMaxCFSSegmentSizeMB(Double.POSITIVE_INFINITY);
IndexWriter writer = new IndexWriter(directory, conf);
LineFileDocs docs = new LineFileDocs(new Random(0));
for (int i = 0; i < 50; i++) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ public PostingsFormat postingsFormat() {
throw new RuntimeException(
"unable to instantiate class '" + mergePolicy + "' as merge policy", e);
}
iwConf.getMergePolicy().setNoCFSRatio(isCompound ? 1.0 : 0.0);
iwConf.getCodec().compoundFormat().setShouldUseCompoundFile(isCompound);
if (iwConf.getMergePolicy() instanceof LogMergePolicy) {
LogMergePolicy logMergePolicy = (LogMergePolicy) iwConf.getMergePolicy();
logMergePolicy.setMergeFactor(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -642,7 +642,7 @@ public void testIndexWriterSettings() throws Exception {
assertEquals(
IndexWriterConfig.DISABLE_AUTO_FLUSH, (int) writer.getConfig().getRAMBufferSizeMB());
assertEquals(3, ((LogMergePolicy) writer.getConfig().getMergePolicy()).getMergeFactor());
assertEquals(0.0d, writer.getConfig().getMergePolicy().getNoCFSRatio(), 0.0);
assertFalse(writer.getConfig().getCodec().compoundFormat().getShouldUseCompoundFile());
writer.close();
Directory dir = benchmark.getRunData().getDirectory();
IndexReader reader = DirectoryReader.open(dir);
Expand Down
148 changes: 148 additions & 0 deletions lucene/core/src/java/org/apache/lucene/codecs/CompoundFormat.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
package org.apache.lucene.codecs;

import java.io.IOException;
import org.apache.lucene.index.LogDocMergePolicy;
import org.apache.lucene.index.MergePolicy;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
Expand All @@ -34,6 +36,152 @@ protected CompoundFormat() {}
// TODO: this is very minimal. If we need more methods,
// we can add 'producer' classes.

/** Default document count threshold for using compound files with LogDocMergePolicy */
static final int DEFAULT_CFS_THRESHOLD_DOC_SIZE = 65536; // docs

/** Default byte size threshold for using compound files with other merge policies (64MB) */
static final long DEFAULT_CFS_THRESHOLD_BYTE_SIZE = 64L * 1024 * 1024; // 64MB

/** Default maximum segment size allowed for compound files (no limit) */
static final long DEFAULT_MAX_CFS_SEGMENT_SIZE = Long.MAX_VALUE;

/** Document count threshold for LogDocMergePolicy */
private int cfsThresholdDocSize = DEFAULT_CFS_THRESHOLD_DOC_SIZE;

/** Byte size threshold for other merge policies */
private long cfsThresholdByteSize = DEFAULT_CFS_THRESHOLD_BYTE_SIZE;

/** Whether compound files should be used at all */
private boolean shouldUseCompoundFile = true;

/** Maximum segment size that can be stored as compound file */
private long maxCFSSegmentSize = DEFAULT_MAX_CFS_SEGMENT_SIZE;

/**
* Sets the document count threshold for using compound files with LogDocMergePolicy. Segments
* with document count less than or equal to this threshold will use compound files.
*
* @param threshold the document count threshold
*/
public void setCfsThresholdDocSize(int threshold) {
this.cfsThresholdDocSize = threshold;
}

/**
* Sets the byte size threshold for using compound files with merge policies other than
* LogDocMergePolicy. Segments with size less than or equal to this threshold will use compound
* files.
*
* @param thresholdBytes the byte size threshold in bytes
*/
public void setCfsThresholdByteSize(long thresholdBytes) {
this.cfsThresholdByteSize = thresholdBytes;
}

/**
* Returns the current document count threshold for compound files.
*
* @return the document count threshold
*/
public int getCfsThresholdDocSize() {
return this.cfsThresholdDocSize;
}

/**
* Returns the current byte size threshold for compound files.
*
* @return the byte size threshold in bytes
*/
public long getCfsThresholdByteSize() {
return this.cfsThresholdByteSize;
}

/**
* Enables or disables the use of compound files entirely. When disabled, no segments will use
* compound files regardless of other settings.
*
* @param useCompoundFile true to enable compound files, false to disable
*/
public void setShouldUseCompoundFile(boolean useCompoundFile) {
this.shouldUseCompoundFile = useCompoundFile;
}

/**
* Returns whether compound files are enabled.
*
* @return true if compound files are enabled, false otherwise
*/
public boolean getShouldUseCompoundFile() {
return this.shouldUseCompoundFile;
}

/**
* Returns the largest size allowed for a compound file segment in megabytes. Segments larger than
* this size will not use compound files even if otherwise eligible.
*
* @return the maximum compound file segment size in MB
*/
public double getMaxCFSSegmentSizeMB() {
return maxCFSSegmentSize / 1024. / 1024.;
}

/**
* Sets the maximum size limit for compound file segments in megabytes. If a merged segment will
* be larger than this value, it will be left as a non-compound file even if compound files are
* enabled. Set this to Double.POSITIVE_INFINITY (default) to always use CFS when other conditions
* are met.
*
* @param v the maximum segment size in MB (must be >= 0)
* @throws IllegalArgumentException if v is negative
*/
public void setMaxCFSSegmentSizeMB(double v) {
if (v < 0.0) {
throw new IllegalArgumentException("maxCFSSegmentSizeMB must be >=0 (got " + v + ")");
}
v *= 1024 * 1024; // Convert MB to bytes
this.maxCFSSegmentSize = v > Long.MAX_VALUE ? Long.MAX_VALUE : (long) v;
}

/**
* Determines whether a segment should use the compound file format based on its size and merge
* policy.
*
* <p>The decision logic is as follows:
*
* <ol>
* <li>If compound files are disabled globally, return false
* <li>If segment size exceeds the maximum CFS segment size, return false
* <li>For LogDocMergePolicy: use CFS if document count ≤ document threshold
* <li>For other merge policies: use CFS if byte size ≤ byte threshold
* </ol>
*
* @param mergedInfoSize the size of the segment (document count for LogDocMergePolicy, bytes for
* others)
* @param mergePolicy the merge policy being used
* @return true if the segment should use compound file format, false otherwise
* @throws IOException if an I/O error occurs
*/
public boolean useCompoundFile(long mergedInfoSize, MergePolicy mergePolicy) throws IOException {
// Check if compound files are globally disabled
if (this.shouldUseCompoundFile == false) {
return false;
}

// Check if segment exceeds maximum allowed size for CFS
if (mergedInfoSize > maxCFSSegmentSize) {
return false;
}

// Apply appropriate threshold based on merge policy type
if (mergePolicy instanceof LogDocMergePolicy) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be great if we can avoid customizing it for specific policies, otherwise it might be tricky to maintain in the future, if e.g. there is another policy that is based on doc size not bytes.

Maybe we can add a enum and a method to MergePolicy which returns its unit (bytes/docs) , and use it here to decide which threshold to use?

Or do we want to always choose compound format based on size in bytes even for LogDocMergePolicy? In this case we might be able to use merge.getMergeInfo().sizeInBytes() when we call this method and avoid relying on MergePolicy#size all together?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 to the idea of Enums, I will wait if anyone else has other suggestions here but having an enum makes most sense to me.

// For LogDocMergePolicy, mergedInfoSize represents document count
return mergedInfoSize <= this.cfsThresholdDocSize;
} else {
// For other policies, mergedInfoSize represents byte size
return mergedInfoSize <= this.cfsThresholdByteSize;
}
}

/** Returns a Directory view (read-only) for the compound files in this segment */
public abstract CompoundDirectory getCompoundReader(Directory dir, SegmentInfo si)
throws IOException;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,38 +75,11 @@ public MergeSpecification findFullFlushMerges(
return in.findFullFlushMerges(mergeTrigger, segmentInfos, mergeContext);
}

@Override
public boolean useCompoundFile(
SegmentInfos infos, SegmentCommitInfo mergedInfo, MergeContext mergeContext)
throws IOException {
return in.useCompoundFile(infos, mergedInfo, mergeContext);
}

@Override
protected long size(SegmentCommitInfo info, MergeContext context) throws IOException {
return in.size(info, context);
}

@Override
public double getNoCFSRatio() {
return in.getNoCFSRatio();
}

@Override
public final void setNoCFSRatio(double noCFSRatio) {
in.setNoCFSRatio(noCFSRatio);
}

@Override
public final void setMaxCFSSegmentSizeMB(double v) {
in.setMaxCFSSegmentSizeMB(v);
}

@Override
public final double getMaxCFSSegmentSizeMB() {
return in.getMaxCFSSegmentSizeMB();
}

@Override
public String toString() {
return getClass().getSimpleName() + "(" + in + ")";
Expand Down
16 changes: 14 additions & 2 deletions lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
Original file line number Diff line number Diff line change
Expand Up @@ -3480,7 +3480,13 @@ public void addIndexesReaderMerge(MergePolicy.OneMerge merge) throws IOException
boolean useCompoundFile;
synchronized (this) {
merge.checkAborted();
useCompoundFile = mergePolicy.useCompoundFile(segmentInfos, merge.getMergeInfo(), this);
useCompoundFile =
merge
.getMergeInfo()
.info
.getCodec()
.compoundFormat()
.useCompoundFile(mergePolicy.size(merge.getMergeInfo(), this), mergePolicy);
}

// Now create the compound file if needed
Expand Down Expand Up @@ -5336,7 +5342,13 @@ public int length() {
// this segment:
boolean useCompoundFile;
synchronized (this) { // Guard segmentInfos
useCompoundFile = mergePolicy.useCompoundFile(segmentInfos, merge.info, this);
useCompoundFile =
merge
.getMergeInfo()
.info
.getCodec()
.compoundFormat()
.useCompoundFile(mergePolicy.size(merge.info, this), mergePolicy);
}

if (useCompoundFile) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -368,9 +368,7 @@ public InfoStream getInfoStream() {
*
* <p>Use <code>false</code> for batch indexing with very large ram buffer settings.
*
* <p><b>Note: To control compound file usage during segment merges see {@link
* MergePolicy#setNoCFSRatio(double)} and {@link MergePolicy#setMaxCFSSegmentSizeMB(double)}. This
* setting only applies to newly created segments.</b>
* <p><b>Note: To control compound file usage during segment merges.</b>
*/
public LiveIndexWriterConfig setUseCompoundFile(boolean useCompoundFile) {
this.useCompoundFile = useCompoundFile;
Expand Down
Loading
Loading