Skip to content

Commit

Permalink
Support DataInput as source for StoredField (#14213)
Browse files Browse the repository at this point in the history
 Allowing indexing stored-only StoredField directly from DataInput.
  • Loading branch information
Tim-Brooks authored and iverase committed Feb 19, 2025
1 parent fadee30 commit 2dedd1d
Show file tree
Hide file tree
Showing 19 changed files with 250 additions and 38 deletions.
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ Improvements
* GITHUB#14239: Hunspell's option to tolerate affix rule count mismatches was
improved to tolerate more instances of this problem. (Robert Muir)

* GITHUB#14213: Allowing indexing stored-only StoredField directly from DataInput. (Tim Brooks)

Optimizations
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.StoredFieldDataInput;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.ByteArrayDataInput;
Expand Down Expand Up @@ -290,7 +291,7 @@ private static void readField(DataInput in, StoredFieldVisitor visitor, FieldInf
switch (bits & TYPE_MASK) {
case BYTE_ARR:
int length = in.readVInt();
visitor.binaryField(info, in, length);
visitor.binaryField(info, new StoredFieldDataInput(in, length));
break;
case STRING:
visitor.stringField(info, in.readString());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.StoredFieldDataInput;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.BytesRef;

Expand Down Expand Up @@ -73,10 +73,11 @@ public void finishDocument() throws IOException {}
/** Writes a stored double value. */
public abstract void writeField(FieldInfo info, double value) throws IOException;

/** Writes a stored binary value from a {@link DataInput} and a {@code length}. */
public void writeField(FieldInfo info, DataInput value, int length) throws IOException {
/** Writes a stored binary value from a {@link StoredFieldDataInput}. */
public void writeField(FieldInfo info, StoredFieldDataInput value) throws IOException {
int length = value.length();
final byte[] bytes = new byte[length];
value.readBytes(bytes, 0, length);
value.getDataInput().readBytes(bytes, 0, length);
writeField(info, new BytesRef(bytes, 0, length));
}

Expand Down Expand Up @@ -191,8 +192,8 @@ public MergeVisitor(MergeState mergeState, int readerIndex) {
}

@Override
public void binaryField(FieldInfo fieldInfo, DataInput value, int length) throws IOException {
writeField(remap(fieldInfo), value, length);
public void binaryField(FieldInfo fieldInfo, StoredFieldDataInput value) throws IOException {
writeField(remap(fieldInfo), value);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.StoredFieldDataInput;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.ByteArrayDataInput;
Expand Down Expand Up @@ -261,7 +262,7 @@ private static void readField(DataInput in, StoredFieldVisitor visitor, FieldInf
switch (bits & TYPE_MASK) {
case BYTE_ARR:
int length = in.readVInt();
visitor.binaryField(info, in, length);
visitor.binaryField(info, new StoredFieldDataInput(in, length));
break;
case STRING:
visitor.stringField(info, in.readString());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.StoredFieldDataInput;
import org.apache.lucene.store.ByteBuffersDataInput;
import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
Expand Down Expand Up @@ -311,12 +311,13 @@ public void writeField(FieldInfo info, BytesRef value) throws IOException {
}

@Override
public void writeField(FieldInfo info, DataInput value, int length) throws IOException {
public void writeField(FieldInfo info, StoredFieldDataInput value) throws IOException {
int length = value.getLength();
++numStoredFieldsInDoc;
final long infoAndBits = (((long) info.number) << TYPE_BITS) | BYTE_ARR;
bufferedDocs.writeVLong(infoAndBits);
bufferedDocs.writeVInt(length);
bufferedDocs.copyBytes(value, length);
bufferedDocs.copyBytes(value.getDataInput(), length);
}

@Override
Expand Down
3 changes: 3 additions & 0 deletions lucene/core/src/java/org/apache/lucene/document/Field.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.IndexableFieldType;
import org.apache.lucene.index.StoredFieldDataInput;
import org.apache.lucene.util.BytesRef;

/**
Expand Down Expand Up @@ -619,6 +620,8 @@ public StoredValue storedValue() {
return new StoredValue((double) fieldsData);
} else if (fieldsData instanceof BytesRef) {
return new StoredValue((BytesRef) fieldsData);
} else if (fieldsData instanceof StoredFieldDataInput) {
return new StoredValue((StoredFieldDataInput) fieldsData);
} else if (fieldsData instanceof String) {
return new StoredValue((String) fieldsData);
} else {
Expand Down
16 changes: 16 additions & 0 deletions lucene/core/src/java/org/apache/lucene/document/StoredField.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package org.apache.lucene.document;

import org.apache.lucene.index.IndexReader; // javadocs
import org.apache.lucene.index.StoredFieldDataInput;
import org.apache.lucene.search.IndexSearcher; // javadocs
import org.apache.lucene.util.BytesRef;

Expand Down Expand Up @@ -105,6 +106,21 @@ public StoredField(String name, BytesRef value) {
super(name, value, TYPE);
}

/**
* Create a stored-only field with the given data input value.
*
* @param name field name
* @param value BytesRef pointing to binary content (not copied)
* @throws IllegalArgumentException if the field name or value is null.
*/
public StoredField(String name, StoredFieldDataInput value) {
super(name, TYPE);
if (value == null) {
throw new IllegalArgumentException("store field data input must not be null");
}
fieldsData = value;
}

/**
* Create a stored-only field with the given string value.
*
Expand Down
28 changes: 27 additions & 1 deletion lucene/core/src/java/org/apache/lucene/document/StoredValue.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import java.util.Objects;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.StoredFieldDataInput;
import org.apache.lucene.util.BytesRef;

/**
Expand All @@ -39,6 +40,8 @@ public enum Type {
DOUBLE,
/** Type of binary values. */
BINARY,
/** Type of data input values. */
DATA_INPUT,
/** Type of string values. */
STRING;
}
Expand All @@ -48,6 +51,7 @@ public enum Type {
private long longValue;
private float floatValue;
private double doubleValue;
private StoredFieldDataInput dataInput;
private BytesRef binaryValue;
private String stringValue;

Expand Down Expand Up @@ -81,7 +85,13 @@ public StoredValue(BytesRef value) {
binaryValue = Objects.requireNonNull(value);
}

/** Ctor for binary values. */
/** Ctor for data input values. */
public StoredValue(StoredFieldDataInput value) {
type = Type.DATA_INPUT;
dataInput = Objects.requireNonNull(value);
}

/** Ctor for string values. */
public StoredValue(String value) {
type = Type.STRING;
stringValue = Objects.requireNonNull(value);
Expand Down Expand Up @@ -132,6 +142,14 @@ public void setBinaryValue(BytesRef value) {
binaryValue = Objects.requireNonNull(value);
}

/** Set a data input value. */
public void setDataInputValue(StoredFieldDataInput value) {
if (type != Type.DATA_INPUT) {
throw new IllegalArgumentException("Cannot set a data input value on a " + type + " value");
}
dataInput = Objects.requireNonNull(value);
}

/** Set a string value. */
public void setStringValue(String value) {
if (type != Type.STRING) {
Expand Down Expand Up @@ -180,6 +198,14 @@ public BytesRef getBinaryValue() {
return binaryValue;
}

/** Retrieve a data input value. */
public StoredFieldDataInput getDataInputValue() {
if (type != Type.DATA_INPUT) {
throw new IllegalArgumentException("Cannot get a data input value on a " + type + " value");
}
return dataInput;
}

/** Retrieve a string value. */
public String getStringValue() {
if (type != Type.STRING) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,8 @@ private static class CopyVisitor extends StoredFieldVisitor {
}

@Override
public void binaryField(FieldInfo fieldInfo, DataInput value, int length) throws IOException {
writer.writeField(fieldInfo, value, length);
public void binaryField(FieldInfo fieldInfo, StoredFieldDataInput value) throws IOException {
writer.writeField(fieldInfo, value);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;

import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.DataInput;

/**
* A fixed size DataInput which includes the length of the input. For use as a StoredField.
*
* @param in the data input
* @param length the length of the data input
* @lucene.experimental
*/
public record StoredFieldDataInput(DataInput in, int length) {

/** Creates a StoredFieldDataInput from a ByteArrayDataInput */
public StoredFieldDataInput(ByteArrayDataInput byteArrayDataInput) {
this(byteArrayDataInput, byteArrayDataInput.length());
}

/** Returns the data input */
public DataInput getDataInput() {
return in;
}

/** Returns the length of the data input */
public int getLength() {
return length;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import java.io.IOException;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.DocumentStoredFieldVisitor;
import org.apache.lucene.store.DataInput;

/**
* Expert: provides a low-level means of accessing the stored field values in an index. See {@link
Expand All @@ -41,15 +40,17 @@ public abstract class StoredFieldVisitor {
protected StoredFieldVisitor() {}

/**
* Expert: Process a binary field directly from the {@link DataInput}. Implementors of this method
* must read {@code length} bytes from the given {@link DataInput}. The default implementation
* reads all byes in a newly created byte array and calls {@link #binaryField(FieldInfo, byte[])}.
* Expert: Process a binary field directly from the {@link StoredFieldDataInput}. Implementors of
* this method must read {@code StoredFieldDataInput#length} bytes from the given {@link
* StoredFieldDataInput}. The default implementation reads all bytes in a newly created byte array
* and calls {@link #binaryField(FieldInfo, byte[])}.
*
* @param value newly allocated byte array with the binary contents.
* @param value the stored field data input.
*/
public void binaryField(FieldInfo fieldInfo, DataInput value, int length) throws IOException {
public void binaryField(FieldInfo fieldInfo, StoredFieldDataInput value) throws IOException {
int length = value.length();
final byte[] data = new byte[length];
value.readBytes(data, 0, length);
value.getDataInput().readBytes(data, 0, value.getLength());
binaryField(fieldInfo, data);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ void writeField(FieldInfo info, StoredValue value) throws IOException {
case BINARY:
writer.writeField(info, value.getBinaryValue());
break;
case DATA_INPUT:
writer.writeField(info, value.getDataInputValue());
break;
case STRING:
writer.writeField(info, value.getStringValue());
break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

import java.nio.charset.StandardCharsets;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.StoredFieldDataInput;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.apache.lucene.tests.util.LuceneTestCase;
Expand Down Expand Up @@ -71,4 +73,48 @@ public void testBinaryFieldInIndex() throws Exception {
reader.close();
dir.close();
}

public void testBinaryFieldFromDataInputInIndex() throws Exception {
FieldType ft = new FieldType();
ft.setStored(true);
byte[] byteArray = binaryValStored.getBytes(StandardCharsets.UTF_8);
StoredFieldDataInput storedFieldDataInput =
new StoredFieldDataInput(new ByteArrayDataInput(byteArray));
StoredField binaryFldStored = new StoredField("binaryStored", storedFieldDataInput);
Field stringFldStored = new Field("stringStored", binaryValStored, ft);

Document doc = new Document();

doc.add(binaryFldStored);

doc.add(stringFldStored);

/* test for field count */
assertEquals(2, doc.getFields().size());

/* add the doc to a ram index */
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
writer.addDocument(doc);

/* open a reader and fetch the document */
IndexReader reader = writer.getReader();
Document docFromReader = reader.storedFields().document(0);
assertTrue(docFromReader != null);

/* fetch the binary stored field and compare its content with the original one */
BytesRef bytes = docFromReader.getBinaryValue("binaryStored");
assertNotNull(bytes);
String binaryFldStoredTest =
new String(bytes.bytes, bytes.offset, bytes.length, StandardCharsets.UTF_8);
assertTrue(binaryFldStoredTest.equals(binaryValStored));

/* fetch the string field and compare its content with the original one */
String stringFldStoredTest = docFromReader.get("stringStored");
assertTrue(stringFldStoredTest.equals(binaryValStored));

writer.close();
reader.close();
dir.close();
}
}
Loading

0 comments on commit 2dedd1d

Please sign in to comment.