From 2dedd1da89acec3d7440348a0d4fd8c1d75225d7 Mon Sep 17 00:00:00 2001 From: Tim Brooks Date: Tue, 18 Feb 2025 23:44:41 -0700 Subject: [PATCH] Support DataInput as source for StoredField (#14213) Allowing indexing stored-only StoredField directly from DataInput. --- lucene/CHANGES.txt | 2 + ...Lucene50CompressingStoredFieldsReader.java | 3 +- .../lucene/codecs/StoredFieldsWriter.java | 13 +++--- ...Lucene90CompressingStoredFieldsReader.java | 3 +- ...Lucene90CompressingStoredFieldsWriter.java | 7 +-- .../org/apache/lucene/document/Field.java | 3 ++ .../apache/lucene/document/StoredField.java | 16 +++++++ .../apache/lucene/document/StoredValue.java | 28 ++++++++++- .../index/SortingStoredFieldsConsumer.java | 4 +- .../lucene/index/StoredFieldDataInput.java | 45 ++++++++++++++++++ .../lucene/index/StoredFieldVisitor.java | 15 +++--- .../lucene/index/StoredFieldsConsumer.java | 3 ++ .../lucene/document/TestBinaryDocument.java | 46 +++++++++++++++++++ .../lucene/index/TestDirectoryReader.java | 43 +++++++++++++---- .../index/TestIndexWriterExceptions.java | 29 ++++++++++++ .../AssertingStoredFieldsFormat.java | 6 +-- .../cranky/CrankyStoredFieldsFormat.java | 6 +-- .../index/BaseIndexFileFormatTestCase.java | 3 ++ .../index/BaseStoredFieldsFormatTestCase.java | 13 +++++- 19 files changed, 250 insertions(+), 38 deletions(-) create mode 100644 lucene/core/src/java/org/apache/lucene/index/StoredFieldDataInput.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 13936c66cd9c..bea7a5bc4f1d 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -54,6 +54,8 @@ Improvements * GITHUB#14239: Hunspell's option to tolerate affix rule count mismatches was improved to tolerate more instances of this problem. (Robert Muir) +* GITHUB#14213: Allowing indexing stored-only StoredField directly from DataInput. (Tim Brooks) + Optimizations --------------------- diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/compressing/Lucene50CompressingStoredFieldsReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/compressing/Lucene50CompressingStoredFieldsReader.java index 8c6ea3264abf..edab3b0d8c22 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/compressing/Lucene50CompressingStoredFieldsReader.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/compressing/Lucene50CompressingStoredFieldsReader.java @@ -30,6 +30,7 @@ import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.StoredFieldDataInput; import org.apache.lucene.index.StoredFieldVisitor; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.ByteArrayDataInput; @@ -290,7 +291,7 @@ private static void readField(DataInput in, StoredFieldVisitor visitor, FieldInf switch (bits & TYPE_MASK) { case BYTE_ARR: int length = in.readVInt(); - visitor.binaryField(info, in, length); + visitor.binaryField(info, new StoredFieldDataInput(in, length)); break; case STRING: visitor.stringField(info, in.readString()); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java index 6824b227f68d..a9f9ce464a30 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java @@ -28,8 +28,8 @@ import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.MergeState; +import org.apache.lucene.index.StoredFieldDataInput; import org.apache.lucene.index.StoredFieldVisitor; -import org.apache.lucene.store.DataInput; import org.apache.lucene.util.Accountable; import org.apache.lucene.util.BytesRef; @@ -73,10 +73,11 @@ public void finishDocument() throws IOException {} /** Writes a stored double value. */ public abstract void writeField(FieldInfo info, double value) throws IOException; - /** Writes a stored binary value from a {@link DataInput} and a {@code length}. */ - public void writeField(FieldInfo info, DataInput value, int length) throws IOException { + /** Writes a stored binary value from a {@link StoredFieldDataInput}. */ + public void writeField(FieldInfo info, StoredFieldDataInput value) throws IOException { + int length = value.length(); final byte[] bytes = new byte[length]; - value.readBytes(bytes, 0, length); + value.getDataInput().readBytes(bytes, 0, length); writeField(info, new BytesRef(bytes, 0, length)); } @@ -191,8 +192,8 @@ public MergeVisitor(MergeState mergeState, int readerIndex) { } @Override - public void binaryField(FieldInfo fieldInfo, DataInput value, int length) throws IOException { - writeField(remap(fieldInfo), value, length); + public void binaryField(FieldInfo fieldInfo, StoredFieldDataInput value) throws IOException { + writeField(remap(fieldInfo), value); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsReader.java index 5025e97f8b31..315adff1473d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsReader.java @@ -51,6 +51,7 @@ import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.StoredFieldDataInput; import org.apache.lucene.index.StoredFieldVisitor; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.ByteArrayDataInput; @@ -261,7 +262,7 @@ private static void readField(DataInput in, StoredFieldVisitor visitor, FieldInf switch (bits & TYPE_MASK) { case BYTE_ARR: int length = in.readVInt(); - visitor.binaryField(info, in, length); + visitor.binaryField(info, new StoredFieldDataInput(in, length)); break; case STRING: visitor.stringField(info, in.readString()); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsWriter.java index a7afdbf30abb..68805763e789 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsWriter.java @@ -34,9 +34,9 @@ import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.MergeState; import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.StoredFieldDataInput; import org.apache.lucene.store.ByteBuffersDataInput; import org.apache.lucene.store.ByteBuffersDataOutput; -import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; @@ -311,12 +311,13 @@ public void writeField(FieldInfo info, BytesRef value) throws IOException { } @Override - public void writeField(FieldInfo info, DataInput value, int length) throws IOException { + public void writeField(FieldInfo info, StoredFieldDataInput value) throws IOException { + int length = value.getLength(); ++numStoredFieldsInDoc; final long infoAndBits = (((long) info.number) << TYPE_BITS) | BYTE_ARR; bufferedDocs.writeVLong(infoAndBits); bufferedDocs.writeVInt(length); - bufferedDocs.copyBytes(value, length); + bufferedDocs.copyBytes(value.getDataInput(), length); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/document/Field.java b/lucene/core/src/java/org/apache/lucene/document/Field.java index 9945fdeae174..9d89abd1ce41 100644 --- a/lucene/core/src/java/org/apache/lucene/document/Field.java +++ b/lucene/core/src/java/org/apache/lucene/document/Field.java @@ -27,6 +27,7 @@ import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.IndexableFieldType; +import org.apache.lucene.index.StoredFieldDataInput; import org.apache.lucene.util.BytesRef; /** @@ -619,6 +620,8 @@ public StoredValue storedValue() { return new StoredValue((double) fieldsData); } else if (fieldsData instanceof BytesRef) { return new StoredValue((BytesRef) fieldsData); + } else if (fieldsData instanceof StoredFieldDataInput) { + return new StoredValue((StoredFieldDataInput) fieldsData); } else if (fieldsData instanceof String) { return new StoredValue((String) fieldsData); } else { diff --git a/lucene/core/src/java/org/apache/lucene/document/StoredField.java b/lucene/core/src/java/org/apache/lucene/document/StoredField.java index c93b9b5d7170..e1190df89804 100644 --- a/lucene/core/src/java/org/apache/lucene/document/StoredField.java +++ b/lucene/core/src/java/org/apache/lucene/document/StoredField.java @@ -17,6 +17,7 @@ package org.apache.lucene.document; import org.apache.lucene.index.IndexReader; // javadocs +import org.apache.lucene.index.StoredFieldDataInput; import org.apache.lucene.search.IndexSearcher; // javadocs import org.apache.lucene.util.BytesRef; @@ -105,6 +106,21 @@ public StoredField(String name, BytesRef value) { super(name, value, TYPE); } + /** + * Create a stored-only field with the given data input value. + * + * @param name field name + * @param value BytesRef pointing to binary content (not copied) + * @throws IllegalArgumentException if the field name or value is null. + */ + public StoredField(String name, StoredFieldDataInput value) { + super(name, TYPE); + if (value == null) { + throw new IllegalArgumentException("store field data input must not be null"); + } + fieldsData = value; + } + /** * Create a stored-only field with the given string value. * diff --git a/lucene/core/src/java/org/apache/lucene/document/StoredValue.java b/lucene/core/src/java/org/apache/lucene/document/StoredValue.java index 61401c7b2fcd..413664de6201 100644 --- a/lucene/core/src/java/org/apache/lucene/document/StoredValue.java +++ b/lucene/core/src/java/org/apache/lucene/document/StoredValue.java @@ -18,6 +18,7 @@ import java.util.Objects; import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.StoredFieldDataInput; import org.apache.lucene.util.BytesRef; /** @@ -39,6 +40,8 @@ public enum Type { DOUBLE, /** Type of binary values. */ BINARY, + /** Type of data input values. */ + DATA_INPUT, /** Type of string values. */ STRING; } @@ -48,6 +51,7 @@ public enum Type { private long longValue; private float floatValue; private double doubleValue; + private StoredFieldDataInput dataInput; private BytesRef binaryValue; private String stringValue; @@ -81,7 +85,13 @@ public StoredValue(BytesRef value) { binaryValue = Objects.requireNonNull(value); } - /** Ctor for binary values. */ + /** Ctor for data input values. */ + public StoredValue(StoredFieldDataInput value) { + type = Type.DATA_INPUT; + dataInput = Objects.requireNonNull(value); + } + + /** Ctor for string values. */ public StoredValue(String value) { type = Type.STRING; stringValue = Objects.requireNonNull(value); @@ -132,6 +142,14 @@ public void setBinaryValue(BytesRef value) { binaryValue = Objects.requireNonNull(value); } + /** Set a data input value. */ + public void setDataInputValue(StoredFieldDataInput value) { + if (type != Type.DATA_INPUT) { + throw new IllegalArgumentException("Cannot set a data input value on a " + type + " value"); + } + dataInput = Objects.requireNonNull(value); + } + /** Set a string value. */ public void setStringValue(String value) { if (type != Type.STRING) { @@ -180,6 +198,14 @@ public BytesRef getBinaryValue() { return binaryValue; } + /** Retrieve a data input value. */ + public StoredFieldDataInput getDataInputValue() { + if (type != Type.DATA_INPUT) { + throw new IllegalArgumentException("Cannot get a data input value on a " + type + " value"); + } + return dataInput; + } + /** Retrieve a string value. */ public String getStringValue() { if (type != Type.STRING) { diff --git a/lucene/core/src/java/org/apache/lucene/index/SortingStoredFieldsConsumer.java b/lucene/core/src/java/org/apache/lucene/index/SortingStoredFieldsConsumer.java index 8044a4cc3875..693806da4fb3 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortingStoredFieldsConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortingStoredFieldsConsumer.java @@ -138,8 +138,8 @@ private static class CopyVisitor extends StoredFieldVisitor { } @Override - public void binaryField(FieldInfo fieldInfo, DataInput value, int length) throws IOException { - writer.writeField(fieldInfo, value, length); + public void binaryField(FieldInfo fieldInfo, StoredFieldDataInput value) throws IOException { + writer.writeField(fieldInfo, value); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/index/StoredFieldDataInput.java b/lucene/core/src/java/org/apache/lucene/index/StoredFieldDataInput.java new file mode 100644 index 000000000000..f55a3b9ef04f --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/index/StoredFieldDataInput.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.index; + +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.DataInput; + +/** + * A fixed size DataInput which includes the length of the input. For use as a StoredField. + * + * @param in the data input + * @param length the length of the data input + * @lucene.experimental + */ +public record StoredFieldDataInput(DataInput in, int length) { + + /** Creates a StoredFieldDataInput from a ByteArrayDataInput */ + public StoredFieldDataInput(ByteArrayDataInput byteArrayDataInput) { + this(byteArrayDataInput, byteArrayDataInput.length()); + } + + /** Returns the data input */ + public DataInput getDataInput() { + return in; + } + + /** Returns the length of the data input */ + public int getLength() { + return length; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/index/StoredFieldVisitor.java b/lucene/core/src/java/org/apache/lucene/index/StoredFieldVisitor.java index 2457f392d112..fa22b435230d 100644 --- a/lucene/core/src/java/org/apache/lucene/index/StoredFieldVisitor.java +++ b/lucene/core/src/java/org/apache/lucene/index/StoredFieldVisitor.java @@ -19,7 +19,6 @@ import java.io.IOException; import org.apache.lucene.document.Document; import org.apache.lucene.document.DocumentStoredFieldVisitor; -import org.apache.lucene.store.DataInput; /** * Expert: provides a low-level means of accessing the stored field values in an index. See {@link @@ -41,15 +40,17 @@ public abstract class StoredFieldVisitor { protected StoredFieldVisitor() {} /** - * Expert: Process a binary field directly from the {@link DataInput}. Implementors of this method - * must read {@code length} bytes from the given {@link DataInput}. The default implementation - * reads all byes in a newly created byte array and calls {@link #binaryField(FieldInfo, byte[])}. + * Expert: Process a binary field directly from the {@link StoredFieldDataInput}. Implementors of + * this method must read {@code StoredFieldDataInput#length} bytes from the given {@link + * StoredFieldDataInput}. The default implementation reads all bytes in a newly created byte array + * and calls {@link #binaryField(FieldInfo, byte[])}. * - * @param value newly allocated byte array with the binary contents. + * @param value the stored field data input. */ - public void binaryField(FieldInfo fieldInfo, DataInput value, int length) throws IOException { + public void binaryField(FieldInfo fieldInfo, StoredFieldDataInput value) throws IOException { + int length = value.length(); final byte[] data = new byte[length]; - value.readBytes(data, 0, length); + value.getDataInput().readBytes(data, 0, value.getLength()); binaryField(fieldInfo, data); } diff --git a/lucene/core/src/java/org/apache/lucene/index/StoredFieldsConsumer.java b/lucene/core/src/java/org/apache/lucene/index/StoredFieldsConsumer.java index 78f2f726a2f1..24c5359d29a7 100644 --- a/lucene/core/src/java/org/apache/lucene/index/StoredFieldsConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/index/StoredFieldsConsumer.java @@ -79,6 +79,9 @@ void writeField(FieldInfo info, StoredValue value) throws IOException { case BINARY: writer.writeField(info, value.getBinaryValue()); break; + case DATA_INPUT: + writer.writeField(info, value.getDataInputValue()); + break; case STRING: writer.writeField(info, value.getStringValue()); break; diff --git a/lucene/core/src/test/org/apache/lucene/document/TestBinaryDocument.java b/lucene/core/src/test/org/apache/lucene/document/TestBinaryDocument.java index f161f3375d5e..ebaf9c512bd5 100644 --- a/lucene/core/src/test/org/apache/lucene/document/TestBinaryDocument.java +++ b/lucene/core/src/test/org/apache/lucene/document/TestBinaryDocument.java @@ -18,6 +18,8 @@ import java.nio.charset.StandardCharsets; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.StoredFieldDataInput; +import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.RandomIndexWriter; import org.apache.lucene.tests.util.LuceneTestCase; @@ -71,4 +73,48 @@ public void testBinaryFieldInIndex() throws Exception { reader.close(); dir.close(); } + + public void testBinaryFieldFromDataInputInIndex() throws Exception { + FieldType ft = new FieldType(); + ft.setStored(true); + byte[] byteArray = binaryValStored.getBytes(StandardCharsets.UTF_8); + StoredFieldDataInput storedFieldDataInput = + new StoredFieldDataInput(new ByteArrayDataInput(byteArray)); + StoredField binaryFldStored = new StoredField("binaryStored", storedFieldDataInput); + Field stringFldStored = new Field("stringStored", binaryValStored, ft); + + Document doc = new Document(); + + doc.add(binaryFldStored); + + doc.add(stringFldStored); + + /* test for field count */ + assertEquals(2, doc.getFields().size()); + + /* add the doc to a ram index */ + Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + writer.addDocument(doc); + + /* open a reader and fetch the document */ + IndexReader reader = writer.getReader(); + Document docFromReader = reader.storedFields().document(0); + assertTrue(docFromReader != null); + + /* fetch the binary stored field and compare its content with the original one */ + BytesRef bytes = docFromReader.getBinaryValue("binaryStored"); + assertNotNull(bytes); + String binaryFldStoredTest = + new String(bytes.bytes, bytes.offset, bytes.length, StandardCharsets.UTF_8); + assertTrue(binaryFldStoredTest.equals(binaryValStored)); + + /* fetch the string field and compare its content with the original one */ + String stringFldStoredTest = docFromReader.get("stringStored"); + assertTrue(stringFldStoredTest.equals(binaryValStored)); + + writer.close(); + reader.close(); + dir.close(); + } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDirectoryReader.java b/lucene/core/src/test/org/apache/lucene/index/TestDirectoryReader.java index 1ea5da23d4b8..878f00ac5512 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestDirectoryReader.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestDirectoryReader.java @@ -38,6 +38,7 @@ import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.analysis.MockAnalyzer; import org.apache.lucene.tests.index.DocHelper; @@ -366,7 +367,8 @@ void assertTermDocsCount(String msg, IndexReader reader, Term term, int expected public void testBinaryFields() throws IOException { Directory dir = newDirectory(); - byte[] bin = new byte[] {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + byte[] bin1 = new byte[] {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + byte[] bin2 = new byte[] {10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; IndexWriter writer = new IndexWriter( @@ -387,7 +389,8 @@ public void testBinaryFields() throws IOException { .setOpenMode(OpenMode.APPEND) .setMergePolicy(newLogMergePolicy())); Document doc = new Document(); - doc.add(new StoredField("bin1", bin)); + doc.add(new StoredField("bin1", bin1)); + doc.add(new StoredField("bin2", new StoredFieldDataInput(new ByteArrayDataInput(bin2)))); doc.add(new TextField("junk", "junk text", Field.Store.NO)); writer.addDocument(doc); writer.close(); @@ -398,11 +401,22 @@ public void testBinaryFields() throws IOException { assertEquals(1, fields.length); IndexableField b1 = fields[0]; assertTrue(b1.binaryValue() != null); - BytesRef bytesRef = b1.binaryValue(); - assertEquals(bin.length, bytesRef.length); - for (int i = 0; i < bin.length; i++) { - assertEquals(bin[i], bytesRef.bytes[i + bytesRef.offset]); + BytesRef bytesRef1 = b1.binaryValue(); + assertEquals(bin1.length, bytesRef1.length); + for (int i = 0; i < bin1.length; i++) { + assertEquals(bin1[i], bytesRef1.bytes[i + bytesRef1.offset]); } + fields = doc2.getFields("bin2"); + assertNotNull(fields); + assertEquals(1, fields.length); + IndexableField b2 = fields[0]; + assertTrue(b2.binaryValue() != null); + BytesRef bytesRef2 = b2.binaryValue(); + assertEquals(bin2.length, bytesRef2.length); + for (int i = 0; i < bin2.length; i++) { + assertEquals(bin2[i], bytesRef2.bytes[i + bytesRef2.offset]); + } + reader.close(); // force merge @@ -421,10 +435,19 @@ public void testBinaryFields() throws IOException { assertEquals(1, fields.length); b1 = fields[0]; assertTrue(b1.binaryValue() != null); - bytesRef = b1.binaryValue(); - assertEquals(bin.length, bytesRef.length); - for (int i = 0; i < bin.length; i++) { - assertEquals(bin[i], bytesRef.bytes[i + bytesRef.offset]); + bytesRef1 = b1.binaryValue(); + assertEquals(bin1.length, bytesRef1.length); + for (int i = 0; i < bin1.length; i++) { + assertEquals(bin1[i], bytesRef1.bytes[i + bytesRef1.offset]); + } + fields = doc2.getFields("bin2"); + assertNotNull(fields); + assertEquals(1, fields.length); + b2 = fields[0]; + bytesRef2 = b2.binaryValue(); + assertEquals(bin2.length, bytesRef2.length); + for (int i = 0; i < bin2.length; i++) { + assertEquals(bin2[i], bytesRef2.bytes[i + bytesRef2.offset]); } reader.close(); dir.close(); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java index 2f92606ba7d0..7bf5bed4e8e6 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java @@ -1772,6 +1772,35 @@ public void testNullStoredBytesRefFieldReuse() throws Exception { dir.close(); } + /** test a null data input value doesn't abort the entire segment */ + public void testNullStoredDataInputField() throws Exception { + Directory dir = newDirectory(); + Analyzer analyzer = new MockAnalyzer(random()); + IndexWriter iw = new IndexWriter(dir, new IndexWriterConfig(analyzer)); + // add good document + Document doc = new Document(); + iw.addDocument(doc); + + expectThrows( + IllegalArgumentException.class, + () -> { + // set to null value + StoredFieldDataInput v = null; + Field theField = new StoredField("foo", v); + doc.add(theField); + iw.addDocument(doc); + fail("didn't get expected exception"); + }); + + assertNull(iw.getTragicException()); + iw.close(); + // make sure we see our good doc + DirectoryReader r = DirectoryReader.open(dir); + assertEquals(1, r.numDocs()); + r.close(); + dir.close(); + } + public void testCrazyPositionIncrementGap() throws Exception { Directory dir = newDirectory(); Analyzer analyzer = diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/asserting/AssertingStoredFieldsFormat.java b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/asserting/AssertingStoredFieldsFormat.java index d8d37718c119..2ea955c3c01a 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/asserting/AssertingStoredFieldsFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/asserting/AssertingStoredFieldsFormat.java @@ -24,8 +24,8 @@ import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.StoredFieldDataInput; import org.apache.lucene.index.StoredFieldVisitor; -import org.apache.lucene.store.DataInput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.tests.util.TestUtil; @@ -161,9 +161,9 @@ public void writeField(FieldInfo info, BytesRef value) throws IOException { } @Override - public void writeField(FieldInfo info, DataInput value, int length) throws IOException { + public void writeField(FieldInfo info, StoredFieldDataInput value) throws IOException { assert docStatus == Status.STARTED; - in.writeField(info, value, length); + in.writeField(info, value); } @Override diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/cranky/CrankyStoredFieldsFormat.java b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/cranky/CrankyStoredFieldsFormat.java index 25d1695f5347..6e8acca3ff94 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/cranky/CrankyStoredFieldsFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/cranky/CrankyStoredFieldsFormat.java @@ -26,7 +26,7 @@ import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.MergeState; import org.apache.lucene.index.SegmentInfo; -import org.apache.lucene.store.DataInput; +import org.apache.lucene.index.StoredFieldDataInput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.util.Accountable; @@ -149,11 +149,11 @@ public void writeField(FieldInfo info, BytesRef value) throws IOException { } @Override - public void writeField(FieldInfo info, DataInput value, int length) throws IOException { + public void writeField(FieldInfo info, StoredFieldDataInput value) throws IOException { if (random.nextInt(10000) == 0) { throw new IOException("Fake IOException from StoredFieldsWriter.writeField()"); } - delegate.writeField(info, value, length); + delegate.writeField(info, value); } @Override diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseIndexFileFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseIndexFileFormatTestCase.java index c2aa7ff0e4de..6daccb35578b 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseIndexFileFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseIndexFileFormatTestCase.java @@ -595,6 +595,9 @@ public void close() {} case BINARY: consumer.writeField(field, value.getBinaryValue()); break; + case DATA_INPUT: + consumer.writeField(field, value.getDataInputValue()); + break; case STRING: consumer.writeField(field, value.getStringValue()); break; diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseStoredFieldsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseStoredFieldsFormatTestCase.java index 84fdcde81732..f879cff8d59c 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseStoredFieldsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseStoredFieldsFormatTestCase.java @@ -57,6 +57,7 @@ import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.NoMergePolicy; import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.StoredFieldDataInput; import org.apache.lucene.index.StoredFieldVisitor; import org.apache.lucene.index.StoredFields; import org.apache.lucene.index.Term; @@ -66,6 +67,7 @@ import org.apache.lucene.search.SortField; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.MMapDirectory; import org.apache.lucene.tests.analysis.MockAnalyzer; @@ -682,7 +684,13 @@ public void testMergeFilterReader() throws IOException { doc.add(new StoredField("d", random().nextDouble())); doc.add(new StoredField("f", random().nextFloat())); doc.add(new StoredField("s", RandomPicks.randomFrom(random(), stringValues))); - doc.add(new StoredField("b", new BytesRef(RandomPicks.randomFrom(random(), stringValues)))); + BytesRef value = new BytesRef(RandomPicks.randomFrom(random(), stringValues)); + doc.add(new StoredField("b", value)); + doc.add( + new StoredField( + "b2", + new StoredFieldDataInput( + new ByteArrayDataInput(value.bytes, value.offset, value.length)))); docs[i] = doc; w.addDocument(doc); } @@ -713,6 +721,9 @@ public void testMergeFilterReader() throws IOException { assertEquals(expected.getField("d").numericValue(), doc.getField("d").numericValue()); assertEquals(expected.getField("f").numericValue(), doc.getField("f").numericValue()); assertEquals(expected.getField("b").binaryValue(), doc.getField("b").binaryValue()); + // The value is the same for fields "b" and "b2". Read the expected value from "b" as "b2" was + // consumed during indexing + assertEquals(expected.getField("b").binaryValue(), doc.getField("b2").binaryValue()); } reader.close();