Skip to content

Commit

Permalink
Merge branch 'main' into dynamic-range-unit-test-coverage-topn-0
Browse files Browse the repository at this point in the history
  • Loading branch information
gsmiller authored Feb 19, 2025
2 parents 2cee7fd + 4289827 commit b53e33d
Show file tree
Hide file tree
Showing 37 changed files with 3,592 additions and 203 deletions.
15 changes: 13 additions & 2 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,16 +49,20 @@ API Changes
* GITHUB#14134: Added Bits#applyMask API to help apply live docs as a mask on a
bit set of matches. (Adrien Grand)

* GITHUB#14209: Deprecate Operations.union(Automaton,Automaton) and
* GITHUB#14209: Deprecate Operations.union(Automaton,Automaton) and
concatenate(Automaton,Automaton) in favor of the methods taking List. (Robert Muir)

* GITHUB#14236: CombinedFieldQuery moved from lucene-sandbox to lucene-core.
(Adrien Grand)

New Features
---------------------

* GITHUB#14084, GITHUB#13635, GITHUB#13634, GITHUB#14170: Adds new `SeededKnnVectorQuery` query.
These queries allow for the vector search entry points to be initialized via a `seed` query. This follows
the research provided via https://arxiv.org/abs/2307.16779. (Sean MacAvaney, Ben Trent).

* GITHUB#13974: Introducing DocValuesMultiRangeQuery.SortedSetStabbingBuilder into sandbox. (Mikhail Khludnev)

Improvements
---------------------
Expand All @@ -85,11 +89,18 @@ Improvements
the wrapped analyzer's strategy to decide if components can be reused or need
to be updated. (Mayya Sharipova)

* GITHUB#14192: Added support for RegExp to handle case insensitive matching
across the full set of Unicode characters. (John Wagster)

* GITHUB#14239: Hunspell's option to tolerate affix rule count mismatches was
improved to tolerate more instances of this problem. (Robert Muir)


* GITHUB#14238: Improve test coverage of Dynamic Range Faceting. (John Houser)

* GITHUB#14213: Allowing indexing stored-only StoredField directly from DataInput. (Tim Brooks)


Optimizations
---------------------

Expand Down Expand Up @@ -138,7 +149,7 @@ Other

* GITHUB#14091: Cover all DataType. (Lu Xugang)

* GITHUB#14130: Upgrade OpenNLP from 2.3.2 to 2.5.3, which transitively upgrades Slf4j
* GITHUB#14130: Upgrade OpenNLP from 2.3.2 to 2.5.3, which transitively upgrades Slf4j
from 1.7.36 to 2.0.16. (Michael Froh)

* GITHUB#14223 : Fixed a flaky test TestKnnFloatVectorQuery.testFindFewer (Navneet Verma)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.StoredFieldDataInput;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.ByteArrayDataInput;
Expand Down Expand Up @@ -290,7 +291,7 @@ private static void readField(DataInput in, StoredFieldVisitor visitor, FieldInf
switch (bits & TYPE_MASK) {
case BYTE_ARR:
int length = in.readVInt();
visitor.binaryField(info, in, length);
visitor.binaryField(info, new StoredFieldDataInput(in, length));
break;
case STRING:
visitor.stringField(info, in.readString());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.StoredFieldDataInput;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.BytesRef;

Expand Down Expand Up @@ -73,10 +73,11 @@ public void finishDocument() throws IOException {}
/** Writes a stored double value. */
public abstract void writeField(FieldInfo info, double value) throws IOException;

/** Writes a stored binary value from a {@link DataInput} and a {@code length}. */
public void writeField(FieldInfo info, DataInput value, int length) throws IOException {
/** Writes a stored binary value from a {@link StoredFieldDataInput}. */
public void writeField(FieldInfo info, StoredFieldDataInput value) throws IOException {
int length = value.length();
final byte[] bytes = new byte[length];
value.readBytes(bytes, 0, length);
value.getDataInput().readBytes(bytes, 0, length);
writeField(info, new BytesRef(bytes, 0, length));
}

Expand Down Expand Up @@ -191,8 +192,8 @@ public MergeVisitor(MergeState mergeState, int readerIndex) {
}

@Override
public void binaryField(FieldInfo fieldInfo, DataInput value, int length) throws IOException {
writeField(remap(fieldInfo), value, length);
public void binaryField(FieldInfo fieldInfo, StoredFieldDataInput value) throws IOException {
writeField(remap(fieldInfo), value);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.StoredFieldDataInput;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.ByteArrayDataInput;
Expand Down Expand Up @@ -261,7 +262,7 @@ private static void readField(DataInput in, StoredFieldVisitor visitor, FieldInf
switch (bits & TYPE_MASK) {
case BYTE_ARR:
int length = in.readVInt();
visitor.binaryField(info, in, length);
visitor.binaryField(info, new StoredFieldDataInput(in, length));
break;
case STRING:
visitor.stringField(info, in.readString());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.StoredFieldDataInput;
import org.apache.lucene.store.ByteBuffersDataInput;
import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
Expand Down Expand Up @@ -311,12 +311,13 @@ public void writeField(FieldInfo info, BytesRef value) throws IOException {
}

@Override
public void writeField(FieldInfo info, DataInput value, int length) throws IOException {
public void writeField(FieldInfo info, StoredFieldDataInput value) throws IOException {
int length = value.getLength();
++numStoredFieldsInDoc;
final long infoAndBits = (((long) info.number) << TYPE_BITS) | BYTE_ARR;
bufferedDocs.writeVLong(infoAndBits);
bufferedDocs.writeVInt(length);
bufferedDocs.copyBytes(value, length);
bufferedDocs.copyBytes(value.getDataInput(), length);
}

@Override
Expand Down
3 changes: 3 additions & 0 deletions lucene/core/src/java/org/apache/lucene/document/Field.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.IndexableFieldType;
import org.apache.lucene.index.StoredFieldDataInput;
import org.apache.lucene.util.BytesRef;

/**
Expand Down Expand Up @@ -619,6 +620,8 @@ public StoredValue storedValue() {
return new StoredValue((double) fieldsData);
} else if (fieldsData instanceof BytesRef) {
return new StoredValue((BytesRef) fieldsData);
} else if (fieldsData instanceof StoredFieldDataInput) {
return new StoredValue((StoredFieldDataInput) fieldsData);
} else if (fieldsData instanceof String) {
return new StoredValue((String) fieldsData);
} else {
Expand Down
16 changes: 16 additions & 0 deletions lucene/core/src/java/org/apache/lucene/document/StoredField.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package org.apache.lucene.document;

import org.apache.lucene.index.IndexReader; // javadocs
import org.apache.lucene.index.StoredFieldDataInput;
import org.apache.lucene.search.IndexSearcher; // javadocs
import org.apache.lucene.util.BytesRef;

Expand Down Expand Up @@ -105,6 +106,21 @@ public StoredField(String name, BytesRef value) {
super(name, value, TYPE);
}

/**
* Create a stored-only field with the given data input value.
*
* @param name field name
* @param value BytesRef pointing to binary content (not copied)
* @throws IllegalArgumentException if the field name or value is null.
*/
public StoredField(String name, StoredFieldDataInput value) {
super(name, TYPE);
if (value == null) {
throw new IllegalArgumentException("store field data input must not be null");
}
fieldsData = value;
}

/**
* Create a stored-only field with the given string value.
*
Expand Down
28 changes: 27 additions & 1 deletion lucene/core/src/java/org/apache/lucene/document/StoredValue.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import java.util.Objects;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.StoredFieldDataInput;
import org.apache.lucene.util.BytesRef;

/**
Expand All @@ -39,6 +40,8 @@ public enum Type {
DOUBLE,
/** Type of binary values. */
BINARY,
/** Type of data input values. */
DATA_INPUT,
/** Type of string values. */
STRING;
}
Expand All @@ -48,6 +51,7 @@ public enum Type {
private long longValue;
private float floatValue;
private double doubleValue;
private StoredFieldDataInput dataInput;
private BytesRef binaryValue;
private String stringValue;

Expand Down Expand Up @@ -81,7 +85,13 @@ public StoredValue(BytesRef value) {
binaryValue = Objects.requireNonNull(value);
}

/** Ctor for binary values. */
/** Ctor for data input values. */
public StoredValue(StoredFieldDataInput value) {
type = Type.DATA_INPUT;
dataInput = Objects.requireNonNull(value);
}

/** Ctor for string values. */
public StoredValue(String value) {
type = Type.STRING;
stringValue = Objects.requireNonNull(value);
Expand Down Expand Up @@ -132,6 +142,14 @@ public void setBinaryValue(BytesRef value) {
binaryValue = Objects.requireNonNull(value);
}

/** Set a data input value. */
public void setDataInputValue(StoredFieldDataInput value) {
if (type != Type.DATA_INPUT) {
throw new IllegalArgumentException("Cannot set a data input value on a " + type + " value");
}
dataInput = Objects.requireNonNull(value);
}

/** Set a string value. */
public void setStringValue(String value) {
if (type != Type.STRING) {
Expand Down Expand Up @@ -180,6 +198,14 @@ public BytesRef getBinaryValue() {
return binaryValue;
}

/** Retrieve a data input value. */
public StoredFieldDataInput getDataInputValue() {
if (type != Type.DATA_INPUT) {
throw new IllegalArgumentException("Cannot get a data input value on a " + type + " value");
}
return dataInput;
}

/** Retrieve a string value. */
public String getStringValue() {
if (type != Type.STRING) {
Expand Down
61 changes: 56 additions & 5 deletions lucene/core/src/java/org/apache/lucene/document/package-info.java
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,61 @@
* make the process of taking a file and converting it into a Lucene {@link
* org.apache.lucene.document.Document}.
*
* <p>The {@link org.apache.lucene.document.DateTools} is a utility class to make dates and times
* searchable. {@link org.apache.lucene.document.IntPoint}, {@link
* org.apache.lucene.document.LongPoint}, {@link org.apache.lucene.document.FloatPoint} and {@link
* org.apache.lucene.document.DoublePoint} enable indexing of numeric values (and also dates) for
* fast range queries using {@link org.apache.lucene.search.PointRangeQuery}
* <h2>How to index ...</h2>
*
* <h3>Strings</h3>
*
* <p>{@link org.apache.lucene.document.TextField} allows indexing tokens from a String so that one
* can perform full-text search on it. The way that the input is tokenized depends on the {@link
* org.apache.lucene.analysis.Analyzer} that is configured on the {@link
* org.apache.lucene.index.IndexWriterConfig}. TextField can also be optionally stored.
*
* <p>{@link org.apache.lucene.document.KeywordField} indexes whole values as a single term so that
* one can perform exact search on it. It also records doc values to enable sorting or faceting on
* this field. Finally, it also supports optionally storing the value.
*
* <p>If faceting or sorting are not required, {@link org.apache.lucene.document.StringField} is a
* variant of {@link org.apache.lucene.document.KeywordField} that does not index doc values.
*
* <h3>Numbers</h3>
*
* <p>If a numeric field represents an identifier rather than a quantity and is more commonly
* searched on single values than on ranges of values, it is generally recommended to index its
* string representation via {@link org.apache.lucene.document.KeywordField} (or {@link
* org.apache.lucene.document.StringField} if doc values are not necessary).
*
* <p>{@link org.apache.lucene.document.LongField}, {@link org.apache.lucene.document.IntField},
* {@link org.apache.lucene.document.DoubleField} and {@link org.apache.lucene.document.FloatField}
* index values in a points index for efficient range queries, and also create doc-values for these
* fields for efficient sorting and faceting.
*
* <p>If the field is aimed at being used to tune the score, {@link
* org.apache.lucene.document.FeatureField} helps internally store numeric data as term frequencies
* in a way that makes it efficient to influence scoring at search time.
*
* <h3>Other types of structured data</h3>
*
* <p>It is recommended to index dates as a {@link org.apache.lucene.document.LongField} that stores
* the number of milliseconds since Epoch.
*
* <p>IP fields can be indexed via {@link org.apache.lucene.document.InetAddressPoint} in addition
* to a {@link org.apache.lucene.document.SortedDocValuesField} (if the field is single-valued) or
* {@link org.apache.lucene.document.SortedSetDocValuesField} that stores the result of {@link
* org.apache.lucene.document.InetAddressPoint#encode}.
*
* <h3>Dense numeric vectors</h3>
*
* <p>Dense numeric vectors can be indexed with {@link
* org.apache.lucene.document.KnnFloatVectorField} if its dimensions are floating-point numbers or
* {@link org.apache.lucene.document.KnnByteVectorField} if its dimensions are bytes. This allows
* searching for nearest neighbors at search time.
*
* <h3>Sparse numeric vectors</h3>
*
* <p>To perform nearest-neighbor search on sparse vectors rather than dense vectors, each dimension
* of the sparse vector should be indexed as a {@link org.apache.lucene.document.FeatureField}.
* Queries can then be constructed as a {@link org.apache.lucene.search.BooleanQuery} with {@link
* org.apache.lucene.document.FeatureField#newLinearQuery(String, String, float) linear queries} as
* {@link org.apache.lucene.search.BooleanClause.Occur#SHOULD} clauses.
*/
package org.apache.lucene.document;
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,8 @@ private static class CopyVisitor extends StoredFieldVisitor {
}

@Override
public void binaryField(FieldInfo fieldInfo, DataInput value, int length) throws IOException {
writer.writeField(fieldInfo, value, length);
public void binaryField(FieldInfo fieldInfo, StoredFieldDataInput value) throws IOException {
writer.writeField(fieldInfo, value);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;

import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.DataInput;

/**
* A fixed size DataInput which includes the length of the input. For use as a StoredField.
*
* @param in the data input
* @param length the length of the data input
* @lucene.experimental
*/
public record StoredFieldDataInput(DataInput in, int length) {

/** Creates a StoredFieldDataInput from a ByteArrayDataInput */
public StoredFieldDataInput(ByteArrayDataInput byteArrayDataInput) {
this(byteArrayDataInput, byteArrayDataInput.length());
}

/** Returns the data input */
public DataInput getDataInput() {
return in;
}

/** Returns the length of the data input */
public int getLength() {
return length;
}
}
Loading

0 comments on commit b53e33d

Please sign in to comment.