Skip to content

Commit b53e33d

Browse files
authored
Merge branch 'main' into dynamic-range-unit-test-coverage-topn-0
2 parents 2cee7fd + 4289827 commit b53e33d

37 files changed

+3592
-203
lines changed

lucene/CHANGES.txt

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,16 +49,20 @@ API Changes
4949
* GITHUB#14134: Added Bits#applyMask API to help apply live docs as a mask on a
5050
bit set of matches. (Adrien Grand)
5151

52-
* GITHUB#14209: Deprecate Operations.union(Automaton,Automaton) and
52+
* GITHUB#14209: Deprecate Operations.union(Automaton,Automaton) and
5353
concatenate(Automaton,Automaton) in favor of the methods taking List. (Robert Muir)
5454

55+
* GITHUB#14236: CombinedFieldQuery moved from lucene-sandbox to lucene-core.
56+
(Adrien Grand)
57+
5558
New Features
5659
---------------------
5760

5861
* GITHUB#14084, GITHUB#13635, GITHUB#13634, GITHUB#14170: Adds new `SeededKnnVectorQuery` query.
5962
These queries allow for the vector search entry points to be initialized via a `seed` query. This follows
6063
the research provided via https://arxiv.org/abs/2307.16779. (Sean MacAvaney, Ben Trent).
6164

65+
* GITHUB#13974: Introducing DocValuesMultiRangeQuery.SortedSetStabbingBuilder into sandbox. (Mikhail Khludnev)
6266

6367
Improvements
6468
---------------------
@@ -85,11 +89,18 @@ Improvements
8589
the wrapped analyzer's strategy to decide if components can be reused or need
8690
to be updated. (Mayya Sharipova)
8791

92+
* GITHUB#14192: Added support for RegExp to handle case insensitive matching
93+
across the full set of Unicode characters. (John Wagster)
94+
8895
* GITHUB#14239: Hunspell's option to tolerate affix rule count mismatches was
8996
improved to tolerate more instances of this problem. (Robert Muir)
9097

98+
9199
* GITHUB#14238: Improve test coverage of Dynamic Range Faceting. (John Houser)
92100

101+
* GITHUB#14213: Allowing indexing stored-only StoredField directly from DataInput. (Tim Brooks)
102+
103+
93104
Optimizations
94105
---------------------
95106

@@ -138,7 +149,7 @@ Other
138149

139150
* GITHUB#14091: Cover all DataType. (Lu Xugang)
140151

141-
* GITHUB#14130: Upgrade OpenNLP from 2.3.2 to 2.5.3, which transitively upgrades Slf4j
152+
* GITHUB#14130: Upgrade OpenNLP from 2.3.2 to 2.5.3, which transitively upgrades Slf4j
142153
from 1.7.36 to 2.0.16. (Michael Froh)
143154

144155
* GITHUB#14223 : Fixed a flaky test TestKnnFloatVectorQuery.testFindFewer (Navneet Verma)

lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene50/compressing/Lucene50CompressingStoredFieldsReader.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import org.apache.lucene.index.FieldInfos;
3131
import org.apache.lucene.index.IndexFileNames;
3232
import org.apache.lucene.index.SegmentInfo;
33+
import org.apache.lucene.index.StoredFieldDataInput;
3334
import org.apache.lucene.index.StoredFieldVisitor;
3435
import org.apache.lucene.store.AlreadyClosedException;
3536
import org.apache.lucene.store.ByteArrayDataInput;
@@ -290,7 +291,7 @@ private static void readField(DataInput in, StoredFieldVisitor visitor, FieldInf
290291
switch (bits & TYPE_MASK) {
291292
case BYTE_ARR:
292293
int length = in.readVInt();
293-
visitor.binaryField(info, in, length);
294+
visitor.binaryField(info, new StoredFieldDataInput(in, length));
294295
break;
295296
case STRING:
296297
visitor.stringField(info, in.readString());

lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@
2828
import org.apache.lucene.index.FieldInfo;
2929
import org.apache.lucene.index.FieldInfos;
3030
import org.apache.lucene.index.MergeState;
31+
import org.apache.lucene.index.StoredFieldDataInput;
3132
import org.apache.lucene.index.StoredFieldVisitor;
32-
import org.apache.lucene.store.DataInput;
3333
import org.apache.lucene.util.Accountable;
3434
import org.apache.lucene.util.BytesRef;
3535

@@ -73,10 +73,11 @@ public void finishDocument() throws IOException {}
7373
/** Writes a stored double value. */
7474
public abstract void writeField(FieldInfo info, double value) throws IOException;
7575

76-
/** Writes a stored binary value from a {@link DataInput} and a {@code length}. */
77-
public void writeField(FieldInfo info, DataInput value, int length) throws IOException {
76+
/** Writes a stored binary value from a {@link StoredFieldDataInput}. */
77+
public void writeField(FieldInfo info, StoredFieldDataInput value) throws IOException {
78+
int length = value.length();
7879
final byte[] bytes = new byte[length];
79-
value.readBytes(bytes, 0, length);
80+
value.getDataInput().readBytes(bytes, 0, length);
8081
writeField(info, new BytesRef(bytes, 0, length));
8182
}
8283

@@ -191,8 +192,8 @@ public MergeVisitor(MergeState mergeState, int readerIndex) {
191192
}
192193

193194
@Override
194-
public void binaryField(FieldInfo fieldInfo, DataInput value, int length) throws IOException {
195-
writeField(remap(fieldInfo), value, length);
195+
public void binaryField(FieldInfo fieldInfo, StoredFieldDataInput value) throws IOException {
196+
writeField(remap(fieldInfo), value);
196197
}
197198

198199
@Override

lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsReader.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
import org.apache.lucene.index.FieldInfos;
5252
import org.apache.lucene.index.IndexFileNames;
5353
import org.apache.lucene.index.SegmentInfo;
54+
import org.apache.lucene.index.StoredFieldDataInput;
5455
import org.apache.lucene.index.StoredFieldVisitor;
5556
import org.apache.lucene.store.AlreadyClosedException;
5657
import org.apache.lucene.store.ByteArrayDataInput;
@@ -261,7 +262,7 @@ private static void readField(DataInput in, StoredFieldVisitor visitor, FieldInf
261262
switch (bits & TYPE_MASK) {
262263
case BYTE_ARR:
263264
int length = in.readVInt();
264-
visitor.binaryField(info, in, length);
265+
visitor.binaryField(info, new StoredFieldDataInput(in, length));
265266
break;
266267
case STRING:
267268
visitor.stringField(info, in.readString());

lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsWriter.java

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,9 @@
3434
import org.apache.lucene.index.IndexFileNames;
3535
import org.apache.lucene.index.MergeState;
3636
import org.apache.lucene.index.SegmentInfo;
37+
import org.apache.lucene.index.StoredFieldDataInput;
3738
import org.apache.lucene.store.ByteBuffersDataInput;
3839
import org.apache.lucene.store.ByteBuffersDataOutput;
39-
import org.apache.lucene.store.DataInput;
4040
import org.apache.lucene.store.DataOutput;
4141
import org.apache.lucene.store.Directory;
4242
import org.apache.lucene.store.IOContext;
@@ -311,12 +311,13 @@ public void writeField(FieldInfo info, BytesRef value) throws IOException {
311311
}
312312

313313
@Override
314-
public void writeField(FieldInfo info, DataInput value, int length) throws IOException {
314+
public void writeField(FieldInfo info, StoredFieldDataInput value) throws IOException {
315+
int length = value.getLength();
315316
++numStoredFieldsInDoc;
316317
final long infoAndBits = (((long) info.number) << TYPE_BITS) | BYTE_ARR;
317318
bufferedDocs.writeVLong(infoAndBits);
318319
bufferedDocs.writeVInt(length);
319-
bufferedDocs.copyBytes(value, length);
320+
bufferedDocs.copyBytes(value.getDataInput(), length);
320321
}
321322

322323
@Override

lucene/core/src/java/org/apache/lucene/document/Field.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import org.apache.lucene.index.IndexOptions;
2828
import org.apache.lucene.index.IndexableField;
2929
import org.apache.lucene.index.IndexableFieldType;
30+
import org.apache.lucene.index.StoredFieldDataInput;
3031
import org.apache.lucene.util.BytesRef;
3132

3233
/**
@@ -619,6 +620,8 @@ public StoredValue storedValue() {
619620
return new StoredValue((double) fieldsData);
620621
} else if (fieldsData instanceof BytesRef) {
621622
return new StoredValue((BytesRef) fieldsData);
623+
} else if (fieldsData instanceof StoredFieldDataInput) {
624+
return new StoredValue((StoredFieldDataInput) fieldsData);
622625
} else if (fieldsData instanceof String) {
623626
return new StoredValue((String) fieldsData);
624627
} else {

lucene/core/src/java/org/apache/lucene/document/StoredField.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
package org.apache.lucene.document;
1818

1919
import org.apache.lucene.index.IndexReader; // javadocs
20+
import org.apache.lucene.index.StoredFieldDataInput;
2021
import org.apache.lucene.search.IndexSearcher; // javadocs
2122
import org.apache.lucene.util.BytesRef;
2223

@@ -105,6 +106,21 @@ public StoredField(String name, BytesRef value) {
105106
super(name, value, TYPE);
106107
}
107108

109+
/**
110+
* Create a stored-only field with the given data input value.
111+
*
112+
* @param name field name
113+
* @param value BytesRef pointing to binary content (not copied)
114+
* @throws IllegalArgumentException if the field name or value is null.
115+
*/
116+
public StoredField(String name, StoredFieldDataInput value) {
117+
super(name, TYPE);
118+
if (value == null) {
119+
throw new IllegalArgumentException("store field data input must not be null");
120+
}
121+
fieldsData = value;
122+
}
123+
108124
/**
109125
* Create a stored-only field with the given string value.
110126
*

lucene/core/src/java/org/apache/lucene/document/StoredValue.java

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
import java.util.Objects;
2020
import org.apache.lucene.index.IndexableField;
21+
import org.apache.lucene.index.StoredFieldDataInput;
2122
import org.apache.lucene.util.BytesRef;
2223

2324
/**
@@ -39,6 +40,8 @@ public enum Type {
3940
DOUBLE,
4041
/** Type of binary values. */
4142
BINARY,
43+
/** Type of data input values. */
44+
DATA_INPUT,
4245
/** Type of string values. */
4346
STRING;
4447
}
@@ -48,6 +51,7 @@ public enum Type {
4851
private long longValue;
4952
private float floatValue;
5053
private double doubleValue;
54+
private StoredFieldDataInput dataInput;
5155
private BytesRef binaryValue;
5256
private String stringValue;
5357

@@ -81,7 +85,13 @@ public StoredValue(BytesRef value) {
8185
binaryValue = Objects.requireNonNull(value);
8286
}
8387

84-
/** Ctor for binary values. */
88+
/** Ctor for data input values. */
89+
public StoredValue(StoredFieldDataInput value) {
90+
type = Type.DATA_INPUT;
91+
dataInput = Objects.requireNonNull(value);
92+
}
93+
94+
/** Ctor for string values. */
8595
public StoredValue(String value) {
8696
type = Type.STRING;
8797
stringValue = Objects.requireNonNull(value);
@@ -132,6 +142,14 @@ public void setBinaryValue(BytesRef value) {
132142
binaryValue = Objects.requireNonNull(value);
133143
}
134144

145+
/** Set a data input value. */
146+
public void setDataInputValue(StoredFieldDataInput value) {
147+
if (type != Type.DATA_INPUT) {
148+
throw new IllegalArgumentException("Cannot set a data input value on a " + type + " value");
149+
}
150+
dataInput = Objects.requireNonNull(value);
151+
}
152+
135153
/** Set a string value. */
136154
public void setStringValue(String value) {
137155
if (type != Type.STRING) {
@@ -180,6 +198,14 @@ public BytesRef getBinaryValue() {
180198
return binaryValue;
181199
}
182200

201+
/** Retrieve a data input value. */
202+
public StoredFieldDataInput getDataInputValue() {
203+
if (type != Type.DATA_INPUT) {
204+
throw new IllegalArgumentException("Cannot get a data input value on a " + type + " value");
205+
}
206+
return dataInput;
207+
}
208+
183209
/** Retrieve a string value. */
184210
public String getStringValue() {
185211
if (type != Type.STRING) {

lucene/core/src/java/org/apache/lucene/document/package-info.java

Lines changed: 56 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,61 @@
4646
* make the process of taking a file and converting it into a Lucene {@link
4747
* org.apache.lucene.document.Document}.
4848
*
49-
* <p>The {@link org.apache.lucene.document.DateTools} is a utility class to make dates and times
50-
* searchable. {@link org.apache.lucene.document.IntPoint}, {@link
51-
* org.apache.lucene.document.LongPoint}, {@link org.apache.lucene.document.FloatPoint} and {@link
52-
* org.apache.lucene.document.DoublePoint} enable indexing of numeric values (and also dates) for
53-
* fast range queries using {@link org.apache.lucene.search.PointRangeQuery}
49+
* <h2>How to index ...</h2>
50+
*
51+
* <h3>Strings</h3>
52+
*
53+
* <p>{@link org.apache.lucene.document.TextField} allows indexing tokens from a String so that one
54+
* can perform full-text search on it. The way that the input is tokenized depends on the {@link
55+
* org.apache.lucene.analysis.Analyzer} that is configured on the {@link
56+
* org.apache.lucene.index.IndexWriterConfig}. TextField can also be optionally stored.
57+
*
58+
* <p>{@link org.apache.lucene.document.KeywordField} indexes whole values as a single term so that
59+
* one can perform exact search on it. It also records doc values to enable sorting or faceting on
60+
* this field. Finally, it also supports optionally storing the value.
61+
*
62+
* <p>If faceting or sorting are not required, {@link org.apache.lucene.document.StringField} is a
63+
* variant of {@link org.apache.lucene.document.KeywordField} that does not index doc values.
64+
*
65+
* <h3>Numbers</h3>
66+
*
67+
* <p>If a numeric field represents an identifier rather than a quantity and is more commonly
68+
* searched on single values than on ranges of values, it is generally recommended to index its
69+
* string representation via {@link org.apache.lucene.document.KeywordField} (or {@link
70+
* org.apache.lucene.document.StringField} if doc values are not necessary).
71+
*
72+
* <p>{@link org.apache.lucene.document.LongField}, {@link org.apache.lucene.document.IntField},
73+
* {@link org.apache.lucene.document.DoubleField} and {@link org.apache.lucene.document.FloatField}
74+
* index values in a points index for efficient range queries, and also create doc-values for these
75+
* fields for efficient sorting and faceting.
76+
*
77+
* <p>If the field is aimed at being used to tune the score, {@link
78+
* org.apache.lucene.document.FeatureField} helps internally store numeric data as term frequencies
79+
* in a way that makes it efficient to influence scoring at search time.
80+
*
81+
* <h3>Other types of structured data</h3>
82+
*
83+
* <p>It is recommended to index dates as a {@link org.apache.lucene.document.LongField} that stores
84+
* the number of milliseconds since Epoch.
85+
*
86+
* <p>IP fields can be indexed via {@link org.apache.lucene.document.InetAddressPoint} in addition
87+
* to a {@link org.apache.lucene.document.SortedDocValuesField} (if the field is single-valued) or
88+
* {@link org.apache.lucene.document.SortedSetDocValuesField} that stores the result of {@link
89+
* org.apache.lucene.document.InetAddressPoint#encode}.
90+
*
91+
* <h3>Dense numeric vectors</h3>
92+
*
93+
* <p>Dense numeric vectors can be indexed with {@link
94+
* org.apache.lucene.document.KnnFloatVectorField} if its dimensions are floating-point numbers or
95+
* {@link org.apache.lucene.document.KnnByteVectorField} if its dimensions are bytes. This allows
96+
* searching for nearest neighbors at search time.
97+
*
98+
* <h3>Sparse numeric vectors</h3>
99+
*
100+
* <p>To perform nearest-neighbor search on sparse vectors rather than dense vectors, each dimension
101+
* of the sparse vector should be indexed as a {@link org.apache.lucene.document.FeatureField}.
102+
* Queries can then be constructed as a {@link org.apache.lucene.search.BooleanQuery} with {@link
103+
* org.apache.lucene.document.FeatureField#newLinearQuery(String, String, float) linear queries} as
104+
* {@link org.apache.lucene.search.BooleanClause.Occur#SHOULD} clauses.
54105
*/
55106
package org.apache.lucene.document;

lucene/core/src/java/org/apache/lucene/index/SortingStoredFieldsConsumer.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -138,8 +138,8 @@ private static class CopyVisitor extends StoredFieldVisitor {
138138
}
139139

140140
@Override
141-
public void binaryField(FieldInfo fieldInfo, DataInput value, int length) throws IOException {
142-
writer.writeField(fieldInfo, value, length);
141+
public void binaryField(FieldInfo fieldInfo, StoredFieldDataInput value) throws IOException {
142+
writer.writeField(fieldInfo, value);
143143
}
144144

145145
@Override

0 commit comments

Comments
 (0)