Skip to content

Commit 5bf7bfe

Browse files
committed
iter
1 parent 07298cf commit 5bf7bfe

40 files changed

+4339
-754
lines changed

lucene/CHANGES.txt

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,12 @@ Improvements
7575
mergeFactor segments together when the merge is below the min merge size.
7676
(Adrien Grand)
7777

78+
* GITHUB#14154: Add UnwrappingReuseStrategy for AnalyzerWrapper that consults
79+
the wrapped analyzer's strategy to decide if components can be reused or need
80+
to be updated. (Mayya Sharipova)
81+
7882
* GITHUB#14192: Added support for RegExp to handle case insensitive matching
79-
across the full set of Unicode characters
83+
across the full set of Unicode characters. (John Wagster)
8084

8185
Optimizations
8286
---------------------
@@ -89,7 +93,11 @@ Optimizations
8993
* GITHUB#14133: Dense blocks of postings are now encoded as bit sets.
9094
(Adrien Grand)
9195

92-
# GITHUB#14169: Optimize ContextQuery with big number of contexts. (Mayya Sharipova)
96+
* GITHUB#14169: Optimize ContextQuery with big number of contexts. (Mayya Sharipova)
97+
98+
* GITHUB#14181: Add updateable random scorer interface for knn vector index building. This allows
99+
for fewer objects to be created during indexing and simplifies internally used iterfaces.
100+
(Ben Trent)
93101

94102
Bug Fixes
95103
---------------------

lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
import org.apache.lucene.util.IOUtils;
3636
import org.apache.lucene.util.hnsw.RandomVectorScorer;
3737
import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier;
38+
import org.apache.lucene.util.hnsw.UpdateableRandomVectorScorer;
3839
import org.openjdk.jmh.annotations.*;
3940

4041
@BenchmarkMode(Mode.Throughput)
@@ -57,7 +58,7 @@ public class VectorScorerBenchmark {
5758
IndexInput in;
5859
KnnVectorValues vectorValues;
5960
byte[] vec1, vec2;
60-
RandomVectorScorer scorer;
61+
UpdateableRandomVectorScorer scorer;
6162

6263
@Setup(Level.Iteration)
6364
public void init() throws IOException {
@@ -76,7 +77,8 @@ public void init() throws IOException {
7677
scorer =
7778
FlatVectorScorerUtil.getLucene99FlatVectorsScorer()
7879
.getRandomVectorScorerSupplier(DOT_PRODUCT, vectorValues)
79-
.scorer(0);
80+
.scorer();
81+
scorer.setScoringOrdinal(0);
8082
}
8183

8284
@TearDown

lucene/codecs/src/java/org/apache/lucene/codecs/bitvectors/FlatBitVectorsScorer.java

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,14 @@
2626
import org.apache.lucene.util.VectorUtil;
2727
import org.apache.lucene.util.hnsw.RandomVectorScorer;
2828
import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier;
29+
import org.apache.lucene.util.hnsw.UpdateableRandomVectorScorer;
2930

3031
/** A bit vector scorer for scoring byte vectors. */
3132
public class FlatBitVectorsScorer implements FlatVectorsScorer {
3233
@Override
3334
public RandomVectorScorerSupplier getRandomVectorScorerSupplier(
3435
VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues)
3536
throws IOException {
36-
assert vectorValues instanceof ByteVectorValues;
3737
if (vectorValues instanceof ByteVectorValues byteVectorValues) {
3838
return new BitRandomVectorScorerSupplier(byteVectorValues);
3939
}
@@ -51,14 +51,13 @@ public RandomVectorScorer getRandomVectorScorer(
5151
public RandomVectorScorer getRandomVectorScorer(
5252
VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues, byte[] target)
5353
throws IOException {
54-
assert vectorValues instanceof ByteVectorValues;
5554
if (vectorValues instanceof ByteVectorValues byteVectorValues) {
5655
return new BitRandomVectorScorer(byteVectorValues, target);
5756
}
5857
throw new IllegalArgumentException("vectorValues must be an instance of ByteVectorValues");
5958
}
6059

61-
static class BitRandomVectorScorer implements RandomVectorScorer {
60+
static class BitRandomVectorScorer implements UpdateableRandomVectorScorer {
6261
private final ByteVectorValues vectorValues;
6362
private final int bitDimensions;
6463
private final byte[] query;
@@ -80,6 +79,11 @@ public int maxOrd() {
8079
return vectorValues.size();
8180
}
8281

82+
@Override
83+
public void setScoringOrdinal(int node) throws IOException {
84+
System.arraycopy(vectorValues.vectorValue(node), 0, query, 0, query.length);
85+
}
86+
8387
@Override
8488
public int ordToDoc(int ord) {
8589
return vectorValues.ordToDoc(ord);
@@ -93,24 +97,22 @@ public Bits getAcceptOrds(Bits acceptDocs) {
9397

9498
static class BitRandomVectorScorerSupplier implements RandomVectorScorerSupplier {
9599
protected final ByteVectorValues vectorValues;
96-
protected final ByteVectorValues vectorValues1;
97-
protected final ByteVectorValues vectorValues2;
100+
protected final ByteVectorValues targetVectors;
98101

99102
public BitRandomVectorScorerSupplier(ByteVectorValues vectorValues) throws IOException {
100103
this.vectorValues = vectorValues;
101-
this.vectorValues1 = vectorValues.copy();
102-
this.vectorValues2 = vectorValues.copy();
104+
this.targetVectors = vectorValues.copy();
103105
}
104106

105107
@Override
106-
public RandomVectorScorer scorer(int ord) throws IOException {
107-
byte[] query = vectorValues1.vectorValue(ord);
108-
return new BitRandomVectorScorer(vectorValues2, query);
108+
public UpdateableRandomVectorScorer scorer() throws IOException {
109+
byte[] query = new byte[vectorValues.dimension()];
110+
return new BitRandomVectorScorer(vectorValues, query);
109111
}
110112

111113
@Override
112114
public RandomVectorScorerSupplier copy() throws IOException {
113-
return new BitRandomVectorScorerSupplier(vectorValues.copy());
115+
return new BitRandomVectorScorerSupplier(vectorValues);
114116
}
115117
}
116118

lucene/core/src/java/org/apache/lucene/analysis/Analyzer.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,7 @@ public void close() {
342342
* TokenFilter} which also serves as the {@link TokenStream} returned by {@link
343343
* Analyzer#tokenStream(String, Reader)}.
344344
*/
345-
public static final class TokenStreamComponents {
345+
public static class TokenStreamComponents {
346346
/** Original source of the tokens. */
347347
protected final Consumer<Reader> source;
348348

lucene/core/src/java/org/apache/lucene/analysis/AnalyzerWrapper.java

Lines changed: 67 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@
4141
* @since 4.0.0
4242
*/
4343
public abstract class AnalyzerWrapper extends Analyzer {
44-
4544
/**
4645
* Creates a new AnalyzerWrapper with the given reuse strategy.
4746
*
@@ -53,7 +52,10 @@ public abstract class AnalyzerWrapper extends Analyzer {
5352
* @see #getReuseStrategy()
5453
*/
5554
protected AnalyzerWrapper(ReuseStrategy reuseStrategy) {
56-
super(reuseStrategy);
55+
super(
56+
reuseStrategy instanceof DelegatingAnalyzerWrapper.DelegatingReuseStrategy
57+
? reuseStrategy
58+
: new UnwrappingReuseStrategy(reuseStrategy));
5759
}
5860

5961
/**
@@ -117,7 +119,10 @@ protected Reader wrapReaderForNormalization(String fieldName, Reader reader) {
117119

118120
@Override
119121
protected final TokenStreamComponents createComponents(String fieldName) {
120-
return wrapComponents(fieldName, getWrappedAnalyzer(fieldName).createComponents(fieldName));
122+
TokenStreamComponents wrappedComponents =
123+
getWrappedAnalyzer(fieldName).createComponents(fieldName);
124+
TokenStreamComponents wrapperComponents = wrapComponents(fieldName, wrappedComponents);
125+
return new TokenStreamComponentsWrapper(wrapperComponents, wrappedComponents);
121126
}
122127

123128
@Override
@@ -151,4 +156,63 @@ protected final Reader initReaderForNormalization(String fieldName, Reader reade
151156
protected final AttributeFactory attributeFactory(String fieldName) {
152157
return getWrappedAnalyzer(fieldName).attributeFactory(fieldName);
153158
}
159+
160+
/**
161+
* A {@link org.apache.lucene.analysis.Analyzer.ReuseStrategy} that checks the wrapped analyzer's
162+
* strategy for reusability. If the wrapped analyzer's strategy returns null, components need to
163+
* be re-created.
164+
*/
165+
public static final class UnwrappingReuseStrategy extends ReuseStrategy {
166+
private final ReuseStrategy reuseStrategy;
167+
168+
public UnwrappingReuseStrategy(ReuseStrategy reuseStrategy) {
169+
this.reuseStrategy = reuseStrategy;
170+
}
171+
172+
@Override
173+
public TokenStreamComponents getReusableComponents(Analyzer analyzer, String fieldName) {
174+
if (analyzer instanceof AnalyzerWrapper wrapper) {
175+
final Analyzer wrappedAnalyzer = wrapper.getWrappedAnalyzer(fieldName);
176+
if (wrappedAnalyzer.getReuseStrategy().getReusableComponents(wrappedAnalyzer, fieldName)
177+
== null) {
178+
return null;
179+
}
180+
}
181+
return reuseStrategy.getReusableComponents(analyzer, fieldName);
182+
}
183+
184+
@Override
185+
public void setReusableComponents(
186+
Analyzer analyzer, String fieldName, TokenStreamComponents components) {
187+
reuseStrategy.setReusableComponents(analyzer, fieldName, components);
188+
189+
if (analyzer instanceof AnalyzerWrapper wrapper) {
190+
assert components instanceof TokenStreamComponentsWrapper;
191+
final TokenStreamComponentsWrapper wrapperComponents =
192+
(TokenStreamComponentsWrapper) components;
193+
final Analyzer wrappedAnalyzer = wrapper.getWrappedAnalyzer(fieldName);
194+
wrappedAnalyzer
195+
.getReuseStrategy()
196+
.setReusableComponents(
197+
wrappedAnalyzer, fieldName, wrapperComponents.getWrappedComponents());
198+
}
199+
}
200+
}
201+
202+
/**
203+
* A {@link Analyzer.TokenStreamComponents} that decorates the wrapper with access to the wrapped
204+
* components.
205+
*/
206+
static final class TokenStreamComponentsWrapper extends TokenStreamComponents {
207+
private final TokenStreamComponents wrapped;
208+
209+
TokenStreamComponentsWrapper(TokenStreamComponents wrapper, TokenStreamComponents wrapped) {
210+
super(wrapper.getSource(), wrapper.getTokenStream());
211+
this.wrapped = wrapped;
212+
}
213+
214+
TokenStreamComponents getWrappedComponents() {
215+
return wrapped;
216+
}
217+
}
154218
}

lucene/core/src/java/org/apache/lucene/analysis/DelegatingAnalyzerWrapper.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,11 @@ protected final Reader wrapReaderForNormalization(String fieldName, Reader reade
6969
return super.wrapReaderForNormalization(fieldName, reader);
7070
}
7171

72-
private static final class DelegatingReuseStrategy extends ReuseStrategy {
72+
/**
73+
* A {@link org.apache.lucene.analysis.Analyzer.ReuseStrategy} that delegates to the wrapped
74+
* analyzer's strategy for reusability of components.
75+
*/
76+
static final class DelegatingReuseStrategy extends ReuseStrategy {
7377
DelegatingAnalyzerWrapper wrapper;
7478
private final ReuseStrategy fallbackStrategy;
7579

lucene/core/src/java/org/apache/lucene/analysis/package-info.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -343,7 +343,7 @@
343343
* undefined as long as they both leave from the same position. As result, all synonyms of a
344344
* token would be considered to appear in exactly the same position as that token, and so
345345
* would they be seen by phrase and proximity searches. For multi-token synonyms to work
346-
* correctly, you should use {@code SynoymGraphFilter} at search time only.
346+
* correctly, you should use {@code SynonymGraphFilter} at search time only.
347347
* </ol>
348348
*
349349
* <h3>Token Position Length</h3>

lucene/core/src/java/org/apache/lucene/codecs/hnsw/DefaultFlatVectorScorer.java

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import org.apache.lucene.index.VectorSimilarityFunction;
2525
import org.apache.lucene.util.hnsw.RandomVectorScorer;
2626
import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier;
27+
import org.apache.lucene.util.hnsw.UpdateableRandomVectorScorer;
2728

2829
/**
2930
* Default implementation of {@link FlatVectorsScorer}.
@@ -89,24 +90,29 @@ public String toString() {
8990
/** RandomVectorScorerSupplier for bytes vector */
9091
private static final class ByteScoringSupplier implements RandomVectorScorerSupplier {
9192
private final ByteVectorValues vectors;
92-
private final ByteVectorValues vectors1;
93-
private final ByteVectorValues vectors2;
93+
private final ByteVectorValues targetVectors;
9494
private final VectorSimilarityFunction similarityFunction;
9595

9696
private ByteScoringSupplier(
9797
ByteVectorValues vectors, VectorSimilarityFunction similarityFunction) throws IOException {
9898
this.vectors = vectors;
99-
vectors1 = vectors.copy();
100-
vectors2 = vectors.copy();
99+
targetVectors = vectors.copy();
101100
this.similarityFunction = similarityFunction;
102101
}
103102

104103
@Override
105-
public RandomVectorScorer scorer(int ord) {
106-
return new RandomVectorScorer.AbstractRandomVectorScorer(vectors) {
104+
public UpdateableRandomVectorScorer scorer() throws IOException {
105+
byte[] vector = new byte[vectors.dimension()];
106+
return new UpdateableRandomVectorScorer.AbstractUpdateableRandomVectorScorer(vectors) {
107+
108+
@Override
109+
public void setScoringOrdinal(int node) throws IOException {
110+
System.arraycopy(targetVectors.vectorValue(node), 0, vector, 0, vector.length);
111+
}
112+
107113
@Override
108114
public float score(int node) throws IOException {
109-
return similarityFunction.compare(vectors1.vectorValue(ord), vectors2.vectorValue(node));
115+
return similarityFunction.compare(vector, targetVectors.vectorValue(node));
110116
}
111117
};
112118
}
@@ -125,24 +131,28 @@ public String toString() {
125131
/** RandomVectorScorerSupplier for Float vector */
126132
private static final class FloatScoringSupplier implements RandomVectorScorerSupplier {
127133
private final FloatVectorValues vectors;
128-
private final FloatVectorValues vectors1;
129-
private final FloatVectorValues vectors2;
134+
private final FloatVectorValues targetVectors;
130135
private final VectorSimilarityFunction similarityFunction;
131136

132137
private FloatScoringSupplier(
133138
FloatVectorValues vectors, VectorSimilarityFunction similarityFunction) throws IOException {
134139
this.vectors = vectors;
135-
vectors1 = vectors.copy();
136-
vectors2 = vectors.copy();
140+
targetVectors = vectors.copy();
137141
this.similarityFunction = similarityFunction;
138142
}
139143

140144
@Override
141-
public RandomVectorScorer scorer(int ord) {
142-
return new RandomVectorScorer.AbstractRandomVectorScorer(vectors) {
145+
public UpdateableRandomVectorScorer scorer() throws IOException {
146+
float[] vector = new float[vectors.dimension()];
147+
return new UpdateableRandomVectorScorer.AbstractUpdateableRandomVectorScorer(vectors) {
143148
@Override
144149
public float score(int node) throws IOException {
145-
return similarityFunction.compare(vectors1.vectorValue(ord), vectors2.vectorValue(node));
150+
return similarityFunction.compare(vector, targetVectors.vectorValue(node));
151+
}
152+
153+
@Override
154+
public void setScoringOrdinal(int node) throws IOException {
155+
System.arraycopy(targetVectors.vectorValue(node), 0, vector, 0, vector.length);
146156
}
147157
};
148158
}

lucene/core/src/java/org/apache/lucene/codecs/hnsw/ScalarQuantizedVectorScorer.java

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import org.apache.lucene.util.VectorUtil;
2525
import org.apache.lucene.util.hnsw.RandomVectorScorer;
2626
import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier;
27+
import org.apache.lucene.util.hnsw.UpdateableRandomVectorScorer;
2728
import org.apache.lucene.util.quantization.QuantizedByteVectorValues;
2829
import org.apache.lucene.util.quantization.ScalarQuantizedVectorSimilarity;
2930
import org.apache.lucene.util.quantization.ScalarQuantizer;
@@ -147,11 +148,18 @@ private ScalarQuantizedRandomVectorScorerSupplier(
147148
}
148149

149150
@Override
150-
public RandomVectorScorer scorer(int ord) throws IOException {
151+
public UpdateableRandomVectorScorer scorer() throws IOException {
151152
final QuantizedByteVectorValues vectorsCopy = values.copy();
152-
final byte[] queryVector = values.vectorValue(ord);
153-
final float queryOffset = values.getScoreCorrectionConstant(ord);
154-
return new RandomVectorScorer.AbstractRandomVectorScorer(vectorsCopy) {
153+
byte[] queryVector = new byte[values.dimension()];
154+
return new UpdateableRandomVectorScorer.AbstractUpdateableRandomVectorScorer(vectorsCopy) {
155+
float queryOffset = 0;
156+
157+
@Override
158+
public void setScoringOrdinal(int node) throws IOException {
159+
System.arraycopy(vectorsCopy.vectorValue(node), 0, queryVector, 0, queryVector.length);
160+
queryOffset = vectorsCopy.getScoreCorrectionConstant(node);
161+
}
162+
155163
@Override
156164
public float score(int node) throws IOException {
157165
byte[] nodeVector = vectorsCopy.vectorValue(node);

lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,8 @@
5252
import org.apache.lucene.util.IOUtils;
5353
import org.apache.lucene.util.RamUsageEstimator;
5454
import org.apache.lucene.util.hnsw.CloseableRandomVectorScorerSupplier;
55-
import org.apache.lucene.util.hnsw.RandomVectorScorer;
5655
import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier;
56+
import org.apache.lucene.util.hnsw.UpdateableRandomVectorScorer;
5757

5858
/**
5959
* Writes vector values to index segments.
@@ -507,8 +507,8 @@ static final class FlatCloseableRandomVectorScorerSupplier
507507
}
508508

509509
@Override
510-
public RandomVectorScorer scorer(int ord) throws IOException {
511-
return supplier.scorer(ord);
510+
public UpdateableRandomVectorScorer scorer() throws IOException {
511+
return supplier.scorer();
512512
}
513513

514514
@Override

0 commit comments

Comments
 (0)