Skip to content

Commit

Permalink
Merge branch 'main' into vector_bpv24
Browse files Browse the repository at this point in the history
  • Loading branch information
gf2121 authored Feb 10, 2025
2 parents 6273aa9 + fe50684 commit 285ae58
Show file tree
Hide file tree
Showing 115 changed files with 2,723 additions and 1,319 deletions.
130 changes: 130 additions & 0 deletions .github/labeler.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# This file defines module label mappings for the Lucene project.
# Each module is associated with a set of file globs that, when matched,
# will trigger the corresponding label to be applied to pull requests.
#
# This configuration is used by the workflow defined in .github/workflows/label-pull-request.yml.
# If we are adding new labels or refactoring modules, we will need to modify this file globs here to ensure that the correct labels are applied.

# For more information on how to define globs, visit: https://github.com/actions/labeler

module:analysis:
- changed-files:
- any-glob-to-any-file: 'lucene/analysis/**'

module:benchmark:
- changed-files:
- any-glob-to-any-file: 'lucene/benchmark/**'

module:classification:
- changed-files:
- any-glob-to-any-file: 'lucene/classification/**'

module:core/codecs:
- changed-files:
- any-glob-to-any-file: ['lucene/core/src/java/org/apache/lucene/codecs/**', 'lucene/core/src/test/org/apache/lucene/codecs/**']

module:core/FSTs:
- changed-files:
- any-glob-to-any-file: ['lucene/core/src/java/org/apache/lucene/util/fst/**', 'lucene/core/src/test/org/apache/lucene/util/fst/**']

module:core/hnsw:
- changed-files:
- any-glob-to-any-file: ['lucene/core/src/java/org/apache/lucene/util/hnsw/**', 'lucene/core/src/test/org/apache/lucene/util/hnsw/**']

module:core/index:
- changed-files:
- any-glob-to-any-file: ['lucene/core/src/java/org/apache/lucene/index/**', 'lucene/core/src/test/org/apache/lucene/index/**']

module:core/search:
- changed-files:
- any-glob-to-any-file: ['lucene/core/src/java/org/apache/lucene/search/**', 'lucene/core/src/test/org/apache/lucene/search/**']

module:core/store:
- changed-files:
- any-glob-to-any-file: ['lucene/core/src/java/org/apache/lucene/store/**', 'lucene/core/src/test/org/apache/lucene/store/**']

module:core/other:
- all:
- changed-files:
- any-glob-to-any-file: ['lucene/core/**']
- all-globs-to-all-files:
- '!lucene/core/src/java/org/apache/lucene/codecs/**'
- '!lucene/core/src/test/org/apache/lucene/codecs/**'
- '!lucene/core/src/java/org/apache/lucene/util/fst/**'
- '!lucene/core/src/test/org/apache/lucene/util/fst/**'
- '!lucene/core/src/java/org/apache/lucene/util/hnsw/**'
- '!lucene/core/src/test/org/apache/lucene/util/hnsw/**'
- '!lucene/core/src/java/org/apache/lucene/index/**'
- '!lucene/core/src/test/org/apache/lucene/index/**'
- '!lucene/core/src/java/org/apache/lucene/search/**'
- '!lucene/core/src/test/org/apache/lucene/search/**'
- '!lucene/core/src/java/org/apache/lucene/store/**'
- '!lucene/core/src/test/org/apache/lucene/store/**'

module:demo:
- changed-files:
- any-glob-to-any-file: 'lucene/demo/**'

module:expressions:
- changed-files:
- any-glob-to-any-file: 'lucene/expressions/**'

module:facet:
- changed-files:
- any-glob-to-any-file: 'lucene/facet/**'

module:grouping:
- changed-files:
- any-glob-to-any-file: 'lucene/grouping/**'

module:highlighter:
- changed-files:
- any-glob-to-any-file: 'lucene/highlighter/**'

module:join:
- changed-files:
- any-glob-to-any-file: 'lucene/join/**'

module:luke:
- changed-files:
- any-glob-to-any-file: 'lucene/luke/**'

module:misc:
- changed-files:
- any-glob-to-any-file: 'lucene/misc/**'

module:monitor:
- changed-files:
- any-glob-to-any-file: 'lucene/monitor/**'

module:queries:
- changed-files:
- any-glob-to-any-file: 'lucene/queries/**'

module:queryparser:
- changed-files:
- any-glob-to-any-file: 'lucene/queryparser/**'

module:replicator:
- changed-files:
- any-glob-to-any-file: 'lucene/replicator/**'

module:sandbox:
- changed-files:
- any-glob-to-any-file: 'lucene/sandbox/**'

module:spatial:
- changed-files:
- any-glob-to-any-file: ['lucene/spatial-extras/**', 'lucene/spatial-test-fixtures/**']

module:spatial3d:
- changed-files:
- any-glob-to-any-file: 'lucene/spatial3d/**'

module:suggest:
- changed-files:
- any-glob-to-any-file: 'lucene/suggest/**'

module:test-framework:
- changed-files:
- any-glob-to-any-file: 'lucene/test-framework/**'
23 changes: 23 additions & 0 deletions .github/workflows/label-pull-request.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# This file defines the workflow for labeling pull requests with module tags based on the changed files in the PR.
# It uses the `actions/labeler` GitHub Action to achieve the same.
#
# The workflow is triggered on the `pull_request_target` event which ensures workflow is only run from the master branch.
# The job `labeler` runs on `ubuntu-latest` and has permissions to read contents and write pull requests.
#
# For more information on the `actions/labeler` GitHub Action, refer to https://github.com/actions/labeler

name: "Pull Request Labeler"
run-name: Labelling pull request with module tags based on changed files in the PR
on:
- pull_request_target

jobs:
labeler:
permissions:
contents: read
pull-requests: write
runs-on: ubuntu-latest
steps:
- uses: actions/labeler@v5
with:
sync-labels: true
4 changes: 3 additions & 1 deletion gradle/testing/randomization.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,9 @@ allprojects {
[propName: 'tests.asserts', value: "true", description: "Enables or disables assertions mode."],
[propName: 'tests.infostream', value: false, description: "Enables or disables infostream logs."],
[propName: 'tests.leaveTemporary', value: false, description: "Leave temporary directories after tests complete."],
[propName: 'tests.useSecurityManager', value: true, description: "Control security manager in tests.", buildOnly: true],
[propName: 'tests.useSecurityManager',
value: { -> rootProject.ext.runtimeJavaVersion <= JavaVersion.VERSION_23 ? 'true' : 'false' },
description: "Control security manager in tests.", buildOnly: true],
// component randomization
[propName: 'tests.codec', value: "random", description: "Sets the codec tests should run with."],
[propName: 'tests.directory', value: "random", description: "Sets the Directory implementation tests should run with."],
Expand Down
34 changes: 30 additions & 4 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,16 @@ API Changes
---------------------
* GITHUB#11023: Removing deprecated parameters from CheckIndex. (Jakub Slowinski)

* GITHUB#14165: TieredMergePolicy's maxMergeAtOnce parameter was removed. (Adrien Grand)

New Features
---------------------
* GITHUB#14097: Binary partitioning merge policy over float-valued vector field. (Mike Sokolov)

Improvements
---------------------

* GITHUB#266: TieredMergePolicy's maxMergeAtOnce default value was changed from 10 to 30. (Adrien Grand)
(No changes)

Optimizations
---------------------
Expand Down Expand Up @@ -44,11 +46,14 @@ API Changes
* GITHUB#14134: Added Bits#applyMask API to help apply live docs as a mask on a
bit set of matches. (Adrien Grand)

* GITHUB#14209: Deprecate Operations.union(Automaton,Automaton) and
concatenate(Automaton,Automaton) in favor of the methods taking List. (Robert Muir)

New Features
---------------------

* GITHUB#14084, GITHUB#13635, GITHUB#13634: Adds new `SeededKnnByteVectorQuery` and `SeededKnnFloatVectorQuery`
queries. These queries allow for the vector search entry points to be initialized via a `seed` query. This follows
* GITHUB#14084, GITHUB#13635, GITHUB#13634, GITHUB#14170: Adds new `SeededKnnVectorQuery` query.
These queries allow for the vector search entry points to be initialized via a `seed` query. This follows
the research provided via https://arxiv.org/abs/2307.16779. (Sean MacAvaney, Ben Trent).


Expand All @@ -69,6 +74,15 @@ Improvements
individual and bulk data retrieval overloads; avoid double buffering with
slices. (Chris Hegarty)

* GITHUB#14166: Log(ByteSize|Doc)MergePolicy now allow merging more than
mergeFactor segments together when the merge is below the min merge size.
(Adrien Grand)

* GITHUB#14154: Add UnwrappingReuseStrategy for AnalyzerWrapper that consults
the wrapped analyzer's strategy to decide if components can be reused or need
to be updated. (Mayya Sharipova)


Optimizations
---------------------

Expand All @@ -80,7 +94,16 @@ Optimizations
* GITHUB#14133: Dense blocks of postings are now encoded as bit sets.
(Adrien Grand)

# GITHUB#14169: Optimize ContextQuery with big number of contexts. (Mayya Sharipova)
* GITHUB#14169: Optimize ContextQuery with big number of contexts. (Mayya Sharipova)

* GITHUB#14181: Add updateable random scorer interface for knn vector index building. This allows
for fewer objects to be created during indexing and simplifies internally used iterfaces.
(Ben Trent)

* GITHUB#14193: Add Automata.makeCharSet() and makeCharClass() that return minimal DFA
for lists of characters and ranges. Use them in RegExp parser. (Robert Muir)

* GITHUB#14176: Reduce when visiting bpv24-encoded doc ids in BKD leaves. (Guo Feng)

# GITHUB#14203: Use Vector API to decode BKD docIds. (GuoFeng)

Expand All @@ -104,6 +127,9 @@ Other

* GITHUB#14091: Cover all DataType. (Lu Xugang)

* GITHUB#14130: Upgrade OpenNLP from 2.3.2 to 2.5.3, which transitively upgrades Slf4j
from 1.7.36 to 2.0.16. (Michael Froh)

======================= Lucene 10.1.0 =======================

API Changes
Expand Down
7 changes: 7 additions & 0 deletions lucene/MIGRATE.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,13 @@

# Apache Lucene Migration Guide

## Migration from Lucene 10.x to Lucene 11.0

### TieredMergePolicy#setMaxMergeAtOnce removed

This parameter has no replacement, TieredMergePolicy no longer bounds the
number of segments that may be merged together.

## Migration from Lucene 9.x to Lucene 10.0

### DataInput#readVLong() may now read negative vlongs
Expand Down
1 change: 1 addition & 0 deletions lucene/analysis.tests/src/test/module-info.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
requires org.apache.lucene.analysis.smartcn;
requires org.apache.lucene.analysis.stempel;
requires org.apache.lucene.test_framework;
requires org.apache.commons.codec;

exports org.apache.lucene.analysis.tests;
}
29 changes: 29 additions & 0 deletions lucene/analysis/opennlp/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,32 @@ dependencies {

moduleTestImplementation project(':lucene:test-framework')
}

ext {
testModelDataDir = file('src/tools/test-model-data')
testsUserDir = file('src/test-files')
testModelDir = file("${testsUserDir}/org/apache/lucene/analysis/opennlp")
}

tasks.register('trainTestModels') {
description = 'Train all small test models for unit tests'
doLast {
mkdir testModelDir
trainModel('SentenceDetectorTrainer', 'en', 'sentences.txt', 'en-test-sent.bin')
trainModel('TokenizerTrainer', 'en', 'tokenizer.txt', 'en-test-tokenizer.bin')
trainModel('POSTaggerTrainer', 'en', 'pos.txt', 'en-test-pos-maxent.bin')
trainModel('ChunkerTrainerME', 'en', 'chunks.txt', 'en-test-chunker.bin')
trainModel('TokenNameFinderTrainer', 'en', 'ner.txt', 'en-test-ner.bin', ['-params', 'ner_TrainerParams.txt'])
trainModel('LemmatizerTrainerME', 'en', 'lemmas.txt', 'en-test-lemmatizer.bin')
}
}

def trainModel(String command, String lang, String data, String model, List extraArgs = []) {
javaexec {
classpath = sourceSets.main.compileClasspath
mainClass = 'opennlp.tools.cmdline.CLI'
workingDir = testModelDataDir
args = [command, '-lang', lang, '-data', data, '-model', "${testModelDir}/${model}"] + extraArgs
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@

package org.apache.lucene.analysis.opennlp.tools;

import java.io.IOException;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTagFormat;
import opennlp.tools.postag.POSTagger;
import opennlp.tools.postag.POSTaggerME;

Expand All @@ -29,8 +29,8 @@
public class NLPPOSTaggerOp {
private final POSTagger tagger;

public NLPPOSTaggerOp(POSModel model) throws IOException {
tagger = new POSTaggerME(model);
public NLPPOSTaggerOp(POSModel model) {
tagger = new POSTaggerME(model, POSTagFormat.PENN);
}

public synchronized String[] getPOSTags(String[] words) {
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ public class TestOpenNLPChunkerFilterFactory extends BaseTokenStreamTestCase {
8, 15, 17, 21, 23, 29, 30, 39, 46, 48, 49, 51, 57, 58
};
private static final String[] SENTENCES_chunks = {
"B-NP", "I-NP", "I-NP", "I-NP", "I-NP", "I-NP", "O", "B-NP", "I-NP", "I-NP", "O", "B-NP",
"B-NP", "I-NP", "I-NP", "B-VP", "B-NP", "I-NP", "O", "B-NP", "I-NP", "I-NP", "O", "B-NP",
"I-NP", "O"
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.hnsw.RandomVectorScorer;
import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier;
import org.apache.lucene.util.hnsw.UpdateableRandomVectorScorer;
import org.openjdk.jmh.annotations.*;

@BenchmarkMode(Mode.Throughput)
Expand All @@ -57,7 +58,7 @@ public class VectorScorerBenchmark {
IndexInput in;
KnnVectorValues vectorValues;
byte[] vec1, vec2;
RandomVectorScorer scorer;
UpdateableRandomVectorScorer scorer;

@Setup(Level.Iteration)
public void init() throws IOException {
Expand All @@ -76,7 +77,8 @@ public void init() throws IOException {
scorer =
FlatVectorScorerUtil.getLucene99FlatVectorsScorer()
.getRandomVectorScorerSupplier(DOT_PRODUCT, vectorValues)
.scorer(0);
.scorer();
scorer.setScoringOrdinal(0);
}

@TearDown
Expand Down
Loading

0 comments on commit 285ae58

Please sign in to comment.