Merge branch 'main' into vector_bpv24

apache · Feb 10, 2025 · 285ae58 · 285ae58
2 parents 6273aa9 + fe50684
commit 285ae58
Show file tree

Hide file tree

Showing 115 changed files with 2,723 additions and 1,319 deletions.
diff --git a/.github/labeler.yml b/.github/labeler.yml
@@ -0,0 +1,130 @@
+# This file defines module label mappings for the Lucene project.
+# Each module is associated with a set of file globs that, when matched,
+# will trigger the corresponding label to be applied to pull requests.
+#
+# This configuration is used by the workflow defined in .github/workflows/label-pull-request.yml.
+# If we are adding new labels or refactoring modules, we will need to modify this file globs here to ensure that the correct labels are applied.
+
+# For more information on how to define globs, visit: https://github.com/actions/labeler
+
+module:analysis:
+  - changed-files:
+      - any-glob-to-any-file: 'lucene/analysis/**'
+
+module:benchmark:
+  - changed-files:
+      - any-glob-to-any-file: 'lucene/benchmark/**'
+
+module:classification:
+  - changed-files:
+      - any-glob-to-any-file: 'lucene/classification/**'
+
+module:core/codecs:
+  - changed-files:
+      - any-glob-to-any-file: ['lucene/core/src/java/org/apache/lucene/codecs/**', 'lucene/core/src/test/org/apache/lucene/codecs/**']
+
+module:core/FSTs:
+  - changed-files:
+      - any-glob-to-any-file: ['lucene/core/src/java/org/apache/lucene/util/fst/**', 'lucene/core/src/test/org/apache/lucene/util/fst/**']
+
+module:core/hnsw:
+  - changed-files:
+      - any-glob-to-any-file: ['lucene/core/src/java/org/apache/lucene/util/hnsw/**', 'lucene/core/src/test/org/apache/lucene/util/hnsw/**']
+
+module:core/index:
+  - changed-files:
+      - any-glob-to-any-file: ['lucene/core/src/java/org/apache/lucene/index/**', 'lucene/core/src/test/org/apache/lucene/index/**']
+
+module:core/search:
+  - changed-files:
+      - any-glob-to-any-file: ['lucene/core/src/java/org/apache/lucene/search/**', 'lucene/core/src/test/org/apache/lucene/search/**']
+
+module:core/store:
+  - changed-files:
+      - any-glob-to-any-file: ['lucene/core/src/java/org/apache/lucene/store/**', 'lucene/core/src/test/org/apache/lucene/store/**']
+
+module:core/other:
+  - all:
+      - changed-files:
+          - any-glob-to-any-file: ['lucene/core/**']
+          - all-globs-to-all-files:
+              - '!lucene/core/src/java/org/apache/lucene/codecs/**'
+              - '!lucene/core/src/test/org/apache/lucene/codecs/**'
+              - '!lucene/core/src/java/org/apache/lucene/util/fst/**'
+              - '!lucene/core/src/test/org/apache/lucene/util/fst/**'
+              - '!lucene/core/src/java/org/apache/lucene/util/hnsw/**'
+              - '!lucene/core/src/test/org/apache/lucene/util/hnsw/**'
+              - '!lucene/core/src/java/org/apache/lucene/index/**'
+              - '!lucene/core/src/test/org/apache/lucene/index/**'
+              - '!lucene/core/src/java/org/apache/lucene/search/**'
+              - '!lucene/core/src/test/org/apache/lucene/search/**'
+              - '!lucene/core/src/java/org/apache/lucene/store/**'
+              - '!lucene/core/src/test/org/apache/lucene/store/**'
+
+module:demo:
+  - changed-files:
+      - any-glob-to-any-file: 'lucene/demo/**'
+
+module:expressions:
+  - changed-files:
+      - any-glob-to-any-file: 'lucene/expressions/**'
+
+module:facet:
+  - changed-files:
+      - any-glob-to-any-file: 'lucene/facet/**'
+
+module:grouping:
+  - changed-files:
+      - any-glob-to-any-file: 'lucene/grouping/**'
+
+module:highlighter:
+  - changed-files:
+      - any-glob-to-any-file: 'lucene/highlighter/**'
+
+module:join:
+  - changed-files:
+      - any-glob-to-any-file: 'lucene/join/**'
+
+module:luke:
+  - changed-files:
+      - any-glob-to-any-file: 'lucene/luke/**'
+
+module:misc:
+  - changed-files:
+      - any-glob-to-any-file: 'lucene/misc/**'
+
+module:monitor:
+  - changed-files:
+      - any-glob-to-any-file: 'lucene/monitor/**'
+
+module:queries:
+  - changed-files:
+      - any-glob-to-any-file: 'lucene/queries/**'
+
+module:queryparser:
+  - changed-files:
+      - any-glob-to-any-file: 'lucene/queryparser/**'
+
+module:replicator:
+  - changed-files:
+      - any-glob-to-any-file: 'lucene/replicator/**'
+
+module:sandbox:
+  - changed-files:
+      - any-glob-to-any-file: 'lucene/sandbox/**'
+
+module:spatial:
+  - changed-files:
+      - any-glob-to-any-file: ['lucene/spatial-extras/**', 'lucene/spatial-test-fixtures/**']
+
+module:spatial3d:
+  - changed-files:
+      - any-glob-to-any-file: 'lucene/spatial3d/**'
+
+module:suggest:
+  - changed-files:
+      - any-glob-to-any-file: 'lucene/suggest/**'
+
+module:test-framework:
+  - changed-files:
+      - any-glob-to-any-file: 'lucene/test-framework/**'
diff --git a/.github/workflows/label-pull-request.yml b/.github/workflows/label-pull-request.yml
@@ -0,0 +1,23 @@
+# This file defines the workflow for labeling pull requests with module tags based on the changed files in the PR.
+# It uses the `actions/labeler` GitHub Action to achieve the same.
+#
+# The workflow is triggered on the `pull_request_target` event which ensures workflow is only run from the master branch.
+# The job `labeler` runs on `ubuntu-latest` and has permissions to read contents and write pull requests.
+#
+# For more information on the `actions/labeler` GitHub Action, refer to https://github.com/actions/labeler
+
+name: "Pull Request Labeler"
+run-name: Labelling pull request with module tags based on changed files in the PR
+on:
+  - pull_request_target
+
+jobs:
+  labeler:
+    permissions:
+      contents: read
+      pull-requests: write
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/labeler@v5
+        with:
+          sync-labels: true
diff --git a/gradle/testing/randomization.gradle b/gradle/testing/randomization.gradle
@@ -76,7 +76,9 @@ allprojects {
           [propName: 'tests.asserts', value: "true", description: "Enables or disables assertions mode."],
           [propName: 'tests.infostream', value: false, description: "Enables or disables infostream logs."],
           [propName: 'tests.leaveTemporary', value: false, description: "Leave temporary directories after tests complete."],
-          [propName: 'tests.useSecurityManager', value: true, description: "Control security manager in tests.", buildOnly: true],
+          [propName: 'tests.useSecurityManager',
+           value: { -> rootProject.ext.runtimeJavaVersion <= JavaVersion.VERSION_23 ? 'true' : 'false' },
+           description: "Control security manager in tests.", buildOnly: true],
           // component randomization
           [propName: 'tests.codec', value: "random", description: "Sets the codec tests should run with."],
           [propName: 'tests.directory', value: "random", description: "Sets the Directory implementation tests should run with."],

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -9,14 +9,16 @@ API Changes
 ---------------------
 * GITHUB#11023: Removing deprecated parameters from CheckIndex. (Jakub Slowinski)
 
+* GITHUB#14165: TieredMergePolicy's maxMergeAtOnce parameter was removed. (Adrien Grand)
+
 New Features
 ---------------------
 * GITHUB#14097: Binary partitioning merge policy over float-valued vector field. (Mike Sokolov)
 
 Improvements
 ---------------------
 
-* GITHUB#266: TieredMergePolicy's maxMergeAtOnce default value was changed from 10 to 30. (Adrien Grand)
+(No changes)
 
 Optimizations
 ---------------------
@@ -44,11 +46,14 @@ API Changes
 * GITHUB#14134: Added Bits#applyMask API to help apply live docs as a mask on a
   bit set of matches. (Adrien Grand)
 
+* GITHUB#14209: Deprecate Operations.union(Automaton,Automaton) and 
+  concatenate(Automaton,Automaton) in favor of the methods taking List.  (Robert Muir)
+
 New Features
 ---------------------
 
-* GITHUB#14084, GITHUB#13635, GITHUB#13634: Adds new `SeededKnnByteVectorQuery` and `SeededKnnFloatVectorQuery`
-  queries. These queries allow for the vector search entry points to be initialized via a `seed` query. This follows
+* GITHUB#14084, GITHUB#13635, GITHUB#13634, GITHUB#14170: Adds new `SeededKnnVectorQuery` query.
+  These queries allow for the vector search entry points to be initialized via a `seed` query. This follows
   the research provided via https://arxiv.org/abs/2307.16779. (Sean MacAvaney, Ben Trent).
 
 
@@ -69,6 +74,15 @@ Improvements
   individual and bulk data retrieval overloads; avoid double buffering with
   slices. (Chris Hegarty)
 
+* GITHUB#14166: Log(ByteSize|Doc)MergePolicy now allow merging more than
+  mergeFactor segments together when the merge is below the min merge size.
+  (Adrien Grand)
+
+* GITHUB#14154: Add UnwrappingReuseStrategy for AnalyzerWrapper that consults
+  the wrapped analyzer's strategy to decide if components can be reused or need
+  to be updated. (Mayya Sharipova)
+
+
 Optimizations
 ---------------------
 
@@ -80,7 +94,16 @@ Optimizations
 * GITHUB#14133: Dense blocks of postings are now encoded as bit sets.
   (Adrien Grand)
 
-# GITHUB#14169: Optimize ContextQuery with big number of contexts. (Mayya Sharipova)
+* GITHUB#14169: Optimize ContextQuery with big number of contexts. (Mayya Sharipova)
+
+* GITHUB#14181: Add updateable random scorer interface for knn vector index building. This allows
+  for fewer objects to be created during indexing and simplifies internally used iterfaces.
+  (Ben Trent)
+
+* GITHUB#14193: Add Automata.makeCharSet() and makeCharClass() that return minimal DFA
+  for lists of characters and ranges. Use them in RegExp parser.  (Robert Muir)
+
+* GITHUB#14176: Reduce when visiting bpv24-encoded doc ids in BKD leaves. (Guo Feng)
 
 # GITHUB#14203: Use Vector API to decode BKD docIds. (GuoFeng)
 
@@ -104,6 +127,9 @@ Other
 
 * GITHUB#14091: Cover all DataType. (Lu Xugang)
 
+* GITHUB#14130: Upgrade OpenNLP from 2.3.2 to 2.5.3, which transitively upgrades Slf4j 
+  from 1.7.36 to 2.0.16. (Michael Froh)
+
 ======================= Lucene 10.1.0 =======================
 
 API Changes

diff --git a/lucene/MIGRATE.md b/lucene/MIGRATE.md
@@ -17,6 +17,13 @@
 
 # Apache Lucene Migration Guide
 
+## Migration from Lucene 10.x to Lucene 11.0
+
+### TieredMergePolicy#setMaxMergeAtOnce removed
+
+This parameter has no replacement, TieredMergePolicy no longer bounds the
+number of segments that may be merged together.
+
 ## Migration from Lucene 9.x to Lucene 10.0
 
 ### DataInput#readVLong() may now read negative vlongs

diff --git a/lucene/analysis.tests/src/test/module-info.java b/lucene/analysis.tests/src/test/module-info.java
@@ -33,6 +33,7 @@
   requires org.apache.lucene.analysis.smartcn;
   requires org.apache.lucene.analysis.stempel;
   requires org.apache.lucene.test_framework;
+  requires org.apache.commons.codec;
 
   exports org.apache.lucene.analysis.tests;
 }
diff --git a/lucene/analysis/opennlp/build.gradle b/lucene/analysis/opennlp/build.gradle
@@ -26,3 +26,32 @@ dependencies {
 
   moduleTestImplementation project(':lucene:test-framework')
 }
+
+ext {
+  testModelDataDir = file('src/tools/test-model-data')
+  testsUserDir = file('src/test-files')
+  testModelDir = file("${testsUserDir}/org/apache/lucene/analysis/opennlp")
+}
+
+tasks.register('trainTestModels') {
+  description = 'Train all small test models for unit tests'
+  doLast {
+    mkdir testModelDir
+    trainModel('SentenceDetectorTrainer', 'en', 'sentences.txt', 'en-test-sent.bin')
+    trainModel('TokenizerTrainer', 'en', 'tokenizer.txt', 'en-test-tokenizer.bin')
+    trainModel('POSTaggerTrainer', 'en', 'pos.txt', 'en-test-pos-maxent.bin')
+    trainModel('ChunkerTrainerME', 'en', 'chunks.txt', 'en-test-chunker.bin')
+    trainModel('TokenNameFinderTrainer', 'en', 'ner.txt', 'en-test-ner.bin', ['-params', 'ner_TrainerParams.txt'])
+    trainModel('LemmatizerTrainerME', 'en', 'lemmas.txt', 'en-test-lemmatizer.bin')
+  }
+}
+
+def trainModel(String command, String lang, String data, String model, List extraArgs = []) {
+  javaexec {
+    classpath = sourceSets.main.compileClasspath
+    mainClass = 'opennlp.tools.cmdline.CLI'
+    workingDir = testModelDataDir
+    args = [command, '-lang', lang, '-data', data, '-model', "${testModelDir}/${model}"] + extraArgs
+  }
+}
+
diff --git a/...ne/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPPOSTaggerOp.java b/...ne/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPPOSTaggerOp.java
@@ -17,8 +17,8 @@
 
 package org.apache.lucene.analysis.opennlp.tools;
 
-import java.io.IOException;
 import opennlp.tools.postag.POSModel;
+import opennlp.tools.postag.POSTagFormat;
 import opennlp.tools.postag.POSTagger;
 import opennlp.tools.postag.POSTaggerME;
 
@@ -29,8 +29,8 @@
 public class NLPPOSTaggerOp {
   private final POSTagger tagger;
 
-  public NLPPOSTaggerOp(POSModel model) throws IOException {
-    tagger = new POSTaggerME(model);
+  public NLPPOSTaggerOp(POSModel model) {
+    tagger = new POSTaggerME(model, POSTagFormat.PENN);
   }
 
   public synchronized String[] getPOSTags(String[] words) {

diff --git a/...ne/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-chunker.bin b/...ne/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-chunker.bin
diff --git a/...analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-lemmatizer.bin b/...analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-lemmatizer.bin
diff --git a/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-ner.bin b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-ner.bin
diff --git a/...analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-pos-maxent.bin b/...analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-pos-maxent.bin
diff --git a/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-sent.bin b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-sent.bin
diff --git a/.../analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-tokenizer.bin b/.../analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-tokenizer.bin
diff --git a/.../opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPChunkerFilterFactory.java b/.../opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPChunkerFilterFactory.java
@@ -58,7 +58,7 @@ public class TestOpenNLPChunkerFilterFactory extends BaseTokenStreamTestCase {
     8, 15, 17, 21, 23, 29, 30, 39, 46, 48, 49, 51, 57, 58
   };
   private static final String[] SENTENCES_chunks = {
-    "B-NP", "I-NP", "I-NP", "I-NP", "I-NP", "I-NP", "O", "B-NP", "I-NP", "I-NP", "O", "B-NP",
+    "B-NP", "I-NP", "I-NP", "B-VP", "B-NP", "I-NP", "O", "B-NP", "I-NP", "I-NP", "O", "B-NP",
     "I-NP", "O"
   };
 

diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorScorerBenchmark.java
@@ -35,6 +35,7 @@
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.hnsw.RandomVectorScorer;
 import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier;
+import org.apache.lucene.util.hnsw.UpdateableRandomVectorScorer;
 import org.openjdk.jmh.annotations.*;
 
 @BenchmarkMode(Mode.Throughput)
@@ -57,7 +58,7 @@ public class VectorScorerBenchmark {
   IndexInput in;
   KnnVectorValues vectorValues;
   byte[] vec1, vec2;
-  RandomVectorScorer scorer;
+  UpdateableRandomVectorScorer scorer;
 
   @Setup(Level.Iteration)
   public void init() throws IOException {
@@ -76,7 +77,8 @@ public void init() throws IOException {
     scorer =
         FlatVectorScorerUtil.getLucene99FlatVectorsScorer()
             .getRandomVectorScorerSupplier(DOT_PRODUCT, vectorValues)
-            .scorer(0);
+            .scorer();
+    scorer.setScoringOrdinal(0);
   }
 
   @TearDown