Skip to content

Commit

Permalink
Solr 9 Update (#221)
Browse files Browse the repository at this point in the history
* Don't publish 10k test suite data yet

* Add Jython fork as a submodule

I needed to patch an issue in PyUnicode so I made a new fork; will remove if/when it gets upstreamed.

* Ignore .DS_Store files

* Add Jython submodule

* Include Jython Gradle project in Montysolr

* Add test for failing name parser test case

* Handle fully unparsable inputs in the Python code

Prior to this the code would error out because it expected at least 1 part. In cases where there are additional parenthesis around the string (it happens), and other miscellaneous inputs, there can be 0 parts instead.

* Handle unparsable author names in the Java code

This causes the unparsed author name to pass through the system. In previous versions there would be (incorrect) null characters added to the output of this pass if the author name couldn't be parsed.

* Use NFKC normalization for author names

This normalization pass helps to consolidate the Unicode code points in the string prior to other passes. Without this step some important parts of certain code points can be eliminated, resulting in mangled output.

* Add test case for parenthesized author names

* Allow `start` in position queries to be >= 0

* Add 10k test suite test case

* Un-ignore the dataset

* Add the query dataset

* Initial Solr 9 commit

Most tests are passing at this point, but I need to merge in the name normalization changes to fix issues caused by the Jython unicode split bug.

* Fix final round of Solr 9 test failures

* Upgrade to Java 17 to match Solr 9

* Match Lucene version to Solr dependency version

* Update luceneMatchVersion to fix warning

We were getting warnings about deprecated behavior at launch

* Fix locale issue in Jython with JDK 17

* Add test case to match adshelp email

* Remove lib directives that match modules

* Add test case for replication URL handling
  • Loading branch information
JCRPaquin authored Dec 5, 2024
1 parent 78dde23 commit c6a9f9b
Show file tree
Hide file tree
Showing 126 changed files with 861 additions and 526 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
.DS_Store
*.class
*.pyc
*.log
Expand Down
21 changes: 6 additions & 15 deletions deploy/adsabs/server/solr/collection1/conf/solrconfig.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,10 @@

<config>

<luceneMatchVersion>6.0</luceneMatchVersion>
<luceneMatchVersion>9.0</luceneMatchVersion>

<lib dir="../lib" />

<lib dir="../../../dist/" regex="apache-solr-cell-\d.*\.jar" />
<lib dir="../../../contrib/extraction/lib" regex=".*\.jar" />

<lib dir="../../../dist/" regex="apache-solr-clustering-\d.*\.jar" />
<lib dir="../../../contrib/clustering/lib/" regex=".*\.jar" />

<lib dir="../../../dist/" regex="apache-solr-langid-\d.*\.jar" />
<lib dir="../../../contrib/langid/lib/" regex=".*\.jar" />

<lib dir="../../../dist/" regex="apache-solr-velocity-\d.*\.jar" />
<lib dir="../../../contrib/velocity/lib" regex=".*\.jar" />

Expand Down Expand Up @@ -45,7 +36,7 @@
make sure that only one writer is modifying it; and other
instances are using 'single' lockType -->

<lockType>${montysolr.locktype:native}</lockType>
<lockType>${montysolr.locktype:single}</lockType>


<deletionPolicy class="solr.SolrDeletionPolicy">
Expand Down Expand Up @@ -124,20 +115,20 @@

<slowQueryThresholdMillis>1000</slowQueryThresholdMillis>

<filterCache class="solr.FastLRUCache"
<filterCache class="solr.CaffeineCache"
size="${solr.filterCache.size:512}"
initialSize="${solr.filterCache.initialSize:512}"
autowarmCount="${solr.filterCache.autowarmCount:128}"
/>

<queryResultCache class="solr.LRUCache"
<queryResultCache class="solr.CaffeineCache"
size="${solr.queryResultCache.size:512}"
initialSize="${solr.queryResultCache.initialSize:512}"
autowarmCount="${solr.queryResultCache.autowarmCount:128}"
maxRamMB="${solr.queryResultCache.maxRamMB:128}"
/>

<documentCache class="solr.LRUCache"
<documentCache class="solr.CaffeineCache"
size="${solr.documentCache.size:512}"
initialSize="${solr.documentCache.initialSize:512}"
autowarmCount="${solr.documentCache.autowarmCount:128}"
Expand Down Expand Up @@ -679,7 +670,7 @@

<queryResponseWriter name="json" class="solr.JSONResponseWriter" />

<queryResponseWriter name="xslt" class="solr.XSLTResponseWriter">
<queryResponseWriter name="xslt" class="solr.scripting.xslt.XSLTResponseWriter">
<int name="xsltCacheLifetimeSeconds">5</int>
</queryResponseWriter>

Expand Down
2 changes: 1 addition & 1 deletion jython
21 changes: 15 additions & 6 deletions montysolr/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -17,24 +17,33 @@ dependencies {
antlr("org.antlr:antlr:3.5.2")

implementation("org.antlr:antlr-runtime:3.5.2")
implementation("org.apache.solr:solr-core:7.7.3")
implementation("org.apache.lucene:lucene-core:7.7.3")
implementation("org.apache.lucene:lucene-queryparser:7.7.3")
implementation("org.apache.solr:solr-core:9.6.1")
implementation("org.apache.lucene:lucene-core:9.10.0")
implementation("org.apache.lucene:lucene-queryparser:9.10.0")
implementation("org.apache.lucene:lucene-join:9.10.0")
implementation("org.apache.lucene:lucene-misc:9.10.0")
implementation("org.apache.lucene:lucene-suggest:9.10.0")
implementation("org.apache.commons:commons-lang3:3.14.0")

implementation("com.google.guava:guava:33.2.1-jre")
implementation("com.anyascii:anyascii:0.3.2")
//implementation("org.python:jython-standalone:2.7.3")
implementation(project(":jython"))

testImplementation("junit:junit:4.13.2")
testImplementation("org.antlr:stringtemplate:3.2.1")
testImplementation("org.apache.solr:solr-test-framework:7.7.3")
testImplementation("org.apache.lucene:lucene-test-framework:7.7.3")
testImplementation("org.apache.solr:solr-test-framework:9.6.1")
testImplementation("org.apache.solr:solr-scripting:9.6.1")
testImplementation("org.apache.lucene:lucene-test-framework:9.10.0")
testImplementation("org.apache.lucene:lucene-backward-codecs:9.10.0")
testImplementation("org.apache.lucene:lucene-codecs:9.10.0")
testImplementation("com.univocity:univocity-parsers:2.9.1")
testImplementation("commons-io:commons-io:2.16.1")
}

java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(11))
languageVersion.set(JavaLanguageVersion.of(17))
}
}

Expand Down
7 changes: 5 additions & 2 deletions montysolr/src/main/java/org/adsabs/InvenioBitSet.java
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
package org.adsabs;

import org.apache.solr.common.util.Base64;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.BitSet;
import java.util.zip.DataFormatException;
import java.util.zip.DeflaterOutputStream;
Expand Down Expand Up @@ -114,7 +116,8 @@ public static String getHexString(byte[] b) throws Exception {
*/
public String toBase64() throws IOException {
byte[] data = this.fastDump();
return Base64.byteArrayToBase64(data, 0, data.length);
return new String(Base64.getEncoder().encode(ByteBuffer.wrap(data, 0, data.length)).array(),
StandardCharsets.ISO_8859_1);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,10 @@

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.util.ResourceLoader;
import org.apache.lucene.util.ResourceLoaderAware;
import org.apache.lucene.analysis.TokenFilterFactory;

import java.io.IOException;
import java.util.Map;
Expand Down Expand Up @@ -71,7 +72,7 @@ public void inform(ResourceLoader loader) throws IOException {
if (null != format) {
throw new IllegalArgumentException("'format' can not be specified w/o an explicit 'words' file: " + format);
}
stopWords = new CharArraySet(StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
stopWords = new CharArraySet(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,8 @@
*/

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.AbstractAnalysisFactory;
import org.apache.lucene.analysis.TokenFilterFactory;

import java.util.Map;

Expand All @@ -34,7 +33,7 @@
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*/
public class SelectiveLowerCaseFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
public class SelectiveLowerCaseFilterFactory extends TokenFilterFactory {
public SelectiveLowerCaseFilterFactory(Map<String, String> args) {
super(args);
if (!args.isEmpty()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,8 @@
*/

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.AbstractAnalysisFactory;
import org.apache.lucene.analysis.TokenFilterFactory;

import java.util.Map;

Expand All @@ -34,7 +33,7 @@
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*/
public class AdsSpecialCharactersFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
public class AdsSpecialCharactersFilterFactory extends TokenFilterFactory {
public AdsSpecialCharactersFilterFactory(Map<String, String> args) {
super(args);
if (!args.isEmpty()) {
Expand All @@ -45,10 +44,5 @@ public AdsSpecialCharactersFilterFactory(Map<String, String> args) {
public AdsSpecialCharactersFilter create(TokenStream input) {
return new AdsSpecialCharactersFilter(input);
}

@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.util.ResourceLoader;
import org.apache.lucene.util.ResourceLoaderAware;
import org.apache.lucene.analysis.TokenFilterFactory;

import java.io.IOException;
import java.util.*;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ public final class SimplePatternSplitTokenizer extends Tokenizer {
* See {@link RegExp} for the accepted syntax.
*/
public SimplePatternSplitTokenizer(String regexp) {
this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, regexp, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, regexp, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
}

/**
Expand Down Expand Up @@ -95,7 +95,7 @@ public SimplePatternSplitTokenizer(AttributeFactory factory, Automaton dfa) {
throw new IllegalArgumentException("please determinize the incoming automaton first");
}

runDFA = new CharacterRunAutomaton(dfa, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
runDFA = new CharacterRunAutomaton(dfa, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
}

private void fillToken(int offsetStart) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,14 @@

package org.apache.lucene.analysis.pattern;

import org.apache.lucene.analysis.util.TokenizerFactory;
import com.google.j2objc.annotations.UsedByReflection;
import org.apache.lucene.analysis.TokenizerFactory;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.RegExp;

import java.util.Collections;
import java.util.Map;

/**
Expand Down Expand Up @@ -56,16 +58,21 @@
* @see SimplePatternSplitTokenizer
*/
public class SimplePatternSplitTokenizerFactory extends TokenizerFactory {
public static final String NAME = "simplePatternSplit";
public static final String PATTERN = "pattern";
private final Automaton dfa;
private final int maxDeterminizedStates;

public SimplePatternSplitTokenizerFactory() {
this(Collections.emptyMap());
}

/**
* Creates a new SimpleSplitPatternTokenizerFactory
*/
public SimplePatternSplitTokenizerFactory(Map<String, String> args) {
super(args);
maxDeterminizedStates = getInt(args, "maxDeterminizedStates", Operations.DEFAULT_MAX_DETERMINIZED_STATES);
maxDeterminizedStates = getInt(args, "maxDeterminizedStates", Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
dfa = Operations.determinize(new RegExp(require(args, PATTERN)).toAutomaton(), maxDeterminizedStates);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.ResourceLoader;
import org.apache.lucene.util.ResourceLoaderAware;
import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.TokenizerFactory;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.CharsRef;

Expand Down Expand Up @@ -150,6 +150,7 @@ public SynonymMap create(ResourceLoader loader) throws IOException, ParseExcepti

protected Analyzer getAnalyzer(ResourceLoader loader) throws IOException {
final boolean ignoreCase = getBoolean(args, "ignoreCase", false);
args.put("ignoreCase", ignoreCase ? "true" : "false");

String tf = args.get("tokenizerFactory");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.automaton.LevenshteinAutomata;

import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
Expand Down Expand Up @@ -488,7 +489,7 @@ public float getFuzzyMinSim() {
ConfigurationKeys.FUZZY_CONFIG);

if (fuzzyConfig == null) {
return FuzzyQuery.defaultMinSimilarity;
return LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE;
} else {
return fuzzyConfig.getMinSimilarity();
}
Expand Down
Loading

0 comments on commit c6a9f9b

Please sign in to comment.