Skip to content

Commit

Permalink
Repeatable hunspell tests (#14246)
Browse files Browse the repository at this point in the history
  • Loading branch information
dweiss committed Feb 16, 2025
1 parent 24057d8 commit 8ed8524
Show file tree
Hide file tree
Showing 5 changed files with 136 additions and 37 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: "Run checks: module lucene/analysis/common"
name: "Run checks: module lucene/analysis/common (hunspell)"

on:
workflow_dispatch:
Expand All @@ -24,7 +24,7 @@ env:

jobs:
test:
name: Extra regression tests
name: Extra Hunspell regression tests
timeout-minutes: 15

runs-on: ubuntu-latest
Expand All @@ -33,5 +33,5 @@ jobs:
- uses: actions/checkout@v4
- uses: ./.github/actions/prepare-for-build

- name: Run 'gradlew lucene/analysis/common check testRegressions'
run: ./gradlew -p lucene/analysis/common check testRegressions
- name: Run Hunspell regression tests
run: ./gradlew -p lucene/analysis/common -Ptests.hunspell.regressions=true -Ptests.verbose=true test --tests "TestAllDictionaries"
32 changes: 32 additions & 0 deletions .github/workflows/run-scheduled-hunspell.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
name: "Run scheduled checks: Hunspell tests against latest dictionaries"

on:
workflow_dispatch:

schedule:
# 4:13 on Mondays
- cron: '13 4 * * 1'

env:
DEVELOCITY_ACCESS_KEY: ${{ secrets.DEVELOCITY_ACCESS_KEY }}

jobs:
test:
name: Hunspell regression tests against latest dictionaries
timeout-minutes: 15

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/prepare-for-build

- name: Run Hunspell regression tests against latest commits in dictionary repositories
run: >
./gradlew -p lucene/analysis/common
-Ptests.hunspell.regressions=true
-Ptests.verbose=true
-Ptests.hunspell.libreoffice.ref=master
-Ptests.hunspell.woorm.ref=main
test
--tests "TestAllDictionaries"
121 changes: 88 additions & 33 deletions lucene/analysis/common/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
* limitations under the License.
*/

import java.nio.file.Files;

apply plugin: 'java-library'

description = 'Analyzers for indexing content in different languages and domains'
Expand All @@ -24,51 +26,104 @@ dependencies {
moduleTestImplementation project(':lucene:test-framework')
}

// Fetch the data and enable regression tests against woorm/ libreoffice dictionaries.
task checkoutHunspellRegressionRepos() {
ext {
checkoutDir = file("${buildDir}/hunspell-regressions")
}
// Enable Hunspell tests against LibreOffice/ Woorm dictionaries. We pull
// these dictionaries dynamically from git of each respective project. To keep
// things consistent across pull requests/ re-runs, we use a fixed git commit
// for each project ({@linkplain https://github.com/apache/lucene/issues/14235 #14235}),
// with a periodic workflow running against the latest commit on each
// project's respective development branch.

// A gradle property with the parent directory for all git clones for each project.
def cloneDirProperty = project.layout.buildDirectory.dir("hunspell-regressions")

outputs.dir checkoutDir
doFirst {
// Clone the repositories we need if they don't exist.
// The list of dictionary projects to pull/ check against. Also includes
// a full commit reference for each project. These should be updated
// from time to time based on what's available at the head reference.
def dictionaryProjects = [
[
"libreoffice": "https://github.com/LibreOffice/dictionaries",
"woorm": "https://github.com/wooorm/dictionaries"
].each { name, repo ->
if (!file("${checkoutDir}/${name}").exists()) {
checkoutDir.mkdirs()
// This will work only if git is available, but I assume it is.
project.exec {
"name": "libreoffice",
"url": "https://github.com/LibreOffice/dictionaries",
"ref": "762abe74008b94b2ff06db6f4024b59a8254c467" // head: master
],
[
"name": "woorm",
"url": "https://github.com/wooorm/dictionaries",
"ref": "8cfea406b505e4d7df52d5a19bce525df98c54ab" // head: main
]
]

// We need this for the new gradle configuration cache. I personally think it's
// awful.
interface ExecOperationsProvider {
@javax.inject.Inject
ExecOperations getExecOperations()
}

// Generate a set of tasks cloning the git repository of each project
// and setting it to the given reference (hash or named).
def cloningTasks = dictionaryProjects.collect { repoSpec ->
def cloningTask = tasks.register(repoSpec.name + "CloneAndSetRef", {task ->
def targetDir = cloneDirProperty.map { dir -> dir.dir(repoSpec.name) }

// The repository reference should be taken from a gradle property or default to the stable
// reference above.
def ref = providers.gradleProperty("tests.hunspell." + repoSpec.name + ".ref").orElse(repoSpec.ref)

// register task outputs.
outputs.dir(targetDir)

// register task inputs; we care about the url and ref.
inputs.property("ref", ref)
inputs.properties(repoSpec)

doFirst {
def execOps = objects.newInstance(ExecOperationsProvider).execOperations

def logger = task.logger
def dotGitPath = targetDir.get().asFile.toPath().toAbsolutePath().resolve(".git")
def gitExec = (List<String> cmdArgs) -> {
logger.lifecycle("Executing git " + cmdArgs.join(" "))
execOps.exec {
executable "git"
ignoreExitValue false
workingDir checkoutDir
args = [ "clone", "--depth=1", repo, name ]
workingDir dotGitPath.getParent().toString()
args = cmdArgs

// An explicit GIT_DIR to prevent .git upward scanning if something goes wrong.
environment("GIT_DIR", dotGitPath.toString())
}
}
}
}
}

task testRegressions(type: Test) {
group "Verification"
description "Run Hunspell regression tests against Woorm/ LibreOffice git repositories."

dependsOn checkoutHunspellRegressionRepos

failFast = true
include "**/TestAllDictionaries*"
// if the target doesn't exist, create an empty repository, set
// the remote url (origin) but don't fetch anything.
if (!Files.exists(dotGitPath)) {
Files.createDirectories(dotGitPath.getParent())
gitExec(["init", "--initial-branch", "irrelevant"])
gitExec(["config", "advice.detachedHead", "false"])
gitExec(["remote", "add", "origin", repoSpec.url])
}

systemProperty "hunspell.dictionaries", checkoutHunspellRegressionRepos.checkoutDir
// Fetch just the revision we're interested in.
// This avoids downloading the entire repo.
gitExec(["fetch", "--depth", "1", "origin", ref.get()])
gitExec(["checkout", "FETCH_HEAD"])
}
})
return cloningTask
}

doFirst {
logger.lifecycle("Running Hunspell regression tests...")
// check if we should run tests with hunspell dictionaries.
def withDictionaries = Boolean.parseBoolean(propertyOrDefault("tests.hunspell.regressions", "false"))
if (withDictionaries) {
tasks.withType(Test).configureEach {
dependsOn cloningTasks
inputs.dir cloneDirProperty
systemProperty "hunspell.dictionaries", cloneDirProperty.map { dir -> dir.asFile.absolutePath }.get()
}
}

// Pass all hunspell-tests-specific project properties to tests as system properties.
tasks.withType(Test) {
// Pass all hunspell-tests-specific project properties to tests as system properties, if they're specified.
tasks.withType(Test).configureEach {
[
"hunspell.dictionaries",
"hunspell.corpora",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@
* property and prints their memory usage. All *.aff files are traversed recursively inside the
* given directory. Each *.aff file must have a same-named sibling *.dic file. For examples of such
* directories, refer to the {@link org.apache.lucene.analysis.hunspell package documentation}.
*
* <p>The build contains tasks that automatically download certain public dictionary files and test
* against them. Take a look at github workflows under {@code .github/workflows} to see how these
* tests are launched if you'd like to repeat locally.
*/
@SuppressSysoutChecks(bugUrl = "prints important memory utilization stats per dictionary")
public class TestAllDictionaries extends LuceneTestCase {
Expand Down
8 changes: 8 additions & 0 deletions lucene/core/src/java/module-info.java
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,14 @@
// Open certain packages for the test framework (ram usage tester).
opens org.apache.lucene.document to
org.apache.lucene.test_framework;
opens org.apache.lucene.util.fst to
org.apache.lucene.test_framework;
opens org.apache.lucene.store to
org.apache.lucene.test_framework;
opens org.apache.lucene.util.automaton to
org.apache.lucene.test_framework;
opens org.apache.lucene.util to
org.apache.lucene.test_framework;

exports org.apache.lucene.util.quantization;
exports org.apache.lucene.codecs.hnsw;
Expand Down

0 comments on commit 8ed8524

Please sign in to comment.