diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java index 3a87fe40733..fdd6242abcc 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java @@ -516,7 +516,7 @@ public static Automaton union(Collection list) { result.finishState(); - return removeDeadStates(result); + return mergeAcceptStatesWithNoTransition(removeDeadStates(result)); } // Simple custom ArrayList @@ -1060,6 +1060,82 @@ public static Automaton removeDeadStates(Automaton a) { return result; } + /** + * Merge all accept states that don't have outgoing transitions to a single shared state. This is + * a subset of minimization that is much cheaper. This helper is useful because operations like + * concatenation need to connect accept states of an automaton with the start state of the next + * one, so having fewer accept states makes the produced automata simpler. + */ + static Automaton mergeAcceptStatesWithNoTransition(Automaton a) { + int numStates = a.getNumStates(); + + int numAcceptStatesWithNoTransition = 0; + int[] acceptStatesWithNoTransition = new int[0]; + + BitSet acceptStates = a.getAcceptStates(); + for (int i = 0; i < numStates; ++i) { + if (acceptStates.get(i) && a.getNumTransitions(i) == 0) { + acceptStatesWithNoTransition = + ArrayUtil.grow(acceptStatesWithNoTransition, 1 + numAcceptStatesWithNoTransition); + acceptStatesWithNoTransition[numAcceptStatesWithNoTransition++] = i; + } + } + + if (numAcceptStatesWithNoTransition <= 1) { + // No states to merge + return a; + } + + // Shrink for simplicity. + acceptStatesWithNoTransition = + ArrayUtil.copyOfSubArray(acceptStatesWithNoTransition, 0, numAcceptStatesWithNoTransition); + + // Now copy states, preserving accept states. + Automaton result = new Automaton(); + for (int s = 0; s < numStates; s++) { + int remappedS = remap(s, acceptStatesWithNoTransition); + while (result.getNumStates() <= remappedS) { + result.createState(); + } + if (acceptStates.get(s)) { + result.setAccept(remappedS, true); + } + } + + // Now copy transitions, making sure to remap states. + Transition t = new Transition(); + for (int s = 0; s < numStates; ++s) { + int remappedSource = remap(s, acceptStatesWithNoTransition); + int numTransitions = a.initTransition(s, t); + for (int j = 0; j < numTransitions; j++) { + a.getNextTransition(t); + int remappedDest = remap(t.dest, acceptStatesWithNoTransition); + result.addTransition(remappedSource, remappedDest, t.min, t.max); + } + } + + result.finishState(); + return result; + } + + private static int remap(int s, int[] combinedStates) { + int idx = Arrays.binarySearch(combinedStates, s); + if (idx >= 0) { + // This state is part of the states that get combined, remap to the first one. + return combinedStates[0]; + } else { + idx = -1 - idx; + if (idx <= 1) { + // There is either no combined state before the current state, or only the first one, which + // we're preserving: no renumbering needed. + return s; + } else { + // Subtract the number of states that get combined into the first combined state. + return s - (idx - 1); + } + } + } + /** * Returns the longest string that is a prefix of all accepted strings and visits each state at * most once. The automaton must not have dead states. If this automaton has already been diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java index c6fde722290..7fb1d679946 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java @@ -374,6 +374,54 @@ public void testRepeat() { Operations.determinize(Operations.repeat(aOrAb), Integer.MAX_VALUE))); } + public void testMergeAcceptStatesWithNoTransition() { + Automaton emptyLanguage = Automata.makeEmpty(); + assertSame(emptyLanguage, Operations.mergeAcceptStatesWithNoTransition(emptyLanguage)); + + Automaton a = Automata.makeString("a"); + assertSame(a, Operations.mergeAcceptStatesWithNoTransition(a)); + + // All accept states get combined + Automaton aOrC = new Automaton(); + aOrC.createState(); + aOrC.createState(); + aOrC.createState(); + aOrC.addTransition(0, 1, 'a'); + aOrC.setAccept(1, true); + aOrC.addTransition(0, 2, 'c'); + aOrC.setAccept(2, true); + Automaton aOrCSingleAcceptState = Operations.mergeAcceptStatesWithNoTransition(aOrC); + assertEquals(1, aOrCSingleAcceptState.getAcceptStates().cardinality()); + assertTrue(AutomatonTestUtil.sameLanguage(aOrC, aOrCSingleAcceptState)); + + // Two accept states get combined, but not the 3rd one since it has an outgoing transition + Automaton aOrCOrXStar = new Automaton(); + aOrCOrXStar.createState(); + aOrCOrXStar.createState(); + aOrCOrXStar.createState(); + aOrCOrXStar.createState(); + aOrCOrXStar.addTransition(0, 1, 'a'); + aOrCOrXStar.setAccept(1, true); + aOrCOrXStar.addTransition(0, 2, 'c'); + aOrCOrXStar.setAccept(2, true); + aOrCOrXStar.addTransition(0, 3, 'x'); + aOrCOrXStar.addTransition(3, 3, 'x'); + aOrCOrXStar.setAccept(3, true); + Automaton aOrCOrXStarSingleAcceptState = + Operations.mergeAcceptStatesWithNoTransition(aOrCOrXStar); + assertEquals(2, aOrCOrXStarSingleAcceptState.getAcceptStates().cardinality()); + assertTrue(AutomatonTestUtil.sameLanguage(aOrCOrXStar, aOrCOrXStarSingleAcceptState)); + + int iters = atLeast(100); + for (int iter = 0; iter < iters; iter++) { + // sameLangage requires a deterministic automaton + Automaton expected = + Operations.determinize(AutomatonTestUtil.randomAutomaton(random()), Integer.MAX_VALUE); + Automaton actual = Operations.mergeAcceptStatesWithNoTransition(expected); + assertTrue(AutomatonTestUtil.sameLanguage(expected, actual)); + } + } + public void testDuelRepeat() { final int iters = atLeast(1_000); for (int iter = 0; iter < iters; ++iter) {