Skip to content

Commit

Permalink
Fix Operations.reverse() to not add non-deterministic dead states (#1…
Browse files Browse the repository at this point in the history
…4212)

Operations.reverse() can create dead states, but ones that have
non-determinism, which is worse than just creating dead states since it
causes Automaton.isDeterministic() to return false, e.g. treated as NFA.
This can lead to unnecessary det() calls expecially if automaton gets
bigger or more complex.

Operations.reverse() serves multiple use-cases today:
* Search engine use-cases trying to speed up leading wildcards
* Testing/academic use-case (Brzozowski minimize)

In the search engine use-case, it is used by both Lucene and Solr.

Lucene uses this method for infinite
automata (e.g. leading wildcard) to compute a common suffix. if the
expression has one (e.g. "*foo"), then we'll need to evaluate many
candidates: so we reverse the automaton as part of computing the common
suffix. Then memcmp can be used to filter out candidates quickly.

Solr uses this method, where users can opt-in to also indexing the
reversed form of every term, with a special marker to prevent
false-positives from the extra reversed terms. At query-time, the
reversed wildcard queries can be turned into something that looks more
like a prefix query: https://github.com/apache/solr/blob/bca4cd630b9cff66ecc0431397a99f5289a6462b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java#L1291-L1324

Move Operations.reverse(Automaton, Set) to AutomatonTestUtil, since it
is too difficult to improve while also supporting this hook.

Fix Operations.reverse(Automaton) to remove dead states.
  • Loading branch information
rmuir authored Feb 10, 2025
1 parent fe50684 commit ad7ff1f
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 43 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,14 @@ public Transition[][] getSortedTransitions() {
return transitions;
}

/** Returns accept states. If the bit is set then that state is an accept state. */
BitSet getAcceptStates() {
/**
* Returns accept states. If the bit is set then that state is an accept state.
*
* <p>expert: Use {@link #isAccept(int)} instead, unless you really need to scan bits.
*
* @lucene.internal This method signature may change in the future
*/
public BitSet getAcceptStates() {
return isAccept;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.internal.hppc.BitMixer;
import org.apache.lucene.internal.hppc.IntCursor;
import org.apache.lucene.internal.hppc.IntHashSet;
Expand Down Expand Up @@ -979,7 +978,7 @@ private static BitSet getLiveStatesFromInitial(Automaton a) {
private static BitSet getLiveStatesToAccept(Automaton a) {
Automaton.Builder builder = new Automaton.Builder();

// NOTE: not quite the same thing as what SpecialOperations.reverse does:
// NOTE: not quite the same thing as what reverse() does:
Transition t = new Transition();
int numStates = a.getNumStates();
for (int s = 0; s < numStates; s++) {
Expand Down Expand Up @@ -1187,7 +1186,7 @@ public static IntsRef getSingleton(Automaton a) {
*/
public static BytesRef getCommonSuffixBytesRef(Automaton a) {
// reverse the language of the automaton, then reverse its common prefix.
Automaton r = removeDeadStates(reverse(a));
Automaton r = reverse(a);
BytesRef ref = getCommonPrefixBytesRef(r);
reverseBytes(ref);
return ref;
Expand All @@ -1205,12 +1204,6 @@ private static void reverseBytes(BytesRef ref) {

/** Returns an automaton accepting the reverse language. */
public static Automaton reverse(Automaton a) {
return reverse(a, null);
}

/** Reverses the automaton, returning the new initial states. */
public static Automaton reverse(Automaton a, Set<Integer> initialStates) {

if (Operations.isEmpty(a)) {
return new Automaton();
}
Expand Down Expand Up @@ -1246,15 +1239,12 @@ public static Automaton reverse(Automaton a, Set<Integer> initialStates) {
BitSet acceptStates = a.getAcceptStates();
while (s < numStates && (s = acceptStates.nextSetBit(s)) != -1) {
result.addEpsilon(0, s + 1);
if (initialStates != null) {
initialStates.add(s + 1);
}
s++;
}

result.finishState();

return result;
return removeDeadStates(result);
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,32 +143,6 @@ public void testCommonPrefixTrailingKleenStar() throws Exception {
assertEquals("boo", Operations.getCommonPrefix(a));
}

public void testCommonPrefixDeadStates() throws Exception {
Automaton a =
Operations.concatenate(List.of(Automata.makeAnyString(), Automata.makeString("boo")));
// reverse it twice, to create some dead states
// TODO: is it possible to fix reverse() to not create dead states?!
Automaton withDeadStates = Operations.reverse(Operations.reverse(a));
IllegalArgumentException expected =
expectThrows(
IllegalArgumentException.class,
() -> {
Operations.getCommonPrefix(withDeadStates);
});
assertEquals("input automaton has dead states", expected.getMessage());
}

public void testCommonPrefixRemoveDeadStates() throws Exception {
Automaton a =
Operations.concatenate(List.of(Automata.makeAnyString(), Automata.makeString("boo")));
// reverse it twice, to create some dead states
// TODO: is it possible to fix reverse() to not create dead states?!
Automaton withDeadStates = Operations.reverse(Operations.reverse(a));
// now remove the deadstates
Automaton withoutDeadStates = Operations.removeDeadStates(withDeadStates);
assertEquals("", Operations.getCommonPrefix(withoutDeadStates));
}

public void testCommonPrefixOptional() throws Exception {
Automaton a = new Automaton();
int init = a.createState();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -329,12 +329,64 @@ public static Automaton randomAutomaton(Random random) {
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

/**
* Original brics implementation of reverse(). It tries to satisfy multiple use-cases by
* populating a set of initial states too.
*/
public static Automaton reverseOriginal(Automaton a, Set<Integer> initialStates) {

if (Operations.isEmpty(a)) {
return new Automaton();
}

int numStates = a.getNumStates();

// Build a new automaton with all edges reversed
Automaton.Builder builder = new Automaton.Builder();

// Initial node; we'll add epsilon transitions in the end:
builder.createState();

for (int s = 0; s < numStates; s++) {
builder.createState();
}

// Old initial state becomes new accept state:
builder.setAccept(1, true);

Transition t = new Transition();
for (int s = 0; s < numStates; s++) {
int numTransitions = a.getNumTransitions(s);
a.initTransition(s, t);
for (int i = 0; i < numTransitions; i++) {
a.getNextTransition(t);
builder.addTransition(t.dest + 1, s + 1, t.min, t.max);
}
}

Automaton result = builder.finish();

int s = 0;
BitSet acceptStates = a.getAcceptStates();
while (s < numStates && (s = acceptStates.nextSetBit(s)) != -1) {
result.addEpsilon(0, s + 1);
if (initialStates != null) {
initialStates.add(s + 1);
}
s++;
}

result.finishState();

return result;
}

/** Simple, original brics implementation of Brzozowski minimize() */
public static Automaton minimizeSimple(Automaton a) {
Set<Integer> initialSet = new HashSet<Integer>();
a = determinizeSimple(Operations.reverse(a, initialSet), initialSet);
a = determinizeSimple(reverseOriginal(a, initialSet), initialSet);
initialSet.clear();
a = determinizeSimple(Operations.reverse(a, initialSet), initialSet);
a = determinizeSimple(reverseOriginal(a, initialSet), initialSet);
return a;
}

Expand Down

0 comments on commit ad7ff1f

Please sign in to comment.