From fe42efc59188c4fad4c299faf2e2df5b157c8007 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 6 Feb 2025 08:59:47 -0500 Subject: [PATCH] Add Automata.makeCharSet/makeCharClass to optimize regexp (#14193) Add Automata.makeCharSet(int[])/makeCharClass(int[],int[]) to optimize regexp. * Add new "character class" node, which was previously composed by union of many nodes. * Remove "predefined class" node, which previously built an internal separate regex on the fly, it is just another character class. * RegExp no longer uses union() internally, except for union (|) operator. * format codepoints in the internal parse tree output with U+%04X * Fix concatenate to remove the dead states it creates, just like intersection/union/etc do * fix dead-states-test to explicitly create dead states, rather than relying on some function to create a mess. it doesn't anymore. --- .../lucene/util/automaton/Automata.java | 27 ++ .../lucene/util/automaton/Operations.java | 3 +- .../apache/lucene/util/automaton/RegExp.java | 286 ++++++++++++------ .../lucene/util/automaton/TestAutomaton.java | 47 ++- .../util/automaton/TestRegExpParsing.java | 70 ++++- 5 files changed, 329 insertions(+), 104 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java index 9ecf748418f6..9b6198fb04c5 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java @@ -32,6 +32,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Collection; +import java.util.Objects; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefIterator; import org.apache.lucene.util.StringHelper; @@ -140,6 +141,32 @@ public static Automaton makeCharRange(int min, int max) { return a; } + /** Returns a new minimal automaton that accepts any of the provided codepoints */ + public static Automaton makeCharSet(int[] codepoints) { + return makeCharClass(codepoints, codepoints); + } + + /** Returns a new minimal automaton that accepts any of the codepoint ranges */ + public static Automaton makeCharClass(int[] starts, int[] ends) { + Objects.requireNonNull(starts); + Objects.requireNonNull(ends); + if (starts.length != ends.length) { + throw new IllegalArgumentException("starts must match ends"); + } + if (starts.length == 0) { + return makeEmpty(); + } + Automaton a = new Automaton(); + int s1 = a.createState(); + int s2 = a.createState(); + a.setAccept(s2, true); + for (int i = 0; i < starts.length; i++) { + a.addTransition(s1, s2, starts[i], ends[i]); + } + a.finishState(); + return a; + } + /** * Constructs sub-automaton corresponding to decimal numbers of length x.substring(n).length(). */ diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java index 7c2b164aa107..fb255f8a02e4 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java @@ -148,8 +148,7 @@ public static Automaton concatenate(List l) { } result.finishState(); - - return result; + return Operations.removeDeadStates(result); } /** diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java index 92bfe41b462b..aaf50c8c1802 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java @@ -34,6 +34,7 @@ import java.util.HashSet; import java.util.Iterator; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.function.BooleanSupplier; @@ -374,6 +375,8 @@ public enum Kind { REGEXP_CHAR, /** A Character range */ REGEXP_CHAR_RANGE, + /** A Character class (list of ranges) */ + REGEXP_CHAR_CLASS, /** Any Character allowed */ REGEXP_ANYCHAR, /** An empty expression */ @@ -386,8 +389,6 @@ public enum Kind { REGEXP_AUTOMATON, /** An Interval expression */ REGEXP_INTERVAL, - /** An expression for a pre-defined class e.g. \w */ - REGEXP_PRE_CLASS, /** * The complement of an expression. * @@ -453,7 +454,7 @@ public enum Kind { public final int min, max, digits; /** Extents for range type expressions */ - public final int from, to; + public final int from[], to[]; // Parser variables private final String originalString; @@ -528,8 +529,8 @@ public RegExp(String s, int syntax_flags, int match_flags) throws IllegalArgumen int min, int max, int digits, - int from, - int to) { + int from[], + int to[]) { this.originalString = null; this.kind = kind; this.flags = flags; @@ -546,17 +547,17 @@ public RegExp(String s, int syntax_flags, int match_flags) throws IllegalArgumen // Simplified construction of container nodes static RegExp newContainerNode(int flags, Kind kind, RegExp exp1, RegExp exp2) { - return new RegExp(flags, kind, exp1, exp2, null, 0, 0, 0, 0, 0, 0); + return new RegExp(flags, kind, exp1, exp2, null, 0, 0, 0, 0, null, null); } // Simplified construction of repeating nodes static RegExp newRepeatingNode(int flags, Kind kind, RegExp exp, int min, int max) { - return new RegExp(flags, kind, exp, null, null, 0, min, max, 0, 0, 0); + return new RegExp(flags, kind, exp, null, null, 0, min, max, 0, null, null); } // Simplified construction of leaf nodes static RegExp newLeafNode( - int flags, Kind kind, String s, int c, int min, int max, int digits, int from, int to) { + int flags, Kind kind, String s, int c, int min, int max, int digits, int from[], int to[]) { return new RegExp(flags, kind, null, null, s, c, min, max, digits, from, to); } @@ -598,10 +599,6 @@ private Automaton toAutomaton( List list; Automaton a = null; switch (kind) { - case REGEXP_PRE_CLASS: - RegExp expanded = expandPredefined(); - a = expanded.toAutomaton(automata, automaton_provider); - break; case REGEXP_UNION: list = new ArrayList<>(); findLeaves(exp1, Kind.REGEXP_UNION, list, automata, automaton_provider); @@ -648,13 +645,16 @@ private Automaton toAutomaton( break; case REGEXP_CHAR: if (check(ASCII_CASE_INSENSITIVE)) { - a = toCaseInsensitiveChar(c); + a = Automata.makeCharSet(toCaseInsensitiveChar(c)); } else { a = Automata.makeChar(c); } break; case REGEXP_CHAR_RANGE: - a = Automata.makeCharRange(from, to); + a = Automata.makeCharRange(from[0], to[0]); + break; + case REGEXP_CHAR_CLASS: + a = Automata.makeCharClass(from, to); break; case REGEXP_ANYCHAR: a = Automata.makeAnyChar(); @@ -696,23 +696,20 @@ private Automaton toAutomaton( return a; } - private Automaton toCaseInsensitiveChar(int codepoint) { - Automaton case1 = Automata.makeChar(codepoint); + private int[] toCaseInsensitiveChar(int codepoint) { // For now we only work with ASCII characters if (codepoint > 128) { - return case1; + return new int[] {codepoint}; } int altCase = Character.isLowerCase(codepoint) ? Character.toUpperCase(codepoint) : Character.toLowerCase(codepoint); - Automaton result; if (altCase != codepoint) { - result = Operations.union(case1, Automata.makeChar(altCase)); + return new int[] {codepoint, altCase}; } else { - result = case1; + return new int[] {codepoint}; } - return result; } private Automaton toCaseInsensitiveString() { @@ -720,7 +717,8 @@ private Automaton toCaseInsensitiveString() { Iterator iter = s.codePoints().iterator(); while (iter.hasNext()) { - list.add(toCaseInsensitiveChar(iter.next())); + int points[] = toCaseInsensitiveChar(iter.next()); + list.add(Automata.makeCharSet(points)); } return Operations.concatenate(list); } @@ -802,7 +800,19 @@ void toStringBuilder(StringBuilder b) { b.append("\\").appendCodePoint(c); break; case REGEXP_CHAR_RANGE: - b.append("[\\").appendCodePoint(from).append("-\\").appendCodePoint(to).append("]"); + b.append("[\\").appendCodePoint(from[0]).append("-\\").appendCodePoint(to[0]).append("]"); + break; + case REGEXP_CHAR_CLASS: + b.append("["); + for (int i = 0; i < from.length; i++) { + if (from[i] == to[i]) { + b.append("\\").appendCodePoint(from[i]); + } else { + b.append("\\").appendCodePoint(from[i]); + b.append("-\\").appendCodePoint(to[i]); + } + } + b.append("]"); break; case REGEXP_ANYCHAR: b.append("."); @@ -828,13 +838,10 @@ void toStringBuilder(StringBuilder b) { if (digits > 0) for (int i = s2.length(); i < digits; i++) b.append('0'); b.append(s2).append(">"); break; - case REGEXP_PRE_CLASS: - b.append("\\").appendCodePoint(from); - break; } } - /** Like to string, but more verbose (shows the higherchy more clearly). */ + /** Like to string, but more verbose (shows the hierarchy more clearly). */ public String toStringTree() { StringBuilder b = new StringBuilder(); toStringTree(b, ""); @@ -888,20 +895,22 @@ void toStringTree(StringBuilder b, String indent) { b.appendCodePoint(c); b.append('\n'); break; - case REGEXP_PRE_CLASS: + case REGEXP_CHAR_RANGE: b.append(indent); b.append(kind); - b.append(" class=\\"); - b.appendCodePoint(from); + b.append(" from="); + b.appendCodePoint(from[0]); + b.append(" to="); + b.appendCodePoint(to[0]); b.append('\n'); break; - case REGEXP_CHAR_RANGE: + case REGEXP_CHAR_CLASS: b.append(indent); b.append(kind); - b.append(" from="); - b.appendCodePoint(from); - b.append(" to="); - b.appendCodePoint(to); + b.append(" starts="); + b.append(toHexString(from)); + b.append(" ends="); + b.append(toHexString(to)); b.append('\n'); break; case REGEXP_ANYCHAR: @@ -942,6 +951,20 @@ void toStringTree(StringBuilder b, String indent) { } } + /** prints like [U+002A U+FD72 U+1FFFF] */ + private StringBuilder toHexString(int[] range) { + StringBuilder sb = new StringBuilder(); + sb.append('['); + for (int codepoint : range) { + if (sb.length() > 1) { + sb.append(' '); + } + sb.append(String.format(Locale.ROOT, "U+%04X", codepoint)); + } + sb.append(']'); + return sb; + } + /** Returns set of automaton identifiers that occur in this regular expression. */ public Set getIdentifiers() { HashSet set = new HashSet<>(); @@ -972,9 +995,9 @@ void getIdentifiers(Set set) { case REGEXP_ANYSTRING: case REGEXP_CHAR: case REGEXP_CHAR_RANGE: + case REGEXP_CHAR_CLASS: case REGEXP_EMPTY: case REGEXP_INTERVAL: - case REGEXP_PRE_CLASS: case REGEXP_STRING: default: } @@ -1050,14 +1073,33 @@ static RegExp makeDeprecatedComplement(int flags, RegExp exp) { } static RegExp makeChar(int flags, int c) { - return newLeafNode(flags, Kind.REGEXP_CHAR, null, c, 0, 0, 0, 0, 0); + return newLeafNode(flags, Kind.REGEXP_CHAR, null, c, 0, 0, 0, null, null); } static RegExp makeCharRange(int flags, int from, int to) { if (from > to) throw new IllegalArgumentException( "invalid range: from (" + from + ") cannot be > to (" + to + ")"); - return newLeafNode(flags, Kind.REGEXP_CHAR_RANGE, null, 0, 0, 0, 0, from, to); + return newLeafNode( + flags, Kind.REGEXP_CHAR_RANGE, null, 0, 0, 0, 0, new int[] {from}, new int[] {to}); + } + + static RegExp makeCharClass(int flags, int from[], int to[]) { + if (from.length != to.length) { + throw new IllegalStateException( + String.format( + Locale.ROOT, + "invalid class: from.length (%d) != to.length (%d)", + from.length, + to.length)); + } + for (int i = 0; i < from.length; i++) { + if (from[i] > to[i]) { + throw new IllegalArgumentException( + "invalid range: from (" + from[i] + ") cannot be > to (" + to[i] + ")"); + } + } + return newLeafNode(flags, Kind.REGEXP_CHAR_CLASS, null, 0, 0, 0, 0, from, to); } static RegExp makeAnyChar(int flags) { @@ -1069,7 +1111,7 @@ static RegExp makeEmpty(int flags) { } static RegExp makeString(int flags, String s) { - return newLeafNode(flags, Kind.REGEXP_STRING, s, 0, 0, 0, 0, 0, 0); + return newLeafNode(flags, Kind.REGEXP_STRING, s, 0, 0, 0, 0, null, null); } static RegExp makeAnyString(int flags) { @@ -1077,11 +1119,11 @@ static RegExp makeAnyString(int flags) { } static RegExp makeAutomaton(int flags, String s) { - return newLeafNode(flags, Kind.REGEXP_AUTOMATON, s, 0, 0, 0, 0, 0, 0); + return newLeafNode(flags, Kind.REGEXP_AUTOMATON, s, 0, 0, 0, 0, null, null); } static RegExp makeInterval(int flags, int min, int max, int digits) { - return newLeafNode(flags, Kind.REGEXP_INTERVAL, null, 0, min, max, digits, 0, 0); + return newLeafNode(flags, Kind.REGEXP_INTERVAL, null, 0, min, max, digits, null, null); } private boolean peek(String s) { @@ -1195,60 +1237,132 @@ final RegExp parseCharClassExp() throws IllegalArgumentException { } final RegExp parseCharClasses() throws IllegalArgumentException { - RegExp e = parseCharClass(); - while (more() && !peek("]")) e = makeUnion(flags, e, parseCharClass()); - return e; - } + ArrayList starts = new ArrayList<>(); + ArrayList ends = new ArrayList<>(); - final RegExp parseCharClass() throws IllegalArgumentException { - RegExp predefinedExp = matchPredefinedCharacterClass(); - if (predefinedExp != null) { - return predefinedExp; - } - - int c = parseCharExp(); - if (match('-')) return makeCharRange(flags, c, parseCharExp()); - else return makeChar(flags, c); - } - - RegExp expandPredefined() { - // See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html - switch (from) { - case 'd': - return new RegExp("[0-9]"); // digit - case 'D': - return new RegExp("[^0-9]"); // non-digit - case 's': - return new RegExp("[ \t\n\r]"); // whitespace - case 'S': - return new RegExp("[^\\s]"); // non-whitespace - case 'w': - return new RegExp("[a-zA-Z_0-9]"); // word - case 'W': - return new RegExp("[^\\w]"); // non-word - default: - throw new IllegalArgumentException("invalid character class " + from); - } - } - - final RegExp matchPredefinedCharacterClass() { - // See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html - if (match('\\')) { - if (peek("dDwWsS")) { - return newLeafNode(flags, Kind.REGEXP_PRE_CLASS, null, 0, 0, 0, 0, next(), 0); + do { + // look for escape + if (match('\\')) { + expandPreDefined(starts, ends); + } else { + // parse a character + int c = parseCharExp(); + + if (match('-')) { + // range from c-d + starts.add(c); + ends.add(parseCharExp()); + } else if (check(ASCII_CASE_INSENSITIVE)) { + // single case-insensitive character + for (int form : toCaseInsensitiveChar(c)) { + starts.add(form); + ends.add(form); + } + } else { + // single character + starts.add(c); + ends.add(c); + } } + } while (more() && !peek("]")); - if (peek("\\")) { - return makeChar(flags, next()); + // not sure why we bother optimizing nodes, same automaton... + // definitely saves time vs fixing toString()-based tests. + if (starts.size() == 1) { + if (starts.get(0).intValue() == ends.get(0).intValue()) { + return makeChar(flags, starts.get(0)); + } else { + return makeCharRange(flags, starts.get(0), ends.get(0)); } + } else { + return makeCharClass( + flags, + starts.stream().mapToInt(Integer::intValue).toArray(), + ends.stream().mapToInt(Integer::intValue).toArray()); + } + } + void expandPreDefined(List starts, List ends) { + if (peek("\\")) { + // escape + starts.add((int) '\\'); + ends.add((int) '\\'); + next(); + } else if (peek("d")) { + // digit: [0-9] + starts.add((int) '0'); + ends.add((int) '9'); + next(); + } else if (peek("D")) { + // non-digit: [^0-9] + starts.add(Character.MIN_CODE_POINT); + ends.add('0' - 1); + starts.add('9' + 1); + ends.add(Character.MAX_CODE_POINT); + next(); + } else if (peek("s")) { + // whitespace: [\t-\n\r ] + starts.add((int) '\t'); + ends.add((int) '\n'); + starts.add((int) '\r'); + ends.add((int) '\r'); + starts.add((int) ' '); + ends.add((int) ' '); + next(); + } else if (peek("S")) { + // non-whitespace: [^\t-\n\r ] + starts.add(Character.MIN_CODE_POINT); + ends.add('\t' - 1); + starts.add('\n' + 1); + ends.add('\r' - 1); + starts.add('\r' + 1); + ends.add(' ' - 1); + starts.add(' ' + 1); + ends.add(Character.MAX_CODE_POINT); + next(); + } else if (peek("w")) { + // word: [0-9A-Z_a-z] + starts.add((int) '0'); + ends.add((int) '9'); + starts.add((int) 'A'); + ends.add((int) 'Z'); + starts.add((int) '_'); + ends.add((int) '_'); + starts.add((int) 'a'); + ends.add((int) 'z'); + next(); + } else if (peek("W")) { + // non-word: [^0-9A-Z_a-z] + starts.add(Character.MIN_CODE_POINT); + ends.add('0' - 1); + starts.add('9' + 1); + ends.add('A' - 1); + starts.add('Z' + 1); + ends.add('_' - 1); + starts.add('_' + 1); + ends.add('a' - 1); + starts.add('z' + 1); + ends.add(Character.MAX_CODE_POINT); + next(); + } else if (peek("abcefghijklmnopqrtuvxyz") || peek("ABCEFGHIJKLMNOPQRTUVXYZ")) { // From https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html#bs // "It is an error to use a backslash prior to any alphabetic character that does not denote // an escaped // construct;" - if (peek("abcefghijklmnopqrtuvxyz") || peek("ABCEFGHIJKLMNOPQRTUVXYZ")) { - throw new IllegalArgumentException("invalid character class \\" + next()); - } + throw new IllegalArgumentException("invalid character class \\" + next()); + } + } + + final RegExp matchPredefinedCharacterClass() { + // See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html + if (match('\\') && peek("\\ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz")) { + var starts = new ArrayList(); + var ends = new ArrayList(); + expandPreDefined(starts, ends); + return makeCharClass( + flags, + starts.stream().mapToInt(Integer::intValue).toArray(), + ends.stream().mapToInt(Integer::intValue).toArray()); } return null; diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java index 3c7d6eea198a..e9e35416db4f 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java @@ -667,11 +667,14 @@ public void testConcatenatePreservesDet() throws Exception { } public void testRemoveDeadStates() throws Exception { - Automaton a = - Operations.concatenate(Arrays.asList(Automata.makeString("x"), Automata.makeString("y"))); - assertEquals(4, a.getNumStates()); + Automaton a = new Automaton(); + int s1 = a.createState(); + a.createState(); // create dead state + a.setAccept(s1, true); + a.finishState(); + assertEquals(2, a.getNumStates()); a = Operations.removeDeadStates(a); - assertEquals(3, a.getNumStates()); + assertEquals(1, a.getNumStates()); } public void testRemoveDeadStatesEmpty1() throws Exception { @@ -1682,4 +1685,40 @@ public void testDeterminizeTooMuchEffort() { Operations.determinize(a, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT); }); } + + public void testMakeCharSetEmpty() { + Automaton expected = Automata.makeEmpty(); + Automaton actual = Automata.makeCharSet(new int[] {}); + assertTrue(AutomatonTestUtil.sameLanguage(expected, actual)); + assertTrue(actual.isDeterministic()); + assertEquals(0, actual.getNumStates()); + assertEquals(0, actual.getNumTransitions()); + } + + public void testMakeCharSetOne() { + Automaton expected = Automata.makeChar('a'); + Automaton actual = Automata.makeCharSet(new int[] {'a'}); + assertTrue(AutomatonTestUtil.sameLanguage(expected, actual)); + assertTrue(actual.isDeterministic()); + assertEquals(2, actual.getNumStates()); + assertEquals(1, actual.getNumTransitions()); + } + + public void testMakeCharSetTwo() { + Automaton expected = Operations.union(Automata.makeChar('a'), Automata.makeChar('A')); + Automaton actual = Automata.makeCharSet(new int[] {'a', 'A'}); + assertTrue(AutomatonTestUtil.sameLanguage(expected, actual)); + assertTrue(actual.isDeterministic()); + assertEquals(2, actual.getNumStates()); + assertEquals(2, actual.getNumTransitions()); + } + + public void testMakeCharSetDups() { + Automaton expected = Automata.makeChar('a'); + Automaton actual = Automata.makeCharSet(new int[] {'a', 'a', 'a'}); + assertTrue(AutomatonTestUtil.sameLanguage(expected, actual)); + assertTrue(actual.isDeterministic()); + assertEquals(2, actual.getNumStates()); + assertEquals(1, actual.getNumTransitions()); + } } diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExpParsing.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExpParsing.java index 74fb08cb7188..e0aec1e53516 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExpParsing.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExpParsing.java @@ -78,6 +78,20 @@ public void testCaseInsensitiveChar() { assertSameLanguage(expected, actual); } + // individual characters (only) inside a class are treated as case insensitive. + public void testCaseInsensitiveClassChar() { + RegExp re = new RegExp("[c]", RegExp.NONE, RegExp.ASCII_CASE_INSENSITIVE); + assertEquals( + "REGEXP_CHAR_CLASS starts=[U+0063 U+0043] ends=[U+0063 U+0043]\n", re.toStringTree()); + } + + // ranges aren't treated as case-insensitive, but maybe ok with charclass + // instead of adding range, expand it: iterate each codepoint, adding its alternatives + public void testCaseInsensitiveClassRange() { + RegExp re = new RegExp("[c-d]", RegExp.NONE, RegExp.ASCII_CASE_INSENSITIVE); + assertEquals("REGEXP_CHAR_RANGE from=c to=d\n", re.toStringTree()); + } + public void testCaseInsensitiveCharUpper() { RegExp re = new RegExp("C", RegExp.NONE, RegExp.ASCII_CASE_INSENSITIVE); assertEquals("\\C", re.toString()); @@ -136,6 +150,22 @@ public void testNegatedChar() { assertSameLanguage(expected, actual); } + public void testNegatedClass() { + RegExp re = new RegExp("[^c-da]"); + assertEquals( + String.join( + "\n", + "REGEXP_INTERSECTION", + " REGEXP_ANYCHAR", + " REGEXP_COMPLEMENT", + " REGEXP_CHAR_CLASS starts=[U+0063 U+0061] ends=[U+0064 U+0061]\n"), + re.toStringTree()); + + Automaton actual = re.toAutomaton(); + assertTrue(actual.isDeterministic()); + assertEquals(2, actual.getNumStates()); + } + public void testCharRange() { RegExp re = new RegExp("[b-d]"); assertEquals("[\\b-\\d]", re.toString()); @@ -180,8 +210,8 @@ public void testIllegalCharRange() { public void testCharClassDigit() { RegExp re = new RegExp("[\\d]"); - assertEquals("\\d", re.toString()); - assertEquals("REGEXP_PRE_CLASS class=\\d\n", re.toStringTree()); + assertEquals("[\\0-\\9]", re.toString()); + assertEquals("REGEXP_CHAR_RANGE from=0 to=9\n", re.toStringTree()); Automaton actual = re.toAutomaton(); assertTrue(actual.isDeterministic()); @@ -192,8 +222,8 @@ public void testCharClassDigit() { public void testCharClassNonDigit() { RegExp re = new RegExp("[\\D]"); - assertEquals("\\D", re.toString()); - assertEquals("REGEXP_PRE_CLASS class=\\D\n", re.toStringTree()); + assertEquals( + "REGEXP_CHAR_CLASS starts=[U+0000 U+003A] ends=[U+002F U+10FFFF]\n", re.toStringTree()); Automaton actual = re.toAutomaton(); assertTrue(actual.isDeterministic()); @@ -208,8 +238,9 @@ public void testCharClassNonDigit() { public void testCharClassWhitespace() { RegExp re = new RegExp("[\\s]"); - assertEquals("\\s", re.toString()); - assertEquals("REGEXP_PRE_CLASS class=\\s\n", re.toStringTree()); + assertEquals( + "REGEXP_CHAR_CLASS starts=[U+0009 U+000D U+0020] ends=[U+000A U+000D U+0020]\n", + re.toStringTree()); Automaton actual = re.toAutomaton(); assertTrue(actual.isDeterministic()); @@ -223,8 +254,9 @@ public void testCharClassWhitespace() { public void testCharClassNonWhitespace() { RegExp re = new RegExp("[\\S]"); - assertEquals("\\S", re.toString()); - assertEquals("REGEXP_PRE_CLASS class=\\S\n", re.toStringTree()); + assertEquals( + "REGEXP_CHAR_CLASS starts=[U+0000 U+000B U+000E U+0021] ends=[U+0008 U+000C U+001F U+10FFFF]\n", + re.toStringTree()); Automaton actual = re.toAutomaton(); assertTrue(actual.isDeterministic()); @@ -247,8 +279,10 @@ public void testCharClassNonWhitespace() { public void testCharClassWord() { RegExp re = new RegExp("[\\w]"); - assertEquals("\\w", re.toString()); - assertEquals("REGEXP_PRE_CLASS class=\\w\n", re.toStringTree()); + assertEquals("[\\0-\\9\\A-\\Z\\_\\a-\\z]", re.toString()); + assertEquals( + "REGEXP_CHAR_CLASS starts=[U+0030 U+0041 U+005F U+0061] ends=[U+0039 U+005A U+005F U+007A]\n", + re.toStringTree()); Automaton actual = re.toAutomaton(); assertTrue(actual.isDeterministic()); @@ -262,8 +296,9 @@ public void testCharClassWord() { public void testCharClassNonWord() { RegExp re = new RegExp("[\\W]"); - assertEquals("\\W", re.toString()); - assertEquals("REGEXP_PRE_CLASS class=\\W\n", re.toStringTree()); + assertEquals( + "REGEXP_CHAR_CLASS starts=[U+0000 U+003A U+005B U+0060 U+007B] ends=[U+002F U+0040 U+005E U+0060 U+10FFFF]\n", + re.toStringTree()); Automaton actual = re.toAutomaton(); assertTrue(actual.isDeterministic()); @@ -284,6 +319,17 @@ public void testCharClassNonWord() { assertSameLanguage(expected, actual); } + // char class with a couple of ranges, predefined,and individual chars + public void testJumboCharClass() { + RegExp re = new RegExp("[0-5a\\sbc-d]"); + assertEquals( + "REGEXP_CHAR_CLASS starts=[U+0030 U+0061 U+0009 U+000D U+0020 U+0062 U+0063] ends=[U+0035 U+0061 U+000A U+000D U+0020 U+0062 U+0064]\n", + re.toStringTree()); + Automaton actual = re.toAutomaton(); + assertTrue(actual.isDeterministic()); + assertEquals(2, actual.getNumStates()); + } + public void testTruncatedCharClass() { expectThrows( IllegalArgumentException.class,