From fe42efc59188c4fad4c299faf2e2df5b157c8007 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Thu, 6 Feb 2025 08:59:47 -0500
Subject: [PATCH] Add Automata.makeCharSet/makeCharClass to optimize regexp
 (#14193)

Add Automata.makeCharSet(int[])/makeCharClass(int[],int[]) to optimize regexp.

* Add new "character class" node, which was previously composed by union
  of many nodes.
* Remove "predefined class" node, which previously built an internal
  separate regex on the fly, it is just another character class.
* RegExp no longer uses union() internally, except for union (|) operator.
* format codepoints in the internal parse tree output with U+%04X
* Fix concatenate to remove the dead states it creates, just like
intersection/union/etc do
* fix dead-states-test to explicitly create dead states, rather than
relying on some function to create a mess. it doesn't anymore.
---
 .../lucene/util/automaton/Automata.java       |  27 ++
 .../lucene/util/automaton/Operations.java     |   3 +-
 .../apache/lucene/util/automaton/RegExp.java  | 286 ++++++++++++------
 .../lucene/util/automaton/TestAutomaton.java  |  47 ++-
 .../util/automaton/TestRegExpParsing.java     |  70 ++++-
 5 files changed, 329 insertions(+), 104 deletions(-)
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java
index 9ecf748418f6..9b6198fb04c5 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java
@@ -32,6 +32,7 @@
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
+import java.util.Objects;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefIterator;
 import org.apache.lucene.util.StringHelper;
@@ -140,6 +141,32 @@ public static Automaton makeCharRange(int min, int max) {
     return a;
   }
 
+  /** Returns a new minimal automaton that accepts any of the provided codepoints */
+  public static Automaton makeCharSet(int[] codepoints) {
+    return makeCharClass(codepoints, codepoints);
+  }
+
+  /** Returns a new minimal automaton that accepts any of the codepoint ranges */
+  public static Automaton makeCharClass(int[] starts, int[] ends) {
+    Objects.requireNonNull(starts);
+    Objects.requireNonNull(ends);
+    if (starts.length != ends.length) {
+      throw new IllegalArgumentException("starts must match ends");
+    }
+    if (starts.length == 0) {
+      return makeEmpty();
+    }
+    Automaton a = new Automaton();
+    int s1 = a.createState();
+    int s2 = a.createState();
+    a.setAccept(s2, true);
+    for (int i = 0; i < starts.length; i++) {
+      a.addTransition(s1, s2, starts[i], ends[i]);
+    }
+    a.finishState();
+    return a;
+  }
+
   /**
    * Constructs sub-automaton corresponding to decimal numbers of length x.substring(n).length().
    */
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java
index 7c2b164aa107..fb255f8a02e4 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java
@@ -148,8 +148,7 @@ public static Automaton concatenate(List<Automaton> l) {
     }
 
     result.finishState();
-
-    return result;
+    return Operations.removeDeadStates(result);
   }
 
   /**
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
index 92bfe41b462b..aaf50c8c1802 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
@@ -34,6 +34,7 @@
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
 import java.util.function.BooleanSupplier;
@@ -374,6 +375,8 @@ public enum Kind {
     REGEXP_CHAR,
     /** A Character range */
     REGEXP_CHAR_RANGE,
+    /** A Character class (list of ranges) */
+    REGEXP_CHAR_CLASS,
     /** Any Character allowed */
     REGEXP_ANYCHAR,
     /** An empty expression */
@@ -386,8 +389,6 @@ public enum Kind {
     REGEXP_AUTOMATON,
     /** An Interval expression */
     REGEXP_INTERVAL,
-    /** An expression for a pre-defined class e.g. \w */
-    REGEXP_PRE_CLASS,
     /**
      * The complement of an expression.
      *
@@ -453,7 +454,7 @@ public enum Kind {
   public final int min, max, digits;
 
   /** Extents for range type expressions */
-  public final int from, to;
+  public final int from[], to[];
 
   // Parser variables
   private final String originalString;
@@ -528,8 +529,8 @@ public RegExp(String s, int syntax_flags, int match_flags) throws IllegalArgumen
       int min,
       int max,
       int digits,
-      int from,
-      int to) {
+      int from[],
+      int to[]) {
     this.originalString = null;
     this.kind = kind;
     this.flags = flags;
@@ -546,17 +547,17 @@ public RegExp(String s, int syntax_flags, int match_flags) throws IllegalArgumen
 
   // Simplified construction of container nodes
   static RegExp newContainerNode(int flags, Kind kind, RegExp exp1, RegExp exp2) {
-    return new RegExp(flags, kind, exp1, exp2, null, 0, 0, 0, 0, 0, 0);
+    return new RegExp(flags, kind, exp1, exp2, null, 0, 0, 0, 0, null, null);
   }
 
   // Simplified construction of repeating nodes
   static RegExp newRepeatingNode(int flags, Kind kind, RegExp exp, int min, int max) {
-    return new RegExp(flags, kind, exp, null, null, 0, min, max, 0, 0, 0);
+    return new RegExp(flags, kind, exp, null, null, 0, min, max, 0, null, null);
   }
 
   // Simplified construction of leaf nodes
   static RegExp newLeafNode(
-      int flags, Kind kind, String s, int c, int min, int max, int digits, int from, int to) {
+      int flags, Kind kind, String s, int c, int min, int max, int digits, int from[], int to[]) {
     return new RegExp(flags, kind, null, null, s, c, min, max, digits, from, to);
   }
 
@@ -598,10 +599,6 @@ private Automaton toAutomaton(
     List<Automaton> list;
     Automaton a = null;
     switch (kind) {
-      case REGEXP_PRE_CLASS:
-        RegExp expanded = expandPredefined();
-        a = expanded.toAutomaton(automata, automaton_provider);
-        break;
       case REGEXP_UNION:
         list = new ArrayList<>();
         findLeaves(exp1, Kind.REGEXP_UNION, list, automata, automaton_provider);
@@ -648,13 +645,16 @@ private Automaton toAutomaton(
         break;
       case REGEXP_CHAR:
         if (check(ASCII_CASE_INSENSITIVE)) {
-          a = toCaseInsensitiveChar(c);
+          a = Automata.makeCharSet(toCaseInsensitiveChar(c));
         } else {
           a = Automata.makeChar(c);
         }
         break;
       case REGEXP_CHAR_RANGE:
-        a = Automata.makeCharRange(from, to);
+        a = Automata.makeCharRange(from[0], to[0]);
+        break;
+      case REGEXP_CHAR_CLASS:
+        a = Automata.makeCharClass(from, to);
         break;
       case REGEXP_ANYCHAR:
         a = Automata.makeAnyChar();
@@ -696,23 +696,20 @@ private Automaton toAutomaton(
     return a;
   }
 
-  private Automaton toCaseInsensitiveChar(int codepoint) {
-    Automaton case1 = Automata.makeChar(codepoint);
+  private int[] toCaseInsensitiveChar(int codepoint) {
     // For now we only work with ASCII characters
     if (codepoint > 128) {
-      return case1;
+      return new int[] {codepoint};
     }
     int altCase =
         Character.isLowerCase(codepoint)
             ? Character.toUpperCase(codepoint)
             : Character.toLowerCase(codepoint);
-    Automaton result;
     if (altCase != codepoint) {
-      result = Operations.union(case1, Automata.makeChar(altCase));
+      return new int[] {codepoint, altCase};
     } else {
-      result = case1;
+      return new int[] {codepoint};
     }
-    return result;
   }
 
   private Automaton toCaseInsensitiveString() {
@@ -720,7 +717,8 @@ private Automaton toCaseInsensitiveString() {
 
     Iterator<Integer> iter = s.codePoints().iterator();
     while (iter.hasNext()) {
-      list.add(toCaseInsensitiveChar(iter.next()));
+      int points[] = toCaseInsensitiveChar(iter.next());
+      list.add(Automata.makeCharSet(points));
     }
     return Operations.concatenate(list);
   }
@@ -802,7 +800,19 @@ void toStringBuilder(StringBuilder b) {
         b.append("\\").appendCodePoint(c);
         break;
       case REGEXP_CHAR_RANGE:
-        b.append("[\\").appendCodePoint(from).append("-\\").appendCodePoint(to).append("]");
+        b.append("[\\").appendCodePoint(from[0]).append("-\\").appendCodePoint(to[0]).append("]");
+        break;
+      case REGEXP_CHAR_CLASS:
+        b.append("[");
+        for (int i = 0; i < from.length; i++) {
+          if (from[i] == to[i]) {
+            b.append("\\").appendCodePoint(from[i]);
+          } else {
+            b.append("\\").appendCodePoint(from[i]);
+            b.append("-\\").appendCodePoint(to[i]);
+          }
+        }
+        b.append("]");
         break;
       case REGEXP_ANYCHAR:
         b.append(".");
@@ -828,13 +838,10 @@ void toStringBuilder(StringBuilder b) {
         if (digits > 0) for (int i = s2.length(); i < digits; i++) b.append('0');
         b.append(s2).append(">");
         break;
-      case REGEXP_PRE_CLASS:
-        b.append("\\").appendCodePoint(from);
-        break;
     }
   }
 
-  /** Like to string, but more verbose (shows the higherchy more clearly). */
+  /** Like to string, but more verbose (shows the hierarchy more clearly). */
   public String toStringTree() {
     StringBuilder b = new StringBuilder();
     toStringTree(b, "");
@@ -888,20 +895,22 @@ void toStringTree(StringBuilder b, String indent) {
         b.appendCodePoint(c);
         b.append('\n');
         break;
-      case REGEXP_PRE_CLASS:
+      case REGEXP_CHAR_RANGE:
         b.append(indent);
         b.append(kind);
-        b.append(" class=\\");
-        b.appendCodePoint(from);
+        b.append(" from=");
+        b.appendCodePoint(from[0]);
+        b.append(" to=");
+        b.appendCodePoint(to[0]);
         b.append('\n');
         break;
-      case REGEXP_CHAR_RANGE:
+      case REGEXP_CHAR_CLASS:
         b.append(indent);
         b.append(kind);
-        b.append(" from=");
-        b.appendCodePoint(from);
-        b.append(" to=");
-        b.appendCodePoint(to);
+        b.append(" starts=");
+        b.append(toHexString(from));
+        b.append(" ends=");
+        b.append(toHexString(to));
         b.append('\n');
         break;
       case REGEXP_ANYCHAR:
@@ -942,6 +951,20 @@ void toStringTree(StringBuilder b, String indent) {
     }
   }
 
+  /** prints like <code>[U+002A U+FD72 U+1FFFF]</code> */
+  private StringBuilder toHexString(int[] range) {
+    StringBuilder sb = new StringBuilder();
+    sb.append('[');
+    for (int codepoint : range) {
+      if (sb.length() > 1) {
+        sb.append(' ');
+      }
+      sb.append(String.format(Locale.ROOT, "U+%04X", codepoint));
+    }
+    sb.append(']');
+    return sb;
+  }
+
   /** Returns set of automaton identifiers that occur in this regular expression. */
   public Set<String> getIdentifiers() {
     HashSet<String> set = new HashSet<>();
@@ -972,9 +995,9 @@ void getIdentifiers(Set<String> set) {
       case REGEXP_ANYSTRING:
       case REGEXP_CHAR:
       case REGEXP_CHAR_RANGE:
+      case REGEXP_CHAR_CLASS:
       case REGEXP_EMPTY:
       case REGEXP_INTERVAL:
-      case REGEXP_PRE_CLASS:
       case REGEXP_STRING:
       default:
     }
@@ -1050,14 +1073,33 @@ static RegExp makeDeprecatedComplement(int flags, RegExp exp) {
   }
 
   static RegExp makeChar(int flags, int c) {
-    return newLeafNode(flags, Kind.REGEXP_CHAR, null, c, 0, 0, 0, 0, 0);
+    return newLeafNode(flags, Kind.REGEXP_CHAR, null, c, 0, 0, 0, null, null);
   }
 
   static RegExp makeCharRange(int flags, int from, int to) {
     if (from > to)
       throw new IllegalArgumentException(
           "invalid range: from (" + from + ") cannot be > to (" + to + ")");
-    return newLeafNode(flags, Kind.REGEXP_CHAR_RANGE, null, 0, 0, 0, 0, from, to);
+    return newLeafNode(
+        flags, Kind.REGEXP_CHAR_RANGE, null, 0, 0, 0, 0, new int[] {from}, new int[] {to});
+  }
+
+  static RegExp makeCharClass(int flags, int from[], int to[]) {
+    if (from.length != to.length) {
+      throw new IllegalStateException(
+          String.format(
+              Locale.ROOT,
+              "invalid class: from.length (%d) != to.length (%d)",
+              from.length,
+              to.length));
+    }
+    for (int i = 0; i < from.length; i++) {
+      if (from[i] > to[i]) {
+        throw new IllegalArgumentException(
+            "invalid range: from (" + from[i] + ") cannot be > to (" + to[i] + ")");
+      }
+    }
+    return newLeafNode(flags, Kind.REGEXP_CHAR_CLASS, null, 0, 0, 0, 0, from, to);
   }
 
   static RegExp makeAnyChar(int flags) {
@@ -1069,7 +1111,7 @@ static RegExp makeEmpty(int flags) {
   }
 
   static RegExp makeString(int flags, String s) {
-    return newLeafNode(flags, Kind.REGEXP_STRING, s, 0, 0, 0, 0, 0, 0);
+    return newLeafNode(flags, Kind.REGEXP_STRING, s, 0, 0, 0, 0, null, null);
   }
 
   static RegExp makeAnyString(int flags) {
@@ -1077,11 +1119,11 @@ static RegExp makeAnyString(int flags) {
   }
 
   static RegExp makeAutomaton(int flags, String s) {
-    return newLeafNode(flags, Kind.REGEXP_AUTOMATON, s, 0, 0, 0, 0, 0, 0);
+    return newLeafNode(flags, Kind.REGEXP_AUTOMATON, s, 0, 0, 0, 0, null, null);
   }
 
   static RegExp makeInterval(int flags, int min, int max, int digits) {
-    return newLeafNode(flags, Kind.REGEXP_INTERVAL, null, 0, min, max, digits, 0, 0);
+    return newLeafNode(flags, Kind.REGEXP_INTERVAL, null, 0, min, max, digits, null, null);
   }
 
   private boolean peek(String s) {
@@ -1195,60 +1237,132 @@ final RegExp parseCharClassExp() throws IllegalArgumentException {
   }
 
   final RegExp parseCharClasses() throws IllegalArgumentException {
-    RegExp e = parseCharClass();
-    while (more() && !peek("]")) e = makeUnion(flags, e, parseCharClass());
-    return e;
-  }
+    ArrayList<Integer> starts = new ArrayList<>();
+    ArrayList<Integer> ends = new ArrayList<>();
 
-  final RegExp parseCharClass() throws IllegalArgumentException {
-    RegExp predefinedExp = matchPredefinedCharacterClass();
-    if (predefinedExp != null) {
-      return predefinedExp;
-    }
-
-    int c = parseCharExp();
-    if (match('-')) return makeCharRange(flags, c, parseCharExp());
-    else return makeChar(flags, c);
-  }
-
-  RegExp expandPredefined() {
-    // See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html
-    switch (from) {
-      case 'd':
-        return new RegExp("[0-9]"); // digit
-      case 'D':
-        return new RegExp("[^0-9]"); // non-digit
-      case 's':
-        return new RegExp("[ \t\n\r]"); // whitespace
-      case 'S':
-        return new RegExp("[^\\s]"); // non-whitespace
-      case 'w':
-        return new RegExp("[a-zA-Z_0-9]"); // word
-      case 'W':
-        return new RegExp("[^\\w]"); // non-word
-      default:
-        throw new IllegalArgumentException("invalid character class " + from);
-    }
-  }
-
-  final RegExp matchPredefinedCharacterClass() {
-    // See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html
-    if (match('\\')) {
-      if (peek("dDwWsS")) {
-        return newLeafNode(flags, Kind.REGEXP_PRE_CLASS, null, 0, 0, 0, 0, next(), 0);
+    do {
+      // look for escape
+      if (match('\\')) {
+        expandPreDefined(starts, ends);
+      } else {
+        // parse a character
+        int c = parseCharExp();
+
+        if (match('-')) {
+          // range from c-d
+          starts.add(c);
+          ends.add(parseCharExp());
+        } else if (check(ASCII_CASE_INSENSITIVE)) {
+          // single case-insensitive character
+          for (int form : toCaseInsensitiveChar(c)) {
+            starts.add(form);
+            ends.add(form);
+          }
+        } else {
+          // single character
+          starts.add(c);
+          ends.add(c);
+        }
       }
+    } while (more() && !peek("]"));
 
-      if (peek("\\")) {
-        return makeChar(flags, next());
+    // not sure why we bother optimizing nodes, same automaton...
+    // definitely saves time vs fixing toString()-based tests.
+    if (starts.size() == 1) {
+      if (starts.get(0).intValue() == ends.get(0).intValue()) {
+        return makeChar(flags, starts.get(0));
+      } else {
+        return makeCharRange(flags, starts.get(0), ends.get(0));
       }
+    } else {
+      return makeCharClass(
+          flags,
+          starts.stream().mapToInt(Integer::intValue).toArray(),
+          ends.stream().mapToInt(Integer::intValue).toArray());
+    }
+  }
 
+  void expandPreDefined(List<Integer> starts, List<Integer> ends) {
+    if (peek("\\")) {
+      // escape
+      starts.add((int) '\\');
+      ends.add((int) '\\');
+      next();
+    } else if (peek("d")) {
+      // digit: [0-9]
+      starts.add((int) '0');
+      ends.add((int) '9');
+      next();
+    } else if (peek("D")) {
+      // non-digit: [^0-9]
+      starts.add(Character.MIN_CODE_POINT);
+      ends.add('0' - 1);
+      starts.add('9' + 1);
+      ends.add(Character.MAX_CODE_POINT);
+      next();
+    } else if (peek("s")) {
+      // whitespace: [\t-\n\r ]
+      starts.add((int) '\t');
+      ends.add((int) '\n');
+      starts.add((int) '\r');
+      ends.add((int) '\r');
+      starts.add((int) ' ');
+      ends.add((int) ' ');
+      next();
+    } else if (peek("S")) {
+      // non-whitespace: [^\t-\n\r ]
+      starts.add(Character.MIN_CODE_POINT);
+      ends.add('\t' - 1);
+      starts.add('\n' + 1);
+      ends.add('\r' - 1);
+      starts.add('\r' + 1);
+      ends.add(' ' - 1);
+      starts.add(' ' + 1);
+      ends.add(Character.MAX_CODE_POINT);
+      next();
+    } else if (peek("w")) {
+      // word: [0-9A-Z_a-z]
+      starts.add((int) '0');
+      ends.add((int) '9');
+      starts.add((int) 'A');
+      ends.add((int) 'Z');
+      starts.add((int) '_');
+      ends.add((int) '_');
+      starts.add((int) 'a');
+      ends.add((int) 'z');
+      next();
+    } else if (peek("W")) {
+      // non-word: [^0-9A-Z_a-z]
+      starts.add(Character.MIN_CODE_POINT);
+      ends.add('0' - 1);
+      starts.add('9' + 1);
+      ends.add('A' - 1);
+      starts.add('Z' + 1);
+      ends.add('_' - 1);
+      starts.add('_' + 1);
+      ends.add('a' - 1);
+      starts.add('z' + 1);
+      ends.add(Character.MAX_CODE_POINT);
+      next();
+    } else if (peek("abcefghijklmnopqrtuvxyz") || peek("ABCEFGHIJKLMNOPQRTUVXYZ")) {
       // From https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html#bs
       // "It is an error to use a backslash prior to any alphabetic character that does not denote
       // an escaped
       // construct;"
-      if (peek("abcefghijklmnopqrtuvxyz") || peek("ABCEFGHIJKLMNOPQRTUVXYZ")) {
-        throw new IllegalArgumentException("invalid character class \\" + next());
-      }
+      throw new IllegalArgumentException("invalid character class \\" + next());
+    }
+  }
+
+  final RegExp matchPredefinedCharacterClass() {
+    // See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html
+    if (match('\\') && peek("\\ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz")) {
+      var starts = new ArrayList<Integer>();
+      var ends = new ArrayList<Integer>();
+      expandPreDefined(starts, ends);
+      return makeCharClass(
+          flags,
+          starts.stream().mapToInt(Integer::intValue).toArray(),
+          ends.stream().mapToInt(Integer::intValue).toArray());
     }
 
     return null;
diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java
index 3c7d6eea198a..e9e35416db4f 100644
--- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java
+++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java
@@ -667,11 +667,14 @@ public void testConcatenatePreservesDet() throws Exception {
   }
 
   public void testRemoveDeadStates() throws Exception {
-    Automaton a =
-        Operations.concatenate(Arrays.asList(Automata.makeString("x"), Automata.makeString("y")));
-    assertEquals(4, a.getNumStates());
+    Automaton a = new Automaton();
+    int s1 = a.createState();
+    a.createState(); // create dead state
+    a.setAccept(s1, true);
+    a.finishState();
+    assertEquals(2, a.getNumStates());
     a = Operations.removeDeadStates(a);
-    assertEquals(3, a.getNumStates());
+    assertEquals(1, a.getNumStates());
   }
 
   public void testRemoveDeadStatesEmpty1() throws Exception {
@@ -1682,4 +1685,40 @@ public void testDeterminizeTooMuchEffort() {
           Operations.determinize(a, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
         });
   }
+
+  public void testMakeCharSetEmpty() {
+    Automaton expected = Automata.makeEmpty();
+    Automaton actual = Automata.makeCharSet(new int[] {});
+    assertTrue(AutomatonTestUtil.sameLanguage(expected, actual));
+    assertTrue(actual.isDeterministic());
+    assertEquals(0, actual.getNumStates());
+    assertEquals(0, actual.getNumTransitions());
+  }
+
+  public void testMakeCharSetOne() {
+    Automaton expected = Automata.makeChar('a');
+    Automaton actual = Automata.makeCharSet(new int[] {'a'});
+    assertTrue(AutomatonTestUtil.sameLanguage(expected, actual));
+    assertTrue(actual.isDeterministic());
+    assertEquals(2, actual.getNumStates());
+    assertEquals(1, actual.getNumTransitions());
+  }
+
+  public void testMakeCharSetTwo() {
+    Automaton expected = Operations.union(Automata.makeChar('a'), Automata.makeChar('A'));
+    Automaton actual = Automata.makeCharSet(new int[] {'a', 'A'});
+    assertTrue(AutomatonTestUtil.sameLanguage(expected, actual));
+    assertTrue(actual.isDeterministic());
+    assertEquals(2, actual.getNumStates());
+    assertEquals(2, actual.getNumTransitions());
+  }
+
+  public void testMakeCharSetDups() {
+    Automaton expected = Automata.makeChar('a');
+    Automaton actual = Automata.makeCharSet(new int[] {'a', 'a', 'a'});
+    assertTrue(AutomatonTestUtil.sameLanguage(expected, actual));
+    assertTrue(actual.isDeterministic());
+    assertEquals(2, actual.getNumStates());
+    assertEquals(1, actual.getNumTransitions());
+  }
 }
diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExpParsing.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExpParsing.java
index 74fb08cb7188..e0aec1e53516 100644
--- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExpParsing.java
+++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExpParsing.java
@@ -78,6 +78,20 @@ public void testCaseInsensitiveChar() {
     assertSameLanguage(expected, actual);
   }
 
+  // individual characters (only) inside a class are treated as case insensitive.
+  public void testCaseInsensitiveClassChar() {
+    RegExp re = new RegExp("[c]", RegExp.NONE, RegExp.ASCII_CASE_INSENSITIVE);
+    assertEquals(
+        "REGEXP_CHAR_CLASS starts=[U+0063 U+0043] ends=[U+0063 U+0043]\n", re.toStringTree());
+  }
+
+  // ranges aren't treated as case-insensitive, but maybe ok with charclass
+  // instead of adding range, expand it: iterate each codepoint, adding its alternatives
+  public void testCaseInsensitiveClassRange() {
+    RegExp re = new RegExp("[c-d]", RegExp.NONE, RegExp.ASCII_CASE_INSENSITIVE);
+    assertEquals("REGEXP_CHAR_RANGE from=c to=d\n", re.toStringTree());
+  }
+
   public void testCaseInsensitiveCharUpper() {
     RegExp re = new RegExp("C", RegExp.NONE, RegExp.ASCII_CASE_INSENSITIVE);
     assertEquals("\\C", re.toString());
@@ -136,6 +150,22 @@ public void testNegatedChar() {
     assertSameLanguage(expected, actual);
   }
 
+  public void testNegatedClass() {
+    RegExp re = new RegExp("[^c-da]");
+    assertEquals(
+        String.join(
+            "\n",
+            "REGEXP_INTERSECTION",
+            "  REGEXP_ANYCHAR",
+            "  REGEXP_COMPLEMENT",
+            "    REGEXP_CHAR_CLASS starts=[U+0063 U+0061] ends=[U+0064 U+0061]\n"),
+        re.toStringTree());
+
+    Automaton actual = re.toAutomaton();
+    assertTrue(actual.isDeterministic());
+    assertEquals(2, actual.getNumStates());
+  }
+
   public void testCharRange() {
     RegExp re = new RegExp("[b-d]");
     assertEquals("[\\b-\\d]", re.toString());
@@ -180,8 +210,8 @@ public void testIllegalCharRange() {
 
   public void testCharClassDigit() {
     RegExp re = new RegExp("[\\d]");
-    assertEquals("\\d", re.toString());
-    assertEquals("REGEXP_PRE_CLASS class=\\d\n", re.toStringTree());
+    assertEquals("[\\0-\\9]", re.toString());
+    assertEquals("REGEXP_CHAR_RANGE from=0 to=9\n", re.toStringTree());
 
     Automaton actual = re.toAutomaton();
     assertTrue(actual.isDeterministic());
@@ -192,8 +222,8 @@ public void testCharClassDigit() {
 
   public void testCharClassNonDigit() {
     RegExp re = new RegExp("[\\D]");
-    assertEquals("\\D", re.toString());
-    assertEquals("REGEXP_PRE_CLASS class=\\D\n", re.toStringTree());
+    assertEquals(
+        "REGEXP_CHAR_CLASS starts=[U+0000 U+003A] ends=[U+002F U+10FFFF]\n", re.toStringTree());
 
     Automaton actual = re.toAutomaton();
     assertTrue(actual.isDeterministic());
@@ -208,8 +238,9 @@ public void testCharClassNonDigit() {
 
   public void testCharClassWhitespace() {
     RegExp re = new RegExp("[\\s]");
-    assertEquals("\\s", re.toString());
-    assertEquals("REGEXP_PRE_CLASS class=\\s\n", re.toStringTree());
+    assertEquals(
+        "REGEXP_CHAR_CLASS starts=[U+0009 U+000D U+0020] ends=[U+000A U+000D U+0020]\n",
+        re.toStringTree());
 
     Automaton actual = re.toAutomaton();
     assertTrue(actual.isDeterministic());
@@ -223,8 +254,9 @@ public void testCharClassWhitespace() {
 
   public void testCharClassNonWhitespace() {
     RegExp re = new RegExp("[\\S]");
-    assertEquals("\\S", re.toString());
-    assertEquals("REGEXP_PRE_CLASS class=\\S\n", re.toStringTree());
+    assertEquals(
+        "REGEXP_CHAR_CLASS starts=[U+0000 U+000B U+000E U+0021] ends=[U+0008 U+000C U+001F U+10FFFF]\n",
+        re.toStringTree());
 
     Automaton actual = re.toAutomaton();
     assertTrue(actual.isDeterministic());
@@ -247,8 +279,10 @@ public void testCharClassNonWhitespace() {
 
   public void testCharClassWord() {
     RegExp re = new RegExp("[\\w]");
-    assertEquals("\\w", re.toString());
-    assertEquals("REGEXP_PRE_CLASS class=\\w\n", re.toStringTree());
+    assertEquals("[\\0-\\9\\A-\\Z\\_\\a-\\z]", re.toString());
+    assertEquals(
+        "REGEXP_CHAR_CLASS starts=[U+0030 U+0041 U+005F U+0061] ends=[U+0039 U+005A U+005F U+007A]\n",
+        re.toStringTree());
 
     Automaton actual = re.toAutomaton();
     assertTrue(actual.isDeterministic());
@@ -262,8 +296,9 @@ public void testCharClassWord() {
 
   public void testCharClassNonWord() {
     RegExp re = new RegExp("[\\W]");
-    assertEquals("\\W", re.toString());
-    assertEquals("REGEXP_PRE_CLASS class=\\W\n", re.toStringTree());
+    assertEquals(
+        "REGEXP_CHAR_CLASS starts=[U+0000 U+003A U+005B U+0060 U+007B] ends=[U+002F U+0040 U+005E U+0060 U+10FFFF]\n",
+        re.toStringTree());
 
     Automaton actual = re.toAutomaton();
     assertTrue(actual.isDeterministic());
@@ -284,6 +319,17 @@ public void testCharClassNonWord() {
     assertSameLanguage(expected, actual);
   }
 
+  // char class with a couple of ranges, predefined,and individual chars
+  public void testJumboCharClass() {
+    RegExp re = new RegExp("[0-5a\\sbc-d]");
+    assertEquals(
+        "REGEXP_CHAR_CLASS starts=[U+0030 U+0061 U+0009 U+000D U+0020 U+0062 U+0063] ends=[U+0035 U+0061 U+000A U+000D U+0020 U+0062 U+0064]\n",
+        re.toStringTree());
+    Automaton actual = re.toAutomaton();
+    assertTrue(actual.isDeterministic());
+    assertEquals(2, actual.getNumStates());
+  }
+
   public void testTruncatedCharClass() {
     expectThrows(
         IllegalArgumentException.class,