opensearch-project · RyanL1997 · Oct 29, 2025 · Oct 22, 2025 · Oct 22, 2025 · Oct 23, 2025
@@ -282,10 +282,13 @@ public RelNode visitRex(Rex node, CalcitePlanContext context) {
           "Rex pattern must contain at least one named capture group");
     }
 
+    // TODO: Once JDK 20+ is supported, consider using Pattern.namedGroups() API for more efficient
+    // named group handling instead of manual parsing in RegexCommonUtils
+
     List<RexNode> newFields = new ArrayList<>();
     List<String> newFieldNames = new ArrayList<>();
 
-    for (int i = 0; i < namedGroups.size(); i++) {
+    for (String groupName : namedGroups) {
       RexNode extractCall;
       if (node.getMaxMatch().isPresent() && node.getMaxMatch().get() > 1) {
         extractCall =
@@ -294,7 +297,7 @@ public RelNode visitRex(Rex node, CalcitePlanContext context) {
                 BuiltinFunctionName.REX_EXTRACT_MULTI,
                 fieldRex,
                 context.rexBuilder.makeLiteral(patternStr),
-                context.relBuilder.literal(i + 1),
+                context.rexBuilder.makeLiteral(groupName),
                 context.relBuilder.literal(node.getMaxMatch().get()));
       } else {
         extractCall =
@@ -303,10 +306,10 @@ public RelNode visitRex(Rex node, CalcitePlanContext context) {
                 BuiltinFunctionName.REX_EXTRACT,
                 fieldRex,
                 context.rexBuilder.makeLiteral(patternStr),
-                context.relBuilder.literal(i + 1));
+                context.rexBuilder.makeLiteral(groupName));
       }
       newFields.add(extractCall);
-      newFieldNames.add(namedGroups.get(i));
+      newFieldNames.add(groupName);
     }
 
     if (node.getOffsetField().isPresent()) {

@@ -15,11 +15,13 @@
 import org.apache.calcite.linq4j.tree.Expression;
 import org.apache.calcite.linq4j.tree.Expressions;
 import org.apache.calcite.rex.RexCall;
+import org.apache.calcite.sql.type.CompositeOperandTypeChecker;
 import org.apache.calcite.sql.type.ReturnTypes;
 import org.apache.calcite.sql.type.SqlReturnTypeInference;
 import org.opensearch.sql.calcite.utils.PPLOperandTypes;
 import org.opensearch.sql.expression.function.ImplementorUDF;
 import org.opensearch.sql.expression.function.UDFOperandMetadata;
+import org.opensearch.sql.expression.parse.RegexCommonUtils;
 
 /** Custom REX_EXTRACT function for extracting regex named capture groups. */
 public final class RexExtractFunction extends ImplementorUDF {
@@ -35,7 +37,12 @@ public SqlReturnTypeInference getReturnTypeInference() {
 
   @Override
   public UDFOperandMetadata getOperandMetadata() {
-    return PPLOperandTypes.STRING_STRING_INTEGER;
+    // Support both (field, pattern, groupIndex) and (field, pattern, groupName)
+    return UDFOperandMetadata.wrap(
+        (CompositeOperandTypeChecker)
+            PPLOperandTypes.STRING_STRING_INTEGER
+                .getInnerTypeChecker()
+                .or(PPLOperandTypes.STRING_STRING_STRING.getInnerTypeChecker()));
   }
 
   private static class RexExtractImplementor implements NotNullImplementor {
@@ -45,19 +52,80 @@ public Expression implement(
         RexToLixTranslator translator, RexCall call, List<Expression> translatedOperands) {
       Expression field = translatedOperands.get(0);
       Expression pattern = translatedOperands.get(1);
-      Expression groupIndex = translatedOperands.get(2);
+      Expression groupIndexOrName = translatedOperands.get(2);
 
-      return Expressions.call(RexExtractFunction.class, "extractGroup", field, pattern, groupIndex);
+      return Expressions.call(
+          RexExtractFunction.class, "extractGroup", field, pattern, groupIndexOrName);
     }
   }
 
+  /**
+   * Extract a regex group by index (1-based).
+   *
+   * @param text The input text to extract from
+   * @param pattern The regex pattern
+   * @param groupIndex The 1-based group index to extract
+   * @return The extracted value or null if not found or invalid
+   */
   public static String extractGroup(String text, String pattern, int groupIndex) {
+    if (text == null || pattern == null) {
+      return null;
+    }
+
+    return executeExtraction(
+        text,
+        pattern,
+        matcher -> {
+          if (groupIndex > 0 && groupIndex <= matcher.groupCount()) {
+            return matcher.group(groupIndex);
+          }
+          return null;
+        });
+  }
+
+  /**
+   * Extract a named capture group from text using the provided pattern. This method avoids the
+   * index shifting issue that occurs with nested unnamed groups.
+   *
+   * @param text The input text to extract from
+   * @param pattern The regex pattern with named capture groups
+   * @param groupName The name of the capture group to extract
+   * @return The extracted value or null if not found
+   */
+  public static String extractGroup(String text, String pattern, String groupName) {
+    if (text == null || pattern == null || groupName == null) {
+      return null;
+    }
+
+    return executeExtraction(
+        text,
+        pattern,
+        matcher -> {
+          try {
+            return matcher.group(groupName);
+          } catch (IllegalArgumentException e) {
+            // Group name doesn't exist in the pattern
+            return null;
+          }
+        });
+  }
+
+  /**
+   * Common extraction logic to avoid code duplication.
+   *
+   * @param text The input text
+   * @param pattern The regex pattern
+   * @param extractor Function to extract the value from the matcher
+   * @return The extracted value or null
+   */
+  private static String executeExtraction(
+      String text, String pattern, java.util.function.Function<Matcher, String> extractor) {
     try {
-      Pattern compiledPattern = Pattern.compile(pattern);
+      Pattern compiledPattern = RegexCommonUtils.getCompiledPattern(pattern);
       Matcher matcher = compiledPattern.matcher(text);
 
-      if (matcher.find() && groupIndex > 0 && groupIndex <= matcher.groupCount()) {
-        return matcher.group(groupIndex);
+      if (matcher.find()) {
+        return extractor.apply(matcher);
       }
       return null;
     } catch (PatternSyntaxException e) {

@@ -16,11 +16,15 @@
 import org.apache.calcite.linq4j.tree.Expression;
 import org.apache.calcite.linq4j.tree.Expressions;
 import org.apache.calcite.rex.RexCall;
+import org.apache.calcite.sql.type.CompositeOperandTypeChecker;
+import org.apache.calcite.sql.type.OperandTypes;
 import org.apache.calcite.sql.type.SqlReturnTypeInference;
+import org.apache.calcite.sql.type.SqlTypeFamily;
 import org.apache.calcite.sql.type.SqlTypeName;
 import org.opensearch.sql.calcite.utils.PPLOperandTypes;
 import org.opensearch.sql.expression.function.ImplementorUDF;
 import org.opensearch.sql.expression.function.UDFOperandMetadata;
+import org.opensearch.sql.expression.parse.RegexCommonUtils;
 
 /** Custom REX_EXTRACT_MULTI function for extracting multiple regex matches. */
 public final class RexExtractMultiFunction extends ImplementorUDF {
@@ -40,7 +44,17 @@ public SqlReturnTypeInference getReturnTypeInference() {
 
   @Override
   public UDFOperandMetadata getOperandMetadata() {
-    return PPLOperandTypes.STRING_STRING_INTEGER_INTEGER;
+    // Support both (field, pattern, groupIndex, maxMatch) and (field, pattern, groupName, maxMatch)
+    return UDFOperandMetadata.wrap(
+        (CompositeOperandTypeChecker)
+            PPLOperandTypes.STRING_STRING_INTEGER_INTEGER
+                .getInnerTypeChecker()
+                .or(
+                    OperandTypes.family(
+                        SqlTypeFamily.CHARACTER,
+                        SqlTypeFamily.CHARACTER,
+                        SqlTypeFamily.CHARACTER,
+                        SqlTypeFamily.INTEGER)));
   }
 
   private static class RexExtractMultiImplementor implements NotNullImplementor {
@@ -50,35 +64,105 @@ public Expression implement(
         RexToLixTranslator translator, RexCall call, List<Expression> translatedOperands) {
       Expression field = translatedOperands.get(0);
       Expression pattern = translatedOperands.get(1);
-      Expression groupIndex = translatedOperands.get(2);
+      Expression groupIndexOrName = translatedOperands.get(2);
       Expression maxMatch = translatedOperands.get(3);
 
       return Expressions.call(
           RexExtractMultiFunction.class,
           "extractMultipleGroups",
           field,
           pattern,
-          groupIndex,
+          groupIndexOrName,
           maxMatch);
     }
   }
 
+  /**
+   * Extract multiple regex groups by index (1-based).
+   *
+   * @param text The input text to extract from
+   * @param pattern The regex pattern
+   * @param groupIndex The 1-based group index to extract
+   * @param maxMatch Maximum number of matches to return (0 = unlimited)
+   * @return List of extracted values or null if no matches found
+   */
   public static List<String> extractMultipleGroups(
       String text, String pattern, int groupIndex, int maxMatch) {
-    // Query planner already validates null inputs via NullPolicy.ARG0
+    if (text == null || pattern == null) {
+      return null;
+    }
+
+    return executeMultipleExtractions(
+        text,
+        pattern,
+        maxMatch,
+        matcher -> {
+          if (groupIndex > 0 && groupIndex <= matcher.groupCount()) {
+            return matcher.group(groupIndex);
+          }
+          return null;
+        });
+  }
+
+  /**
+   * Extract multiple occurrences of a named capture group from text. This method avoids the index
+   * shifting issue that occurs with nested unnamed groups.
+   *
+   * @param text The input text to extract from
+   * @param pattern The regex pattern with named capture groups
+   * @param groupName The name of the capture group to extract
+   * @param maxMatch Maximum number of matches to return (0 = unlimited)
+   * @return List of extracted values or null if no matches found
+   */
+  public static List<String> extractMultipleGroups(
+      String text, String pattern, String groupName, int maxMatch) {
+    if (text == null || pattern == null || groupName == null) {
+      return null;
+    }
+
+    return executeMultipleExtractions(
+        text,
+        pattern,
+        maxMatch,
+        matcher -> {
+          try {
+            return matcher.group(groupName);
+          } catch (IllegalArgumentException e) {
+            // Group name doesn't exist in the pattern, stop processing
+            return null;
+          }
+        });
+  }
+
+  /**
+   * Common extraction logic for multiple matches to avoid code duplication.
+   *
+   * @param text The input text
+   * @param pattern The regex pattern
+   * @param maxMatch Maximum matches (0 = unlimited)
+   * @param extractor Function to extract the value from the matcher
+   * @return List of extracted values or null if no matches found
+   */
+  private static List<String> executeMultipleExtractions(
+      String text,
+      String pattern,
+      int maxMatch,
+      java.util.function.Function<Matcher, String> extractor) {
     try {
-      Pattern compiledPattern = Pattern.compile(pattern);
+      Pattern compiledPattern = RegexCommonUtils.getCompiledPattern(pattern);
       Matcher matcher = compiledPattern.matcher(text);
       List<String> matches = new ArrayList<>();
 
       int matchCount = 0;
       while (matcher.find() && (maxMatch == 0 || matchCount < maxMatch)) {
-        if (groupIndex > 0 && groupIndex <= matcher.groupCount()) {
-          String match = matcher.group(groupIndex);
-          if (match != null) {
-            matches.add(match);
-            matchCount++;
-          }
+        String match = extractor.apply(matcher);
+        if (match != null) {
+          matches.add(match);
+          matchCount++;
+        } else {
+          // If extractor returns null, it might indicate an error (like invalid group name)
+          // Stop processing to avoid infinite loop
+          break;
         }
       }
 

@@ -306,4 +306,53 @@ public void testRexMaxMatchConfigurableLimit() throws IOException {
           new ClusterSetting(PERSISTENT, Settings.Key.PPL_REX_MAX_MATCH_LIMIT.getKeyValue(), null));
     }
   }
+
+  @Test
+  public void testRexNestedCaptureGroupsBugFix() throws IOException {
+    JSONObject resultWithNested =
+        executeQuery(
+            String.format(
+                "source=%s | rex field=email"
+                    + " \\\"(?<user>[^@]+)@(?<domain>(pyrami|gmail|yahoo))\\\\\\\\.(?<tld>(com|org|net))\\\""
+                    + " | fields user, domain, tld | head 1",
+                TEST_INDEX_ACCOUNT));
+
+    assertEquals(1, resultWithNested.getJSONArray("datarows").length());
+    assertEquals(
+        "amberduke",
+        resultWithNested
+            .getJSONArray("datarows")
+            .getJSONArray(0)
+            .get(0)); // user should be "amberduke"
+    assertEquals(
+        "pyrami",
+        resultWithNested
+            .getJSONArray("datarows")
+            .getJSONArray(0)
+            .get(1)); // domain should be "pyrami", NOT "amberduke"
+    assertEquals(
+        "com",
+        resultWithNested
+            .getJSONArray("datarows")
+            .getJSONArray(0)
+            .get(2)); // tld should be "com", NOT "pyrami"
+
+    // More complex nested alternation
+    JSONObject complexNested =
+        executeQuery(
+            String.format(
+                "source=%s | rex field=firstname"
+                    + " \\\"(?<initial>(A|B|C|D|E))[a-z]*(?<suffix>(ley|nne|ber|ton|son))\\\" |"
+                    + " fields initial, suffix | head 1",
+                TEST_INDEX_ACCOUNT));
+
+    if (!complexNested.getJSONArray("datarows").isEmpty()) {
+      String initial = complexNested.getJSONArray("datarows").getJSONArray(0).getString(0);
+      String suffix = complexNested.getJSONArray("datarows").getJSONArray(0).getString(1);
+
+      assertTrue("Initial should be a single letter A-E", initial.matches("[A-E]"));
+      assertTrue(
+          "Suffix should match alternation pattern", suffix.matches("(ley|nne|ber|ton|son)"));
+    }
+  }
 }
@@ -3,8 +3,8 @@ calcite:
     LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT])
       LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], initial=[$17])
         LogicalSort(fetch=[5])
-          LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], initial=[REX_EXTRACT($10, '(?<initial>^[A-Z])', 1)])
+          LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], initial=[REX_EXTRACT($10, '(?<initial>^[A-Z])', 'initial')])
             CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]])
   physical: |
-    EnumerableCalc(expr#0..10=[{inputs}], expr#11=['(?<initial>^[A-Z])'], expr#12=[1], expr#13=[REX_EXTRACT($t10, $t11, $t12)], proj#0..10=[{exprs}], $f11=[$t13])
+    EnumerableCalc(expr#0..10=[{inputs}], expr#11=['(?<initial>^[A-Z])'], expr#12=['initial'], expr#13=[REX_EXTRACT($t10, $t11, $t12)], proj#0..10=[{exprs}], $f11=[$t13])
       CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[account_number, firstname, address, balance, gender, city, employer, state, age, email, lastname], LIMIT->5, LIMIT->10000], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":5,"timeout":"1m","_source":{"includes":["account_number","firstname","address","balance","gender","city","employer","state","age","email","lastname"],"excludes":[]}}, requestedTotalSize=5, pageSize=null, startFrom=0)])
@@ -3,10 +3,10 @@ calcite:
     LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT])
       LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], initial=[$17])
         LogicalSort(fetch=[5])
-          LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], initial=[REX_EXTRACT($10, '(?<initial>^[A-Z])', 1)])
+          LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], initial=[REX_EXTRACT($10, '(?<initial>^[A-Z])', 'initial')])
             CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]])
   physical: |
     EnumerableLimit(fetch=[10000])
-      EnumerableCalc(expr#0..16=[{inputs}], expr#17=['(?<initial>^[A-Z])'], expr#18=[1], expr#19=[REX_EXTRACT($t10, $t17, $t18)], proj#0..10=[{exprs}], initial=[$t19])
+      EnumerableCalc(expr#0..16=[{inputs}], expr#17=['(?<initial>^[A-Z])'], expr#18=['initial'], expr#19=[REX_EXTRACT($t10, $t17, $t18)], proj#0..10=[{exprs}], initial=[$t19])
         EnumerableLimit(fetch=[5])
           CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]])