Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -282,10 +282,13 @@ public RelNode visitRex(Rex node, CalcitePlanContext context) {
"Rex pattern must contain at least one named capture group");
}

// TODO: Once JDK 20+ is supported, consider using Pattern.namedGroups() API for more efficient
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as for now, I added a TODO here @dai-chen

// named group handling instead of manual parsing in RegexCommonUtils

List<RexNode> newFields = new ArrayList<>();
List<String> newFieldNames = new ArrayList<>();

for (int i = 0; i < namedGroups.size(); i++) {
for (String groupName : namedGroups) {
RexNode extractCall;
if (node.getMaxMatch().isPresent() && node.getMaxMatch().get() > 1) {
extractCall =
Expand All @@ -294,7 +297,7 @@ public RelNode visitRex(Rex node, CalcitePlanContext context) {
BuiltinFunctionName.REX_EXTRACT_MULTI,
fieldRex,
context.rexBuilder.makeLiteral(patternStr),
context.relBuilder.literal(i + 1),
context.rexBuilder.makeLiteral(groupName),
Copy link
Collaborator

@dai-chen dai-chen Oct 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I found there is a namedGroups() API in Matcher (since JDK 20?). If we can get correct index here, we don't need to modify the UDFs below? Alternatively we can move capture name -> index logic here from UDFs?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is correct - the core issue is matching named groups to their correct indices, and Pattern.namedGroups() would be the perfect solution. However, I discovered that we're blocked by a
compatibility constraint:

  • Pattern.namedGroups() was introduced in JDK 20
  • We need backward compatibility with JDK 11/17 - for 2.19-dev

I agree that directly leveraging the Pattern.namedGroups() is the right architectural approach - we should definitely migrate to it when we fully upgrade to JDK 20+. At that point, it would be a simple one-line change in CalciteRelNodeVisitor.

context.relBuilder.literal(node.getMaxMatch().get()));
} else {
extractCall =
Expand All @@ -303,10 +306,10 @@ public RelNode visitRex(Rex node, CalcitePlanContext context) {
BuiltinFunctionName.REX_EXTRACT,
fieldRex,
context.rexBuilder.makeLiteral(patternStr),
context.relBuilder.literal(i + 1));
context.rexBuilder.makeLiteral(groupName));
}
newFields.add(extractCall);
newFieldNames.add(namedGroups.get(i));
newFieldNames.add(groupName);
}

if (node.getOffsetField().isPresent()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,13 @@
import org.apache.calcite.linq4j.tree.Expression;
import org.apache.calcite.linq4j.tree.Expressions;
import org.apache.calcite.rex.RexCall;
import org.apache.calcite.sql.type.CompositeOperandTypeChecker;
import org.apache.calcite.sql.type.ReturnTypes;
import org.apache.calcite.sql.type.SqlReturnTypeInference;
import org.opensearch.sql.calcite.utils.PPLOperandTypes;
import org.opensearch.sql.expression.function.ImplementorUDF;
import org.opensearch.sql.expression.function.UDFOperandMetadata;
import org.opensearch.sql.expression.parse.RegexCommonUtils;

/** Custom REX_EXTRACT function for extracting regex named capture groups. */
public final class RexExtractFunction extends ImplementorUDF {
Expand All @@ -35,7 +37,12 @@ public SqlReturnTypeInference getReturnTypeInference() {

@Override
public UDFOperandMetadata getOperandMetadata() {
return PPLOperandTypes.STRING_STRING_INTEGER;
// Support both (field, pattern, groupIndex) and (field, pattern, groupName)
return UDFOperandMetadata.wrap(
(CompositeOperandTypeChecker)
PPLOperandTypes.STRING_STRING_INTEGER
.getInnerTypeChecker()
.or(PPLOperandTypes.STRING_STRING_STRING.getInnerTypeChecker()));
}

private static class RexExtractImplementor implements NotNullImplementor {
Expand All @@ -45,19 +52,80 @@ public Expression implement(
RexToLixTranslator translator, RexCall call, List<Expression> translatedOperands) {
Expression field = translatedOperands.get(0);
Expression pattern = translatedOperands.get(1);
Expression groupIndex = translatedOperands.get(2);
Expression groupIndexOrName = translatedOperands.get(2);

return Expressions.call(RexExtractFunction.class, "extractGroup", field, pattern, groupIndex);
return Expressions.call(
RexExtractFunction.class, "extractGroup", field, pattern, groupIndexOrName);
}
}

/**
* Extract a regex group by index (1-based).
*
* @param text The input text to extract from
* @param pattern The regex pattern
* @param groupIndex The 1-based group index to extract
* @return The extracted value or null if not found or invalid
*/
public static String extractGroup(String text, String pattern, int groupIndex) {
if (text == null || pattern == null) {
return null;
}

return executeExtraction(
text,
pattern,
matcher -> {
if (groupIndex > 0 && groupIndex <= matcher.groupCount()) {
return matcher.group(groupIndex);
}
return null;
});
}

/**
* Extract a named capture group from text using the provided pattern. This method avoids the
* index shifting issue that occurs with nested unnamed groups.
*
* @param text The input text to extract from
* @param pattern The regex pattern with named capture groups
* @param groupName The name of the capture group to extract
* @return The extracted value or null if not found
*/
public static String extractGroup(String text, String pattern, String groupName) {
if (text == null || pattern == null || groupName == null) {
return null;
}

return executeExtraction(
text,
pattern,
matcher -> {
try {
return matcher.group(groupName);
} catch (IllegalArgumentException e) {
// Group name doesn't exist in the pattern
return null;
}
});
}

/**
* Common extraction logic to avoid code duplication.
*
* @param text The input text
* @param pattern The regex pattern
* @param extractor Function to extract the value from the matcher
* @return The extracted value or null
*/
private static String executeExtraction(
String text, String pattern, java.util.function.Function<Matcher, String> extractor) {
try {
Pattern compiledPattern = Pattern.compile(pattern);
Pattern compiledPattern = RegexCommonUtils.getCompiledPattern(pattern);
Matcher matcher = compiledPattern.matcher(text);

if (matcher.find() && groupIndex > 0 && groupIndex <= matcher.groupCount()) {
return matcher.group(groupIndex);
if (matcher.find()) {
return extractor.apply(matcher);
}
return null;
} catch (PatternSyntaxException e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,15 @@
import org.apache.calcite.linq4j.tree.Expression;
import org.apache.calcite.linq4j.tree.Expressions;
import org.apache.calcite.rex.RexCall;
import org.apache.calcite.sql.type.CompositeOperandTypeChecker;
import org.apache.calcite.sql.type.OperandTypes;
import org.apache.calcite.sql.type.SqlReturnTypeInference;
import org.apache.calcite.sql.type.SqlTypeFamily;
import org.apache.calcite.sql.type.SqlTypeName;
import org.opensearch.sql.calcite.utils.PPLOperandTypes;
import org.opensearch.sql.expression.function.ImplementorUDF;
import org.opensearch.sql.expression.function.UDFOperandMetadata;
import org.opensearch.sql.expression.parse.RegexCommonUtils;

/** Custom REX_EXTRACT_MULTI function for extracting multiple regex matches. */
public final class RexExtractMultiFunction extends ImplementorUDF {
Expand All @@ -40,7 +44,17 @@ public SqlReturnTypeInference getReturnTypeInference() {

@Override
public UDFOperandMetadata getOperandMetadata() {
return PPLOperandTypes.STRING_STRING_INTEGER_INTEGER;
// Support both (field, pattern, groupIndex, maxMatch) and (field, pattern, groupName, maxMatch)
return UDFOperandMetadata.wrap(
(CompositeOperandTypeChecker)
PPLOperandTypes.STRING_STRING_INTEGER_INTEGER
.getInnerTypeChecker()
.or(
OperandTypes.family(
SqlTypeFamily.CHARACTER,
SqlTypeFamily.CHARACTER,
SqlTypeFamily.CHARACTER,
SqlTypeFamily.INTEGER)));
}

private static class RexExtractMultiImplementor implements NotNullImplementor {
Expand All @@ -50,35 +64,105 @@ public Expression implement(
RexToLixTranslator translator, RexCall call, List<Expression> translatedOperands) {
Expression field = translatedOperands.get(0);
Expression pattern = translatedOperands.get(1);
Expression groupIndex = translatedOperands.get(2);
Expression groupIndexOrName = translatedOperands.get(2);
Expression maxMatch = translatedOperands.get(3);

return Expressions.call(
RexExtractMultiFunction.class,
"extractMultipleGroups",
field,
pattern,
groupIndex,
groupIndexOrName,
maxMatch);
}
}

/**
* Extract multiple regex groups by index (1-based).
*
* @param text The input text to extract from
* @param pattern The regex pattern
* @param groupIndex The 1-based group index to extract
* @param maxMatch Maximum number of matches to return (0 = unlimited)
* @return List of extracted values or null if no matches found
*/
public static List<String> extractMultipleGroups(
String text, String pattern, int groupIndex, int maxMatch) {
// Query planner already validates null inputs via NullPolicy.ARG0
if (text == null || pattern == null) {
return null;
}

return executeMultipleExtractions(
text,
pattern,
maxMatch,
matcher -> {
if (groupIndex > 0 && groupIndex <= matcher.groupCount()) {
return matcher.group(groupIndex);
}
return null;
});
}

/**
* Extract multiple occurrences of a named capture group from text. This method avoids the index
* shifting issue that occurs with nested unnamed groups.
*
* @param text The input text to extract from
* @param pattern The regex pattern with named capture groups
* @param groupName The name of the capture group to extract
* @param maxMatch Maximum number of matches to return (0 = unlimited)
* @return List of extracted values or null if no matches found
*/
public static List<String> extractMultipleGroups(
String text, String pattern, String groupName, int maxMatch) {
if (text == null || pattern == null || groupName == null) {
return null;
}

return executeMultipleExtractions(
text,
pattern,
maxMatch,
matcher -> {
try {
return matcher.group(groupName);
} catch (IllegalArgumentException e) {
// Group name doesn't exist in the pattern, stop processing
return null;
}
});
}

/**
* Common extraction logic for multiple matches to avoid code duplication.
*
* @param text The input text
* @param pattern The regex pattern
* @param maxMatch Maximum matches (0 = unlimited)
* @param extractor Function to extract the value from the matcher
* @return List of extracted values or null if no matches found
*/
private static List<String> executeMultipleExtractions(
String text,
String pattern,
int maxMatch,
java.util.function.Function<Matcher, String> extractor) {
try {
Pattern compiledPattern = Pattern.compile(pattern);
Pattern compiledPattern = RegexCommonUtils.getCompiledPattern(pattern);
Matcher matcher = compiledPattern.matcher(text);
List<String> matches = new ArrayList<>();

int matchCount = 0;
while (matcher.find() && (maxMatch == 0 || matchCount < maxMatch)) {
if (groupIndex > 0 && groupIndex <= matcher.groupCount()) {
String match = matcher.group(groupIndex);
if (match != null) {
matches.add(match);
matchCount++;
}
String match = extractor.apply(matcher);
if (match != null) {
matches.add(match);
matchCount++;
} else {
// If extractor returns null, it might indicate an error (like invalid group name)
// Stop processing to avoid infinite loop
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm currently thinking about adding an error handling here

break;
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -306,4 +306,53 @@ public void testRexMaxMatchConfigurableLimit() throws IOException {
new ClusterSetting(PERSISTENT, Settings.Key.PPL_REX_MAX_MATCH_LIMIT.getKeyValue(), null));
}
}

@Test
public void testRexNestedCaptureGroupsBugFix() throws IOException {
JSONObject resultWithNested =
executeQuery(
String.format(
"source=%s | rex field=email"
+ " \\\"(?<user>[^@]+)@(?<domain>(pyrami|gmail|yahoo))\\\\\\\\.(?<tld>(com|org|net))\\\""
+ " | fields user, domain, tld | head 1",
TEST_INDEX_ACCOUNT));

assertEquals(1, resultWithNested.getJSONArray("datarows").length());
assertEquals(
"amberduke",
resultWithNested
.getJSONArray("datarows")
.getJSONArray(0)
.get(0)); // user should be "amberduke"
assertEquals(
"pyrami",
resultWithNested
.getJSONArray("datarows")
.getJSONArray(0)
.get(1)); // domain should be "pyrami", NOT "amberduke"
assertEquals(
"com",
resultWithNested
.getJSONArray("datarows")
.getJSONArray(0)
.get(2)); // tld should be "com", NOT "pyrami"

// More complex nested alternation
JSONObject complexNested =
executeQuery(
String.format(
"source=%s | rex field=firstname"
+ " \\\"(?<initial>(A|B|C|D|E))[a-z]*(?<suffix>(ley|nne|ber|ton|son))\\\" |"
+ " fields initial, suffix | head 1",
TEST_INDEX_ACCOUNT));

if (!complexNested.getJSONArray("datarows").isEmpty()) {
String initial = complexNested.getJSONArray("datarows").getJSONArray(0).getString(0);
String suffix = complexNested.getJSONArray("datarows").getJSONArray(0).getString(1);

assertTrue("Initial should be a single letter A-E", initial.matches("[A-E]"));
assertTrue(
"Suffix should match alternation pattern", suffix.matches("(ley|nne|ber|ton|son)"));
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ calcite:
LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT])
LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], initial=[$17])
LogicalSort(fetch=[5])
LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], initial=[REX_EXTRACT($10, '(?<initial>^[A-Z])', 1)])
LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], initial=[REX_EXTRACT($10, '(?<initial>^[A-Z])', 'initial')])
CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]])
physical: |
EnumerableCalc(expr#0..10=[{inputs}], expr#11=['(?<initial>^[A-Z])'], expr#12=[1], expr#13=[REX_EXTRACT($t10, $t11, $t12)], proj#0..10=[{exprs}], $f11=[$t13])
EnumerableCalc(expr#0..10=[{inputs}], expr#11=['(?<initial>^[A-Z])'], expr#12=['initial'], expr#13=[REX_EXTRACT($t10, $t11, $t12)], proj#0..10=[{exprs}], $f11=[$t13])
CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[account_number, firstname, address, balance, gender, city, employer, state, age, email, lastname], LIMIT->5, LIMIT->10000], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":5,"timeout":"1m","_source":{"includes":["account_number","firstname","address","balance","gender","city","employer","state","age","email","lastname"],"excludes":[]}}, requestedTotalSize=5, pageSize=null, startFrom=0)])
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@ calcite:
LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT])
LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], initial=[$17])
LogicalSort(fetch=[5])
LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], initial=[REX_EXTRACT($10, '(?<initial>^[A-Z])', 1)])
LogicalProject(account_number=[$0], firstname=[$1], address=[$2], balance=[$3], gender=[$4], city=[$5], employer=[$6], state=[$7], age=[$8], email=[$9], lastname=[$10], _id=[$11], _index=[$12], _score=[$13], _maxscore=[$14], _sort=[$15], _routing=[$16], initial=[REX_EXTRACT($10, '(?<initial>^[A-Z])', 'initial')])
CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]])
physical: |
EnumerableLimit(fetch=[10000])
EnumerableCalc(expr#0..16=[{inputs}], expr#17=['(?<initial>^[A-Z])'], expr#18=[1], expr#19=[REX_EXTRACT($t10, $t17, $t18)], proj#0..10=[{exprs}], initial=[$t19])
EnumerableCalc(expr#0..16=[{inputs}], expr#17=['(?<initial>^[A-Z])'], expr#18=['initial'], expr#19=[REX_EXTRACT($t10, $t17, $t18)], proj#0..10=[{exprs}], initial=[$t19])
EnumerableLimit(fetch=[5])
CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]])
Loading
Loading