-
Notifications
You must be signed in to change notification settings - Fork 177
[BugFix] Fix unexpected shift of extraction for rex with nested capture groups in named groups
#4641
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[BugFix] Fix unexpected shift of extraction for rex with nested capture groups in named groups
#4641
Changes from all commits
7b5ee2c
44a969b
27487a6
82bad3c
ede5716
25629e0
bf77fe2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -282,10 +282,13 @@ public RelNode visitRex(Rex node, CalcitePlanContext context) { | |
| "Rex pattern must contain at least one named capture group"); | ||
| } | ||
|
|
||
| // TODO: Once JDK 20+ is supported, consider using Pattern.namedGroups() API for more efficient | ||
| // named group handling instead of manual parsing in RegexCommonUtils | ||
|
|
||
| List<RexNode> newFields = new ArrayList<>(); | ||
| List<String> newFieldNames = new ArrayList<>(); | ||
|
|
||
| for (int i = 0; i < namedGroups.size(); i++) { | ||
| for (String groupName : namedGroups) { | ||
| RexNode extractCall; | ||
| if (node.getMaxMatch().isPresent() && node.getMaxMatch().get() > 1) { | ||
| extractCall = | ||
|
|
@@ -294,7 +297,7 @@ public RelNode visitRex(Rex node, CalcitePlanContext context) { | |
| BuiltinFunctionName.REX_EXTRACT_MULTI, | ||
| fieldRex, | ||
| context.rexBuilder.makeLiteral(patternStr), | ||
| context.relBuilder.literal(i + 1), | ||
| context.rexBuilder.makeLiteral(groupName), | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I found there is a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That is correct - the core issue is matching named groups to their correct indices, and
I agree that directly leveraging the |
||
| context.relBuilder.literal(node.getMaxMatch().get())); | ||
| } else { | ||
| extractCall = | ||
|
|
@@ -303,10 +306,10 @@ public RelNode visitRex(Rex node, CalcitePlanContext context) { | |
| BuiltinFunctionName.REX_EXTRACT, | ||
| fieldRex, | ||
| context.rexBuilder.makeLiteral(patternStr), | ||
| context.relBuilder.literal(i + 1)); | ||
| context.rexBuilder.makeLiteral(groupName)); | ||
| } | ||
| newFields.add(extractCall); | ||
| newFieldNames.add(namedGroups.get(i)); | ||
| newFieldNames.add(groupName); | ||
| } | ||
|
|
||
| if (node.getOffsetField().isPresent()) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,11 +16,15 @@ | |
| import org.apache.calcite.linq4j.tree.Expression; | ||
| import org.apache.calcite.linq4j.tree.Expressions; | ||
| import org.apache.calcite.rex.RexCall; | ||
| import org.apache.calcite.sql.type.CompositeOperandTypeChecker; | ||
| import org.apache.calcite.sql.type.OperandTypes; | ||
| import org.apache.calcite.sql.type.SqlReturnTypeInference; | ||
| import org.apache.calcite.sql.type.SqlTypeFamily; | ||
| import org.apache.calcite.sql.type.SqlTypeName; | ||
| import org.opensearch.sql.calcite.utils.PPLOperandTypes; | ||
| import org.opensearch.sql.expression.function.ImplementorUDF; | ||
| import org.opensearch.sql.expression.function.UDFOperandMetadata; | ||
| import org.opensearch.sql.expression.parse.RegexCommonUtils; | ||
|
|
||
| /** Custom REX_EXTRACT_MULTI function for extracting multiple regex matches. */ | ||
| public final class RexExtractMultiFunction extends ImplementorUDF { | ||
|
|
@@ -40,7 +44,17 @@ public SqlReturnTypeInference getReturnTypeInference() { | |
|
|
||
| @Override | ||
| public UDFOperandMetadata getOperandMetadata() { | ||
| return PPLOperandTypes.STRING_STRING_INTEGER_INTEGER; | ||
| // Support both (field, pattern, groupIndex, maxMatch) and (field, pattern, groupName, maxMatch) | ||
| return UDFOperandMetadata.wrap( | ||
| (CompositeOperandTypeChecker) | ||
| PPLOperandTypes.STRING_STRING_INTEGER_INTEGER | ||
| .getInnerTypeChecker() | ||
| .or( | ||
| OperandTypes.family( | ||
| SqlTypeFamily.CHARACTER, | ||
| SqlTypeFamily.CHARACTER, | ||
| SqlTypeFamily.CHARACTER, | ||
| SqlTypeFamily.INTEGER))); | ||
| } | ||
|
|
||
| private static class RexExtractMultiImplementor implements NotNullImplementor { | ||
|
|
@@ -50,35 +64,105 @@ public Expression implement( | |
| RexToLixTranslator translator, RexCall call, List<Expression> translatedOperands) { | ||
| Expression field = translatedOperands.get(0); | ||
| Expression pattern = translatedOperands.get(1); | ||
| Expression groupIndex = translatedOperands.get(2); | ||
| Expression groupIndexOrName = translatedOperands.get(2); | ||
| Expression maxMatch = translatedOperands.get(3); | ||
|
|
||
| return Expressions.call( | ||
| RexExtractMultiFunction.class, | ||
| "extractMultipleGroups", | ||
| field, | ||
| pattern, | ||
| groupIndex, | ||
| groupIndexOrName, | ||
| maxMatch); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Extract multiple regex groups by index (1-based). | ||
| * | ||
| * @param text The input text to extract from | ||
| * @param pattern The regex pattern | ||
| * @param groupIndex The 1-based group index to extract | ||
| * @param maxMatch Maximum number of matches to return (0 = unlimited) | ||
| * @return List of extracted values or null if no matches found | ||
| */ | ||
| public static List<String> extractMultipleGroups( | ||
| String text, String pattern, int groupIndex, int maxMatch) { | ||
| // Query planner already validates null inputs via NullPolicy.ARG0 | ||
| if (text == null || pattern == null) { | ||
| return null; | ||
| } | ||
|
|
||
| return executeMultipleExtractions( | ||
| text, | ||
| pattern, | ||
| maxMatch, | ||
| matcher -> { | ||
| if (groupIndex > 0 && groupIndex <= matcher.groupCount()) { | ||
| return matcher.group(groupIndex); | ||
| } | ||
| return null; | ||
| }); | ||
| } | ||
|
|
||
| /** | ||
| * Extract multiple occurrences of a named capture group from text. This method avoids the index | ||
| * shifting issue that occurs with nested unnamed groups. | ||
| * | ||
| * @param text The input text to extract from | ||
| * @param pattern The regex pattern with named capture groups | ||
| * @param groupName The name of the capture group to extract | ||
| * @param maxMatch Maximum number of matches to return (0 = unlimited) | ||
| * @return List of extracted values or null if no matches found | ||
| */ | ||
| public static List<String> extractMultipleGroups( | ||
| String text, String pattern, String groupName, int maxMatch) { | ||
| if (text == null || pattern == null || groupName == null) { | ||
| return null; | ||
| } | ||
|
|
||
| return executeMultipleExtractions( | ||
| text, | ||
| pattern, | ||
| maxMatch, | ||
| matcher -> { | ||
| try { | ||
| return matcher.group(groupName); | ||
| } catch (IllegalArgumentException e) { | ||
| // Group name doesn't exist in the pattern, stop processing | ||
| return null; | ||
| } | ||
| }); | ||
| } | ||
|
|
||
| /** | ||
| * Common extraction logic for multiple matches to avoid code duplication. | ||
| * | ||
| * @param text The input text | ||
| * @param pattern The regex pattern | ||
| * @param maxMatch Maximum matches (0 = unlimited) | ||
| * @param extractor Function to extract the value from the matcher | ||
| * @return List of extracted values or null if no matches found | ||
| */ | ||
| private static List<String> executeMultipleExtractions( | ||
| String text, | ||
| String pattern, | ||
| int maxMatch, | ||
| java.util.function.Function<Matcher, String> extractor) { | ||
| try { | ||
| Pattern compiledPattern = Pattern.compile(pattern); | ||
| Pattern compiledPattern = RegexCommonUtils.getCompiledPattern(pattern); | ||
| Matcher matcher = compiledPattern.matcher(text); | ||
| List<String> matches = new ArrayList<>(); | ||
|
|
||
| int matchCount = 0; | ||
| while (matcher.find() && (maxMatch == 0 || matchCount < maxMatch)) { | ||
| if (groupIndex > 0 && groupIndex <= matcher.groupCount()) { | ||
| String match = matcher.group(groupIndex); | ||
| if (match != null) { | ||
| matches.add(match); | ||
| matchCount++; | ||
| } | ||
| String match = extractor.apply(matcher); | ||
| if (match != null) { | ||
| matches.add(match); | ||
| matchCount++; | ||
| } else { | ||
| // If extractor returns null, it might indicate an error (like invalid group name) | ||
| // Stop processing to avoid infinite loop | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm currently thinking about adding an error handling here |
||
| break; | ||
| } | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
as for now, I added a
TODOhere @dai-chen