diff --git a/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/StringFunctions.java b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/StringFunctions.java index 2c1f8617cb3f..61fafd85a194 100644 --- a/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/StringFunctions.java +++ b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/StringFunctions.java @@ -18,19 +18,14 @@ */ package org.apache.pinot.common.function.scalar; -import it.unimi.dsi.fastutil.objects.ObjectLinkedOpenHashSet; -import it.unimi.dsi.fastutil.objects.ObjectSet; import java.io.UnsupportedEncodingException; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.text.Normalizer; import java.util.Base64; import java.util.UUID; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import javax.annotation.Nullable; import org.apache.commons.lang3.StringUtils; -import org.apache.pinot.common.utils.RegexpPatternConverterUtils; import org.apache.pinot.common.utils.URIUtils; import org.apache.pinot.spi.annotations.ScalarFunction; import org.apache.pinot.spi.utils.JsonUtils; @@ -48,9 +43,6 @@ public class StringFunctions { private StringFunctions() { } - private final static Pattern LTRIM = Pattern.compile("^\\s+"); - private final static Pattern RTRIM = Pattern.compile("\\s+$"); - /** * @see StringUtils#reverse(String) * @param input @@ -136,22 +128,6 @@ public static String substring(String input, int beginIndex, int length) { return StringUtils.substring(input, beginIndex, endIndex); } - /** - * Joins two input strings with separator in between. - */ - @ScalarFunction - public static String concatWS(String separator, String input1, String input2) { - return input1 + separator + input2; - } - - /** - * Joins two input strings with separator in between. - */ - @ScalarFunction - public static String concat(String input1, String input2, String separator) { - return input1 + separator + input2; - } - /** * Joins two input strings with no separator in between. */ @@ -208,24 +184,6 @@ public static String trim(String end, String characters, String value) { } } - /** - * @param input - * @return trim spaces from left side of the string - */ - @ScalarFunction - public static String ltrim(String input) { - return LTRIM.matcher(input).replaceAll(""); - } - - /** - * @param input - * @return trim spaces from right side of the string - */ - @ScalarFunction - public static String rtrim(String input) { - return RTRIM.matcher(input).replaceAll(""); - } - /** * @see StringUtils#left(String, int) * @param input @@ -246,48 +204,6 @@ public static String rightSubStr(String input, int length) { return StringUtils.right(input, length); } - /** - * @see #StringFunctions#regexpExtract(String, String, int, String) - * @param value - * @param regexp - * @return the matched result. - */ - @ScalarFunction - public static String regexpExtract(String value, String regexp) { - return regexpExtract(value, regexp, 0, ""); - } - - /** - * @see #StringFunctions#regexpExtract(String, String, int, String) - * @param value - * @param regexp - * @param group - * @return the matched result. - */ - @ScalarFunction - public static String regexpExtract(String value, String regexp, int group) { - return regexpExtract(value, regexp, group, ""); - } - - /** - * Regular expression that extract first matched substring. - * @param value input value - * @param regexp regular expression - * @param group the group number within the regular expression to extract. - * @param defaultValue the default value if no match found - * @return the matched result - */ - @ScalarFunction - public static String regexpExtract(String value, String regexp, int group, String defaultValue) { - Pattern p = Pattern.compile(regexp); - Matcher matcher = p.matcher(value); - if (matcher.find() && matcher.groupCount() >= group) { - return matcher.group(group); - } else { - return defaultValue; - } - } - /** * @see String#length() * @param input @@ -370,17 +286,6 @@ public static boolean endsWith(String input, String suffix) { return StringUtils.endsWith(input, suffix); } - /** - * @see String#replaceAll(String, String) - * @param input - * @param find target substring to replace - * @param substitute new substring to be replaced with target - */ - @ScalarFunction - public static String replace(String input, String find, String substitute) { - return StringUtils.replace(input, find, substitute); - } - /** * @see StringUtils#rightPad(String, int, char) * @param input @@ -637,43 +542,6 @@ public static String[] suffixesWithSuffix(String input, int maxlength, @Nullable return suffixArr; } - /** - * @param input an input string for ngram generations. - * @param length the max length of the ngram for the string. - * @return generate an array of unique ngram of the string that length are exactly matching the specified length. - */ - @ScalarFunction - public static String[] uniqueNgrams(String input, int length) { - if (length == 0 || length > input.length()) { - return new String[0]; - } - ObjectSet ngramSet = new ObjectLinkedOpenHashSet<>(); - for (int i = 0; i < input.length() - length + 1; i++) { - ngramSet.add(input.substring(i, i + length)); - } - return ngramSet.toArray(new String[0]); - } - - /** - * @param input an input string for ngram generations. - * @param minGram the min length of the ngram for the string. - * @param maxGram the max length of the ngram for the string. - * @return generate an array of ngram of the string that length are within the specified range [minGram, maxGram]. - */ - @ScalarFunction - public static String[] uniqueNgrams(String input, int minGram, int maxGram) { - ObjectSet ngramSet = new ObjectLinkedOpenHashSet<>(); - for (int n = minGram; n <= maxGram && n <= input.length(); n++) { - if (n == 0) { - continue; - } - for (int i = 0; i < input.length() - n + 1; i++) { - ngramSet.add(input.substring(i, i + n)); - } - } - return ngramSet.toArray(new String[0]); - } - /** * TODO: Revisit if index should be one-based (both Presto and Postgres use one-based index, which starts with 1) * @param input @@ -824,125 +692,6 @@ public static byte[] fromBase64(String input) { return Base64.getDecoder().decode(input); } - /** - * Replace a regular expression pattern. If matchStr is not found, inputStr will be returned. By default, all - * occurences of match pattern in the input string will be replaced. Default matching pattern is case sensitive. - * - * @param inputStr Input string to apply the regexpReplace - * @param matchStr Regexp or string to match against inputStr - * @param replaceStr Regexp or string to replace if matchStr is found - * @param matchStartPos Index of inputStr from where matching should start. Default is 0. - * @param occurence Controls which occurence of the matched pattern must be replaced. Counting starts at 0. Default - * is -1 - * @param flag Single character flag that controls how the regex finds matches in inputStr. If an incorrect flag is - * specified, the function applies default case sensitive match. Only one flag can be specified. Supported - * flags: - * i -> Case insensitive - * @return replaced input string - */ - @ScalarFunction - public static String regexpReplace(String inputStr, String matchStr, String replaceStr, int matchStartPos, - int occurence, String flag) { - Integer patternFlag; - - // TODO: Support more flags like MULTILINE, COMMENTS, etc. - switch (flag) { - case "i": - patternFlag = Pattern.CASE_INSENSITIVE; - break; - default: - patternFlag = null; - break; - } - - Pattern p; - if (patternFlag != null) { - p = Pattern.compile(matchStr, patternFlag); - } else { - p = Pattern.compile(matchStr); - } - - Matcher matcher = p.matcher(inputStr).region(matchStartPos, inputStr.length()); - StringBuffer sb; - - if (occurence >= 0) { - sb = new StringBuffer(inputStr); - while (occurence >= 0 && matcher.find()) { - if (occurence == 0) { - sb.replace(matcher.start(), matcher.end(), replaceStr); - break; - } - occurence--; - } - } else { - sb = new StringBuffer(); - while (matcher.find()) { - matcher.appendReplacement(sb, replaceStr); - } - matcher.appendTail(sb); - } - - return sb.toString(); - } - - /** - * See #regexpReplace(String, String, String, int, int, String). Matches against entire inputStr and replaces all - * occurences. Match is performed in case-sensitive mode. - * - * @param inputStr Input string to apply the regexpReplace - * @param matchStr Regexp or string to match against inputStr - * @param replaceStr Regexp or string to replace if matchStr is found - * @return replaced input string - */ - @ScalarFunction - public static String regexpReplace(String inputStr, String matchStr, String replaceStr) { - return regexpReplace(inputStr, matchStr, replaceStr, 0, -1, ""); - } - - /** - * See #regexpReplace(String, String, String, int, int, String). Matches against entire inputStr and replaces all - * occurences. Match is performed in case-sensitive mode. - * - * @param inputStr Input string to apply the regexpReplace - * @param matchStr Regexp or string to match against inputStr - * @param replaceStr Regexp or string to replace if matchStr is found - * @param matchStartPos Index of inputStr from where matching should start. Default is 0. - * @return replaced input string - */ - @ScalarFunction - public static String regexpReplace(String inputStr, String matchStr, String replaceStr, int matchStartPos) { - return regexpReplace(inputStr, matchStr, replaceStr, matchStartPos, -1, ""); - } - - /** - * See #regexpReplace(String, String, String, int, int, String). Match is performed in case-sensitive mode. - * - * @param inputStr Input string to apply the regexpReplace - * @param matchStr Regexp or string to match against inputStr - * @param replaceStr Regexp or string to replace if matchStr is found - * @param matchStartPos Index of inputStr from where matching should start. Default is 0. - * @param occurence Controls which occurence of the matched pattern must be replaced. Counting starts - * at 0. Default is -1 - * @return replaced input string - */ - @ScalarFunction - public static String regexpReplace(String inputStr, String matchStr, String replaceStr, int matchStartPos, - int occurence) { - return regexpReplace(inputStr, matchStr, replaceStr, matchStartPos, occurence, ""); - } - - @ScalarFunction - public static boolean regexpLike(String inputStr, String regexPatternStr) { - Pattern pattern = Pattern.compile(regexPatternStr, Pattern.UNICODE_CASE | Pattern.CASE_INSENSITIVE); - return pattern.matcher(inputStr).find(); - } - - @ScalarFunction - public static boolean like(String inputStr, String likePatternStr) { - String regexPatternStr = RegexpPatternConverterUtils.likeToRegexpLike(likePatternStr); - return regexpLike(inputStr, regexPatternStr); - } - /** * Checks whether the input string can be parsed into a json node or not. Useful for scenarios where we want * to filter out malformed json. diff --git a/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/regexp/RegexpExtractConstFunctions.java b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/regexp/RegexpExtractConstFunctions.java new file mode 100644 index 000000000000..92e808b77e43 --- /dev/null +++ b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/regexp/RegexpExtractConstFunctions.java @@ -0,0 +1,77 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.common.function.scalar.regexp; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.apache.pinot.spi.annotations.ScalarFunction; + + +/** + * Optimized implementation of regexp_extract that assumes pattern is constant. + */ +public class RegexpExtractConstFunctions { + + private Matcher _matcher; + + /** + * @param value + * @param regexp + * @return the matched result. + */ + @ScalarFunction + public String regexpExtract(String value, String regexp) { + return regexpExtract(value, regexp, 0, ""); + } + + /** + * @param value + * @param regexp + * @param group + * @return the matched result. + */ + @ScalarFunction + public String regexpExtract(String value, String regexp, int group) { + return regexpExtract(value, regexp, group, ""); + } + + /** + * Regular expression that extract first matched substring. + * + * @param value input value + * @param regexp regular expression + * @param group the group number within the regular expression to extract. + * @param defaultValue the default value if no match found + * @return the matched result + */ + @ScalarFunction + public String regexpExtract(String value, String regexp, int group, String defaultValue) { + if (_matcher == null) { + Pattern p = Pattern.compile(regexp); + _matcher = p.matcher(""); + } + + _matcher.reset(value); + if (_matcher.find() && _matcher.groupCount() >= group) { + return _matcher.group(group); + } else { + return defaultValue; + } + } +} diff --git a/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/regexp/RegexpExtractVarFunctions.java b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/regexp/RegexpExtractVarFunctions.java new file mode 100644 index 000000000000..f4ffd6fd2a53 --- /dev/null +++ b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/regexp/RegexpExtractVarFunctions.java @@ -0,0 +1,74 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.common.function.scalar.regexp; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.apache.pinot.spi.annotations.ScalarFunction; + + +/** + * Implementation of regexp_extract that assumes pattern is not constant. + */ +public class RegexpExtractVarFunctions { + private RegexpExtractVarFunctions() { + } + + /** + * @see #RegexpExtractVarFunctions#regexpExtractVar(String, String, int, String) + * @param value + * @param regexp + * @return the matched result. + */ + @ScalarFunction + public static String regexpExtractVar(String value, String regexp) { + return regexpExtractVar(value, regexp, 0, ""); + } + + /** + * @see #RegexpExtractVarFunctions#regexpExtractVar(String, String, int, String) + * @param value + * @param regexp + * @param group + * @return the matched result. + */ + @ScalarFunction + public static String regexpExtractVar(String value, String regexp, int group) { + return regexpExtractVar(value, regexp, group, ""); + } + + /** + * Regular expression that extract first matched substring. + * @param value input value + * @param regexp regular expression + * @param group the group number within the regular expression to extract. + * @param defaultValue the default value if no match found + * @return the matched result + */ + @ScalarFunction + public static String regexpExtractVar(String value, String regexp, int group, String defaultValue) { + Pattern p = Pattern.compile(regexp); + Matcher matcher = p.matcher(value); + if (matcher.find() && matcher.groupCount() >= group) { + return matcher.group(group); + } else { + return defaultValue; + } + } +} diff --git a/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/regexp/RegexpLikeConstFunctions.java b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/regexp/RegexpLikeConstFunctions.java new file mode 100644 index 000000000000..6224a56d6f2c --- /dev/null +++ b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/regexp/RegexpLikeConstFunctions.java @@ -0,0 +1,52 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.common.function.scalar.regexp; + +import org.apache.pinot.common.utils.RegexpPatternConverterUtils; +import org.apache.pinot.common.utils.regex.Matcher; +import org.apache.pinot.common.utils.regex.PatternFactory; +import org.apache.pinot.spi.annotations.ScalarFunction; + + +/** + * Optimized regexp_like implementation that assumes that pattern is constant. + */ +public class RegexpLikeConstFunctions { + + private Matcher _matcher; + + @ScalarFunction + public boolean regexpLike(String inputStr, String regexPatternStr) { + if (_matcher == null) { + _matcher = PatternFactory.compile(regexPatternStr).matcher(""); + } + + return _matcher.reset(inputStr).find(); + } + + @ScalarFunction + public boolean like(String inputStr, String likePatternStr) { + if (_matcher == null) { + String regexPatternStr = RegexpPatternConverterUtils.likeToRegexpLike(likePatternStr); + _matcher = PatternFactory.compile(regexPatternStr).matcher(""); + } + + return _matcher.reset(inputStr).find(); + } +} diff --git a/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/regexp/RegexpLikeVarFunctions.java b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/regexp/RegexpLikeVarFunctions.java new file mode 100644 index 000000000000..41ef92938e79 --- /dev/null +++ b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/regexp/RegexpLikeVarFunctions.java @@ -0,0 +1,45 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.common.function.scalar.regexp; + +import java.util.regex.Pattern; +import org.apache.pinot.common.utils.RegexpPatternConverterUtils; +import org.apache.pinot.spi.annotations.ScalarFunction; + + +/** + * Optimized regexp_like implementation that accepts variable pattern that needs compiling on each call. + */ +public class RegexpLikeVarFunctions { + + private RegexpLikeVarFunctions() { + } + + @ScalarFunction + public static boolean regexpLikeVar(String inputStr, String regexPatternStr) { + Pattern pattern = Pattern.compile(regexPatternStr, Pattern.UNICODE_CASE | Pattern.CASE_INSENSITIVE); + return pattern.matcher(inputStr).find(); + } + + @ScalarFunction + public static boolean likeVar(String inputStr, String likePatternStr) { + String regexPatternStr = RegexpPatternConverterUtils.likeToRegexpLike(likePatternStr); + return regexpLikeVar(inputStr, regexPatternStr); + } +} diff --git a/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/regexp/RegexpReplaceConstFunctions.java b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/regexp/RegexpReplaceConstFunctions.java new file mode 100644 index 000000000000..0a740498575e --- /dev/null +++ b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/regexp/RegexpReplaceConstFunctions.java @@ -0,0 +1,130 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.common.function.scalar.regexp; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.apache.pinot.spi.annotations.ScalarFunction; + + +/** + * Optimized regexp_replace implementations that assume pattern is constant. + */ +public class RegexpReplaceConstFunctions { + + private final StringBuilder _buffer = new StringBuilder(); + private Pattern _pattern; + private Matcher _matcher; + + /** + * Replace a regular expression pattern. If matchStr is not found, inputStr will be returned. By default, all + * occurrences of match pattern in the input string will be replaced. Default matching pattern is case sensitive. + * + * @param inputStr Input string to apply the regexpReplace + * @param matchStr Regexp or string to match against inputStr + * @param replaceStr Regexp or string to replace if matchStr is found + * @param matchStartPos Index of inputStr from where matching should start. Default is 0. + * @param occurrence Controls which occurrence of the matched pattern must be replaced. Counting starts at 0. + * Default is -1 + * @param flag Single character flag that controls how the regex finds matches in inputStr. If an + * incorrect flag + * is specified, the function applies default case-sensitive match. Only one flag can be + * specified. + * Supported flags: + * i -> Case insensitive + * @return replaced input string + */ + @ScalarFunction + public String regexpReplace(String inputStr, String matchStr, String replaceStr, int matchStartPos, + int occurrence, String flag) { + if (_pattern == null) { + int patternFlag = "i".equals(flag) ? Pattern.CASE_INSENSITIVE : 0; + _pattern = Pattern.compile(matchStr, patternFlag); + _matcher = _pattern.matcher(""); + } + + _matcher.reset(inputStr).region(matchStartPos, inputStr.length()); + + if (occurrence >= 0) { + _buffer.setLength(0); + _buffer.append(inputStr); + while (occurrence >= 0 && _matcher.find()) { + if (occurrence == 0) { + _buffer.replace(_matcher.start(), _matcher.end(), replaceStr); + break; + } + occurrence--; + } + } else { + _buffer.setLength(0); + while (_matcher.find()) { + _matcher.appendReplacement(_buffer, replaceStr); + } + _matcher.appendTail(_buffer); + } + + return _buffer.toString(); + } + + /** + * See #regexpReplace(String, String, String, int, int, String). Matches against entire inputStr and replaces all + * occurrences. Match is performed in case-sensitive mode. + * + * @param inputStr Input string to apply the regexpReplace + * @param matchStr Regexp or string to match against inputStr + * @param replaceStr Regexp or string to replace if matchStr is found + * @return replaced input string + */ + @ScalarFunction + public String regexpReplace(String inputStr, String matchStr, String replaceStr) { + return regexpReplace(inputStr, matchStr, replaceStr, 0, -1, ""); + } + + /** + * See #regexpReplace(String, String, String, int, int, String). Matches against entire inputStr and replaces all + * occurences. Match is performed in case-sensitive mode. + * + * @param inputStr Input string to apply the regexpReplace + * @param matchStr Regexp or string to match against inputStr + * @param replaceStr Regexp or string to replace if matchStr is found + * @param matchStartPos Index of inputStr from where matching should start. Default is 0. + * @return replaced input string + */ + @ScalarFunction + public String regexpReplace(String inputStr, String matchStr, String replaceStr, int matchStartPos) { + return regexpReplace(inputStr, matchStr, replaceStr, matchStartPos, -1, ""); + } + + /** + * See #regexpReplace(String, String, String, int, int, String). Match is performed in case-sensitive mode. + * + * @param inputStr Input string to apply the regexpReplace + * @param matchStr Regexp or string to match against inputStr + * @param replaceStr Regexp or string to replace if matchStr is found + * @param matchStartPos Index of inputStr from where matching should start. Default is 0. + * @param occurrence Controls which occurrence of the matched pattern must be replaced. Counting starts + * at 0. Default is -1 + * @return replaced input string + */ + @ScalarFunction + public String regexpReplace(String inputStr, String matchStr, String replaceStr, int matchStartPos, + int occurrence) { + return regexpReplace(inputStr, matchStr, replaceStr, matchStartPos, occurrence, ""); + } +} diff --git a/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/regexp/RegexpReplaceVarFunctions.java b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/regexp/RegexpReplaceVarFunctions.java new file mode 100644 index 000000000000..ca32921988e2 --- /dev/null +++ b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/regexp/RegexpReplaceVarFunctions.java @@ -0,0 +1,125 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.common.function.scalar.regexp; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.apache.pinot.spi.annotations.ScalarFunction; + + +/** + * Optimized regexp_like implementation that assumes that pattern is not constant . + */ +public class RegexpReplaceVarFunctions { + + private final StringBuilder _buffer = new StringBuilder(); + + /** + * Replace a regular expression pattern. If matchStr is not found, inputStr will be returned. By default, all + * occurences of match pattern in the input string will be replaced. Default matching pattern is case sensitive. + * + * @param inputStr Input string to apply the regexpReplace + * @param matchStr Regexp or string to match against inputStr + * @param replaceStr Regexp or string to replace if matchStr is found + * @param matchStartPos Index of inputStr from where matching should start. Default is 0. + * @param occurence Controls which occurence of the matched pattern must be replaced. Counting starts at 0. + * Default + * is -1 + * @param flag Single character flag that controls how the regex finds matches in inputStr. If an + * incorrect flag is + * specified, the function applies default case sensitive match. Only one flag can be + * specified. Supported + * flags: + * i -> Case insensitive + * @return replaced input string + */ + @ScalarFunction + public String regexpReplaceVar(String inputStr, String matchStr, String replaceStr, int matchStartPos, + int occurence, String flag) { + int patternFlag = "i".equals(flag) ? Pattern.CASE_INSENSITIVE : 0; + Pattern p = Pattern.compile(matchStr, patternFlag); + Matcher matcher = p.matcher(inputStr).region(matchStartPos, inputStr.length()); + + if (occurence >= 0) { + _buffer.setLength(0); + _buffer.append(inputStr); + while (occurence >= 0 && matcher.find()) { + if (occurence == 0) { + _buffer.replace(matcher.start(), matcher.end(), replaceStr); + break; + } + occurence--; + } + } else { + _buffer.setLength(0); + while (matcher.find()) { + matcher.appendReplacement(_buffer, replaceStr); + } + matcher.appendTail(_buffer); + } + + return _buffer.toString(); + } + + /** + * See #regexpReplace(String, String, String, int, int, String). Matches against entire inputStr and replaces all + * occurences. Match is performed in case-sensitive mode. + * + * @param inputStr Input string to apply the regexpReplace + * @param matchStr Regexp or string to match against inputStr + * @param replaceStr Regexp or string to replace if matchStr is found + * @return replaced input string + */ + @ScalarFunction + public String regexpReplaceVar(String inputStr, String matchStr, String replaceStr) { + return regexpReplaceVar(inputStr, matchStr, replaceStr, 0, -1, ""); + } + + /** + * See #regexpReplace(String, String, String, int, int, String). Matches against entire inputStr and replaces all + * occurences. Match is performed in case-sensitive mode. + * + * @param inputStr Input string to apply the regexpReplace + * @param matchStr Regexp or string to match against inputStr + * @param replaceStr Regexp or string to replace if matchStr is found + * @param matchStartPos Index of inputStr from where matching should start. Default is 0. + * @return replaced input string + */ + @ScalarFunction + public String regexpReplaceVar(String inputStr, String matchStr, String replaceStr, int matchStartPos) { + return regexpReplaceVar(inputStr, matchStr, replaceStr, matchStartPos, -1, ""); + } + + /** + * See #regexpReplace(String, String, String, int, int, String). Match is performed in case-sensitive mode. + * + * @param inputStr Input string to apply the regexpReplace + * @param matchStr Regexp or string to match against inputStr + * @param replaceStr Regexp or string to replace if matchStr is found + * @param matchStartPos Index of inputStr from where matching should start. Default is 0. + * @param occurence Controls which occurence of the matched pattern must be replaced. Counting starts + * at 0. Default is -1 + * @return replaced input string + */ + @ScalarFunction + public String regexpReplaceVar(String inputStr, String matchStr, String replaceStr, int matchStartPos, + int occurence) { + return regexpReplaceVar(inputStr, matchStr, replaceStr, matchStartPos, occurence, ""); + } +} diff --git a/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/string/LTrimFunction.java b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/string/LTrimFunction.java new file mode 100644 index 000000000000..bf8c29336582 --- /dev/null +++ b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/string/LTrimFunction.java @@ -0,0 +1,39 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.common.function.scalar.string; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.apache.pinot.spi.annotations.ScalarFunction; + + +public class LTrimFunction { + + private final static Pattern LTRIM = Pattern.compile("^\\s+"); + private final Matcher _matcher = LTRIM.matcher(""); + + /** + * @param input input + * @return trim spaces from left side of the string + */ + @ScalarFunction + public String ltrim(String input) { + return _matcher.reset(input).replaceAll(""); + } +} diff --git a/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/string/NgramFunctions.java b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/string/NgramFunctions.java new file mode 100644 index 000000000000..b2dc67c7c6e8 --- /dev/null +++ b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/string/NgramFunctions.java @@ -0,0 +1,68 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.common.function.scalar.string; + +import it.unimi.dsi.fastutil.objects.ObjectLinkedOpenHashSet; +import it.unimi.dsi.fastutil.objects.ObjectSet; +import org.apache.pinot.spi.annotations.ScalarFunction; + + +public class NgramFunctions { + + private final ObjectSet _ngramSet = new ObjectLinkedOpenHashSet<>(); + + /** + * @param input an input string for ngram generations. + * @param length the max length of the ngram for the string. + * @return generate an array of unique ngram of the string that length are exactly matching the specified length. + */ + @ScalarFunction + public String[] uniqueNgrams(String input, int length) { + if (length == 0 || length > input.length()) { + return new String[0]; + } + + _ngramSet.clear(); + for (int i = 0; i < input.length() - length + 1; i++) { + _ngramSet.add(input.substring(i, i + length)); + } + return _ngramSet.toArray(new String[0]); + } + + /** + * @param input an input string for ngram generations. + * @param minGram the min length of the ngram for the string. + * @param maxGram the max length of the ngram for the string. + * @return generate an array of ngram of the string that length are within the specified range [minGram, maxGram]. + */ + @ScalarFunction + public String[] uniqueNgrams(String input, int minGram, int maxGram) { + _ngramSet.clear(); + ObjectSet ngramSet = new ObjectLinkedOpenHashSet<>(); + for (int n = minGram; n <= maxGram && n <= input.length(); n++) { + if (n == 0) { + continue; + } + for (int i = 0; i < input.length() - n + 1; i++) { + ngramSet.add(input.substring(i, i + n)); + } + } + return ngramSet.toArray(new String[0]); + } +} diff --git a/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/string/RTrimFunction.java b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/string/RTrimFunction.java new file mode 100644 index 000000000000..eb1b019eaaa9 --- /dev/null +++ b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/string/RTrimFunction.java @@ -0,0 +1,39 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.common.function.scalar.string; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.apache.pinot.spi.annotations.ScalarFunction; + + +public class RTrimFunction { + + private final static Pattern RTRIM = Pattern.compile("\\s+$"); + private final Matcher _matcher = RTRIM.matcher(""); + + /** + * @param input input + * @return trim spaces from right side of the string + */ + @ScalarFunction + public String rtrim(String input) { + return _matcher.reset(input).replaceAll(""); + } +} diff --git a/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/string/StringFunctions.java b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/string/StringFunctions.java new file mode 100644 index 000000000000..75bad424f113 --- /dev/null +++ b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/string/StringFunctions.java @@ -0,0 +1,88 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.common.function.scalar.string; + +import org.apache.commons.lang3.StringUtils; +import org.apache.pinot.spi.annotations.ScalarFunction; + + +/** + * Class contains a number of non-static scalar functions optimized by re-using various buffers. + */ +public class StringFunctions { + + private final StringBuilder _buffer = new StringBuilder(); + + /** + * Joins two input strings with separator in between. + */ + @ScalarFunction + public String concat(String input1, String input2, String separator) { + _buffer.setLength(0); + return _buffer.append(input1) + .append(separator) + .append(input2) + .toString(); + } + + /** + * Joins two input strings with separator in between. + */ + @ScalarFunction + public String concatWS(String separator, String input1, String input2) { + _buffer.setLength(0); + return _buffer.append(input1) + .append(separator) + .append(input2) + .toString(); + } + + /** + * @param input + * @param searchString target substring to replace + * @param substitute new substring to be replaced with target + * @see String#replaceAll(String, String) + */ + @ScalarFunction + public String replace(String input, String searchString, String substitute) { + if (StringUtils.isEmpty(input) || StringUtils.isEmpty(searchString) || substitute == null) { + return input; + } + int start = 0; + int end = StringUtils.indexOf(input, searchString, start); + if (end == StringUtils.INDEX_NOT_FOUND) { + return input; + } + final int replLength = searchString.length(); + int increase = Math.max(substitute.length() - replLength, 0) * 16; + _buffer.setLength(0); + _buffer.ensureCapacity(input.length() + increase); + int max = -1; + while (end != StringUtils.INDEX_NOT_FOUND) { + _buffer.append(input, start, end).append(substitute); + start = end + replLength; + if (--max == 0) { + break; + } + end = StringUtils.indexOf(input, searchString, start); + } + _buffer.append(input, start, input.length()); + return _buffer.toString(); + } +} diff --git a/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/StringFunctionsTest.java b/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/StringFunctionsTest.java index 3a61e414dff2..68c71e4afbf1 100644 --- a/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/StringFunctionsTest.java +++ b/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/StringFunctionsTest.java @@ -18,6 +18,7 @@ */ package org.apache.pinot.common.function.scalar; +import org.apache.pinot.common.function.scalar.string.NgramFunctions; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -135,8 +136,8 @@ public void testPrefixAndSuffix(String input, int length, String[] expectedPrefi @Test(dataProvider = "ngramTestCases") public void testNGram(String input, int minGram, int maxGram, String[] expectedExactNGram, String[] expectedNGram) { - assertEquals(StringFunctions.uniqueNgrams(input, maxGram), expectedExactNGram); - assertEquals(StringFunctions.uniqueNgrams(input, minGram, maxGram), expectedNGram); + assertEquals(new NgramFunctions().uniqueNgrams(input, maxGram), expectedExactNGram); + assertEquals(new NgramFunctions().uniqueNgrams(input, minGram, maxGram), expectedNGram); } @Test diff --git a/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/regexp/RegexpExtractConstFunctionsTest.java b/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/regexp/RegexpExtractConstFunctionsTest.java new file mode 100644 index 000000000000..8924146ba479 --- /dev/null +++ b/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/regexp/RegexpExtractConstFunctionsTest.java @@ -0,0 +1,48 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.common.function.scalar.regexp; + +import org.testng.annotations.Test; + +import static org.testng.Assert.assertEquals; + + +public class RegexpExtractConstFunctionsTest { + + @Test + public void test() { + RegexpExtractConstFunctions f = new RegexpExtractConstFunctions(); + + assertEquals(f.regexpExtract("val abe eee", "(a[bcd]e)"), "abe"); + assertEquals(f.regexpExtract("val ade eee", "(a[bcd]e)"), "ade"); + assertEquals(f.regexpExtract("val age eee", "(a[bcd]e)"), ""); + // f caches first pattern and ignores second argument + assertEquals(f.regexpExtract("val abe ace", "(a[bcd]e) (a[bcd]e)", 2), ""); + + f = new RegexpExtractConstFunctions(); + assertEquals(f.regexpExtract("val abe ace", "(a[bcd]e) (a[bcd]e)", 2), "ace"); + + f = new RegexpExtractConstFunctions(); + assertEquals(f.regexpExtract("abe ace ade", "(a[bcd]e) (a[bcd]e) (a[bcd]e)", 3), "ade"); + + f = new RegexpExtractConstFunctions(); + assertEquals(f.regexpExtract("abe ace ade", "(a[bcd]e)", 5, "wrong"), "wrong"); + assertEquals(f.regexpExtract("aa bb cc", "(a[bcd]e)", 1, "wrong"), "wrong"); + } +} diff --git a/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/regexp/RegexpExtractVarFunctionsTest.java b/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/regexp/RegexpExtractVarFunctionsTest.java new file mode 100644 index 000000000000..b7124f2d87b0 --- /dev/null +++ b/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/regexp/RegexpExtractVarFunctionsTest.java @@ -0,0 +1,41 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.common.function.scalar.regexp; + +import org.testng.annotations.Test; + +import static org.apache.pinot.common.function.scalar.regexp.RegexpExtractVarFunctions.regexpExtractVar; +import static org.testng.Assert.assertEquals; + + +public class RegexpExtractVarFunctionsTest { + + @Test + public void test() { + assertEquals(RegexpExtractVarFunctions.regexpExtractVar("val abe eee", "(a[bcd]e)"), "abe"); + assertEquals(RegexpExtractVarFunctions.regexpExtractVar("val ade eee", "(a[bcd]e)"), "ade"); + assertEquals(RegexpExtractVarFunctions.regexpExtractVar("val age eee", "(a[bcd]e)"), ""); + + assertEquals(RegexpExtractVarFunctions.regexpExtractVar("val abe ace", "(a[bcd]e) (a[bcd]e)", 2), "ace"); + assertEquals(RegexpExtractVarFunctions.regexpExtractVar("abe ace ade", "(a[bcd]e) (a[bcd]e) (a[bcd]e)", 3), "ade"); + + assertEquals(regexpExtractVar("abe ace ade", "(a[bcd]e)", 5, "wrong"), "wrong"); + assertEquals(regexpExtractVar("aa bb cc", "(a[bcd]e)", 1, "wrong"), "wrong"); + } +} diff --git a/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/regexp/RegexpLikeConstFunctionsTest.java b/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/regexp/RegexpLikeConstFunctionsTest.java new file mode 100644 index 000000000000..a201a99ba9e0 --- /dev/null +++ b/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/regexp/RegexpLikeConstFunctionsTest.java @@ -0,0 +1,62 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.common.function.scalar.regexp; + +import org.testng.annotations.Test; + +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; + + +public class RegexpLikeConstFunctionsTest { + + @Test + public void testLike() { + RegexpLikeConstFunctions f = new RegexpLikeConstFunctions(); + + assertTrue(f.like("ab", "%ab%")); + assertTrue(f.like("aaba", "%ab%")); + assertTrue(f.like("$ab$", "%ab%")); + + assertFalse(f.like("", "%ab%")); + assertFalse(f.like("_", "%ab%")); + assertFalse(f.like("a", "%ab%")); + assertFalse(f.like("b", "%ab%")); + + //returns true because function matches against first pattern + assertTrue(f.like("aab", "abb")); + } + + @Test + public void testRegexpLike() { + RegexpLikeConstFunctions f = new RegexpLikeConstFunctions(); + + assertTrue(f.regexpLike("ab", ".*ab.*")); + assertTrue(f.regexpLike("aaba", ".*ab.*")); + assertTrue(f.regexpLike("$ab$", ".*ab.*")); + + assertFalse(f.regexpLike("", ".*ab.*")); + assertFalse(f.regexpLike("_", ".*ab.*")); + assertFalse(f.regexpLike("a", ".*ab.*")); + assertFalse(f.regexpLike("b", ".*ab.*")); + + //returns true because function matches against first pattern + assertTrue(f.regexpLike("aab", "ab")); + } +} diff --git a/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/regexp/RegexpLikeVarFunctionsTest.java b/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/regexp/RegexpLikeVarFunctionsTest.java new file mode 100644 index 000000000000..625fe5618b21 --- /dev/null +++ b/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/regexp/RegexpLikeVarFunctionsTest.java @@ -0,0 +1,59 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.common.function.scalar.regexp; + +import org.testng.annotations.Test; + +import static org.apache.pinot.common.function.scalar.regexp.RegexpLikeVarFunctions.likeVar; +import static org.apache.pinot.common.function.scalar.regexp.RegexpLikeVarFunctions.regexpLikeVar; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; + + +public class RegexpLikeVarFunctionsTest { + + @Test + public void testLike() { + assertTrue(likeVar("ab", "%ab%")); + assertTrue(likeVar("aaba", "%ab%")); + assertTrue(likeVar("$ab$", "%ab%")); + + assertFalse(likeVar("", "%ab%")); + assertFalse(likeVar("_", "%ab%")); + assertFalse(likeVar("a", "%ab%")); + assertFalse(likeVar("b", "%ab%")); + + assertFalse(likeVar("aab", "ab")); + } + + @Test + public void testRegexpLike() { + assertTrue(regexpLikeVar("ab", ".*ab.*")); + assertTrue(regexpLikeVar("aaba", ".*ab.*")); + assertTrue(regexpLikeVar("$ab$", ".*ab.*")); + + assertFalse(regexpLikeVar("", ".*ab.*")); + assertFalse(regexpLikeVar("_", ".*ab.*")); + assertFalse(regexpLikeVar("a", ".*ab.*")); + assertFalse(regexpLikeVar("b", ".*ab.*")); + + //returns true because function matches against first pattern + assertFalse(regexpLikeVar("aab", "abb")); + } +} diff --git a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/MultiStageEngineIntegrationTest.java b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/MultiStageEngineIntegrationTest.java index 0b1f0abc8038..5e6d96daf2ae 100644 --- a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/MultiStageEngineIntegrationTest.java +++ b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/MultiStageEngineIntegrationTest.java @@ -458,6 +458,123 @@ public void testRegexpReplace() assertEquals(count1, count2); } + @Test + public void testRegexpReplaceVar() + throws Exception { + // Correctness tests of regexpReplaceVar. + + // Test replace all. + String sqlQuery = "SELECT regexpReplaceVar('CA', 'C', 'TEST')"; + JsonNode response = postQuery(sqlQuery); + String result = response.get("resultTable").get("rows").get(0).get(0).asText(); + assertEquals(result, "TESTA"); + + sqlQuery = "SELECT regexpReplaceVar('foobarbaz', 'b', 'X')"; + response = postQuery(sqlQuery); + result = response.get("resultTable").get("rows").get(0).get(0).asText(); + assertEquals(result, "fooXarXaz"); + + sqlQuery = "SELECT regexpReplaceVar('foobarbaz', 'b', 'XY')"; + response = postQuery(sqlQuery); + result = response.get("resultTable").get("rows").get(0).get(0).asText(); + assertEquals(result, "fooXYarXYaz"); + + sqlQuery = "SELECT regexpReplaceVar('Argentina', '(.)', '$1 ')"; + response = postQuery(sqlQuery); + result = response.get("resultTable").get("rows").get(0).get(0).asText(); + assertEquals(result, "A r g e n t i n a "); + + sqlQuery = "SELECT regexpReplaceVar('Pinot is blazing fast', '( ){2,}', ' ')"; + response = postQuery(sqlQuery); + result = response.get("resultTable").get("rows").get(0).get(0).asText(); + assertEquals(result, "Pinot is blazing fast"); + + sqlQuery = "SELECT regexpReplaceVar('healthy, wealthy, and wise','\\w+thy', 'something')"; + response = postQuery(sqlQuery); + result = response.get("resultTable").get("rows").get(0).get(0).asText(); + assertEquals(result, "something, something, and wise"); + + sqlQuery = "SELECT regexpReplaceVar('11234567898','(\\d)(\\d{3})(\\d{3})(\\d{4})', '$1-($2) $3-$4')"; + response = postQuery(sqlQuery); + result = response.get("resultTable").get("rows").get(0).get(0).asText(); + assertEquals(result, "1-(123) 456-7898"); + + // Test replace starting at index. + + sqlQuery = "SELECT regexpReplaceVar('healthy, wealthy, stealthy and wise','\\w+thy', 'something', 4)"; + response = postQuery(sqlQuery); + result = response.get("resultTable").get("rows").get(0).get(0).asText(); + assertEquals(result, "healthy, something, something and wise"); + + sqlQuery = "SELECT regexpReplaceVar('healthy, wealthy, stealthy and wise','\\w+thy', 'something', 1)"; + response = postQuery(sqlQuery); + result = response.get("resultTable").get("rows").get(0).get(0).asText(); + assertEquals(result, "hsomething, something, something and wise"); + + // Test occurence + sqlQuery = "SELECT regexpReplaceVar('healthy, wealthy, stealthy and wise','\\w+thy', 'something', 0, 2)"; + response = postQuery(sqlQuery); + result = response.get("resultTable").get("rows").get(0).get(0).asText(); + assertEquals(result, "healthy, wealthy, something and wise"); + + sqlQuery = "SELECT regexpReplaceVar('healthy, wealthy, stealthy and wise','\\w+thy', 'something', 0, 0)"; + response = postQuery(sqlQuery); + result = response.get("resultTable").get("rows").get(0).get(0).asText(); + assertEquals(result, "something, wealthy, stealthy and wise"); + + // Test flags + sqlQuery = "SELECT regexpReplaceVar('healthy, wealthy, stealthy and wise','\\w+tHy', 'something', 0, 0, 'i')"; + response = postQuery(sqlQuery); + result = response.get("resultTable").get("rows").get(0).get(0).asText(); + assertEquals(result, "something, wealthy, stealthy and wise"); + + // Negative test. Pattern match not found. + sqlQuery = "SELECT regexpReplaceVar('healthy, wealthy, stealthy and wise','\\w+tHy', 'something')"; + response = postQuery(sqlQuery); + result = response.get("resultTable").get("rows").get(0).get(0).asText(); + assertEquals(result, "healthy, wealthy, stealthy and wise"); + + // Negative test. Pattern match not found. + sqlQuery = "SELECT regexpReplaceVar('healthy, wealthy, stealthy and wise','\\w+tHy', 'something', 3, 21, 'i')"; + response = postQuery(sqlQuery); + result = response.get("resultTable").get("rows").get(0).get(0).asText(); + assertEquals(result, "healthy, wealthy, stealthy and wise"); + + // Negative test - incorrect flag + sqlQuery = "SELECT regexpReplaceVar('healthy, wealthy, stealthy and wise','\\w+tHy', 'something', 3, 12, 'xyz')"; + response = postQuery(sqlQuery); + result = response.get("resultTable").get("rows").get(0).get(0).asText(); + assertEquals(result, "healthy, wealthy, stealthy and wise"); + + // Test in select clause with column values + sqlQuery = "SELECT regexpReplaceVar(DestCityName, ' ', '', 0, -1, 'i') from mytable where OriginState = 'CA'"; + response = postQuery(sqlQuery); + JsonNode rows = response.get("resultTable").get("rows"); + for (int i = 0; i < rows.size(); i++) { + JsonNode row = rows.get(i); + assertFalse(row.get(0).asText().contains(" ")); + } + + // Test in where clause + sqlQuery = "SELECT count(*) from mytable where regexpReplaceVar(OriginState, '[VC]A', 'TEST') = 'TEST'"; + response = postQuery(sqlQuery); + int count1 = response.get("resultTable").get("rows").get(0).get(0).asInt(); + sqlQuery = "SELECT count(*) from mytable where OriginState='CA' or OriginState='VA'"; + response = postQuery(sqlQuery); + int count2 = response.get("resultTable").get("rows").get(0).get(0).asInt(); + assertEquals(count1, count2); + + // Test nested transform + sqlQuery = + "SELECT count(*) from mytable where contains(regexpReplaceVar(OriginState, '(C)(A)', '$1TEST$2'), 'CTESTA')"; + response = postQuery(sqlQuery); + count1 = response.get("resultTable").get("rows").get(0).get(0).asInt(); + sqlQuery = "SELECT count(*) from mytable where OriginState='CA'"; + response = postQuery(sqlQuery); + count2 = response.get("resultTable").get("rows").get(0).get(0).asInt(); + assertEquals(count1, count2); + } + @Test public void testUrlFunc() throws Exception { diff --git a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/OfflineClusterIntegrationTest.java b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/OfflineClusterIntegrationTest.java index daa826de22f7..64b026600f65 100644 --- a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/OfflineClusterIntegrationTest.java +++ b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/OfflineClusterIntegrationTest.java @@ -875,6 +875,124 @@ public void testRegexpReplace(boolean useMultiStageQueryEngine) assertEquals(count1, count2); } + @Test(dataProvider = "useBothQueryEngines") + public void testRegexpReplaceVar(boolean useMultiStageQueryEngine) + throws Exception { + setUseMultiStageQueryEngine(useMultiStageQueryEngine); + // Correctness tests of regexpReplace. + + // Test replace all. + String sqlQuery = "SELECT regexpReplaceVar('CA', 'C', 'TEST')"; + JsonNode response = postQuery(sqlQuery); + String result = response.get("resultTable").get("rows").get(0).get(0).asText(); + assertEquals(result, "TESTA"); + + sqlQuery = "SELECT regexpReplaceVar('foobarbaz', 'b', 'X')"; + response = postQuery(sqlQuery); + result = response.get("resultTable").get("rows").get(0).get(0).asText(); + assertEquals(result, "fooXarXaz"); + + sqlQuery = "SELECT regexpReplaceVar('foobarbaz', 'b', 'XY')"; + response = postQuery(sqlQuery); + result = response.get("resultTable").get("rows").get(0).get(0).asText(); + assertEquals(result, "fooXYarXYaz"); + + sqlQuery = "SELECT regexpReplaceVar('Argentina', '(.)', '$1 ')"; + response = postQuery(sqlQuery); + result = response.get("resultTable").get("rows").get(0).get(0).asText(); + assertEquals(result, "A r g e n t i n a "); + + sqlQuery = "SELECT regexpReplaceVar('Pinot is blazing fast', '( ){2,}', ' ')"; + response = postQuery(sqlQuery); + result = response.get("resultTable").get("rows").get(0).get(0).asText(); + assertEquals(result, "Pinot is blazing fast"); + + sqlQuery = "SELECT regexpReplaceVar('healthy, wealthy, and wise','\\w+thy', 'something')"; + response = postQuery(sqlQuery); + result = response.get("resultTable").get("rows").get(0).get(0).asText(); + assertEquals(result, "something, something, and wise"); + + sqlQuery = "SELECT regexpReplaceVar('11234567898','(\\d)(\\d{3})(\\d{3})(\\d{4})', '$1-($2) $3-$4')"; + response = postQuery(sqlQuery); + result = response.get("resultTable").get("rows").get(0).get(0).asText(); + assertEquals(result, "1-(123) 456-7898"); + + // Test replace starting at index. + + sqlQuery = "SELECT regexpReplaceVar('healthy, wealthy, stealthy and wise','\\w+thy', 'something', 4)"; + response = postQuery(sqlQuery); + result = response.get("resultTable").get("rows").get(0).get(0).asText(); + assertEquals(result, "healthy, something, something and wise"); + + sqlQuery = "SELECT regexpReplaceVar('healthy, wealthy, stealthy and wise','\\w+thy', 'something', 1)"; + response = postQuery(sqlQuery); + result = response.get("resultTable").get("rows").get(0).get(0).asText(); + assertEquals(result, "hsomething, something, something and wise"); + + // Test occurence + sqlQuery = "SELECT regexpReplaceVar('healthy, wealthy, stealthy and wise','\\w+thy', 'something', 0, 2)"; + response = postQuery(sqlQuery); + result = response.get("resultTable").get("rows").get(0).get(0).asText(); + assertEquals(result, "healthy, wealthy, something and wise"); + + sqlQuery = "SELECT regexpReplaceVar('healthy, wealthy, stealthy and wise','\\w+thy', 'something', 0, 0)"; + response = postQuery(sqlQuery); + result = response.get("resultTable").get("rows").get(0).get(0).asText(); + assertEquals(result, "something, wealthy, stealthy and wise"); + + // Test flags + sqlQuery = "SELECT regexpReplaceVar('healthy, wealthy, stealthy and wise','\\w+tHy', 'something', 0, 0, 'i')"; + response = postQuery(sqlQuery); + result = response.get("resultTable").get("rows").get(0).get(0).asText(); + assertEquals(result, "something, wealthy, stealthy and wise"); + + // Negative test. Pattern match not found. + sqlQuery = "SELECT regexpReplaceVar('healthy, wealthy, stealthy and wise','\\w+tHy', 'something')"; + response = postQuery(sqlQuery); + result = response.get("resultTable").get("rows").get(0).get(0).asText(); + assertEquals(result, "healthy, wealthy, stealthy and wise"); + + // Negative test. Pattern match not found. + sqlQuery = "SELECT regexpReplaceVar('healthy, wealthy, stealthy and wise','\\w+tHy', 'something', 3, 21, 'i')"; + response = postQuery(sqlQuery); + result = response.get("resultTable").get("rows").get(0).get(0).asText(); + assertEquals(result, "healthy, wealthy, stealthy and wise"); + + // Negative test - incorrect flag + sqlQuery = "SELECT regexpReplaceVar('healthy, wealthy, stealthy and wise','\\w+tHy', 'something', 3, 12, 'xyz')"; + response = postQuery(sqlQuery); + result = response.get("resultTable").get("rows").get(0).get(0).asText(); + assertEquals(result, "healthy, wealthy, stealthy and wise"); + + // Test in select clause with column values + sqlQuery = "SELECT regexpReplaceVar(DestCityName, ' ', '', 0, -1, 'i') from mytable where OriginState = 'CA'"; + response = postQuery(sqlQuery); + JsonNode rows = response.get("resultTable").get("rows"); + for (int i = 0; i < rows.size(); i++) { + JsonNode row = rows.get(i); + assertFalse(row.get(0).asText().contains(" ")); + } + + // Test in where clause + sqlQuery = "SELECT count(*) from mytable where regexpReplaceVar(OriginState, '[VC]A', 'TEST') = 'TEST'"; + response = postQuery(sqlQuery); + int count1 = response.get("resultTable").get("rows").get(0).get(0).asInt(); + sqlQuery = "SELECT count(*) from mytable where OriginState='CA' or OriginState='VA'"; + response = postQuery(sqlQuery); + int count2 = response.get("resultTable").get("rows").get(0).get(0).asInt(); + assertEquals(count1, count2); + + // Test nested transform + sqlQuery = + "SELECT count(*) from mytable where contains(regexpReplaceVar(OriginState, '(C)(A)', '$1TEST$2'), 'CTESTA')"; + response = postQuery(sqlQuery); + count1 = response.get("resultTable").get("rows").get(0).get(0).asInt(); + sqlQuery = "SELECT count(*) from mytable where OriginState='CA'"; + response = postQuery(sqlQuery); + count2 = response.get("resultTable").get("rows").get(0).get(0).asInt(); + assertEquals(count1, count2); + } + @Test public void testCastMV() throws Exception { diff --git a/pinot-perf/pom.xml b/pinot-perf/pom.xml index 9bb5fa66f3b5..88e8e8054bc6 100644 --- a/pinot-perf/pom.xml +++ b/pinot-perf/pom.xml @@ -85,6 +85,7 @@ org.testng testng + compile org.mockito diff --git a/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkQueries.java b/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkQueries.java index a249a176353a..0b0b4a4abe8a 100644 --- a/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkQueries.java +++ b/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkQueries.java @@ -104,7 +104,6 @@ public static void main(String[] args) new AggregationFunctionColumnPair(AggregationFunctionType.SUM, RAW_INT_COL_NAME).toColumnName()), null, Integer.MAX_VALUE))).build(); - //@formatter:off private static final Schema SCHEMA = new Schema.SchemaBuilder() .setSchemaName(TABLE_NAME) .addSingleValueDimension(SORTED_COL_NAME, FieldSpec.DataType.INT) @@ -116,7 +115,6 @@ public static void main(String[] args) .addSingleValueDimension(LOW_CARDINALITY_STRING_COL, FieldSpec.DataType.STRING) .addSingleValueDimension(TIMESTAMP_COL, FieldSpec.DataType.TIMESTAMP) .build(); - //@formatter:on public static final String FILTERED_QUERY = "SELECT SUM(INT_COL) FILTER(WHERE INT_COL > 123 AND INT_COL < 599999)," + "MAX(INT_COL) FILTER(WHERE INT_COL > 123 AND INT_COL < 599999) " @@ -196,6 +194,12 @@ public static void main(String[] args) + "FromDateTime(dateTimeConvert(TSTMP_COL, '1:MILLISECONDS:EPOCH', '1:DAYS:SIMPLE_DATE_FORMAT:yyyy-MM-dd HH:mm:ss" + ".SSSZ tz(CET)', '1:DAYS'), 'yyyy-MM-dd HH:mm:ss.SSSZ') = 120000000"; + public static final String REGEXP_REPLACE_QUERY = + " select regexp_replace(RAW_STRING_COL, '.*a.*', 'abc' ), count(*) \n" + + " from MyTable \n " + + " group by 1 " + + " limit 1000000\n"; + @Param({"1", "2", "10", "50"}) private int _numSegments; @Param("1500000") @@ -207,8 +211,8 @@ public static void main(String[] args) SUM_QUERY, NO_INDEX_LIKE_QUERY, MULTI_GROUP_BY_ORDER_BY, MULTI_GROUP_BY_ORDER_BY_LOW_HIGH, TIME_GROUP_BY, RAW_COLUMN_SUMMARY_STATS, COUNT_OVER_BITMAP_INDEX_IN, COUNT_OVER_BITMAP_INDEXES, COUNT_OVER_BITMAP_AND_SORTED_INDEXES, COUNT_OVER_BITMAP_INDEX_EQUALS, STARTREE_SUM_QUERY, STARTREE_FILTER_QUERY, - FILTERING_BITMAP_SCAN_QUERY, FILTERING_SCAN_QUERY, - FILTERING_ON_TIMESTAMP_WORKAROUND_QUERY, FILTERING_ON_TIMESTAMP_QUERY + FILTERING_BITMAP_SCAN_QUERY, FILTERING_SCAN_QUERY, FILTERING_ON_TIMESTAMP_WORKAROUND_QUERY, + FILTERING_ON_TIMESTAMP_QUERY, REGEXP_REPLACE_QUERY }) String _query; private IndexSegment _indexSegment; @@ -241,13 +245,14 @@ public void tearDown() { EXECUTOR_SERVICE.shutdownNow(); } - private LazyDataGenerator createTestData(int numRows) { + static LazyDataGenerator createTestData(int numRows, Distribution.DataSupplier supplier) { //create data lazily to prevent OOM and speed up setup return new LazyDataGenerator() { private final Map _strings = new HashMap<>(); private final String[] _lowCardinalityValues = IntStream.range(0, 10).mapToObj(i -> "value" + i).toArray(String[]::new); + private Distribution.DataSupplier _supplier = supplier; @Override public int size() { @@ -279,7 +284,7 @@ public void rewind() { private void buildSegment(String segmentName) throws Exception { - LazyDataGenerator rows = createTestData(_numRows); + LazyDataGenerator rows = createTestData(_numRows, _supplier); SegmentGeneratorConfig config = new SegmentGeneratorConfig(TABLE_CONFIG, SCHEMA); config.setOutDir(INDEX_DIR.getPath()); config.setTableName(TABLE_NAME); diff --git a/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkQueriesMSQE.java b/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkQueriesMSQE.java new file mode 100644 index 000000000000..c70c25bd2fda --- /dev/null +++ b/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkQueriesMSQE.java @@ -0,0 +1,221 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.perf; + +import com.fasterxml.jackson.databind.JsonNode; +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.TimeUnit; +import org.apache.commons.io.FileUtils; +import org.apache.pinot.common.utils.TarCompressionUtils; +import org.apache.pinot.integration.tests.BaseClusterIntegrationTest; +import org.apache.pinot.integration.tests.ClusterIntegrationTestUtils; +import org.apache.pinot.segment.local.segment.creator.impl.SegmentIndexCreationDriverImpl; +import org.apache.pinot.segment.spi.AggregationFunctionType; +import org.apache.pinot.segment.spi.creator.SegmentGeneratorConfig; +import org.apache.pinot.segment.spi.index.startree.AggregationFunctionColumnPair; +import org.apache.pinot.spi.config.table.FieldConfig; +import org.apache.pinot.spi.config.table.StarTreeIndexConfig; +import org.apache.pinot.spi.config.table.TableConfig; +import org.apache.pinot.spi.config.table.TableType; +import org.apache.pinot.spi.data.FieldSpec; +import org.apache.pinot.spi.data.Schema; +import org.apache.pinot.spi.data.readers.RecordReader; +import org.apache.pinot.spi.utils.builder.TableConfigBuilder; +import org.apache.pinot.util.TestUtils; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.runner.Runner; +import org.openjdk.jmh.runner.options.ChainedOptionsBuilder; +import org.openjdk.jmh.runner.options.OptionsBuilder; + + +/** + * Benchmark similar to BenchmarkQueries, but using multi-stage query engine. + */ +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.MILLISECONDS) +@Fork(1) +@Warmup(iterations = 5, time = 1) +@Measurement(iterations = 5, time = 1) +@State(Scope.Benchmark) +public class BenchmarkQueriesMSQE extends BaseClusterIntegrationTest { + + public static void main(String[] args) + throws Exception { + ChainedOptionsBuilder opt = new OptionsBuilder().include(BenchmarkQueriesMSQE.class.getSimpleName()); + new Runner(opt.build()).run(); + } + + private static final String TABLE_NAME = "MyTable"; + private static final String FIRST_SEGMENT_NAME = "firstTestSegment"; + private static final String SECOND_SEGMENT_NAME = "secondTestSegment"; + private static final String INT_COL_NAME = "INT_COL"; + private static final String SORTED_COL_NAME = "SORTED_COL"; + private static final String RAW_INT_COL_NAME = "RAW_INT_COL"; + private static final String RAW_STRING_COL_NAME = "RAW_STRING_COL"; + private static final String NO_INDEX_INT_COL_NAME = "NO_INDEX_INT_COL"; + private static final String NO_INDEX_STRING_COL = "NO_INDEX_STRING_COL"; + private static final String LOW_CARDINALITY_STRING_COL = "LOW_CARDINALITY_STRING_COL"; + private static final String TIMESTAMP_COL = "TSTMP_COL"; + private static final List FIELD_CONFIGS = new ArrayList<>(); + + private static final TableConfig TABLE_CONFIG = new TableConfigBuilder(TableType.OFFLINE) + .setTableName(TABLE_NAME) + .setInvertedIndexColumns(List.of(INT_COL_NAME, LOW_CARDINALITY_STRING_COL)) + .setFieldConfigList(FIELD_CONFIGS) + .setNoDictionaryColumns(List.of(RAW_INT_COL_NAME, RAW_STRING_COL_NAME, TIMESTAMP_COL)) + .setSortedColumn(SORTED_COL_NAME) + .setRangeIndexColumns(List.of(INT_COL_NAME, LOW_CARDINALITY_STRING_COL)) + .setStarTreeIndexConfigs( + Collections.singletonList( + new StarTreeIndexConfig(List.of(SORTED_COL_NAME, INT_COL_NAME), null, + Collections.singletonList( + new AggregationFunctionColumnPair(AggregationFunctionType.SUM, RAW_INT_COL_NAME).toColumnName()), + null, Integer.MAX_VALUE))).build(); + + private static final Schema SCHEMA = new Schema.SchemaBuilder() + .setSchemaName(TABLE_NAME) + .addSingleValueDimension(SORTED_COL_NAME, FieldSpec.DataType.INT) + .addSingleValueDimension(NO_INDEX_INT_COL_NAME, FieldSpec.DataType.INT) + .addSingleValueDimension(RAW_INT_COL_NAME, FieldSpec.DataType.INT) + .addSingleValueDimension(INT_COL_NAME, FieldSpec.DataType.INT) + .addSingleValueDimension(RAW_STRING_COL_NAME, FieldSpec.DataType.STRING) + .addSingleValueDimension(NO_INDEX_STRING_COL, FieldSpec.DataType.STRING) + .addSingleValueDimension(LOW_CARDINALITY_STRING_COL, FieldSpec.DataType.STRING) + .addSingleValueDimension(TIMESTAMP_COL, FieldSpec.DataType.TIMESTAMP) + .build(); + + public static final String REGEXP_LIKE_CONST_QUERY = "select * from \n" + + "(\n" + + " select RAW_STRING_COL\n" + + " from MyTable \n" + + " limit 100000\n" + + ") \n" + + "where regexp_like_const('.*a.*', RAW_STRING_COL )"; + + public static final String REGEXP_LIKE_VAR_QUERY = "select * from \n" + + "(\n" + + " select RAW_STRING_COL\n" + + " from MyTable \n" + + " limit 100000\n" + + ") \n" + + "where regexp_like('.*a.*', RAW_STRING_COL )"; + + private Distribution.DataSupplier _supplier; + + @Param("1500000") + private int _numRows; + + @Param({"EXP(0.001)", "EXP(0.5)", "EXP(0.999)"}) + String _scenario; + + @Param({ + REGEXP_LIKE_CONST_QUERY, REGEXP_LIKE_VAR_QUERY + }) + String _query; + + @Setup + public void setUp() + throws Exception { + TestUtils.ensureDirectoriesExistAndEmpty(_tempDir, _segmentDir, _tarDir); + _supplier = Distribution.createSupplier(42, _scenario); + + // Start the Pinot cluster + startZk(); + startController(); + startBroker(); + startServer(); + + // upload test data + addSchema(SCHEMA); + addTableConfig(TABLE_CONFIG); + + buildSegment(FIRST_SEGMENT_NAME); + buildSegment(SECOND_SEGMENT_NAME); + + uploadSegments(TABLE_NAME, _tarDir); + + //check for data to arrive + waitForAllDocsLoaded(60000); + } + + @Override + protected long getCountStarResult() { + return _numRows * 2; + } + + @TearDown + public void tearDown() + throws IOException { + stopServer(); + stopBroker(); + stopController(); + stopZk(); + FileUtils.deleteQuietly(_tempDir); + } + + @Benchmark + public JsonNode query() + throws Exception { + JsonNode result = + postQuery(_query, ClusterIntegrationTestUtils.getBrokerQueryApiUrl(getBrokerBaseApiUrl(), true), null, + getExtraQueryProperties()); + JsonNode exceptions = result.get("exceptions").get(0); + if (exceptions != null) { + throw new RuntimeException(exceptions.get("message").asText()); + } + return result.get("resultTable").get("rows"); + } + + private void buildSegment(String segmentName) + throws Exception { + LazyDataGenerator rows = BenchmarkQueries.createTestData(_numRows, _supplier); + SegmentGeneratorConfig config = new SegmentGeneratorConfig(TABLE_CONFIG, SCHEMA); + config.setOutDir(_segmentDir.getPath()); + config.setTableName(TABLE_NAME); + config.setSegmentName(segmentName); + + SegmentIndexCreationDriverImpl driver = new SegmentIndexCreationDriverImpl(); + try (RecordReader recordReader = new GeneratedDataRecordReader(rows)) { + driver.init(config, recordReader); + driver.build(); + } + //save generator state so that other segments are not identical to this one + _supplier.snapshot(); + + // Tar the segment + File indexDir = new File(_segmentDir, segmentName); + File segmentTarFile = new File(_tarDir, segmentName + TarCompressionUtils.TAR_GZ_FILE_EXTENSION); + TarCompressionUtils.createCompressedTarFile(indexDir, segmentTarFile); + } +} diff --git a/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkRegexpReplace.java b/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkRegexpReplace.java new file mode 100644 index 000000000000..3ef2bd00b440 --- /dev/null +++ b/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkRegexpReplace.java @@ -0,0 +1,142 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.perf; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Objects; +import java.util.concurrent.TimeUnit; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.apache.pinot.common.function.scalar.regexp.RegexpReplaceConstFunctions; +import org.apache.pinot.spi.annotations.ScalarFunction; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; +import org.openjdk.jmh.runner.Runner; +import org.openjdk.jmh.runner.options.ChainedOptionsBuilder; +import org.openjdk.jmh.runner.options.OptionsBuilder; + + +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.MICROSECONDS) +@Fork(1) +@Warmup(iterations = 3, time = 1) +@Measurement(iterations = 3, time = 1) +@State(Scope.Benchmark) +public class BenchmarkRegexpReplace { + + private ArrayList _input = new ArrayList<>(); + + @Param({"q.[aeiou]c.*", ".*a", "b.*", ".*", ".*ated", ".*ba.*"}) + public String _regex; + + @Setup + public void setUp() + throws IOException { + try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader( + Objects.requireNonNull(getClass().getClassLoader().getResourceAsStream("data/words.txt"))))) { + String currentWord; + while ((currentWord = bufferedReader.readLine()) != null) { + _input.add(currentWord); + } + } + } + + public static void main(String[] args) + throws Exception { + ChainedOptionsBuilder opt = new OptionsBuilder().include(BenchmarkRegexpReplace.class.getSimpleName()); + new Runner(opt.build()).run(); + } + + @Benchmark + public void testRegexpReplaceOld(Blackhole blackhole) { + for (int i = 0, n = _input.size(); i < n; i++) { + blackhole.consume(regexpReplaceOld(_input.get(i), _regex, "")); + } + } + + @Benchmark + public void testRegexpReplace(Blackhole blackhole) { + RegexpReplaceConstFunctions function = new RegexpReplaceConstFunctions(); + for (int i = 0, n = _input.size(); i < n; i++) { + blackhole.consume(function.regexpReplace(_input.get(i), _regex, "")); + } + } + + //old regexp_replace implementation + public static String regexpReplaceOld(String inputStr, String matchStr, String replaceStr, int matchStartPos, + int occurence, String flag) { + Integer patternFlag; + + switch (flag) { + case "i": + patternFlag = Pattern.CASE_INSENSITIVE; + break; + default: + patternFlag = null; + break; + } + + Pattern p; + if (patternFlag != null) { + p = Pattern.compile(matchStr, patternFlag); + } else { + p = Pattern.compile(matchStr); + } + + Matcher matcher = p.matcher(inputStr).region(matchStartPos, inputStr.length()); + StringBuffer sb; + + if (occurence >= 0) { + sb = new StringBuffer(inputStr); + while (occurence >= 0 && matcher.find()) { + if (occurence == 0) { + sb.replace(matcher.start(), matcher.end(), replaceStr); + break; + } + occurence--; + } + } else { + sb = new StringBuffer(); + while (matcher.find()) { + matcher.appendReplacement(sb, replaceStr); + } + matcher.appendTail(sb); + } + + return sb.toString(); + } + + @ScalarFunction + public static String regexpReplaceOld(String inputStr, String matchStr, String replaceStr) { + return regexpReplaceOld(inputStr, matchStr, replaceStr, 0, -1, ""); + } +} diff --git a/pinot-query-runtime/src/test/java/org/apache/pinot/query/QueryTestSet.java b/pinot-query-runtime/src/test/java/org/apache/pinot/query/QueryTestSet.java index fb86ab53f7e5..a0752eabe2cb 100644 --- a/pinot-query-runtime/src/test/java/org/apache/pinot/query/QueryTestSet.java +++ b/pinot-query-runtime/src/test/java/org/apache/pinot/query/QueryTestSet.java @@ -223,8 +223,10 @@ public Object[][] provideTestSql() { // TODO split these SQL functions into separate test files to share between planner and runtime // LIKE function new Object[]{"SELECT col1 FROM a WHERE col2 LIKE '%o%'"}, - new Object[]{"SELECT a.col1, b.col1 FROM a JOIN b ON a.col3 = b.col3 WHERE a.col2 LIKE b.col1"}, - new Object[]{"SELECT a.col1 LIKE b.col1 FROM a JOIN b ON a.col3 = b.col3"}, + new Object[]{"SELECT a.col1 LIKE '%o%' FROM a JOIN b ON a.col3 = b.col3"}, + // since PR #14833 LIKE assumes pattern is constant, so passing column to it produces wrong results + //new Object[]{"SELECT a.col1, b.col1 FROM a JOIN b ON a.col3 = b.col3 WHERE a.col2 LIKE b.col1"}, + //new Object[]{"SELECT a.col1 LIKE b.col1 FROM a JOIN b ON a.col3 = b.col3"}, // COALESCE function new Object[]{"SELECT a.col1, COALESCE(b.col3, 0) FROM a LEFT JOIN b ON a.col1 = b.col2"}, diff --git a/pinot-query-runtime/src/test/java/org/apache/pinot/query/runtime/queries/QueryRunnerTest.java b/pinot-query-runtime/src/test/java/org/apache/pinot/query/runtime/queries/QueryRunnerTest.java index b70b39a8f5da..e19ea2fb928b 100644 --- a/pinot-query-runtime/src/test/java/org/apache/pinot/query/runtime/queries/QueryRunnerTest.java +++ b/pinot-query-runtime/src/test/java/org/apache/pinot/query/runtime/queries/QueryRunnerTest.java @@ -247,10 +247,10 @@ protected Object[][] provideTestSqlAndRowCount() { // ScalarFunction // test function can be used in predicate/leaf/intermediate stage (using regexpLike) - new Object[]{"SELECT a.col1, b.col1 FROM a JOIN b ON a.col3 = b.col3 WHERE regexpLike(a.col2, b.col1)", 9}, - new Object[]{"SELECT a.col1, b.col1 FROM a JOIN b ON a.col3 = b.col3 WHERE regexp_like(a.col2, b.col1)", 9}, - new Object[]{"SELECT regexpLike(a.col1, b.col1) FROM a JOIN b ON a.col3 = b.col3", 39}, - new Object[]{"SELECT regexp_like(a.col1, b.col1) FROM a JOIN b ON a.col3 = b.col3", 39}, + new Object[]{"SELECT a.col1, b.col1 FROM a JOIN b ON a.col3 = b.col3 WHERE regexpLikeVar(a.col2, b.col1)", 9}, + new Object[]{"SELECT a.col1, b.col1 FROM a JOIN b ON a.col3 = b.col3 WHERE regexp_like_var(a.col2, b.col1)", 9}, + new Object[]{"SELECT regexpLikeVar(a.col1, b.col1) FROM a JOIN b ON a.col3 = b.col3", 39}, + new Object[]{"SELECT regexp_like_var(a.col1, b.col1) FROM a JOIN b ON a.col3 = b.col3", 39}, // test function with @ScalarFunction annotation and alias works (using round_decimal) new Object[]{"SELECT roundDecimal(col3) FROM a", 15}, diff --git a/pinot-query-runtime/src/test/resources/queries/TypeCasting.json b/pinot-query-runtime/src/test/resources/queries/TypeCasting.json index 299ceda84d59..ad6643ad233e 100644 --- a/pinot-query-runtime/src/test/resources/queries/TypeCasting.json +++ b/pinot-query-runtime/src/test/resources/queries/TypeCasting.json @@ -128,7 +128,7 @@ }, { "description": "test Pinot function variances above, but also mixed in with intermediate stage transfer", - "sql": "SELECT md5(a.bytesCol), substr(b.stringCol, 5), regexpExtract(a.stringCol, '([\\w]+).*') FROM {tbl} AS a JOIN {tbl} AS b ON a.intCol = b.intCol WHERE regexpLike(a.stringCol, b.stringCol)", + "sql": "SELECT md5(a.bytesCol), substr(b.stringCol, 5), regexpExtract(a.stringCol, '([\\w]+).*') FROM {tbl} AS a JOIN {tbl} AS b ON a.intCol = b.intCol WHERE regexpLikeVar(a.stringCol, b.stringCol)", "outputs": [ ["2f249230a8e7c2bf6005ccd2679259ec", "", "lyons"], ["a85a5fd494d9a538e22b696159931c1b", "", "onan"],