Skip to content

Commit

Permalink
Added regex support as per issue jqlang#164.
Browse files Browse the repository at this point in the history
jq now depends on oniguruma for regex support.
Modified configure.ac accordingly.

Added valgrind suppression file for oniguruma to prevent one-time and bounded
leaks from causing tests to fail.

Signed-off-by: Nicolas Williams <[email protected]>
  • Loading branch information
wtlangford authored and nicowilliams committed Jun 19, 2014
1 parent 5d9d1b1 commit 8ff935c
Show file tree
Hide file tree
Showing 7 changed files with 420 additions and 2 deletions.
217 changes: 217 additions & 0 deletions builtin.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include <assert.h>
#include <limits.h>
#include <math.h>
#include <oniguruma.h>
#include <stdlib.h>
#include <string.h>
#include "builtin.h"
Expand Down Expand Up @@ -514,6 +515,215 @@ static jv f_group_by_impl(jv input, jv keys) {
}
}

static int f_match_name_iter(const UChar* name, const UChar *name_end, int ngroups,
int *groups, regex_t *reg, void *arg) {
jv captures = *(jv*)arg;
for (int i = 0; i < ngroups; ++i) {
jv cap = jv_array_get(jv_copy(captures),groups[i]-1);
if (jv_get_kind(cap) == JV_KIND_OBJECT) {
cap = jv_object_set(cap, jv_string("name"), jv_string_sized((const char*)name, name_end-name));
captures = jv_array_set(captures,groups[i]-1,cap);
} else {
jv_free(cap);
}
}
*(jv *)arg = captures;
return 0;
}


static jv f_match(jv input, jv regex, jv modifiers, jv testmode) {
int test = jv_equal(testmode, jv_true());
jv result;
int onigret;
int global = 0;
regex_t *reg;
OnigErrorInfo einfo;
OnigRegion* region;

jv_free(testmode);
if (jv_get_kind(input) != JV_KIND_STRING) {
jv_free(regex);
jv_free(modifiers);
return type_error(input, "cannot be matched, as it is not a string");
}

if (jv_get_kind(regex) != JV_KIND_STRING) {
jv_free(input);
jv_free(modifiers);
return type_error(regex, "is not a string");
}

OnigOptionType options = ONIG_OPTION_CAPTURE_GROUP;

if (jv_get_kind(modifiers) == JV_KIND_STRING) {
jv modarray = jv_string_explode(jv_copy(modifiers));
jv_array_foreach(modarray, i, mod) {
switch ((int)jv_number_value(mod)) {
case 'g':
global = 1;
break;
case 'i':
options |= ONIG_OPTION_IGNORECASE;
break;
case 'x':
options |= ONIG_OPTION_EXTEND;
break;
case 'm':
options |= ONIG_OPTION_MULTILINE;
break;
case 's':
options |= ONIG_OPTION_SINGLELINE;
break;
case 'p':
options |= ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE;
break;
case 'l':
options |= ONIG_OPTION_FIND_LONGEST;
break;
case 'n':
options |= ONIG_OPTION_FIND_NOT_EMPTY;
break;
default:
jv_free(input);
jv_free(regex);
jv_free(modarray);
return jv_invalid_with_msg(jv_string_concat(modifiers,
jv_string(" is not a valid modifier string")));
}
}
jv_free(modarray);
} else if (jv_get_kind(modifiers) != JV_KIND_NULL) {
// If it isn't a string or null, then it is the wrong type...
jv_free(input);
jv_free(regex);
return type_error(modifiers, "is not a string");
}

jv_free(modifiers);

onigret = onig_new(&reg, (const UChar*)jv_string_value(regex),
(const UChar*)(jv_string_value(regex) + jv_string_length_bytes(jv_copy(regex))),
options, ONIG_ENCODING_UTF8, ONIG_SYNTAX_PERL_NG, &einfo);
if (onigret != ONIG_NORMAL) {
UChar ebuf[ONIG_MAX_ERROR_MESSAGE_LEN];
onig_error_code_to_str(ebuf, onigret, einfo);
jv_free(input);
jv_free(regex);
return jv_invalid_with_msg(jv_string_concat(jv_string("Regex failure: "),
jv_string((char*)ebuf)));
}
if (!test)
result = jv_array();
const char *input_string = jv_string_value(input);
const UChar* start = (const UChar*)jv_string_value(input);
const unsigned long length = jv_string_length_bytes(jv_copy(input));
const UChar* end = start + length;
region = onig_region_new();
do {
onigret = onig_search(reg,
(const UChar*)jv_string_value(input), end, /* string boundaries */
start, end, /* search boundaries */
region, ONIG_OPTION_NONE);
if (onigret >= 0) {
if (test) {
result = jv_true();
break;
}

// Zero-width match
if (region->end[0] == region->beg[0]) {
unsigned long idx;
const char *fr = (const char*)input_string;
for (idx = 0; fr != input_string+region->beg[0]; idx++) {
fr += jvp_utf8_decode_length(*fr);
}
jv match = jv_object_set(jv_object(), jv_string("offset"), jv_number(idx));
match = jv_object_set(match, jv_string("length"), jv_number(0));
match = jv_object_set(match, jv_string("string"), jv_string(""));
match = jv_object_set(match, jv_string("captures"), jv_array());
result = jv_array_append(result, match);
start += 1;
continue;
}

unsigned long idx;
unsigned long len;
const char *fr = (const char*)input_string;

for (idx = len = 0; fr != input_string+region->end[0]; len++) {
if (fr == input_string+region->beg[0]) idx = len, len=0;
fr += jvp_utf8_decode_length(*fr);
}

jv match = jv_object_set(jv_object(), jv_string("offset"), jv_number(idx));

unsigned long blen = region->end[0]-region->beg[0];
match = jv_object_set(match, jv_string("length"), jv_number(len));
match = jv_object_set(match, jv_string("string"), jv_string_sized(input_string+region->beg[0],blen));
jv captures = jv_array();
for (int i = 1; i < region->num_regs; ++i) {
// Empty capture.
if (region->beg[i] == region->end[i]) {
// Didn't match.
jv cap;
if (region->beg[i] == -1) {
cap = jv_object_set(jv_object(), jv_string("offset"), jv_number(-1));
cap = jv_object_set(cap, jv_string("string"), jv_null());
} else {
fr = input_string;
for (idx = 0; fr != input_string+region->beg[i]; idx++) {
fr += jvp_utf8_decode_length(*fr);
}
cap = jv_object_set(jv_object(), jv_string("offset"), jv_number(idx));
cap = jv_object_set(cap, jv_string("string"), jv_string(""));
}
cap = jv_object_set(cap, jv_string("length"), jv_number(0));
cap = jv_object_set(cap, jv_string("name"), jv_null());
captures = jv_array_append(captures, cap);
continue;
}
fr = input_string;
for (idx = len = 0; fr != input_string+region->end[i]; len++) {
if (fr == input_string+region->beg[i]) idx = len, len=0;
fr += jvp_utf8_decode_length(*fr);
}

blen = region->end[i]-region->beg[i];
jv cap = jv_object_set(jv_object(), jv_string("offset"), jv_number(idx));
cap = jv_object_set(cap, jv_string("length"), jv_number(len));
cap = jv_object_set(cap, jv_string("string"), jv_string_sized(input_string+region->beg[i],blen));
cap = jv_object_set(cap, jv_string("name"), jv_null());
captures = jv_array_append(captures,cap);
}
onig_foreach_name(reg,f_match_name_iter,&captures);
match = jv_object_set(match, jv_string("captures"), captures);
result = jv_array_append(result, match);
start = (const UChar*)(input_string+region->end[0]);
onig_region_free(region,0);
} else if (onigret == ONIG_MISMATCH) {
if (test)
result = jv_false();
break;
} else { /* Error */
UChar ebuf[ONIG_MAX_ERROR_MESSAGE_LEN];
onig_error_code_to_str(ebuf, onigret, einfo);
jv_free(result);
result = jv_invalid_with_msg(jv_string_concat(jv_string("Regex failure: "),
jv_string((char*)ebuf)));
break;
}
} while (global && start != end);
onig_region_free(region,1);
region = NULL;
if (region)
onig_region_free(region,1);
onig_free(reg);
jv_free(input);
jv_free(regex);
return result;
}

static jv minmax_by(jv values, jv keys, int is_min) {
if (jv_get_kind(values) != JV_KIND_ARRAY)
return type_error2(values, keys, "cannot be iterated over");
Expand Down Expand Up @@ -642,6 +852,7 @@ static const struct cfunction function_list[] = {
{(cfunction_ptr)f_error, "error", 2},
{(cfunction_ptr)f_format, "format", 2},
{(cfunction_ptr)f_env, "env", 1},
{(cfunction_ptr)f_match, "_match_impl", 4},
};
#undef LIBM_DD

Expand Down Expand Up @@ -737,6 +948,12 @@ static const char* const jq_builtins[] = {
"def flatten: reduce .[] as $i ([]; if $i | type == \"array\" then . + ($i | flatten) else . + [$i] end);",
"def flatten(x): reduce .[] as $i ([]; if $i | type == \"array\" and x > 0 then . + ($i | flatten(x-1)) else . + [$i] end);",
"def range(x): range(0;x);",
"def match(re; mode): _match_impl(re; mode; false)|.[];",
"def match(val): if val | type == \"string\" then match(val; null) elif val | type == \"array\" and (val | length) > 1 then match(val[0]; val[1]) elif val | type == \"array\" and (val | length > 0) then match(val[0]; null) else error((val | type) + \" not a string or array\") end;",
"def test(re; mode): _match_impl(re; mode; true);",
"def test(val): if val |type == \"string\" then test(val; null) elif val | type == \"array\" and (val | length) > 1 then test(val[0]; val[1]) elif val | type == \"array\" and (val | length > 0) then test(val[0]; null) else error((val | type) + \" not a string or array\") end;",
// "def test(re): _match(re; null; 1);",

};
#undef LIBM_DD

Expand Down
57 changes: 57 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,63 @@ if test "x$LEX" != xflex; then
fi


##########################################################################
# check for ONIGURUMA library
##########################################################################

AC_ARG_WITH([oniguruma],
[AS_HELP_STRING([--with-oniguruma=prefix],
[try this for a non-standard install prefix of the oniguruma library])],
[ONIGURUMAPATHSET=1],
[ONIGURUMAPATHSET=0])

if test $ONIGURUMAPATHSET == 1; then
CFLAGS="$CFLAGS -I${with_oniguruma}/include"
LDFLAGS="$LDFLAGS -L${with_oniguruma}/lib"
fi

# store current *FLAGS and merge with AM_*FLAGS for compilation and linker check
OLD_CFLAGS=$CFLAGS;
OLD_LDFLAGS=$LDFLAGS;
CFLAGS="$AM_CFLAGS $CFLAGS"
LDFLAGS="$AM_LDFLAGS $LDFLAGS"

# ensure the library to check for is covered by the LIBS variable
OLD_LIBS=$LIBS
LIBS="$LIBS -lonig"

# check for ONIGURUMA library headers
AC_MSG_CHECKING([for oniguruma.h])
# try to compile a file that includes a header of the library oniguruma
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([ #include <oniguruma.h> ])],
[AC_MSG_RESULT([yes])
# try to link the function 'onig_free' out of library oniguruma
AC_MSG_CHECKING([for oniguruma usability])
AC_LINK_IFELSE(
[AC_LANG_PROGRAM([[#include <oniguruma.h>]],
[[onig_free(0);]])],
[AC_MSG_RESULT([yes])
FOUND_ONIGURUMA=1;],
[AC_MSG_RESULT([no])
LIBS=$OLD_LIBS; dnl reset to old value since oniguruma was not found
FOUND_ONIGURUMA=0;])],
[AC_MSG_RESULT([not found])
FOUND_ONIGURUMA=0;])

# reset original *FLAGS
CFLAGS=$OLD_CFLAGS
LDFLAGS=$OLD_LDFLAGS

# handle check results
if test $FOUND_ONIGURUMA != 1; then
AC_MSG_NOTICE([Oniguruma was not found.])
AC_MSG_NOTICE([ Try setting the location using '--with-oniguruma=PREFIX' ])
AC_MSG_ERROR([ oniguruma is required to build jq.])
fi




dnl Check for valgrind
AC_CHECK_PROGS(valgrind_cmd, valgrind)
if test "x$valgrind_cmd" = "x" ; then
Expand Down
72 changes: 72 additions & 0 deletions docs/content/3.manual/manual.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1087,6 +1087,78 @@ sections:
input: '["foobar", "barfoo"]'
output: ['[false, true, true, false, false]']

- title: "`match(val)`, `match(regex; modifiers)`"
body: |
The filter `match(val)` performs PCRE regex matching on its input.
`val` can be either a string or an array. If it is an array,
the first element is the regex specifier and the optional
second element is the modifier flags.
The accepted modifier flags are:
* `g` - Global search (find all matches, not just the first)
* `i` - Case insensitive search
* `x` - Extended regex format (ignore whitespaces)
* `m` - Multi line mode ('.' will match newlines)
* `s` - Single line mode ('^' -> '\A', '$' -> '\Z')
* `p` - Both s and m modes are enabled
* `l` - Find longest possible matches
* `n` - Ignore empty matches
The filter outputs an object for each match it finds. Matches have
the following fields:
* `offset` - offset in UTF-8 codepoints from the beginning of the input
* `length` - length in UTF-8 codepoints of the match
* `string` - the string that it matched
* `captures` - an array of objects representing capturing groups.
Capturing group objects have the following fields:
* `offset` - offset in UTF-8 codepoints from the beginning of the input
* `length` - length in UTF-8 codepoints of this capturing group
* `string` - the string that was captured
* `name` - the name of the capturing group (or `null` if it was unnamed)
Capturing groups that did not match anything return an offset of -1
examples:
- program: 'match("(abc)+"; "g")'
input: '"abc abc"'
output:
- '{"offset": 0, "length": 3, "string": "abc", "captures": [{"offset": 0, "length": 3, "string": "abc", "name": null}]}'
- '{"offset": 4, "length": 3, "string": "abc", "captures": [{"offset": 4, "length": 3, "string": "abc", "name": null}]}'
- program: 'match("foo")'
input: '"foo bar foo"'
output: ['{"offset": 0, "length": 3, "string": "foo", "captures": []}']
- program: 'match(["foo", "ig"])'
input: '"foo bar FOO"'
output:
- '{"offset": 0, "length": 3, "string": "foo", "captures": []}'
- '{"offset": 8, "length": 3, "string": "FOO", "captures": []}'
- program: 'match("foo (?<bar123>bar)? foo"; "ig")'
input: '"foo bar foo foo foo"'
output:
- '{"offset": 0, "length": 11, "string": "foo bar foo", "captures": [{"offset": 4, "length": 3, "string": "bar", "name": "bar123"}]}'
- '{"offset": 12, "length": 8, "string": "foo foo", "captures": [{"offset": -1, "length": 0, "string": null, "name": "bar123"}]}'


- title: "`test(val)`, `test(regex)`, `test(regex; modifiers)`"
body: |
Like `match`, but does not return match objects, only `true` or `false`
for whether or not the regex matches the input.
examples:
- program: 'test("foo")'
input: '"foo"'
output: ['true']
- program: 'test("foo"; "i")'
input: '"Foo"'
output: ['true']
- program: 'test("foo")'
input: '"bar"'
output: ['false']
- title: "`ltrimstr(str)`"
body: |
Expand Down
Loading

0 comments on commit 8ff935c

Please sign in to comment.