27
27
class Lexer :
28
28
"""JSONPath expression lexical scanner."""
29
29
30
- __slots__ = ("filter_depth" , "paren_stack" , "tokens" , "start" , "pos" , "query" )
30
+ __slots__ = (
31
+ "filter_depth" ,
32
+ "func_call_stack" ,
33
+ "bracket_stack" ,
34
+ "tokens" ,
35
+ "start" ,
36
+ "pos" ,
37
+ "query" ,
38
+ )
31
39
32
40
def __init__ (self , query : str ) -> None :
33
41
self .filter_depth = 0
34
42
"""Filter nesting level."""
35
43
36
- self .paren_stack : List [int ] = []
44
+ self .func_call_stack : List [int ] = []
37
45
"""A running count of parentheses for each, possibly nested, function call.
38
46
39
47
If the stack is empty, we are not in a function call. Remember that
40
48
function arguments can be arbitrarily nested in parentheses.
41
49
"""
42
50
51
+ self .bracket_stack : list [tuple [str , int ]] = []
52
+ """A stack of opening (parentheses/bracket, index) pairs."""
53
+
43
54
self .tokens : List [Token ] = []
44
55
"""Tokens resulting from scanning a JSONPath expression."""
45
56
@@ -133,7 +144,7 @@ def ignore_whitespace(self) -> bool:
133
144
134
145
def error (self , msg : str ) -> None :
135
146
"""Emit an error token."""
136
- # better error messages.
147
+ # TODO: better error messages.
137
148
self .tokens .append (
138
149
Token (
139
150
TokenType .ERROR ,
@@ -179,6 +190,7 @@ def lex_segment(l: Lexer) -> Optional[StateFn]: # noqa: D103, PLR0911
179
190
180
191
if c == "[" :
181
192
l .emit (TokenType .LBRACKET )
193
+ l .bracket_stack .append ((c , l .pos - 1 ))
182
194
return lex_inside_bracketed_segment
183
195
184
196
if l .filter_depth :
@@ -202,6 +214,7 @@ def lex_descendant_segment(l: Lexer) -> Optional[StateFn]: # noqa: D103
202
214
203
215
if c == "[" :
204
216
l .emit (TokenType .LBRACKET )
217
+ l .bracket_stack .append ((c , l .pos - 1 ))
205
218
return lex_inside_bracketed_segment
206
219
207
220
l .backup ()
@@ -244,11 +257,17 @@ def lex_inside_bracketed_segment(l: Lexer) -> Optional[StateFn]: # noqa: PLR091
244
257
c = l .next ()
245
258
246
259
if c == "]" :
260
+ if not l .bracket_stack or l .bracket_stack [- 1 ][0 ] != "[" :
261
+ l .backup ()
262
+ l .error ("unbalanced brackets" )
263
+ return None
264
+
265
+ l .bracket_stack .pop ()
247
266
l .emit (TokenType .RBRACKET )
248
267
return lex_segment
249
268
250
269
if c == "" :
251
- l .error ("unclosed bracketed selection " )
270
+ l .error ("unbalanced brackets " )
252
271
return None
253
272
254
273
if c == "*" :
@@ -299,18 +318,14 @@ def lex_inside_filter(l: Lexer) -> Optional[StateFn]: # noqa: D103, PLR0915, PL
299
318
300
319
if c == "]" :
301
320
l .filter_depth -= 1
302
- if len (l .paren_stack ) == 1 :
303
- l .error ("unbalanced parentheses" )
304
- return None
305
-
306
321
l .backup ()
307
322
return lex_inside_bracketed_segment
308
323
309
324
if c == "," :
310
325
l .emit (TokenType .COMMA )
311
326
# If we have unbalanced parens, we are inside a function call and a
312
327
# comma separates arguments. Otherwise a comma separates selectors.
313
- if l .paren_stack :
328
+ if l .func_call_stack :
314
329
continue
315
330
l .filter_depth -= 1
316
331
return lex_inside_bracketed_segment
@@ -323,19 +338,26 @@ def lex_inside_filter(l: Lexer) -> Optional[StateFn]: # noqa: D103, PLR0915, PL
323
338
324
339
if c == "(" :
325
340
l .emit (TokenType .LPAREN )
341
+ l .bracket_stack .append ((c , l .pos - 1 ))
326
342
# Are we in a function call? If so, a function argument contains parens.
327
- if l .paren_stack :
328
- l .paren_stack [- 1 ] += 1
343
+ if l .func_call_stack :
344
+ l .func_call_stack [- 1 ] += 1
329
345
continue
330
346
331
347
if c == ")" :
348
+ if not l .bracket_stack or l .bracket_stack [- 1 ][0 ] != "(" :
349
+ l .backup ()
350
+ l .error ("unbalanced parentheses" )
351
+ return None
352
+
353
+ l .bracket_stack .pop ()
332
354
l .emit (TokenType .RPAREN )
333
355
# Are we closing a function call or a parenthesized expression?
334
- if l .paren_stack :
335
- if l .paren_stack [- 1 ] == 1 :
336
- l .paren_stack .pop ()
356
+ if l .func_call_stack :
357
+ if l .func_call_stack [- 1 ] == 1 :
358
+ l .func_call_stack .pop ()
337
359
else :
338
- l .paren_stack [- 1 ] -= 1
360
+ l .func_call_stack [- 1 ] -= 1
339
361
continue
340
362
341
363
if c == "$" :
@@ -402,8 +424,9 @@ def lex_inside_filter(l: Lexer) -> Optional[StateFn]: # noqa: D103, PLR0915, PL
402
424
l .emit (TokenType .INT )
403
425
elif l .accept_match (RE_FUNCTION_NAME ) and l .peek () == "(" :
404
426
# Keep track of parentheses for this function call.
405
- l .paren_stack .append (1 )
427
+ l .func_call_stack .append (1 )
406
428
l .emit (TokenType .FUNCTION )
429
+ l .bracket_stack .append (("(" , l .pos ))
407
430
l .next ()
408
431
l .ignore () # ignore LPAREN
409
432
else :
@@ -486,6 +509,21 @@ def tokenize(query: str) -> List[Token]:
486
509
lexer , tokens = lex (query )
487
510
lexer .run ()
488
511
512
+ # Check for remaining opening brackets that have not been closes.
513
+ if lexer .bracket_stack :
514
+ ch , index = lexer .bracket_stack [0 ]
515
+ msg = f"unbalanced { 'brackets' if ch == '[' else 'parentheses' } "
516
+ raise JSONPathSyntaxError (
517
+ msg ,
518
+ token = Token (
519
+ TokenType .ERROR ,
520
+ lexer .query [index ],
521
+ index ,
522
+ lexer .query ,
523
+ msg ,
524
+ ),
525
+ )
526
+
489
527
if tokens and tokens [- 1 ].type_ == TokenType .ERROR :
490
528
raise JSONPathSyntaxError (tokens [- 1 ].message , token = tokens [- 1 ])
491
529
0 commit comments