Skip to content

Commit c20cd8f

Browse files
committed
refactor: Separate lex_token
Taking some of PRQL#4055
1 parent 19b4121 commit c20cd8f

File tree

1 file changed

+76
-23
lines changed

1 file changed

+76
-23
lines changed

prqlc/prqlc-parser/src/lexer.rs

+76-23
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ pub enum Token {
1616
Param(String),
1717

1818
Range {
19+
/// Whether the left side of the range is bound by the previous token
20+
/// (but it's not contained in this token)
1921
bind_left: bool,
2022
bind_right: bool,
2123
},
@@ -39,12 +41,16 @@ pub enum Token {
3941
Annotate, // @
4042
}
4143

44+
/// Lex chars to tokens until the end of the input
4245
pub fn lexer() -> impl Parser<char, Vec<TokenSpan>, Error = Cheap<char>> {
43-
let whitespace = filter(|x: &char| x.is_inline_whitespace())
46+
lex_token()
4447
.repeated()
45-
.at_least(1)
46-
.ignored();
48+
.then_ignore(ignored())
49+
.then_ignore(end())
50+
}
4751

52+
/// Lex chars to a single token
53+
pub fn lex_token() -> impl Parser<char, TokenSpan, Error = Cheap<char>> {
4854
let control_multi = choice((
4955
just("->").to(Token::ArrowThin),
5056
just("=>").to(Token::ArrowFat),
@@ -109,41 +115,55 @@ pub fn lexer() -> impl Parser<char, Vec<TokenSpan>, Error = Cheap<char>> {
109115
))
110116
.recover_with(skip_then_retry_until([]).skip_start());
111117

112-
let comment = just('#')
113-
.then(newline.not().repeated())
114-
.separated_by(newline.then(whitespace.or_not()))
115-
.at_least(1)
116-
.ignored();
117-
118-
let range = (whitespace.or_not())
118+
let range = (whitespace().or_not())
119119
.then_ignore(just(".."))
120-
.then(whitespace.or_not())
120+
.then(whitespace().or_not())
121121
.map(|(left, right)| Token::Range {
122+
// If there was no whitespace before (after), then we mark the range
123+
// as bound on the left (right).
122124
bind_left: left.is_none(),
123125
bind_right: right.is_none(),
124126
})
125127
.map_with_span(TokenSpan);
126128

127-
let line_wrap = newline
129+
choice((range, ignored().ignore_then(token.map_with_span(TokenSpan))))
130+
}
131+
132+
fn ignored() -> impl Parser<char, (), Error = Cheap<char>> {
133+
choice((comment(), whitespace(), line_wrap()))
134+
.repeated()
135+
.ignored()
136+
}
137+
138+
fn whitespace() -> impl Parser<char, (), Error = Cheap<char>> {
139+
filter(|x: &char| x.is_inline_whitespace())
140+
.repeated()
141+
.at_least(1)
142+
.ignored()
143+
}
144+
145+
fn line_wrap() -> impl Parser<char, (), Error = Cheap<char>> {
146+
newline()
128147
.then(
129148
// We can optionally have an empty line, or a line with a comment,
130149
// between the initial line and the continued line
131-
whitespace
150+
whitespace()
132151
.or_not()
133-
.then(comment.or_not())
134-
.then(newline)
152+
.then(comment().or_not())
153+
.then(newline())
135154
.repeated(),
136155
)
137-
.then(whitespace.repeated())
156+
.then(whitespace().repeated())
138157
.then(just('\\'))
139-
.ignored();
140-
141-
let ignored = choice((comment, whitespace, line_wrap)).repeated();
158+
.ignored()
159+
}
142160

143-
choice((range, ignored.ignore_then(token.map_with_span(TokenSpan))))
144-
.repeated()
145-
.then_ignore(ignored)
146-
.then_ignore(end())
161+
fn comment() -> impl Parser<char, (), Error = Cheap<char>> {
162+
just('#')
163+
.then(newline().not().repeated())
164+
.separated_by(newline().then(whitespace().or_not()))
165+
.at_least(1)
166+
.ignored()
147167
}
148168

149169
pub fn ident_part() -> impl Parser<char, String, Error = Cheap<char>> + Clone {
@@ -625,4 +645,37 @@ mod test {
625645
// Unicode escape
626646
assert_snapshot!(quoted_string(true).parse(r"'\u{01f422}'").unwrap(), @"🐢");
627647
}
648+
649+
#[test]
650+
fn range() {
651+
assert_debug_snapshot!(TokenVec(lexer().parse("1..2").unwrap()), @r###"
652+
TokenVec (
653+
0..1: Literal(Integer(1)),
654+
1..3: Range { bind_left: true, bind_right: true },
655+
3..4: Literal(Integer(2)),
656+
)
657+
"###);
658+
659+
assert_debug_snapshot!(TokenVec(lexer().parse("..2").unwrap()), @r###"
660+
TokenVec (
661+
0..2: Range { bind_left: true, bind_right: true },
662+
2..3: Literal(Integer(2)),
663+
)
664+
"###);
665+
666+
assert_debug_snapshot!(TokenVec(lexer().parse("1..").unwrap()), @r###"
667+
TokenVec (
668+
0..1: Literal(Integer(1)),
669+
1..3: Range { bind_left: true, bind_right: true },
670+
)
671+
"###);
672+
673+
assert_debug_snapshot!(TokenVec(lexer().parse("in ..5").unwrap()), @r###"
674+
TokenVec (
675+
0..2: Ident("in"),
676+
2..5: Range { bind_left: false, bind_right: true },
677+
5..6: Literal(Integer(5)),
678+
)
679+
"###);
680+
}
628681
}

0 commit comments

Comments
 (0)