Skip to content

Commit 6773918

Browse files
committed
Regex Improvements
- inline the take_while_ascii_pred function - change flag handling to stack allocation by using a fixed sized array - Parse regex literals at an earlier opportunity when inspecting expressions
1 parent 69d4c02 commit 6773918

File tree

4 files changed

+61
-19
lines changed

4 files changed

+61
-19
lines changed

core/parser/src/lexer/cursor.rs

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -162,16 +162,19 @@ impl<R: ReadChar> Cursor<R> {
162162
/// It also stops when the next character is not an ascii or there is no next character.
163163
///
164164
/// Note that all characters up until the stop character are added to the buffer, including the character right before.
165-
pub(super) fn take_while_ascii_pred<F>(&mut self, buf: &mut Vec<u8>, pred: &F) -> io::Result<()>
165+
#[allow(clippy::cast_possible_truncation)]
166+
#[inline]
167+
pub(super) fn take_while_ascii_pred<F>(&mut self, buf: &mut [u8], pred: &F) -> io::Result<()>
166168
where
167169
F: Fn(char) -> bool,
168170
{
171+
let mut count = 0;
169172
loop {
170173
if !self.next_is_ascii_pred(pred)? {
171174
return Ok(());
172175
} else if let Some(byte) = self.next_char()? {
173-
#[allow(clippy::cast_possible_truncation)]
174-
buf.push(byte as u8);
176+
buf[count] = byte as u8;
177+
count += 1;
175178
} else {
176179
// next_is_pred will return false if the next value is None so the None case should already be handled.
177180
unreachable!();

core/parser/src/lexer/regex.rs

Lines changed: 43 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
use crate::lexer::{Cursor, Error, Token, TokenKind, Tokenizer};
44
use crate::source::ReadChar;
55
use bitflags::bitflags;
6-
use boa_ast::{Position, PositionGroup};
7-
use boa_interner::{Interner, Sym};
6+
use boa_ast::PositionGroup;
7+
use boa_interner::Interner;
88
use regress::{Flags, Regex};
99
use std::fmt::{Display, Write};
1010
use std::str::{self, FromStr};
@@ -114,13 +114,16 @@ impl<R> Tokenizer<R> for RegexLiteral {
114114
}
115115
}
116116

117-
let mut flags = Vec::new();
117+
let mut flags: [u8; 8] = [0; 8];
118118
let flags_start = cursor.pos();
119119
cursor.take_while_ascii_pred(&mut flags, &char::is_alphabetic)?;
120120

121-
// SAFETY: We have already checked that the bytes are valid UTF-8.
122-
let flags_str = unsafe { str::from_utf8_unchecked(flags.as_slice()) };
121+
let flags_string = match RegExpFlags::from_bytes(flags) {
122+
Err(message) => return Err(Error::Syntax(message.into(), flags_start)),
123+
Ok(regex_flags) => regex_flags.to_string(),
124+
};
123125

126+
let flags_str = flags_string.as_str();
124127
let mut body_utf16 = Vec::new();
125128

126129
// We convert the body to UTF-16 since it may contain code points that are not valid UTF-8.
@@ -149,7 +152,7 @@ impl<R> Tokenizer<R> for RegexLiteral {
149152
Ok(Token::new_by_position_group(
150153
TokenKind::regular_expression_literal(
151154
interner.get_or_intern(body_utf16.as_slice()),
152-
parse_regex_flags(flags_str, flags_start, interner)?,
155+
interner.get_or_intern(flags_str.to_string().as_str()),
153156
),
154157
start_pos,
155158
cursor.pos_group(),
@@ -189,6 +192,40 @@ bitflags! {
189192
}
190193
}
191194

195+
impl RegExpFlags {
196+
fn from_bytes(bytes: [u8; 8]) -> Result<Self, String> {
197+
let mut flags = Self::default();
198+
for c in bytes {
199+
let new_flag = match c {
200+
b'g' => Self::GLOBAL,
201+
b'i' => Self::IGNORE_CASE,
202+
b'm' => Self::MULTILINE,
203+
b's' => Self::DOT_ALL,
204+
b'u' => Self::UNICODE,
205+
b'y' => Self::STICKY,
206+
b'd' => Self::HAS_INDICES,
207+
b'v' => Self::UNICODE_SETS,
208+
0x00 => continue,
209+
_ => return Err(format!("invalid regular expression flag {}", char::from(c))),
210+
};
211+
212+
if flags.contains(new_flag) {
213+
return Err(format!(
214+
"repeated regular expression flag {}",
215+
char::from(c)
216+
));
217+
}
218+
flags.insert(new_flag);
219+
}
220+
221+
if flags.contains(Self::UNICODE) && flags.contains(Self::UNICODE_SETS) {
222+
return Err("cannot use both 'u' and 'v' flags".into());
223+
}
224+
225+
Ok(flags)
226+
}
227+
}
228+
192229
impl FromStr for RegExpFlags {
193230
type Err = String;
194231

@@ -224,13 +261,6 @@ impl FromStr for RegExpFlags {
224261
}
225262
}
226263

227-
fn parse_regex_flags(s: &str, start: Position, interner: &mut Interner) -> Result<Sym, Error> {
228-
match RegExpFlags::from_str(s) {
229-
Err(message) => Err(Error::Syntax(message.into(), start)),
230-
Ok(flags) => Ok(interner.get_or_intern(flags.to_string().as_str())),
231-
}
232-
}
233-
234264
impl Display for RegExpFlags {
235265
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
236266
if self.contains(Self::HAS_INDICES) {

core/parser/src/parser/expression/assignment/mod.rs

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,10 @@ use crate::{
2929
Error,
3030
};
3131
use boa_ast::{
32-
expression::operator::assign::{Assign, AssignOp, AssignTarget},
32+
expression::{
33+
operator::assign::{Assign, AssignOp, AssignTarget},
34+
RegExpLiteral,
35+
},
3336
operations::{bound_names, contains, lexically_declared_names, ContainsSymbol},
3437
Expression, Keyword, Punctuator, Span,
3538
};
@@ -147,6 +150,13 @@ where
147150
.into());
148151
}
149152
}
153+
TokenKind::RegularExpressionLiteral(body, flags) => {
154+
let node =
155+
RegExpLiteral::new(*body, *flags, cursor.peek(0, interner).or_abrupt()?.span())
156+
.into();
157+
cursor.advance(interner);
158+
return Ok(node);
159+
}
150160
_ => {}
151161
}
152162

core/parser/src/parser/expression/primary/mod.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,14 +42,13 @@ use crate::{
4242
source::ReadChar,
4343
Error,
4444
};
45-
use ast::expression::RegExpLiteral as AstRegExp;
4645
use boa_ast::{
4746
self as ast,
4847
declaration::Variable,
4948
expression::{
5049
literal::{self, Literal, LiteralKind, TemplateElement},
5150
operator::{assign::AssignTarget, binary::BinaryOp},
52-
Identifier, Parenthesized, This,
51+
Identifier, Parenthesized, RegExpLiteral as AstRegExp, This,
5352
},
5453
function::{FormalParameter, FormalParameterList},
5554
operations::{contains, ContainsSymbol},

0 commit comments

Comments
 (0)