Skip to content

Commit

Permalink
Fix handling of current scanner mode; Provide an iterator adapter `Wi…
Browse files Browse the repository at this point in the history
…thPositions`
  • Loading branch information
jsinger67 committed Sep 19, 2024
1 parent e23574b commit f04cae7
Show file tree
Hide file tree
Showing 7 changed files with 255 additions and 151 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ let scanner = ScannerBuilder::new().add_scanner_modes(&*MODES).build().unwrap();
let find_iter = scanner.find_iter(INPUT).with_positions();
let matches: Vec<MatchExt> = find_iter.collect();
```
- Fixed handling of current scanner mode. There was a bug that scanner mode switching from the
outside had no effect on cloned `ScannerImpl` instances. This was fixed by using a shared mode
field.

## 0.3.2 - 2024-09-09

Expand Down
9 changes: 8 additions & 1 deletion src/find_matches.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::{FindMatchesImpl, Match, Position, PositionProvider, ScannerImpl};
use crate::{ FindMatchesImpl, Match, Position, PositionProvider, ScannerImpl };

/// The result of a peek operation.
#[derive(Debug, PartialEq)]
Expand Down Expand Up @@ -89,6 +89,13 @@ impl<'h> FindMatches<'h> {
pub fn advance_to(&mut self, position: usize) -> usize {
self.inner.advance_to(position)
}

/// Returns the current scanner mode. Used for tests and debugging purposes.
#[allow(dead_code)]
#[inline]
pub(crate) fn current_mode(&self) -> usize {
self.inner.current_mode()
}
}

impl Iterator for FindMatches<'_> {
Expand Down
7 changes: 7 additions & 0 deletions src/internal/find_matches_impl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,13 @@ impl<'h> FindMatchesImpl<'h> {
}
self.last_char = c;
}

/// Returns the current scanner mode. Used for tests and debugging purposes.
#[allow(dead_code)]
#[inline]
pub(crate) fn current_mode(&self) -> usize {
self.scanner_impl.current_mode()
}
}

impl std::fmt::Debug for FindMatchesImpl<'_> {
Expand Down
176 changes: 85 additions & 91 deletions src/internal/scanner_impl.rs
Original file line number Diff line number Diff line change
@@ -1,53 +1,59 @@
use std::sync::Arc;
use std::sync::{ Arc, Mutex };

use log::{debug, trace};
use log::{ debug, trace };

use crate::{FindMatches, Match, Result, ScannerMode, ScnrError};
use crate::{ FindMatches, Match, Result, ScannerMode, ScnrError };

use super::{CharClassID, CharacterClassRegistry, CompiledScannerMode, MatchFunction};
use super::{ CharClassID, CharacterClassRegistry, CompiledScannerMode, MatchFunction };

#[derive(Clone)]
pub(crate) struct ScannerImpl {
pub(crate) character_classes: CharacterClassRegistry,
pub(crate) scanner_modes: Vec<CompiledScannerMode>,
current_mode: usize,
// The function used to match characters to character classes.
pub(crate) match_char_class: Arc<dyn Fn(CharClassID, char) -> bool + 'static + Send + Sync>,
pub(crate) match_char_class: Arc<dyn (Fn(CharClassID, char) -> bool) + 'static + Send + Sync>,
// The current mode is private and thereby makes the free creation of ScannerImpl instances
// impossible.
// ScannerImpl instances are always created by the Scanner::try_new method and of course by
// the clone method. So the current mode is always shared between all ScannerImpl instances of
// the same Scanner instance.
current_mode: Arc<Mutex<usize>>,
}

impl ScannerImpl {
/// Creates a new scanner implementation from the given scanner modes.
/// Returns an iterator over all non-overlapping matches.
/// The iterator yields a [`Match`] value until no more matches could be found.
pub(crate) fn find_iter(scanner: Self, input: &str) -> FindMatches<'_> {
FindMatches::new(scanner, input)
pub(crate) fn find_iter(scanner_impl: Self, input: &str) -> FindMatches<'_> {
FindMatches::new(scanner_impl, input)
}

pub(crate) fn create_match_char_class(
&self,
) -> Result<Box<dyn Fn(CharClassID, char) -> bool + 'static + Send + Sync>> {
let match_functions =
self.character_classes
.iter()
.try_fold(Vec::new(), |mut acc, cc| {
trace!("Create match function for char class {:?}", cc);
let match_function: MatchFunction = cc.ast().try_into()?;
acc.push(match_function);
Ok::<Vec<MatchFunction>, ScnrError>(acc)
})?;
Ok(Box::new(move |char_class, c| {
let res = match_functions[char_class.as_usize()].call(c);
if res {
trace!("Match char class: {:?} {:?} -> {:?}", char_class, c, res);
}
res
}))
&self
) -> Result<Box<dyn (Fn(CharClassID, char) -> bool) + 'static + Send + Sync>> {
let match_functions = self.character_classes.iter().try_fold(Vec::new(), |mut acc, cc| {
trace!("Create match function for char class {:?}", cc);
let match_function: MatchFunction = cc.ast().try_into()?;
acc.push(match_function);
Ok::<Vec<MatchFunction>, ScnrError>(acc)
})?;
Ok(
Box::new(move |char_class, c| {
let res = match_functions[char_class.as_usize()].call(c);
if res {
trace!("Match char class: {:?} {:?} -> {:?}", char_class, c, res);
}
res
})
)
}

/// Executes a leftmost search and returns the first match that is found, if one exists.
/// It starts the search at the position of the given CharIndices iterator.
/// During the search, all DFAs are advanced in parallel by one character at a time.
pub(crate) fn find_from(&mut self, char_indices: std::str::CharIndices) -> Option<Match> {
let patterns = &mut self.scanner_modes[self.current_mode].patterns;
let patterns = &mut self.scanner_modes[*self.current_mode.lock().unwrap()].patterns;
for (dfa, _) in patterns.iter_mut() {
dfa.reset();
}
Expand All @@ -57,15 +63,14 @@ impl ScannerImpl {

for (i, c) in char_indices {
for dfa_index in &active_dfas {
// trace!(
// "Advance DFA #{} with char {:?} and token type {}",
// dfa_index,
// c,
// patterns[*dfa_index].1
// );
patterns[*dfa_index]
.0
.advance(i, c, &*self.match_char_class);
trace!(
"Advance DFA #{} of mode {} with char {:?} and token type {}",
dfa_index,
*self.current_mode.lock().unwrap(),
c,
patterns[*dfa_index].1
);
patterns[*dfa_index].0.advance(i, c, &*self.match_char_class);
}

// trace!("Clear active DFAs");
Expand All @@ -88,7 +93,7 @@ impl ScannerImpl {

let current_match = self.find_first_longest_match();
if let Some(m) = current_match.as_ref() {
self.execute_possible_mode_switch(m)
self.execute_possible_mode_switch(m);
}
current_match
}
Expand All @@ -104,7 +109,7 @@ impl ScannerImpl {
/// It is called by the `peek_n` method of the `FindMatches` iterator on a copy of the
/// `CharIndices` iterator. Thus, the original `CharIndices` iterator is not advanced.
pub(crate) fn peek_from(&mut self, char_indices: std::str::CharIndices) -> Option<Match> {
let patterns = &mut self.scanner_modes[self.current_mode].patterns;
let patterns = &mut self.scanner_modes[*self.current_mode.lock().unwrap()].patterns;
for (dfa, _) in patterns.iter_mut() {
dfa.reset();
}
Expand All @@ -114,9 +119,7 @@ impl ScannerImpl {

for (i, c) in char_indices {
for dfa_index in &active_dfas {
patterns[*dfa_index]
.0
.advance(i, c, &*self.match_char_class);
patterns[*dfa_index].0.advance(i, c, &*self.match_char_class);
}

// We remove all DFAs from `active_dfas` that finished or did not find a match so far.
Expand All @@ -137,13 +140,14 @@ impl ScannerImpl {
fn find_first_longest_match(&mut self) -> Option<Match> {
let mut current_match: Option<Match> = None;
{
let patterns = &self.scanner_modes[self.current_mode].patterns;
let patterns = &self.scanner_modes[*self.current_mode.lock().unwrap()].patterns;
for (dfa, tok_type) in patterns.iter() {
if let Some(dfa_match) = dfa.current_match() {
if current_match.is_none()
|| dfa_match.start < current_match.unwrap().start()
|| dfa_match.start == current_match.unwrap().start()
&& dfa_match.len() > current_match.unwrap().span().len()
if
current_match.is_none() ||
dfa_match.start < current_match.unwrap().start() ||
(dfa_match.start == current_match.unwrap().start() &&
dfa_match.len() > current_match.unwrap().span().len())
{
// We have a match and we continue the look for a longer match.
current_match = Some(Match::new(tok_type.as_usize(), dfa_match));
Expand All @@ -157,17 +161,17 @@ impl ScannerImpl {
/// Executes a possible mode switch if a transition is defined for the token type found.
#[inline]
fn execute_possible_mode_switch(&mut self, current_match: &Match) {
let current_mode = &self.scanner_modes[self.current_mode];
let current_mode = &self.scanner_modes[*self.current_mode.lock().unwrap()];
// We perform a scanner mode switch if a transition is defined for the token type found.
if let Some(next_mode) = current_mode.has_transition(current_match.token_type()) {
self.current_mode = next_mode;
*self.current_mode.lock().unwrap() = next_mode;
}
}

/// Returns the number of the next scanner mode if a transition is defined for the token type.
/// If no transition is defined, None returned.
pub(crate) fn has_transition(&self, token_type: usize) -> Option<usize> {
self.scanner_modes[self.current_mode].has_transition(token_type)
self.scanner_modes[*self.current_mode.lock().unwrap()].has_transition(token_type)
}

/// Returns the name of the scanner mode with the given index.
Expand All @@ -176,20 +180,11 @@ impl ScannerImpl {
self.scanner_modes.get(index).map(|mode| mode.name.as_str())
}

/// Sets the current scanner mode.
///
/// A parser can explicitly set the scanner mode to switch to a different set of DFAs.
/// Usually, the scanner mode is changed by the scanner itself based on the transitions defined
/// in the scanner mode.
pub(crate) fn set_mode(&mut self, mode: usize) {
trace!("Set scanner mode to {}", mode);
self.current_mode = mode;
}

/// Returns the current scanner mode.
/// Returns the current scanner mode. Used for tests and debugging purposes.
#[allow(dead_code)]
#[inline]
pub(crate) fn current_mode(&self) -> usize {
self.current_mode
*self.current_mode.lock().unwrap()
}

/// Traces the compiled DFAs as dot format.
Expand All @@ -210,7 +205,7 @@ impl ScannerImpl {
dfa,
&title,
&self.character_classes,
&mut cursor,
&mut cursor
);
let mut dot_format = String::new();
cursor.set_position(0);
Expand All @@ -228,10 +223,9 @@ impl ScannerImpl {
pub(crate) fn generate_compiled_dfas_as_dot<T>(
&self,
modes: &[ScannerMode],
target_folder: T,
target_folder: T
) -> Result<()>
where
T: AsRef<std::path::Path>,
where T: AsRef<std::path::Path>
{
use std::fs::File;
for (i, scanner_mode) in self.scanner_modes.iter().enumerate() {
Expand All @@ -258,29 +252,29 @@ impl ScannerImpl {
/// Resets the scanner to the initial state.
#[inline]
pub(crate) fn reset(&mut self) {
self.current_mode = 0;
*self.current_mode.lock().unwrap() = 0;
}

pub(crate) fn common_mode(&mut self, current_mode: Arc<Mutex<usize>>) {
self.current_mode = current_mode;
}
}

impl TryFrom<Vec<ScannerMode>> for ScannerImpl {
type Error = crate::ScnrError;
fn try_from(scanner_modes: Vec<ScannerMode>) -> Result<Self> {
let mut character_classes = CharacterClassRegistry::new();
let scanner_modes =
scanner_modes
.into_iter()
.try_fold(Vec::new(), |mut acc, scanner_mode| {
acc.push(CompiledScannerMode::try_from_scanner_mode(
scanner_mode,
&mut character_classes,
)?);
Ok::<Vec<CompiledScannerMode>, ScnrError>(acc)
})?;
let scanner_modes = scanner_modes.into_iter().try_fold(Vec::new(), |mut acc, scanner_mode| {
acc.push(
CompiledScannerMode::try_from_scanner_mode(scanner_mode, &mut character_classes)?
);
Ok::<Vec<CompiledScannerMode>, ScnrError>(acc)
})?;

let mut me = Self {
character_classes,
scanner_modes,
current_mode: 0,
current_mode: Arc::new(Mutex::new(0)),
match_char_class: Arc::new(|_, _| false),
};
me.match_char_class = Arc::new(Self::create_match_char_class(&me)?);
Expand All @@ -301,13 +295,13 @@ impl std::fmt::Debug for ScannerImpl {
mod tests {
use super::*;
use crate::ScannerMode;
use std::{convert::TryInto, fs};
use std::{ convert::TryInto, fs };

#[test]
fn test_try_from() {
let scanner_modes = vec![
ScannerMode::new("mode1", vec![("a".to_string(), 0)], vec![]),
ScannerMode::new("mode2", vec![("b".to_string(), 1)], vec![]),
ScannerMode::new("mode2", vec![("b".to_string(), 1)], vec![])
];
let scanner_impl: ScannerImpl = scanner_modes.try_into().unwrap();
assert_eq!(scanner_impl.character_classes.len(), 2);
Expand All @@ -318,24 +312,25 @@ mod tests {
fn test_match_char_class() {
let scanner_modes = vec![
ScannerMode::new("mode1", vec![("a".to_string(), 0)], vec![]),
ScannerMode::new("mode2", vec![("b".to_string(), 1)], vec![]),
ScannerMode::new("mode2", vec![("b".to_string(), 1)], vec![])
];
let scanner_impl: ScannerImpl = scanner_modes.try_into().unwrap();
let match_char_class = scanner_impl.create_match_char_class().unwrap();
assert!(match_char_class(0.into(), 'a'));
assert!(!match_char_class(0.into(), 'b'));
assert!(!match_char_class(0.into(), 'c'));
assert!(!match_char_class(1.into(), 'a'));
assert!(match_char_class(1.into(), 'b'));
assert!(!match_char_class(1.into(), 'c'));
assert!(match_char_class((0).into(), 'a'));
assert!(!match_char_class((0).into(), 'b'));
assert!(!match_char_class((0).into(), 'c'));
assert!(!match_char_class((1).into(), 'a'));
assert!(match_char_class((1).into(), 'b'));
assert!(!match_char_class((1).into(), 'c'));
}

#[test]
fn test_generate_dot_files() {
let path = concat!(env!("CARGO_MANIFEST_DIR"), "/tests/data/string.json");
let file = fs::File::open(path).unwrap();

let scanner_modes: Vec<ScannerMode> = serde_json::from_reader(file)
let scanner_modes: Vec<ScannerMode> = serde_json
::from_reader(file)
.unwrap_or_else(|e| panic!("**** Failed to read json file {path}: {e}"));

let scanner_impl: ScannerImpl = scanner_modes.clone().try_into().unwrap();
Expand All @@ -347,12 +342,11 @@ mod tests {
fs::create_dir_all(target_folder).unwrap();

// Generate the compiled DFAs as dot files.
scanner_impl
.generate_compiled_dfas_as_dot(&scanner_modes, target_folder)
.unwrap();
scanner_impl.generate_compiled_dfas_as_dot(&scanner_modes, target_folder).unwrap();

// Check if the dot files are generated.
let dot_files: Vec<_> = fs::read_dir(target_folder)
let dot_files: Vec<_> = fs
::read_dir(target_folder)
.unwrap()
.map(|entry| entry.unwrap().path())
.collect();
Expand Down
Loading

0 comments on commit f04cae7

Please sign in to comment.