From 40916a66ee2629c3171a4ff225ca9378101efdde Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Sun, 22 Mar 2026 02:33:36 +0000 Subject: [PATCH] perf: cache compiled regexes in formatter using LazyLock This change improves performance in `src/formatter.rs` by replacing all 15 instances of local `Regex::new` calls with cached `std::sync::LazyLock` static variables. This avoids repeated regex parsing and compilation on every function call, which is particularly beneficial for the MDX formatting pipeline where many transformations are applied in sequence. The use of `std::sync::LazyLock` follows modern Rust idioms (stabilized in Rust 1.80) and provides thread-safe, lazy initialization of the regular expressions. Logic remains identical to the previous implementation as the regex patterns were preserved exactly. --- src/formatter.rs | 117 ++++++++++++++++++++++++----------------------- 1 file changed, 61 insertions(+), 56 deletions(-) diff --git a/src/formatter.rs b/src/formatter.rs index 634dd21..d770340 100644 --- a/src/formatter.rs +++ b/src/formatter.rs @@ -1,8 +1,32 @@ use regex::Regex; use std::fs; use std::path::Path; +use std::sync::LazyLock; use walkdir::WalkDir; +static RE_BLANK_LINES: LazyLock = LazyLock::new(|| Regex::new(r"\n{3,}").unwrap()); +static RE_HTML_COMMENT: LazyLock = LazyLock::new(|| Regex::new(r"").unwrap()); +static RE_BARE_URL: LazyLock = LazyLock::new(|| Regex::new(r"<(https?://[^>]+)>").unwrap()); +static RE_BR: LazyLock = LazyLock::new(|| Regex::new(r"").unwrap()); +static RE_HR: LazyLock = LazyLock::new(|| Regex::new(r"").unwrap()); +static RE_TR_TABLE: LazyLock = LazyLock::new(|| Regex::new(r"\s*").unwrap()); +static RE_EMPTY_TR: LazyLock = LazyLock::new(|| Regex::new(r"\s*").unwrap()); +static RE_STYLE: LazyLock = LazyLock::new(|| Regex::new(r#"style="([^"]*)""#).unwrap()); +static RE_HUGO_CALLOUT_OPEN: LazyLock = + LazyLock::new(|| Regex::new(r"\{\{[<%]\s*callout\b[^{}]*[>%]\}\}").unwrap()); +static RE_HUGO_CALLOUT_CLOSE: LazyLock = + LazyLock::new(|| Regex::new(r"\{\{[<%]\s*/callout\s*[>%]\}\}").unwrap()); +static RE_HUGO_DETAILS_SINGLE: LazyLock = LazyLock::new(|| { + Regex::new(r#"\{\{% details title="([^"]*)"[^%]*%\}\}\s*(.+?)\s*\{\{% /details %\}\}"#).unwrap() +}); +static RE_HUGO_DETAILS_OPEN: LazyLock = + LazyLock::new(|| Regex::new(r#"\{\{% details title="([^"]*)"[^%]*%\}\}"#).unwrap()); +static RE_HUGO_DETAILS_CLOSE: LazyLock = + LazyLock::new(|| Regex::new(r#"([^\n])\s*\{\{% /details %\}\}"#).unwrap()); +static RE_CODE_BLOCK: LazyLock = LazyLock::new(|| Regex::new(r"```[\s\S]*?```").unwrap()); +static RE_MATH_BLOCK: LazyLock = + LazyLock::new(|| Regex::new(r"\$\$(\r?\n)?([\s\S]*?)(\r?\n)?\$\$").unwrap()); + /// Format a single MDX file with all transformations pub fn format_mdx_file(content: &str) -> String { let mut result = content.to_string(); @@ -20,22 +44,19 @@ pub fn format_mdx_file(content: &str) -> String { result = convert_inline_math(&result); // Clean up multiple consecutive blank lines - let re = Regex::new(r"\n{3,}").unwrap(); - result = re.replace_all(&result, "\n\n").to_string(); + result = RE_BLANK_LINES.replace_all(&result, "\n\n").to_string(); result } /// Remove HTML comments from content fn remove_html_comments(content: &str) -> String { - let re = Regex::new(r"").unwrap(); - re.replace_all(content, "").to_string() + RE_HTML_COMMENT.replace_all(content, "").to_string() } /// Convert bare URLs in angle brackets to Markdown links for MDX compatibility fn convert_bare_urls_to_links(content: &str) -> String { - let re = Regex::new(r"<(https?://[^>]+)>").unwrap(); - re.replace_all(content, "[$1]($1)").to_string() + RE_BARE_URL.replace_all(content, "[$1]($1)").to_string() } /// Remove shield.io badges (markdown image syntax) @@ -52,12 +73,10 @@ fn fix_self_closing_tags(content: &str) -> String { let mut result = content.to_string(); // Convert
to
- let re_br = Regex::new(r"").unwrap(); - result = re_br.replace_all(&result, "
").to_string(); + result = RE_BR.replace_all(&result, "
").to_string(); // Convert
to
- let re_hr = Regex::new(r"").unwrap(); - result = re_hr.replace_all(&result, "
").to_string(); + result = RE_HR.replace_all(&result, "
").to_string(); result } @@ -67,12 +86,10 @@ fn fix_malformed_html(content: &str) -> String { let mut result = content.to_string(); // Remove empty tags before closing table - let re_tr_table = Regex::new(r"\s*").unwrap(); - result = re_tr_table.replace_all(&result, "").to_string(); + result = RE_TR_TABLE.replace_all(&result, "").to_string(); // Remove empty tags - let re_empty_tr = Regex::new(r"\s*").unwrap(); - result = re_empty_tr.replace_all(&result, "").to_string(); + result = RE_EMPTY_TR.replace_all(&result, "").to_string(); result } @@ -99,33 +116,32 @@ fn css_property_to_camel_case(prop: &str) -> String { /// Convert HTML style attributes to JSX format fn convert_style_to_jsx(content: &str) -> String { - let re = Regex::new(r#"style="([^"]*)""#).unwrap(); - - re.replace_all(content, |caps: ®ex::Captures| { - let style_str = &caps[1]; - let mut jsx_props = Vec::new(); + RE_STYLE + .replace_all(content, |caps: ®ex::Captures| { + let style_str = &caps[1]; + let mut jsx_props = Vec::new(); + + for prop in style_str.split(';') { + let prop = prop.trim(); + if prop.is_empty() || !prop.contains(':') { + continue; + } - for prop in style_str.split(';') { - let prop = prop.trim(); - if prop.is_empty() || !prop.contains(':') { - continue; + let parts: Vec<&str> = prop.splitn(2, ':').collect(); + if parts.len() == 2 { + let name = css_property_to_camel_case(parts[0].trim()); + let value = parts[1].trim(); + jsx_props.push(format!("{}: \"{}\"", name, value)); + } } - let parts: Vec<&str> = prop.splitn(2, ':').collect(); - if parts.len() == 2 { - let name = css_property_to_camel_case(parts[0].trim()); - let value = parts[1].trim(); - jsx_props.push(format!("{}: \"{}\"", name, value)); + if jsx_props.is_empty() { + String::new() + } else { + format!("style={{{{{}}}}}", jsx_props.join(", ")) } - } - - if jsx_props.is_empty() { - String::new() - } else { - format!("style={{{{{}}}}}", jsx_props.join(", ")) - } - }) - .to_string() + }) + .to_string() } /// Remove Hugo callout shortcodes that are invalid in MDX. @@ -134,13 +150,11 @@ fn convert_hugo_callout_shortcodes(content: &str) -> String { // Remove opening callout tags such as: // {{< callout type="info" >}} or {{% callout type="warning" %}} - let re_open = Regex::new(r"\{\{[<%]\s*callout\b[^{}]*[>%]\}\}").unwrap(); - result = re_open.replace_all(&result, "").to_string(); + result = RE_HUGO_CALLOUT_OPEN.replace_all(&result, "").to_string(); // Remove closing callout tags such as: // {{< /callout >}} or {{% /callout %}} - let re_close = Regex::new(r"\{\{[<%]\s*/callout\s*[>%]\}\}").unwrap(); - result = re_close.replace_all(&result, "").to_string(); + result = RE_HUGO_CALLOUT_CLOSE.replace_all(&result, "").to_string(); result } @@ -150,23 +164,18 @@ fn convert_hugo_details_to_accordion(content: &str) -> String { let mut result = content.to_string(); // First, handle single-line shortcodes: {{% details title="..." %}} content {{% /details %}} - let re_single_line = - Regex::new(r#"\{\{% details title="([^"]*)"[^%]*%\}\}\s*(.+?)\s*\{\{% /details %\}\}"#) - .unwrap(); - result = re_single_line + result = RE_HUGO_DETAILS_SINGLE .replace_all(&result, "\n$2\n") .to_string(); // Convert opening tags - let re_open = Regex::new(r#"\{\{% details title="([^"]*)"[^%]*%\}\}"#).unwrap(); - result = re_open + result = RE_HUGO_DETAILS_OPEN .replace_all(&result, r#""#) .to_string(); // Convert closing tags - ensure they're on their own line for MDX compatibility // Replace any occurrence where {{% /details %}} appears at end of line content - let re_closing = Regex::new(r#"([^\n])\s*\{\{% /details %\}\}"#).unwrap(); - result = re_closing + result = RE_HUGO_DETAILS_CLOSE .replace_all(&result, "$1\n") .to_string(); @@ -183,12 +192,11 @@ fn convert_hugo_details_to_accordion(content: &str) -> String { /// Preserves whether there's a newline after the opening $$ fn convert_math_blocks(content: &str) -> String { // First, extract and protect code blocks - let code_block_re = Regex::new(r"```[\s\S]*?```").unwrap(); let mut code_blocks = Vec::new(); let mut protected_content = content.to_string(); // Replace code blocks with placeholders - for (i, mat) in code_block_re.find_iter(content).enumerate() { + for (i, mat) in RE_CODE_BLOCK.find_iter(content).enumerate() { code_blocks.push(mat.as_str().to_string()); let placeholder = format!("___CODE_BLOCK_PLACEHOLDER_{}___", i); protected_content = protected_content.replacen(mat.as_str(), &placeholder, 1); @@ -196,9 +204,7 @@ fn convert_math_blocks(content: &str) -> String { // Match $$ ... $$ (both inline and block forms) only outside code blocks // This regex captures: opening $$, optional newline, content, optional newline, closing $$ - let re = Regex::new(r"\$\$(\r?\n)?([\s\S]*?)(\r?\n)?\$\$").unwrap(); - - let result = re + let result = RE_MATH_BLOCK .replace_all(&protected_content, |caps: ®ex::Captures| { let has_opening_newline = caps.get(1).is_some(); let math_content = &caps[2]; @@ -229,12 +235,11 @@ fn convert_math_blocks(content: &str) -> String { /// Only converts single dollar signs, not double dollar signs fn convert_inline_math(content: &str) -> String { // First, extract and protect code blocks - let code_block_re = Regex::new(r"```[\s\S]*?```").unwrap(); let mut code_blocks = Vec::new(); let mut protected_content = content.to_string(); // Replace code blocks with placeholders - for (i, mat) in code_block_re.find_iter(content).enumerate() { + for (i, mat) in RE_CODE_BLOCK.find_iter(content).enumerate() { code_blocks.push(mat.as_str().to_string()); let placeholder = format!("___CODE_BLOCK_PLACEHOLDER_{}___", i); protected_content = protected_content.replacen(mat.as_str(), &placeholder, 1);