Don't normalize strings in the CLI (#127)

DavisVaughan · web-flow · commit f41a27e86da3 · 2025-01-07T15:16:34.000-05:00
* Normalize multiline strings in the formatter so we don't have to normalize in the CLI

Allowing us to actually take advantage of the `Unchanged` optimization with CRLF endings, and correctly handle `--check` too

* Add a parser snapshot test for multiline strings with CRLF line endings

To prove we can parse these line endings, and to prove that the CRLF ends up in the `RStringValue`

* Add CHANGELOG bullets

* Mention why no `line_ending` crate usage

* Don't use `Cell` after all, since it's more mental overhead than it's worth
diff --git a/.gitattributes b/.gitattributes
@@ -1,4 +1,6 @@
 * text=auto eol=lf
 
 # Windows specific test files where we need CRLF endings
+crates/air/tests/fixtures/crlf/*.R text eol=crlf
 crates/air_r_formatter/tests/specs/r/crlf/*.R text eol=crlf
+crates/air_r_parser/tests/snapshots/ok/crlf/*.R text eol=crlf
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 # Development version
 
+- `air format` is now faster on Windows when nothing changes (#90).
+
+- `air format --check` now works correctly with Windows line endings (#123).
+
 - Magic line breaks are now supported in left assignment (#118).
 
 
diff --git a/crates/air/src/commands/format.rs b/crates/air/src/commands/format.rs
@@ -179,24 +179,11 @@ fn format_file(path: PathBuf, mode: FormatMode) -> Result<FormatFileAction, Form
     };
     let options = RFormatOptions::default().with_line_ending(line_ending);
 
-    let source = line_ending::normalize(source);
     let formatted = match format_source(source.as_str(), options) {
         Ok(formatted) => formatted,
         Err(err) => return Err(FormatCommandError::Format(path.clone(), err)),
     };
 
-    // TODO: We rarely ever take advantage of this optimization on Windows right
-    // now. We always normalize on entry but we apply the requested line ending
-    // on exit (so on Windows we often infer CRLF on entry and normalize to
-    // LF, but apply CRLF on exit so `source` and `new` always have different
-    // line endings). We probably need to compare pre-normalized against
-    // post-formatted output?
-    //
-    // Ah, no, the right thing to do is for the cli to not care about normalizing
-    // line endings. This is mostly useful in the LSP for all the document manipulation
-    // we are going to do there. In the cli, we want to format a whole bunch of files
-    // so we really want this optimization, and since the formatter and parser can handle
-    // windows line endings just fine, we should not normalize here.
     match formatted {
         FormattedSource::Formatted(new) => {
             match mode {
@@ -231,8 +218,6 @@ pub(crate) enum FormatSourceError {
 }
 
 /// Formats a vector of `source` code
-///
-/// Safety: `source` should already be normalized to Unix line endings
 pub(crate) fn format_source(
     source: &str,
     options: RFormatOptions,
diff --git a/crates/air/tests/fixtures/crlf/multiline_string_value.R b/crates/air/tests/fixtures/crlf/multiline_string_value.R
@@ -0,0 +1,2 @@
+"multiline
+string"
diff --git a/crates/air/tests/format.rs b/crates/air/tests/format.rs
@@ -1,3 +1,6 @@
+use std::path::Path;
+use std::path::PathBuf;
+
 use air::args::Args;
 use air::run;
 use air::status::ExitStatus;
@@ -21,3 +24,22 @@ fn default_options() -> anyhow::Result<()> {
     assert_eq!(err, ExitStatus::Success);
     Ok(())
 }
+
+#[test]
+fn test_check_returns_cleanly_for_multiline_strings_with_crlf_line_endings() -> anyhow::Result<()> {
+    let fixtures = path_fixtures();
+    let path = fixtures.join("crlf").join("multiline_string_value.R");
+    let path = path.to_str().unwrap();
+
+    let args = Args::parse_from(["", "format", path, "--check"]);
+    let err = run(args)?;
+
+    assert_eq!(err, ExitStatus::Success);
+    Ok(())
+}
+
+fn path_fixtures() -> PathBuf {
+    Path::new(env!("CARGO_MANIFEST_DIR"))
+        .join("tests")
+        .join("fixtures")
+}
diff --git a/crates/air_formatter_test/src/spec.rs b/crates/air_formatter_test/src/spec.rs
@@ -28,9 +28,6 @@ impl<'a> SpecTestFile<'a> {
 
         let input_code = std::fs::read_to_string(input_file).unwrap();
 
-        // Normalize to Unix line endings
-        let input_code = line_ending::normalize(input_code);
-
         // For the whole file, not a specific range right now
         let range_start_index = None;
         let range_end_index = None;
diff --git a/crates/air_r_formatter/src/lib.rs b/crates/air_r_formatter/src/lib.rs
@@ -25,6 +25,7 @@ mod prelude;
 mod r;
 pub(crate) mod separated;
 mod statement_body;
+mod string_literal;
 
 #[rustfmt::skip]
 mod generated;
diff --git a/crates/air_r_formatter/src/r/auxiliary/string_value.rs b/crates/air_r_formatter/src/r/auxiliary/string_value.rs
@@ -1,4 +1,5 @@
 use crate::prelude::*;
+use crate::string_literal::FormatStringLiteralToken;
 use air_r_syntax::RStringValue;
 use air_r_syntax::RStringValueFields;
 use biome_formatter::write;
@@ -8,6 +9,6 @@ pub(crate) struct FormatRStringValue;
 impl FormatNodeRule<RStringValue> for FormatRStringValue {
     fn fmt_fields(&self, node: &RStringValue, f: &mut RFormatter) -> FormatResult<()> {
         let RStringValueFields { value_token } = node.as_fields();
-        write![f, [value_token.format()]]
+        write![f, [FormatStringLiteralToken::new(&value_token?)]]
     }
 }
diff --git a/crates/air_r_formatter/src/string_literal.rs b/crates/air_r_formatter/src/string_literal.rs
@@ -0,0 +1,148 @@
+use air_r_syntax::RSyntaxKind::R_STRING_LITERAL;
+use air_r_syntax::RSyntaxToken;
+use biome_formatter::prelude::syntax_token_cow_slice;
+use biome_formatter::prelude::Formatter;
+use biome_formatter::trivia::format_replaced;
+use biome_formatter::Format;
+use biome_formatter::FormatResult;
+use std::borrow::Cow;
+
+use crate::context::RFormatContext;
+use crate::RFormatter;
+
+/// Helper utility for formatting a string literal token
+///
+/// The main job of this utility is to `normalize()` the string and handle the
+/// complicated way we have to call [format_replaced] with that normalized result.
+pub(crate) struct FormatStringLiteralToken<'token> {
+    /// The string literal token to format
+    token: &'token RSyntaxToken,
+}
+
+impl<'token> FormatStringLiteralToken<'token> {
+    pub(crate) fn new(token: &'token RSyntaxToken) -> Self {
+        Self { token }
+    }
+
+    fn normalize(&self) -> FormatNormalizedStringLiteralToken {
+        let token = self.token;
+
+        debug_assert!(
+            matches!(token.kind(), R_STRING_LITERAL),
+            "Found kind {:?}",
+            token.kind()
+        );
+
+        let text = token.text_trimmed();
+        let text = normalize_string(text);
+
+        FormatNormalizedStringLiteralToken { token, text }
+    }
+}
+
+impl Format<RFormatContext> for FormatStringLiteralToken<'_> {
+    fn fmt(&self, f: &mut RFormatter) -> FormatResult<()> {
+        self.normalize().fmt(f)
+    }
+}
+
+struct FormatNormalizedStringLiteralToken<'token> {
+    /// The original string literal token before normalization
+    token: &'token RSyntaxToken,
+
+    /// The normalized text
+    text: Cow<'token, str>,
+}
+
+impl Format<RFormatContext> for FormatNormalizedStringLiteralToken<'_> {
+    fn fmt(&self, f: &mut Formatter<RFormatContext>) -> FormatResult<()> {
+        format_replaced(
+            self.token,
+            &syntax_token_cow_slice(
+                // Cloning the `Cow<str>` is cheap since 99% of the time it will be the
+                // `Borrowed` variant. Only with multiline strings on Windows will it
+                // ever actually clone the underlying string.
+                self.text.clone(),
+                self.token,
+                self.token.text_trimmed_range().start(),
+            ),
+        )
+        .fmt(f)
+    }
+}
+
+/// Normalize a string, returning a [`Cow::Borrowed`] if the input was already normalized
+///
+/// This function:
+/// - Normalizes all line endings to `\n`
+///
+/// We may perform more normalization in the future. We don't use utilities from the
+/// `line_ending` crate because we don't own the string.
+///
+/// This function is particularly useful for multiline strings, which capture the existing
+/// line ending inside the string token itself. We must normalize those line endings to
+/// `\n` before the formatter -> printer stage, because the printer can't handle other
+/// line endings and will panic on them. At the printer -> string stage at the very end,
+/// the printer will replace all `\n` with the `LineEnding` requested by the user.
+/// https://github.com/biomejs/biome/blob/a658a294087c143b83350cbeb6b44f7a2e9afdd1/crates/biome_formatter/src/printer/mod.rs#L714-L718
+fn normalize_string(input: &str) -> Cow<str> {
+    // The normalized string if `input` is not yet normalized.
+    // `output` must remain empty if `input` is already normalized.
+    let mut output = String::new();
+
+    // Tracks the last index of `input` that has been written to `output`.
+    // If `last_loc` is `0` at the end, then the input is already normalized and can be returned as is.
+    let mut last_loc = 0;
+
+    let mut iter = input.char_indices().peekable();
+
+    while let Some((loc, char)) = iter.next() {
+        if char == '\r' {
+            output.push_str(&input[last_loc..loc]);
+
+            if iter.peek().is_some_and(|(_, next)| next == &'\n') {
+                // CRLF support - skip over the '\r' character, keep the `\n`
+                iter.next();
+            } else {
+                // CR support - Replace the `\r` with a `\n`
+                output.push('\n');
+            }
+
+            last_loc = loc + '\r'.len_utf8();
+        }
+    }
+
+    if last_loc == 0 {
+        Cow::Borrowed(input)
+    } else {
+        output.push_str(&input[last_loc..]);
+        Cow::Owned(output)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::string_literal::normalize_string;
+    use std::borrow::Cow;
+
+    #[test]
+    fn normalize_empty() {
+        let x = "";
+        assert_eq!(normalize_string(x), Cow::Borrowed(x));
+    }
+
+    #[test]
+    fn normalize_newlines() {
+        let x = "abcd";
+        assert_eq!(normalize_string(x), Cow::Borrowed(x));
+
+        let x = "a\nb\nc\nd\n";
+        assert_eq!(normalize_string(x), Cow::Borrowed(x));
+
+        let x = "a\nb\rc\r\nd\n";
+        assert_eq!(
+            normalize_string(x),
+            Cow::Owned::<str>(String::from("a\nb\nc\nd\n"))
+        );
+    }
+}
diff --git a/crates/air_r_parser/tests/snapshots/ok/crlf/multiline_string_value.R b/crates/air_r_parser/tests/snapshots/ok/crlf/multiline_string_value.R
@@ -0,0 +1,2 @@
+"multiline
+string"
diff --git a/crates/air_r_parser/tests/snapshots/ok/crlf/multiline_string_value.R.snap b/crates/air_r_parser/tests/snapshots/ok/crlf/multiline_string_value.R.snap
@@ -0,0 +1,38 @@
+---
+source: crates/air_r_parser/tests/spec_test.rs
+expression: snapshot
+---
+## Input
+
+```R
+"multiline
+string"
+
+```
+
+
+## AST
+
+```
+RRoot {
+    bom_token: missing (optional),
+    expressions: RExpressionList [
+        RStringValue {
+            value_token: R_STRING_LITERAL@0..19 "\"multiline\r\nstring\"" [] [],
+        },
+    ],
+    eof_token: EOF@19..21 "" [Newline("\r\n")] [],
+}
+```
+
+## CST
+
+```
+0: R_ROOT@0..21
+  0: (empty)
+  1: R_EXPRESSION_LIST@0..19
+    0: R_STRING_VALUE@0..19
+      0: R_STRING_LITERAL@0..19 "\"multiline\r\nstring\"" [] []
+  2: EOF@19..21 "" [Newline("\r\n")] []
+
+```
diff --git a/crates/air_r_parser/tests/spec_test.rs b/crates/air_r_parser/tests/spec_test.rs
@@ -52,9 +52,6 @@ pub fn run(test_case: &str, _snapshot_name: &str, test_directory: &str, outcome_
     let content = fs::read_to_string(test_case_path)
         .expect("Expected test path to be a readable file in UTF8 encoding");
 
-    // Normalize to Unix line endings
-    let content = line_ending::normalize(content);
-
     let options = RParserOptions::default();
     let parsed = parse(&content, options);
     let root = RRoot::unwrap_cast(parsed.syntax());

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`use crate::prelude::*;`
	`2`	`+use crate::string_literal::FormatStringLiteralToken;`
`2`	`3`	`use air_r_syntax::RStringValue;`
`3`	`4`	`use air_r_syntax::RStringValueFields;`
`4`	`5`	`use biome_formatter::write;`
`@@ -8,6 +9,6 @@ pub(crate) struct FormatRStringValue;`
`8`	`9`	`impl FormatNodeRule<RStringValue> for FormatRStringValue {`
`9`	`10`	`fn fmt_fields(&self, node: &RStringValue, f: &mut RFormatter) -> FormatResult<()> {`
`10`	`11`	`let RStringValueFields { value_token } = node.as_fields();`
`11`		`- write![f, [value_token.format()]]`
	`12`	`+ write![f, [FormatStringLiteralToken::new(&value_token?)]]`
`12`	`13`	`}`
`13`	`14`	`}`