diff --git a/Cargo.lock b/Cargo.lock index 6a293972..4980c6f7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -370,17 +370,6 @@ dependencies = [ "serde", ] -[[package]] -name = "biome_lsp_converters" -version = "0.1.0" -source = "git+https://github.com/biomejs/biome?rev=2648fa4201be4afd26f44eca1a4e77aac0a67272#2648fa4201be4afd26f44eca1a4e77aac0a67272" -dependencies = [ - "anyhow", - "biome_rowan", - "rustc-hash", - "tower-lsp 0.20.0 (registry+https://github.com/rust-lang/crates.io-index)", -] - [[package]] name = "biome_markup" version = "0.5.7" @@ -651,28 +640,6 @@ dependencies = [ "cfg-if", ] -[[package]] -name = "crossbeam" -version = "0.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1137cd7e7fc0fb5d3c5a8678be38ec56e819125d8d7907411fe24ccb943faca8" -dependencies = [ - "crossbeam-channel", - "crossbeam-deque", - "crossbeam-epoch", - "crossbeam-queue", - "crossbeam-utils", -] - -[[package]] -name = "crossbeam-channel" -version = "0.5.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33480d6946193aa8033910124896ca395333cae7e2d1113d1fef6c3272217df2" -dependencies = [ - "crossbeam-utils", -] - [[package]] name = "crossbeam-deque" version = "0.8.5" @@ -692,15 +659,6 @@ dependencies = [ "crossbeam-utils", ] -[[package]] -name = "crossbeam-queue" -version = "0.3.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df0346b5d5e76ac2fe4e327c5fd1118d6be7c51dfb18f9b7922923f287471e35" -dependencies = [ - "crossbeam-utils", -] - [[package]] name = "crossbeam-utils" version = "0.8.20" @@ -1360,13 +1318,10 @@ dependencies = [ "anyhow", "assert_matches", "biome_formatter", - "biome_lsp_converters", - "biome_parser", "biome_rowan", "biome_text_size", "bytes", "crates", - "crossbeam", "dissimilar", "futures", "futures-util", @@ -1379,6 +1334,7 @@ dependencies = [ "serde", "serde_json", "settings", + "source_file", "struct-field-names-as-array", "strum", "tempfile", @@ -1386,12 +1342,11 @@ dependencies = [ "time", "tokio", "tokio-util", - "tower-lsp 0.20.0 (git+https://github.com/lionel-/tower-lsp?branch=bugfix%2Fpatches)", + "tower-lsp", "tracing", "tracing-subscriber", "tree-sitter", "tree-sitter-r", - "triomphe", "url", "uuid", "workspace", @@ -1423,7 +1378,7 @@ dependencies = [ "serde_json", "tokio", "tokio-util", - "tower-lsp 0.20.0 (git+https://github.com/lionel-/tower-lsp?branch=bugfix%2Fpatches)", + "tower-lsp", "tracing", "url", ] @@ -1995,6 +1950,13 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "source_file" +version = "0.0.0" +dependencies = [ + "biome_text_size", +] + [[package]] name = "spdx" version = "0.10.8" @@ -2337,29 +2299,6 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" -[[package]] -name = "tower-lsp" -version = "0.20.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4ba052b54a6627628d9b3c34c176e7eda8359b7da9acd497b9f20998d118508" -dependencies = [ - "async-trait", - "auto_impl", - "bytes", - "dashmap 5.5.3", - "futures", - "httparse", - "lsp-types", - "memchr", - "serde", - "serde_json", - "tokio", - "tokio-util", - "tower", - "tower-lsp-macros 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)", - "tracing", -] - [[package]] name = "tower-lsp" version = "0.20.0" @@ -2378,21 +2317,10 @@ dependencies = [ "tokio", "tokio-util", "tower", - "tower-lsp-macros 0.9.0 (git+https://github.com/lionel-/tower-lsp?branch=bugfix%2Fpatches)", + "tower-lsp-macros", "tracing", ] -[[package]] -name = "tower-lsp-macros" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84fd902d4e0b9a4b27f2f440108dc034e1758628a9b702f8ec61ad66355422fa" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.90", -] - [[package]] name = "tower-lsp-macros" version = "0.9.0" @@ -2494,16 +2422,6 @@ dependencies = [ "tree-sitter-language", ] -[[package]] -name = "triomphe" -version = "0.1.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef8f7726da4807b58ea5c96fdc122f80702030edc33b35aff9190a51148ccc85" -dependencies = [ - "serde", - "stable_deref_trait", -] - [[package]] name = "unicode-bom" version = "2.0.3" diff --git a/Cargo.toml b/Cargo.toml index ce5ea5b4..e71000ed 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,6 +34,7 @@ line_ending = { path = "./crates/line_ending" } lsp = { path = "./crates/lsp" } lsp_test = { path = "./crates/lsp_test" } settings = { path = "./crates/settings" } +source_file = { path = "./crates/source_file" } tests_macros = { path = "./crates/tests_macros" } workspace = { path = "./crates/workspace" } @@ -42,7 +43,6 @@ assert_matches = "1.5.0" biome_console = { git = "https://github.com/biomejs/biome", rev = "2648fa4201be4afd26f44eca1a4e77aac0a67272" } biome_diagnostics = { git = "https://github.com/biomejs/biome", rev = "2648fa4201be4afd26f44eca1a4e77aac0a67272" } biome_formatter = { git = "https://github.com/biomejs/biome", rev = "2648fa4201be4afd26f44eca1a4e77aac0a67272" } -biome_lsp_converters = { git = "https://github.com/biomejs/biome", rev = "2648fa4201be4afd26f44eca1a4e77aac0a67272" } biome_parser = { git = "https://github.com/biomejs/biome", rev = "2648fa4201be4afd26f44eca1a4e77aac0a67272" } biome_rowan = { git = "https://github.com/biomejs/biome", rev = "2648fa4201be4afd26f44eca1a4e77aac0a67272" } biome_string_case = { git = "https://github.com/biomejs/biome", rev = "2648fa4201be4afd26f44eca1a4e77aac0a67272" } @@ -52,7 +52,6 @@ bytes = "1.8.0" cargo_metadata = "0.19.1" clap = { version = "4.5.20", features = ["derive"] } colored = "3.0.0" -crossbeam = "0.8.4" dissimilar = "1.0.9" futures = "0.3.31" futures-util = "0.3.31" @@ -82,7 +81,6 @@ tracing = { version = "0.1.40", default-features = false, features = ["std"] } tracing-subscriber = "0.3.19" tree-sitter = "0.23.0" tree-sitter-r = { git = "https://github.com/r-lib/tree-sitter-r", rev = "a0d3e3307489c3ca54da8c7b5b4e0c5f5fd6953a" } -triomphe = "0.1.14" url = "2.5.3" uuid = { version = "1.11.0", features = ["v4"] } diff --git a/crates/crates/src/snapshots/crates__tests__crate_names.snap b/crates/crates/src/snapshots/crates__tests__crate_names.snap index 3914755c..1e56d317 100644 --- a/crates/crates/src/snapshots/crates__tests__crate_names.snap +++ b/crates/crates/src/snapshots/crates__tests__crate_names.snap @@ -17,6 +17,7 @@ expression: AIR_CRATE_NAMES "lsp", "lsp_test", "settings", + "source_file", "tests_macros", "workspace", "xtask", diff --git a/crates/lsp/Cargo.toml b/crates/lsp/Cargo.toml index 32685961..b2063445 100644 --- a/crates/lsp/Cargo.toml +++ b/crates/lsp/Cargo.toml @@ -18,20 +18,17 @@ air_r_parser.workspace = true air_r_syntax.workspace = true anyhow.workspace = true biome_formatter.workspace = true -biome_lsp_converters.workspace = true -biome_parser.workspace = true biome_rowan.workspace = true biome_text_size.workspace = true crates.workspace = true -crossbeam.workspace = true dissimilar.workspace = true futures.workspace = true itertools.workspace = true line_ending.workspace = true -memchr.workspace = true serde.workspace = true serde_json.workspace = true settings.workspace = true +source_file.workspace = true struct-field-names-as-array.workspace = true strum = { workspace = true, features = ["derive"] } time = { workspace = true } @@ -41,7 +38,6 @@ tracing.workspace = true tracing-subscriber = { workspace = true, features = ["ansi", "local-time"] } tree-sitter.workspace = true tree-sitter-r.workspace = true -triomphe.workspace = true url.workspace = true uuid = { workspace = true, features = ["v4"] } workspace = { workspace = true } diff --git a/crates/lsp/src/rust_analyzer/diff.rs b/crates/lsp/src/diff.rs similarity index 93% rename from crates/lsp/src/rust_analyzer/diff.rs rename to crates/lsp/src/diff.rs index 6587def8..00c5505f 100644 --- a/crates/lsp/src/rust_analyzer/diff.rs +++ b/crates/lsp/src/diff.rs @@ -4,9 +4,10 @@ // origin = "https://github.com/rust-lang/rust-analyzer/blob/8d5e91c9/crates/rust-analyzer/src/handlers/request.rs#L2483" // --- -use biome_text_size::{TextRange, TextSize}; +use biome_text_size::TextRange; +use biome_text_size::TextSize; -use super::text_edit::TextEdit; +use crate::text_edit::TextEdit; pub(crate) fn diff(left: &str, right: &str) -> TextEdit { use dissimilar::Chunk; diff --git a/crates/lsp/src/documents.rs b/crates/lsp/src/documents.rs index 3d6160a2..f99a9f23 100644 --- a/crates/lsp/src/documents.rs +++ b/crates/lsp/src/documents.rs @@ -5,26 +5,28 @@ // // -use biome_lsp_converters::{line_index, PositionEncoding}; use settings::LineEnding; +use source_file::LineOffsetEncoding; +use source_file::SourceFile; use tower_lsp::lsp_types; -use crate::rust_analyzer::line_index::LineIndex; -use crate::rust_analyzer::utils::apply_document_changes; use crate::settings::DocumentSettings; #[derive(Clone)] pub struct Document { /// The normalized current contents of the document. UTF-8 Rust string with /// Unix line endings. - pub contents: String, + pub source_file: SourceFile, - /// Map of new lines in `contents`. Also contains line endings type in the - /// original document (we only store Unix lines) and the position encoding - /// type of the session. This provides all that is needed to send data back - /// to the client with positions in the correct coordinate space and - /// correctly formatted text. - pub line_index: LineIndex, + /// The encoding negotiated with the client for this document used when converting + /// line offsets to and from the client i.e. for ([lsp_types::Position] <-> + /// [biome_text_size::TextSize]) + pub encoding: LineOffsetEncoding, + + /// The line endings used in the [SourceFile]. Always [LineEnding::Lf] due to + /// up front normalization, but we pull from this to pass to other helpers, so + /// we keep it around. + pub endings: LineEnding, /// We store the syntax tree in the document for now. /// We will think about laziness and incrementality in the future. @@ -47,11 +49,7 @@ impl std::fmt::Debug for Document { } impl Document { - pub fn new( - contents: String, - version: Option, - position_encoding: PositionEncoding, - ) -> Self { + pub fn new(contents: String, version: Option, encoding: LineOffsetEncoding) -> Self { // Detect existing endings let endings = line_ending::infer(&contents); @@ -61,23 +59,18 @@ impl Document { LineEnding::Crlf => line_ending::normalize(contents), }; - // TODO: Handle user requested line ending preference here - // by potentially overwriting `endings` if the user didn't - // select `LineEndings::Auto`, and then pass that to `LineIndex`. + // Always Unix line endings + let endings = LineEnding::Lf; - // Create line index to keep track of newline offsets - let line_index = LineIndex { - index: triomphe::Arc::new(line_index::LineIndex::new(&contents)), - endings, - encoding: position_encoding, - }; + let source_file = SourceFile::new(contents); // Parse document immediately for now - let parse = air_r_parser::parse(&contents, Default::default()); + let parse = air_r_parser::parse(source_file.contents(), Default::default()); Self { - contents, - line_index, + source_file, + encoding, + endings, parse, version, settings: Default::default(), @@ -86,19 +79,21 @@ impl Document { /// For unit tests pub fn doodle(contents: &str) -> Self { - Self::new(contents.into(), None, PositionEncoding::Utf8) + Self::new(contents.into(), None, LineOffsetEncoding::UTF8) } #[cfg(test)] pub fn doodle_and_range(contents: &str) -> (Self, biome_text_size::TextRange) { let (contents, range) = crate::test::extract_marked_range(contents); - let doc = Self::new(contents, None, PositionEncoding::Utf8); + let doc = Self::new(contents, None, LineOffsetEncoding::UTF8); (doc, range) } - pub fn on_did_change(&mut self, mut params: lsp_types::DidChangeTextDocumentParams) { - let new_version = params.text_document.version; - + pub fn apply_changes( + &mut self, + mut changes: Vec, + new_version: i32, + ) { // Check for out-of-order change notifications if let Some(old_version) = self.version { // According to the spec, versions might not be consecutive but they must be monotonically @@ -116,23 +111,35 @@ impl Document { // replaced text can't invalidate the text change events, even those // applied subsequently, since those changes are specified with [line, // col] coordinates. - for event in &mut params.content_changes { + for event in &mut changes { let text = std::mem::take(&mut event.text); event.text = line_ending::normalize(text); } - let contents = apply_document_changes( - self.line_index.encoding, - &self.contents, - params.content_changes, - ); - - // No incrementality for now - let parse = air_r_parser::parse(&contents, Default::default()); + if let [lsp_types::TextDocumentContentChangeEvent { range: None, .. }] = changes.as_slice() + { + tracing::trace!("Fast path - replacing entire document"); + // Unwrap: If-let ensures there is exactly 1 change event + let change = changes.pop().unwrap(); + self.source_file = SourceFile::new(change.text); + } else { + // Handle all changes individually + for lsp_types::TextDocumentContentChangeEvent { range, text, .. } in changes { + if let Some(range) = range { + // Replace a range and reanalyze the line starts + let range = + crate::from_proto::text_range(range, &self.source_file, self.encoding); + self.source_file + .replace_range(usize::from(range.start())..usize::from(range.end()), &text); + } else { + // Replace the whole file + self.source_file = SourceFile::new(text); + } + } + } - self.parse = parse; - self.contents = contents; - self.line_index.index = triomphe::Arc::new(line_index::LineIndex::new(&self.contents)); + // Update other fields + self.parse = air_r_parser::parse(self.source_file.contents(), Default::default()); self.version = Some(new_version); } @@ -147,18 +154,11 @@ mod tests { use air_r_syntax::RSyntaxNode; use biome_text_size::{TextRange, TextSize}; - use crate::rust_analyzer::text_edit::TextEdit; + use crate::text_edit::TextEdit; use crate::to_proto; use super::*; - fn dummy_versioned_doc() -> lsp_types::VersionedTextDocumentIdentifier { - lsp_types::VersionedTextDocumentIdentifier { - uri: url::Url::parse("file:///foo").unwrap(), - version: 1, - } - } - #[test] fn test_document_starts_at_0_with_leading_whitespace() { let document = Document::doodle("\n\n# hi there"); @@ -180,13 +180,8 @@ mod tests { TextRange::new(TextSize::from(4_u32), TextSize::from(7)), String::from("1 + 2"), ); - let edits = to_proto::doc_edit_vec(&doc.line_index, edit).unwrap(); - - let params = lsp_types::DidChangeTextDocumentParams { - text_document: dummy_versioned_doc(), - content_changes: edits, - }; - doc.on_did_change(params); + let edits = to_proto::doc_change_vec(edit, &doc.source_file, doc.encoding, doc.endings); + doc.apply_changes(edits, 1); let updated_syntax: RSyntaxNode = doc.parse.syntax(); insta::assert_debug_snapshot!(updated_syntax); @@ -218,33 +213,23 @@ mod tests { }, }; - let mut utf8_replace_params = lsp_types::DidChangeTextDocumentParams { - text_document: dummy_versioned_doc(), - content_changes: vec![], - }; - let mut utf16_replace_params = utf8_replace_params.clone(); - - utf8_replace_params.content_changes = vec![lsp_types::TextDocumentContentChangeEvent { + let utf8_content_changes = vec![lsp_types::TextDocumentContentChangeEvent { range: Some(utf8_range), range_length: None, text: String::from("bar"), }]; - utf16_replace_params.content_changes = vec![lsp_types::TextDocumentContentChangeEvent { + let utf16_content_changes = vec![lsp_types::TextDocumentContentChangeEvent { range: Some(utf16_range), range_length: None, text: String::from("bar"), }]; - let mut document = Document::new("a𐐀b".into(), None, PositionEncoding::Utf8); - document.on_did_change(utf8_replace_params); - assert_eq!(document.contents, "a𐐀bar"); + let mut document = Document::new("a𐐀b".into(), None, LineOffsetEncoding::UTF8); + document.apply_changes(utf8_content_changes, 1); + assert_eq!(document.source_file.contents(), "a𐐀bar"); - let mut document = Document::new( - "a𐐀b".into(), - None, - PositionEncoding::Wide(biome_lsp_converters::WideEncoding::Utf16), - ); - document.on_did_change(utf16_replace_params); - assert_eq!(document.contents, "a𐐀bar"); + let mut document = Document::new("a𐐀b".into(), None, LineOffsetEncoding::UTF16); + document.apply_changes(utf16_content_changes, 1); + assert_eq!(document.source_file.contents(), "a𐐀bar"); } } diff --git a/crates/lsp/src/encoding.rs b/crates/lsp/src/encoding.rs deleted file mode 100644 index be8062a6..00000000 --- a/crates/lsp/src/encoding.rs +++ /dev/null @@ -1,61 +0,0 @@ -// -// encoding.rs -// -// Copyright (C) 2024 Posit Software, PBC. All rights reserved. -// -// - -/// Converts a character offset into a particular line from UTF-16 to UTF-8 -fn convert_character_from_utf16_to_utf8(x: &str, character: usize) -> usize { - if x.is_ascii() { - // Fast pass - return character; - } - - // Initial check, since loop would skip this case - if character == 0 { - return character; - } - - let mut n = 0; - - // For each `u32` sized `char`, figure out the equivalent size in UTF-16 - // world of that `char`. Once we hit the requested number of `character`s, - // that means we have indexed into `x` to the correct position, at which - // point we can take the current bytes based `pos` that marks the start of - // this `char`, and add on its UTF-8 based size to return an adjusted column - // offset. We use `==` because I'm fairly certain they should always align - // exactly, and it would be good to log if that isn't the case. - for (pos, char) in x.char_indices() { - n += char.len_utf16(); - - if n == character { - return pos + char.len_utf8(); - } - } - - tracing::error!("Failed to locate UTF-16 offset of {character}. Line: '{x}'."); - 0 -} - -/// Converts a character offset into a particular line from UTF-8 to UTF-16 -fn convert_character_from_utf8_to_utf16(x: &str, character: usize) -> usize { - if x.is_ascii() { - // Fast pass - return character; - } - - // The UTF-8 -> UTF-16 case is slightly simpler. We just slice into `x` - // using our existing UTF-8 offset, reencode the slice as a UTF-16 based - // iterator, and count up the pieces. - match x.get(..character) { - Some(x) => x.encode_utf16().count(), - None => { - let n = x.len(); - tracing::error!( - "Tried to take UTF-8 character {character}, but only {n} characters exist. Line: '{x}'." - ); - 0 - } - } -} diff --git a/crates/lsp/src/from_proto.rs b/crates/lsp/src/from_proto.rs index a3329d30..62584ceb 100644 --- a/crates/lsp/src/from_proto.rs +++ b/crates/lsp/src/from_proto.rs @@ -1,37 +1,31 @@ -pub(crate) use biome_lsp_converters::from_proto::offset; -pub(crate) use biome_lsp_converters::from_proto::text_range; - +use biome_text_size::TextRange; +use biome_text_size::TextSize; +use source_file::LineNumber; +use source_file::LineOffset; +use source_file::LineOffsetEncoding; +use source_file::SourceFile; +use source_file::SourceLocation; use tower_lsp::lsp_types; -use crate::documents::Document; - -pub fn apply_text_edits( - doc: &Document, - mut edits: Vec, -) -> anyhow::Result { - let mut text = doc.contents.clone(); - - // Apply edits from bottom to top to avoid inserted newlines to invalidate - // positions in earlier parts of the doc (they are sent in reading order - // accorder to the LSP protocol) - edits.reverse(); - - for edit in edits { - let start: usize = offset( - &doc.line_index.index, - edit.range.start, - doc.line_index.encoding, - )? - .into(); - let end: usize = offset( - &doc.line_index.index, - edit.range.end, - doc.line_index.encoding, - )? - .into(); - - text.replace_range(start..end, &edit.new_text); - } +pub fn offset( + position: lsp_types::Position, + source: &SourceFile, + encoding: LineOffsetEncoding, +) -> TextSize { + let source_location = SourceLocation::new( + LineNumber::from(position.line), + LineOffset::new(position.character, encoding), + ); + source.offset(source_location) +} - Ok(text) +pub fn text_range( + range: lsp_types::Range, + source: &SourceFile, + encoding: LineOffsetEncoding, +) -> TextRange { + TextRange::new( + self::offset(range.start, source, encoding), + self::offset(range.end, source, encoding), + ) } diff --git a/crates/lsp/src/handlers_ext.rs b/crates/lsp/src/handlers_ext.rs index 2d3b4f59..58fb0c35 100644 --- a/crates/lsp/src/handlers_ext.rs +++ b/crates/lsp/src/handlers_ext.rs @@ -32,7 +32,7 @@ pub(crate) fn view_file(params: ViewFileParams, state: &WorldState) -> anyhow::R .set_language(&tree_sitter_r::LANGUAGE.into()) .unwrap(); - let ast = parser.parse(&doc.contents, None).unwrap(); + let ast = parser.parse(doc.source_file.contents(), None).unwrap(); if ast.root_node().has_error() { return Ok(String::from("*Parse error*")); diff --git a/crates/lsp/src/handlers_format.rs b/crates/lsp/src/handlers_format.rs index 91e455d2..43f6152a 100644 --- a/crates/lsp/src/handlers_format.rs +++ b/crates/lsp/src/handlers_format.rs @@ -51,14 +51,16 @@ pub(crate) fn document_formatting( return Ok(None); } - let format_options = workspace_settings.to_format_options(&doc.contents, &doc.settings); + let format_options = + workspace_settings.to_format_options(doc.source_file.contents(), &doc.settings); - match format_source_with_parse(&doc.contents, &doc.parse, format_options)? { + match format_source_with_parse(doc.source_file.contents(), &doc.parse, format_options)? { FormattedSource::Changed(formatted) => Ok(Some(to_proto::replace_all_edit( - &doc.line_index, - &doc.contents, &formatted, - )?)), + &doc.source_file, + doc.encoding, + doc.endings, + ))), FormattedSource::Unchanged => Ok(None), } } @@ -97,8 +99,7 @@ pub(crate) fn document_range_formatting( return Ok(None); } - let range = - from_proto::text_range(&doc.line_index.index, params.range, doc.line_index.encoding)?; + let range = from_proto::text_range(params.range, &doc.source_file, doc.encoding); let logical_lines = find_deepest_enclosing_logical_lines(doc.parse.syntax(), range); if logical_lines.is_empty() { @@ -134,7 +135,8 @@ pub(crate) fn document_range_formatting( let eof = air_r_syntax::RSyntaxToken::new_detached(RSyntaxKind::EOF, "", vec![], vec![]); let root = air_r_factory::r_root(list, eof).build(); - let format_options = workspace_settings.to_format_options(&doc.contents, &doc.settings); + let format_options = + workspace_settings.to_format_options(doc.source_file.contents(), &doc.settings); let format_info = biome_formatter::format_sub_tree( root.syntax(), @@ -150,7 +152,13 @@ pub(crate) fn document_range_formatting( // Remove last hard break line from our artifical expression list format_text.pop(); - let edits = to_proto::replace_range_edit(&doc.line_index, format_range, format_text)?; + let edits = to_proto::replace_range_edit( + format_range, + format_text, + &doc.source_file, + doc.encoding, + doc.endings, + ); Ok(Some(edits)) } @@ -279,7 +287,7 @@ mod tests { use crate::test::new_test_client; use crate::test::FileName; use crate::test::TestClientExt; - use biome_lsp_converters::PositionEncoding; + use source_file::LineOffsetEncoding; use std::path::Path; use tower_lsp::lsp_types::DidChangeWorkspaceFoldersParams; use tower_lsp::lsp_types::WorkspaceFolder; @@ -685,7 +693,7 @@ mod tests { let output = "1 + 1\n"; let url = as_file_url(tempdir.join("test.R").as_path()); let filename = FileName::Url(url); - let doc = Document::new(input.to_string(), Some(0), PositionEncoding::Utf8); + let doc = Document::new(input.to_string(), Some(0), LineOffsetEncoding::UTF8); let result = client.format_document(&doc, filename).await; assert_eq!(result, output); @@ -716,7 +724,7 @@ default-exclude = false let input = "1+1"; let url = as_file_url(tempdir.join("test.R").as_path()); let filename = FileName::Url(url); - let doc = Document::new(input.to_string(), Some(0), PositionEncoding::Utf8); + let doc = Document::new(input.to_string(), Some(0), LineOffsetEncoding::UTF8); let result = client.format_document(&doc, filename).await; assert_eq!(result, input); @@ -725,7 +733,7 @@ default-exclude = false let output = "1 + 1\n"; let url = as_file_url(tempdir.join("cpp11.R").as_path()); let filename = FileName::Url(url); - let doc = Document::new(input.to_string(), Some(0), PositionEncoding::Utf8); + let doc = Document::new(input.to_string(), Some(0), LineOffsetEncoding::UTF8); let result = client.format_document(&doc, filename).await; assert_eq!(result, output); } @@ -753,7 +761,7 @@ igraph::graph_from_literal(Alice + --+Jerry) .trim_start(); let url = as_file_url(tempdir.join("test.R").as_path()); let filename = FileName::Url(url); - let doc = Document::new(input.to_string(), Some(0), PositionEncoding::Utf8); + let doc = Document::new(input.to_string(), Some(0), LineOffsetEncoding::UTF8); let result = client.format_document(&doc, filename).await; assert_eq!(result, output); @@ -792,7 +800,7 @@ igraph::graph_from_literal(Alice +--+ Jerry) .trim_start(); let url = as_file_url(tempdir.join("test.R").as_path()); let filename = FileName::Url(url); - let doc = Document::new(input.to_string(), Some(0), PositionEncoding::Utf8); + let doc = Document::new(input.to_string(), Some(0), LineOffsetEncoding::UTF8); let result = client.format_document(&doc, filename).await; assert_eq!(result, output); } @@ -844,7 +852,7 @@ indent-width = 8 let output = "list(\n 1\n)\n"; let url = as_file_url(directory.join("test.R").as_path()); let filename = FileName::Url(url); - let doc = Document::new(input.to_string(), Some(0), PositionEncoding::Utf8); + let doc = Document::new(input.to_string(), Some(0), LineOffsetEncoding::UTF8); let result = client.format_document(&doc, filename).await; assert_eq!(result, output); @@ -853,7 +861,7 @@ indent-width = 8 let output = "list(\n 1\n)\n"; let url = as_file_url(workspace.join("test.R").as_path()); let filename = FileName::Url(url); - let doc = Document::new(input.to_string(), Some(0), PositionEncoding::Utf8); + let doc = Document::new(input.to_string(), Some(0), LineOffsetEncoding::UTF8); let result = client.format_document(&doc, filename).await; assert_eq!(result, output); } diff --git a/crates/lsp/src/handlers_state.rs b/crates/lsp/src/handlers_state.rs index 37cdd47c..cfc49f17 100644 --- a/crates/lsp/src/handlers_state.rs +++ b/crates/lsp/src/handlers_state.rs @@ -9,9 +9,8 @@ use std::array::IntoIter; use anyhow::anyhow; use anyhow::Context; -use biome_lsp_converters::PositionEncoding; -use biome_lsp_converters::WideEncoding; use serde_json::Value; +use source_file::LineOffsetEncoding; use struct_field_names_as_array::FieldNamesAsArray; use tower_lsp::lsp_types; use tower_lsp::lsp_types::ConfigurationItem; @@ -106,10 +105,10 @@ pub(crate) fn initialize( .position_encodings .contains(&lsp_types::PositionEncodingKind::UTF8) { - lsp_state.position_encoding = PositionEncoding::Utf8; + lsp_state.encoding = LineOffsetEncoding::UTF8; Some(lsp_types::PositionEncodingKind::UTF8) } else { - lsp_state.position_encoding = PositionEncoding::Wide(WideEncoding::Utf16); + lsp_state.encoding = LineOffsetEncoding::UTF16; Some(lsp_types::PositionEncodingKind::UTF16) }; @@ -147,7 +146,7 @@ pub(crate) async fn did_open( let uri = params.text_document.uri; let version = params.text_document.version; - let document = Document::new(contents, Some(version), lsp_state.position_encoding); + let document = Document::new(contents, Some(version), lsp_state.encoding); state.documents.insert(uri.clone(), document); // Propagate client settings to Air @@ -168,8 +167,7 @@ pub(crate) fn did_change( ) -> anyhow::Result<()> { let uri = ¶ms.text_document.uri; let doc = state.get_document_mut_or_error(uri)?; - doc.on_did_change(params); - + doc.apply_changes(params.content_changes, params.text_document.version); Ok(()) } diff --git a/crates/lsp/src/lib.rs b/crates/lsp/src/lib.rs index bec31d85..47192e2a 100644 --- a/crates/lsp/src/lib.rs +++ b/crates/lsp/src/lib.rs @@ -1,11 +1,8 @@ -// TODO: Remove this -#![allow(dead_code)] - pub use tower_lsp::start_lsp; pub mod capabilities; +pub mod diff; pub mod documents; -pub mod encoding; pub mod file_patterns; pub mod from_proto; pub mod handlers; @@ -15,10 +12,10 @@ pub mod handlers_state; pub mod logging; pub mod main_loop; pub mod notifications; -pub mod rust_analyzer; pub mod settings; pub mod settings_vsc; pub mod state; +pub mod text_edit; pub mod to_proto; pub mod tower_lsp; pub mod workspaces; diff --git a/crates/lsp/src/main_loop.rs b/crates/lsp/src/main_loop.rs index e8fc0c88..075eb72c 100644 --- a/crates/lsp/src/main_loop.rs +++ b/crates/lsp/src/main_loop.rs @@ -5,14 +5,12 @@ // // -use std::collections::HashMap; use std::future; use std::pin::Pin; use anyhow::anyhow; -use biome_lsp_converters::PositionEncoding; -use biome_lsp_converters::WideEncoding; use futures::StreamExt; +use source_file::LineOffsetEncoding; use tokio::sync::mpsc::unbounded_channel as tokio_unbounded_channel; use tokio::task::JoinHandle; use tower_lsp::lsp_types::Diagnostic; @@ -57,16 +55,19 @@ type TaskList = futures::stream::FuturesUnordered, Option), SpawnedTask(JoinHandle>>), } @@ -156,13 +157,10 @@ pub(crate) struct LspState { /// Resolver to look up [`Settings`] given a document [`Url`] pub(crate) workspace_settings_resolver: WorkspaceSettingsResolver, - /// The negociated encoding for document positions. Note that documents are + /// The negotiated encoding for document line offsets. Note that documents are /// always stored as UTF-8 in Rust Strings. This encoding is only used to - /// translate UTF-16 positions sent by the client to UTF-8 ones. - pub(crate) position_encoding: PositionEncoding, - - /// The set of tree-sitter document parsers managed by the `GlobalState`. - pub(crate) parsers: HashMap, + /// translate UTF-16 positions sent by the client to UTF-8 byte offsets. + pub(crate) encoding: LineOffsetEncoding, /// List of client capabilities that we care about pub(crate) capabilities: AirClientCapabilities, @@ -180,8 +178,7 @@ impl LspState { client, workspace_settings_resolver: Default::default(), // All servers and clients have to support UTF-16 so that's the default - position_encoding: PositionEncoding::Wide(WideEncoding::Utf16), - parsers: Default::default(), + encoding: LineOffsetEncoding::UTF16, capabilities: Default::default(), log_state: Default::default(), settings: Default::default(), diff --git a/crates/lsp/src/rust_analyzer/line_index.rs b/crates/lsp/src/rust_analyzer/line_index.rs deleted file mode 100644 index e46e03f1..00000000 --- a/crates/lsp/src/rust_analyzer/line_index.rs +++ /dev/null @@ -1,19 +0,0 @@ -// --- source -// authors = ["rust-analyzer team"] -// license = "MIT OR Apache-2.0" -// origin = "https://github.com/rust-lang/rust-analyzer/blob/master/crates/rust-analyzer/src/line_index.rs" -// --- - -//! Enhances `ide::LineIndex` with additional info required to convert offsets -//! into lsp positions. - -use biome_lsp_converters::line_index; -use settings::LineEnding; -use triomphe::Arc; - -#[derive(Debug, Clone)] -pub struct LineIndex { - pub index: Arc, - pub endings: LineEnding, - pub encoding: biome_lsp_converters::PositionEncoding, -} diff --git a/crates/lsp/src/rust_analyzer/mod.rs b/crates/lsp/src/rust_analyzer/mod.rs deleted file mode 100644 index bfb231cc..00000000 --- a/crates/lsp/src/rust_analyzer/mod.rs +++ /dev/null @@ -1,5 +0,0 @@ -pub mod diff; -pub mod line_index; -pub mod text_edit; -pub mod to_proto; -pub mod utils; diff --git a/crates/lsp/src/rust_analyzer/to_proto.rs b/crates/lsp/src/rust_analyzer/to_proto.rs deleted file mode 100644 index d3ec37b0..00000000 --- a/crates/lsp/src/rust_analyzer/to_proto.rs +++ /dev/null @@ -1,60 +0,0 @@ -// --- source -// authors = ["rust-analyzer team"] -// license = "MIT OR Apache-2.0" -// origin = "https://github.com/rust-lang/rust-analyzer/blob/master/crates/rust-analyzer/src/lsp/to_proto.rs" -// --- - -//! Conversion of rust-analyzer specific types to lsp_types equivalents. - -use super::{ - line_index::LineIndex, - text_edit::{Indel, TextEdit}, -}; -use settings::LineEnding; -use tower_lsp::lsp_types; - -pub(crate) fn text_edit( - line_index: &LineIndex, - indel: Indel, -) -> anyhow::Result { - let range = biome_lsp_converters::to_proto::range( - &line_index.index, - indel.delete, - line_index.encoding, - )?; - let new_text = match line_index.endings { - LineEnding::Lf => indel.insert, - LineEnding::Crlf => indel.insert.replace('\n', "\r\n"), - }; - Ok(lsp_types::TextEdit { range, new_text }) -} - -pub(crate) fn completion_text_edit( - line_index: &LineIndex, - insert_replace_support: Option, - indel: Indel, -) -> anyhow::Result { - let text_edit = text_edit(line_index, indel)?; - Ok(match insert_replace_support { - Some(cursor_pos) => lsp_types::InsertReplaceEdit { - new_text: text_edit.new_text, - insert: lsp_types::Range { - start: text_edit.range.start, - end: cursor_pos, - }, - replace: text_edit.range, - } - .into(), - None => text_edit.into(), - }) -} - -pub(crate) fn text_edit_vec( - line_index: &LineIndex, - text_edit: TextEdit, -) -> anyhow::Result> { - text_edit - .into_iter() - .map(|indel| self::text_edit(line_index, indel)) - .collect() -} diff --git a/crates/lsp/src/rust_analyzer/utils.rs b/crates/lsp/src/rust_analyzer/utils.rs deleted file mode 100644 index e8602a99..00000000 --- a/crates/lsp/src/rust_analyzer/utils.rs +++ /dev/null @@ -1,56 +0,0 @@ -// --- source -// authors = ["rust-analyzer team"] -// license = "MIT OR Apache-2.0" -// origin = "https://github.com/rust-lang/rust-analyzer/blob/master/crates/rust-analyzer/src/lsp/utils.rs" -// --- - -use std::ops::Range; - -use biome_lsp_converters::line_index; -use tower_lsp::lsp_types; - -use crate::from_proto; - -pub(crate) fn apply_document_changes( - encoding: biome_lsp_converters::PositionEncoding, - file_contents: &str, - mut content_changes: Vec, -) -> String { - // If at least one of the changes is a full document change, use the last - // of them as the starting point and ignore all previous changes. - let (mut text, content_changes) = match content_changes - .iter() - .rposition(|change| change.range.is_none()) - { - Some(idx) => { - let text = std::mem::take(&mut content_changes[idx].text); - (text, &content_changes[idx + 1..]) - } - None => (file_contents.to_owned(), &content_changes[..]), - }; - if content_changes.is_empty() { - return text; - } - - let mut line_index = line_index::LineIndex::new(&text); - - // The changes we got must be applied sequentially, but can cross lines so we - // have to keep our line index updated. - // Some clients (e.g. Code) sort the ranges in reverse. As an optimization, we - // remember the last valid line in the index and only rebuild it if needed. - // The VFS will normalize the end of lines to `\n`. - let mut index_valid = !0u32; - for change in content_changes { - // The None case can't happen as we have handled it above already - if let Some(range) = change.range { - if index_valid <= range.end.line { - line_index = line_index::LineIndex::new(&text); - } - index_valid = range.start.line; - if let Ok(range) = from_proto::text_range(&line_index, range, encoding) { - text.replace_range(Range::::from(range), &change.text); - } - } - } - text -} diff --git a/crates/lsp/src/settings_vsc.rs b/crates/lsp/src/settings_vsc.rs index fde1da5c..e8518fa8 100644 --- a/crates/lsp/src/settings_vsc.rs +++ b/crates/lsp/src/settings_vsc.rs @@ -142,6 +142,7 @@ pub(crate) fn indent_style_from_vsc(insert_spaces: bool) -> settings::IndentStyl } } +#[allow(dead_code)] pub(crate) fn line_width_from_vsc(rulers: &[usize]) -> Option { rulers.first().and_then(|w| (*w as u16).try_into().ok()) } diff --git a/crates/lsp/src/state.rs b/crates/lsp/src/state.rs index 741a1db7..af281a3e 100644 --- a/crates/lsp/src/state.rs +++ b/crates/lsp/src/state.rs @@ -34,9 +34,11 @@ pub(crate) struct WorldState { /// currently pushed to the LSP), and cache the symbols with Salsa. The /// performance is not currently an issue but this could change once we do /// more analysis of symbols in the search path. + #[allow(dead_code)] pub(crate) console_scopes: Vec>, /// Currently installed packages + #[allow(dead_code)] pub(crate) installed_packages: Vec, } @@ -66,11 +68,4 @@ impl WorldState { pub(crate) fn workspace_uris(&self) -> Vec { self.documents.iter().map(|elt| elt.0.clone()).collect() } - - pub(crate) fn workspace_paths(&self) -> Vec { - self.workspace_uris() - .into_iter() - .map(|uri| uri.to_string()) - .collect() - } } diff --git a/crates/lsp/src/test/client_ext.rs b/crates/lsp/src/test/client_ext.rs index 171daf9b..4e6130ac 100644 --- a/crates/lsp/src/test/client_ext.rs +++ b/crates/lsp/src/test/client_ext.rs @@ -2,7 +2,7 @@ use biome_text_size::TextRange; use lsp_test::lsp_client::TestClient; use tower_lsp::lsp_types; -use crate::{documents::Document, from_proto, to_proto}; +use crate::{documents::Document, from_proto}; pub(crate) trait TestClientExt { async fn open_document( @@ -53,7 +53,7 @@ impl TestClientExt for TestClient { uri, language_id: String::from("r"), version: 0, - text: doc.contents.clone(), + text: doc.source_file.contents().to_string(), }; let params = lsp_types::DidOpenTextDocumentParams { @@ -66,8 +66,8 @@ impl TestClientExt for TestClient { async fn format_document(&mut self, doc: &Document, filename: FileName) -> String { match self.format_document_edits(doc, filename).await { - Some(edits) => from_proto::apply_text_edits(doc, edits).unwrap(), - None => doc.contents.clone(), + Some(edits) => apply_text_edits(doc, edits).unwrap(), + None => doc.source_file.contents().to_string(), } } @@ -78,9 +78,9 @@ impl TestClientExt for TestClient { range: TextRange, ) -> String { let Some(edits) = self.format_document_range_edits(doc, filename, range).await else { - return doc.contents.clone(); + return doc.source_file.contents().to_string(); }; - from_proto::apply_text_edits(doc, edits).unwrap() + apply_text_edits(doc, edits).unwrap() } async fn format_document_edits( @@ -121,7 +121,7 @@ impl TestClientExt for TestClient { ) -> Option> { let lsp_doc = self.open_document(doc, filename).await; - let range = to_proto::range(&doc.line_index.index, range, doc.line_index.encoding).unwrap(); + let range = crate::to_proto::range(range, &doc.source_file, doc.encoding); self.range_formatting(lsp_types::DocumentRangeFormattingParams { text_document: lsp_types::TextDocumentIdentifier { @@ -158,3 +158,21 @@ fn formatting_options(doc: &Document) -> lsp_types::FormattingOptions { ..Default::default() } } + +fn apply_text_edits(doc: &Document, mut edits: Vec) -> anyhow::Result { + let mut text = doc.source_file.contents().to_string(); + + // Apply edits from bottom to top to avoid inserted newlines to invalidate + // positions in earlier parts of the doc (they are sent in reading order + // accorder to the LSP protocol) + edits.reverse(); + + for edit in edits { + let range = from_proto::text_range(edit.range, &doc.source_file, doc.encoding); + let start: usize = range.start().into(); + let end: usize = range.end().into(); + text.replace_range(start..end, &edit.new_text); + } + + Ok(text) +} diff --git a/crates/lsp/src/rust_analyzer/text_edit.rs b/crates/lsp/src/text_edit.rs similarity index 98% rename from crates/lsp/src/rust_analyzer/text_edit.rs rename to crates/lsp/src/text_edit.rs index fa777e50..d91c836a 100644 --- a/crates/lsp/src/rust_analyzer/text_edit.rs +++ b/crates/lsp/src/text_edit.rs @@ -79,12 +79,6 @@ impl TextEdit { builder.finish() } - // --- Start Posit - pub fn diff(text: &str, replace_with: &str) -> TextEdit { - super::diff::diff(text, replace_with) - } - // --- End Posit - pub fn len(&self) -> usize { self.indels.len() } diff --git a/crates/lsp/src/to_proto.rs b/crates/lsp/src/to_proto.rs index 4b4c5f5e..a72d6410 100644 --- a/crates/lsp/src/to_proto.rs +++ b/crates/lsp/src/to_proto.rs @@ -7,45 +7,101 @@ // Utilites for converting internal types to LSP types -pub(crate) use rust_analyzer::to_proto::text_edit_vec; +use biome_rowan::TextSize; +use settings::LineEnding; +use source_file::LineOffsetEncoding; +use source_file::SourceFile; -#[cfg(test)] -pub(crate) use biome_lsp_converters::to_proto::range; - -use crate::rust_analyzer::{self, line_index::LineIndex, text_edit::TextEdit}; +use crate::text_edit::Indel; +use crate::text_edit::TextEdit; use biome_text_size::TextRange; use tower_lsp::lsp_types; -pub(crate) fn doc_edit_vec( - line_index: &LineIndex, +pub(crate) fn position( + offset: TextSize, + source_file: &SourceFile, + encoding: LineOffsetEncoding, +) -> lsp_types::Position { + let source_location = source_file.source_location(offset, encoding); + lsp_types::Position { + line: source_location.line_number().into(), + character: source_location.line_offset().raw(), + } +} + +pub(crate) fn range( + range: TextRange, + source_file: &SourceFile, + encoding: LineOffsetEncoding, +) -> lsp_types::Range { + lsp_types::Range { + start: self::position(range.start(), source_file, encoding), + end: self::position(range.end(), source_file, encoding), + } +} + +pub(crate) fn text_edit( + indel: Indel, + source_file: &SourceFile, + encoding: LineOffsetEncoding, + endings: LineEnding, +) -> lsp_types::TextEdit { + let range = self::range(indel.delete, source_file, encoding); + let new_text = match endings { + LineEnding::Lf => indel.insert, + LineEnding::Crlf => indel.insert.replace('\n', "\r\n"), + }; + lsp_types::TextEdit { range, new_text } +} + +pub(crate) fn text_edit_vec( + text_edit: TextEdit, + source_file: &SourceFile, + encoding: LineOffsetEncoding, + endings: LineEnding, +) -> Vec { + text_edit + .into_iter() + .map(|indel| self::text_edit(indel, source_file, encoding, endings)) + .collect() +} + +#[cfg(test)] +pub(crate) fn doc_change_vec( text_edit: TextEdit, -) -> anyhow::Result> { - let edits = text_edit_vec(line_index, text_edit)?; + source_file: &SourceFile, + encoding: LineOffsetEncoding, + endings: LineEnding, +) -> Vec { + let edits = self::text_edit_vec(text_edit, source_file, encoding, endings); - Ok(edits + edits .into_iter() .map(|edit| lsp_types::TextDocumentContentChangeEvent { range: Some(edit.range), range_length: None, text: edit.new_text, }) - .collect()) + .collect() } pub(crate) fn replace_range_edit( - line_index: &LineIndex, range: TextRange, replace_with: String, -) -> anyhow::Result> { + source_file: &SourceFile, + encoding: LineOffsetEncoding, + endings: LineEnding, +) -> Vec { let edit = TextEdit::replace(range, replace_with); - text_edit_vec(line_index, edit) + self::text_edit_vec(edit, source_file, encoding, endings) } pub(crate) fn replace_all_edit( - line_index: &LineIndex, - text: &str, replace_with: &str, -) -> anyhow::Result> { - let edit = TextEdit::diff(text, replace_with); - text_edit_vec(line_index, edit) + source_file: &SourceFile, + encoding: LineOffsetEncoding, + endings: LineEnding, +) -> Vec { + let edit = crate::diff::diff(source_file.contents(), replace_with); + self::text_edit_vec(edit, source_file, encoding, endings) } diff --git a/crates/lsp/src/workspaces.rs b/crates/lsp/src/workspaces.rs index 773ef9ac..ac9763e5 100644 --- a/crates/lsp/src/workspaces.rs +++ b/crates/lsp/src/workspaces.rs @@ -124,6 +124,7 @@ impl WorkspaceSettingsResolver { } } + #[allow(dead_code)] pub(crate) fn len(&self) -> usize { self.path_to_settings_resolver.len() } diff --git a/crates/source_file/Cargo.toml b/crates/source_file/Cargo.toml new file mode 100644 index 00000000..ec9bc5ea --- /dev/null +++ b/crates/source_file/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "source_file" +version = "0.0.0" +publish = false +authors = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } +homepage = { workspace = true } +repository = { workspace = true } +license = { workspace = true } + +[dependencies] +biome_text_size = { workspace = true } + +[lints] +workspace = true diff --git a/crates/source_file/src/lib.rs b/crates/source_file/src/lib.rs new file mode 100644 index 00000000..6a010e34 --- /dev/null +++ b/crates/source_file/src/lib.rs @@ -0,0 +1,17 @@ +//! Tools for managing a single source file +//! +//! In particular, [SourceFile] manages the conversions between UTF-8 byte offsets into a +//! [String], and line number + line offset (also known as row/column or row/character) +//! backed [SourceLocation]s, where the line offset is measured in UTF code units and is +//! dependent on the [LineOffsetEncoding] used. [SourceLocation]s are meant to easily map +//! to LSP `Position`s, and can handle the common `PositionEncodingKind`s of UTF-8, +//! UTF-16, and UTF-32. + +pub use crate::source_file::SourceFile; +pub use crate::source_location::LineNumber; +pub use crate::source_location::LineOffset; +pub use crate::source_location::LineOffsetEncoding; +pub use crate::source_location::SourceLocation; + +mod source_file; +mod source_location; diff --git a/crates/source_file/src/source_file.rs b/crates/source_file/src/source_file.rs new file mode 100644 index 00000000..a2d485f1 --- /dev/null +++ b/crates/source_file/src/source_file.rs @@ -0,0 +1,966 @@ +//! This top level documentation details the algorithms and terminology behind +//! [SourceFile::offset] and [SourceFile::source_location]. The primary goal of +//! these functions (and really this whole module) is to handle conversion between +//! a [byte offset](TextSize) and a [line number + line offset](SourceLocation), +//! including treatment of various UTF encodings. +//! +//! Both [TextSize] and [SourceLocation] are ways of pointing at a location within a +//! file: +//! +//! - A [TextSize] is a simple byte offset into a UTF-8 encoded [String]. +//! +//! - A [SourceLocation] contains a location encoded as `line_number` and `line_offset`, +//! where: +//! - `line_number` ([LineNumber]) represents the 0-indexed line number +//! - `line_offset` ([LineOffset]) represents the 0-indexed offset from the start of the +//! line represented by `line_number`. The offset itself is (critically) measured in a +//! UTF concept known as "code units", and is meaningless without the corresponding +//! [LineOffsetEncoding]. +//! +//! A [SourceLocation] is meant to map to an LSP `Position`, and a [LineOffsetEncoding] is +//! meant to map to an LSP `PositionEncodingKind`. We are typically handed an LSP +//! `Position`, must convert it to a [TextSize] (by going through [SourceLocation]), then +//! we use that [TextSize] to index into our [String] representing our document. On the +//! way out, we must convert from [TextSize] or [TextRange] back to LSP `Position` or LSP +//! `Range` by going back through [SourceLocation]. +//! +//! Now, for some definitions: +//! +//! - Code unit: The minimal bit combination that can represent a single character, +//! depending on the encoding used. +//! - UTF-8: +//! - 1 code unit = 1 byte = 8 bits +//! - UTF-16: +//! - 1 code unit = 2 bytes = 16 bits +//! - UTF-32: +//! - 1 code unit = 4 bytes = 32 bits +//! +//! - Character: A combination of code units that construct a single UTF element. +//! - UTF-8: +//! - 1 character = 1,2,3,4 code units = 1,2,3,4 bytes = 8,16,24,32 bits +//! - UTF-16: +//! - 1 character = 1,2 code units = 2,4 bytes = 16,32 bits +//! - UTF-32: +//! - 1 character = 1 code units = 4 bytes = 32 bits +//! +//! - Unicode Scalar Value: Any Unicode Code Point other than a Surrogate Code Point ( +//! which are only used by UTF-16). Technically, this means any value in the range of +//! [0 to 0x10FFFF] excluding the slice of [0xD800 to 0xDFFF]. The [char] type +//! represents these. +//! +//! - Unicode Code Point: Any value in the Unicode code space of [0 to 0x10FFFF]. This +//! means that something representing an arbitrary code point must be 4 bytes, implying +//! that something representing a Unicode Scalar Value must also be 4 bytes, and +//! practically a [char] has the same memory layout as a [u32] under the hood. +//! +//! - Rust [String] and [str] are in UTF-8, and all [byte offsets](TextSize) into them +//! assume the strings are encoded in UTF-8. +//! +//! One key thing to note is that `\n` (or `\r\n`) is the same in all encodings. This +//! means that finding the [LineNumber] you are on is easy, you are either given it +//! through [SourceLocation::line_number], or it can be easily computed from a UTF-8 [byte +//! offset](TextSize) by doing a binary search into an ordered vector of line start +//! locations. That isolates the "hard" details of encoding translation to the +//! [LineOffset], which is typically an extremely small slice of the overall file. +//! +//! # Implementing [SourceFile::offset] +//! +//! ## UTF-8 code units -> UTF-8 byte offset +//! +//! Easy! 1 UTF-8 code unit maps directly to 1 byte in a UTF-8 string, so counting the +//! code units is equivalent to finding the byte offset into the UTF-8 string. +//! +//! ## UTF-16 code units -> UTF-8 byte offset +//! +//! 1 UTF-16 code unit is always 2 bytes if the string is encoded in UTF-16. But if +//! the string is encoded in UTF-8 as ours is, we don't immediately know if the +//! UTF-16 code unit would be represented by 1 or 2 bytes in a UTF-8 string. +//! +//! To do this, we iterate over the [str::chars()] of the string, which are Unicode Scalar +//! Values, i.e. a UTF-32 character, the widest of them all. To figure out the correct +//! amount of UTF-16 code units to count up, we compute the [char::len_utf16()] of each +//! character, which returns the number of UTF-16 code units required if the [char] +//! were instead encoded in UTF-16. Once we've reached the [LineOffset] offset, we've +//! found all the [char]s we care about. To find the byte offset in UTF-8 encoded space, +//! we sum up the [char::len_utf8()] of each of those [char]s. +//! +//! ## UTF-32 code units -> UTF-8 byte offset +//! +//! Very similar to UTF-16, except 1 UTF-32 code unit is always 4 bytes if the string +//! itself is encoded in UTF-32. +//! +//! This is slightly easier than UTF-16. Because [str::chars()] already returns Unicode +//! Scalar Values, also known as UTF-32 characters, and because 1 UTF-32 character is +//! the same size as 1 UTF-32 code unit, we just iterate over the [str::chars()] up to +//! the [LineOffset] value, summing the [char::len_utf8()] of each [char] along the way. +//! +//! # Implementing [SourceFile::source_location] +//! +//! ## UTF-8 byte offset -> UTF-8 code units +//! +//! Easy! Like with the other direction, UTF-8 byte offsets can be directly translated +//! to UTF-8 code units, so there is nothing to do. +//! +//! ## UTF-8 byte offset -> UTF-16 code units +//! +//! This is actually pretty easy. All we do is slice the [String] from its start up to +//! the UTF-8 byte offset in question, then call [str::encode_utf16()] and count the +//! number of UTF-16 code units it returns. +//! +//! This would be expensive if we had to reencode as UTF-16 from the beginning of the +//! file, but we actually just need to reencode as UTF-16 from the beginning of the +//! line that the offset is on, up to the offset position itself, which is a very small +//! slice. This works because the line number itself is not dependent on the encoding, +//! only the line offset into that line is. +//! +//! ## UTF-8 byte offset -> UTF-32 code units +//! +//! Same as with UTF-16, but rather than [str::encode_utf16()], we can use [str::chars()] +//! because it already returns Unicode Scalar Values, which are UTF-32 characters, which +//! are equivalent to UTF-32 code units. + +use biome_text_size::TextLen; +use biome_text_size::TextRange; +use biome_text_size::TextSize; + +use crate::source_location::LineNumber; +use crate::source_location::LineOffsetEncoding; +use crate::LineOffset; +use crate::SourceLocation; + +/// Manager of a single source file +/// +/// Builds a vector of line start locations on creation, for use with +/// [TextSize] <-> [SourceLocation] conversions. In particular, see: +/// +/// - [Self::offset()] for [SourceLocation] -> [TextSize] +/// - [Self::source_location()] for [TextSize] -> [SourceLocation] +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct SourceFile { + contents: String, + line_starts: Vec, + kind: SourceKind, +} + +#[derive(Debug, Clone, Copy, Eq, PartialEq)] +enum SourceKind { + /// Optimized for an ASCII only document + Ascii, + + /// Document containing UTF-8 + Utf8, +} + +impl SourceKind { + const fn is_ascii(self) -> bool { + matches!(self, SourceKind::Ascii) + } +} + +impl SourceFile { + /// Builds the [`SourceFile`] from the contents of a file. + pub fn new(contents: String) -> Self { + let (line_starts, kind) = Self::analyze(&contents); + Self { + contents, + line_starts, + kind, + } + } + + fn analyze(contents: &str) -> (Vec, SourceKind) { + let mut line_starts: Vec = Vec::with_capacity(contents.len() / 88); + + // Always push a start for an offset of `0`, needed for an invariant in `line_number()` + line_starts.push(TextSize::from(0)); + + let mut utf8 = false; + + let bytes = contents.as_bytes(); + assert!(u32::try_from(bytes.len()).is_ok()); + + for (i, byte) in bytes.iter().enumerate() { + utf8 |= !byte.is_ascii(); + + match byte { + // Only track one line break for `\r\n`. + b'\r' if bytes.get(i + 1) == Some(&b'\n') => continue, + b'\n' | b'\r' => { + // Safety: Assertion above guarantees `i <= u32::MAX` + #[allow(clippy::cast_possible_truncation)] + line_starts.push(TextSize::from(i as u32) + TextSize::from(1)); + } + _ => {} + } + } + + let kind = if utf8 { + SourceKind::Utf8 + } else { + SourceKind::Ascii + }; + + (line_starts, kind) + } + + /// Returns a reference to the contents in the source file. + pub fn contents(&self) -> &str { + &self.contents + } + + /// Consumes the source file, returning only the contents. + pub fn into_contents(self) -> String { + self.contents + } + + /// Replace text in the source file and reanalyze afterwards. + pub fn replace_range(&mut self, range: R, replace_with: &str) + where + R: std::ops::RangeBounds, + { + self.contents.replace_range(range, replace_with); + let (line_starts, kind) = Self::analyze(&self.contents); + self.line_starts = line_starts; + self.kind = kind; + } + + /// Returns `true` if the text only consists of ASCII characters + pub fn is_ascii(&self) -> bool { + self.kind.is_ascii() + } + + /// Return the number of lines in the source file. + pub fn line_count(&self) -> usize { + self.line_starts().len() + } + + /// Returns the row number for a given offset. + /// + /// ## Examples + /// + /// ``` + /// use biome_text_size::TextSize; + /// use source_file::{SourceFile, SourceLocation, LineNumber}; + /// + /// let source = "def a():\n pass".to_string(); + /// let source = SourceFile::new(source); + /// + /// assert_eq!(source.line_number(TextSize::from(0)), LineNumber::from(0)); + /// assert_eq!(source.line_number(TextSize::from(4)), LineNumber::from(0)); + /// assert_eq!(source.line_number(TextSize::from(13)), LineNumber::from(1)); + /// ``` + pub fn line_number(&self, offset: TextSize) -> LineNumber { + let line_number = match self.line_starts().binary_search(&offset) { + // `offset` is at the start of a line + Ok(line_number) => line_number, + Err(next_line_number) => { + // SAFETY: Safe because the line starts always contain an entry for the offset 0 + next_line_number - 1 + } + }; + + LineNumber::try_from(line_number) + .expect("Number of line starts should fit in a `LineNumber`") + } + + /// Returns the [byte offset](TextSize) for the line's start. + pub fn line_start(&self, line_number: LineNumber) -> TextSize { + let line_number = usize::from(line_number); + + if line_number >= self.line_count() { + // If asking for a line number past the last line, return last byte + self.contents().text_len() + } else { + self.line_starts()[line_number] + } + } + + /// Returns the [byte offset](TextSize) of the line's end. + /// + /// The offset is the end of the line, up to and including the newline character + /// ending the line (if any), making it equivalent to the next line's start. + pub(crate) fn line_end(&self, line_number: LineNumber) -> TextSize { + let line_number = usize::from(line_number); + + if line_number.saturating_add(1) >= self.line_count() { + // If asking for a line number past the last line, return last byte + self.contents().text_len() + } else { + self.line_starts()[line_number + 1] + } + } + + /// Returns the [`TextRange`] of the line. + /// + /// The start points to the first character's [byte offset](TextSize). The end points + /// up to, and including, the newline character ending the line (if any). This makes + /// the range a `[)` range. + pub fn line_range(&self, line_number: LineNumber) -> TextRange { + TextRange::new(self.line_start(line_number), self.line_end(line_number)) + } + + /// Returns the [byte offsets](TextSize) for every line + pub fn line_starts(&self) -> &[TextSize] { + &self.line_starts + } + + /// Returns the [SourceLocation] at the [byte offset](TextSize). + /// + /// ## Examples + /// + /// ``` + /// use biome_text_size::TextSize; + /// use source_file::{SourceFile, SourceLocation, LineNumber, LineOffset, LineOffsetEncoding}; + /// + /// let source = "x <- function()\n NULL".to_string(); + /// let source = SourceFile::new(source); + /// + /// assert_eq!( + /// source.source_location(TextSize::from(0), LineOffsetEncoding::UTF8), + /// SourceLocation::new( + /// LineNumber::from(0), + /// LineOffset::new(0, LineOffsetEncoding::UTF8) + /// ) + /// ); + /// assert_eq!( + /// source.source_location(TextSize::from(4), LineOffsetEncoding::UTF8), + /// SourceLocation::new( + /// LineNumber::from(0), + /// LineOffset::new(4, LineOffsetEncoding::UTF8) + /// ) + /// ); + /// assert_eq!( + /// source.source_location(TextSize::from(20), LineOffsetEncoding::UTF8), + /// SourceLocation::new( + /// LineNumber::from(1), + /// LineOffset::new(4, LineOffsetEncoding::UTF8) + /// ) + /// ); + /// ``` + pub fn source_location( + &self, + offset: TextSize, + encoding: LineOffsetEncoding, + ) -> SourceLocation { + let line_number = self.line_number(offset); + let line_range_up_to_offset = TextRange::new(self.line_start(line_number), offset); + + let line_offset = if self.is_ascii() { + LineOffset::new(line_range_up_to_offset.len().into(), encoding) + } else { + match encoding { + LineOffsetEncoding::UTF8 => { + LineOffset::new(line_range_up_to_offset.len().into(), encoding) + } + LineOffsetEncoding::UTF16 => { + let line_contents_up_to_offset = &self.contents()[line_range_up_to_offset]; + let offset = line_contents_up_to_offset + .encode_utf16() + .count() + .try_into() + .expect("A single line's `offset` should fit in a `u32`"); + LineOffset::new(offset, encoding) + } + LineOffsetEncoding::UTF32 => { + let line_contents_up_to_offset = &self.contents()[line_range_up_to_offset]; + let offset = line_contents_up_to_offset + .chars() + .count() + .try_into() + .expect("A single line's `offset` should fit in a `u32`"); + LineOffset::new(offset, encoding) + } + } + }; + + SourceLocation::new(line_number, line_offset) + } + + /// Returns the [byte offset](TextSize) at the [SourceLocation]. + /// + /// ## Examples + /// + /// ### ASCII + /// + /// ``` + /// use source_file::{SourceFile, SourceLocation, LineNumber, LineOffset, LineOffsetEncoding}; + /// use biome_text_size::TextSize; + /// + /// let source = r#"a = 4 + /// c = "some string" + /// x = b"#.to_string(); + /// + /// let source = SourceFile::new(source); + /// + /// // First line, first column + /// assert_eq!( + /// source.offset(SourceLocation::new( + /// LineNumber::from(0), + /// LineOffset::new(0, LineOffsetEncoding::UTF8) + /// )), + /// TextSize::from(0) + /// ); + /// + /// // Second line, 4th column + /// assert_eq!( + /// source.offset(SourceLocation::new( + /// LineNumber::from(1), + /// LineOffset::new(4, LineOffsetEncoding::UTF8) + /// )), + /// TextSize::from(10) + /// ); + /// + /// // Offset past the end of the first line + /// assert_eq!( + /// source.offset(SourceLocation::new( + /// LineNumber::from(0), + /// LineOffset::new(10, LineOffsetEncoding::UTF8) + /// )), + /// TextSize::from(6) + /// ); + /// + /// // Offset past the end of the file + /// assert_eq!( + /// source.offset(SourceLocation::new( + /// LineNumber::from(3), + /// LineOffset::new(0, LineOffsetEncoding::UTF8) + /// )), + /// TextSize::from(29) + /// ); + /// ``` + /// + /// ### UTF8 + /// + /// ``` + /// use source_file::{SourceFile, SourceLocation, LineNumber, LineOffset, LineOffsetEncoding}; + /// use biome_text_size::TextSize; + /// + /// let source = r#"a = 4 + /// c = "❤️" + /// x = b"#.to_string(); + /// + /// let source = SourceFile::new(source); + /// + /// // First line, first column + /// assert_eq!( + /// source.offset(SourceLocation::new( + /// LineNumber::from(0), + /// LineOffset::new(0, LineOffsetEncoding::UTF8) + /// )), + /// TextSize::from(0) + /// ); + /// + /// // Third line, 2nd column, after emoji, UTF8 + /// assert_eq!( + /// source.offset(SourceLocation::new( + /// LineNumber::from(2), + /// LineOffset::new(1, LineOffsetEncoding::UTF8) + /// )), + /// TextSize::from(20) + /// ); + /// + /// // Third line, 2nd column, after emoji, UTF16 + /// assert_eq!( + /// source.offset(SourceLocation::new( + /// LineNumber::from(2), + /// LineOffset::new(1, LineOffsetEncoding::UTF16) + /// )), + /// TextSize::from(20) + /// ); + /// + /// // Offset past the end of the second line, UTF8 + /// assert_eq!( + /// source.offset(SourceLocation::new( + /// LineNumber::from(1), + /// LineOffset::new(10, LineOffsetEncoding::UTF8) + /// )), + /// TextSize::from(16) + /// ); + /// + /// // Offset past the end of the second line, UTF32 + /// assert_eq!( + /// source.offset(SourceLocation::new( + /// LineNumber::from(1), + /// LineOffset::new(10, LineOffsetEncoding::UTF32) + /// )), + /// TextSize::from(19) + /// ); + /// + /// // Offset past the end of the file + /// assert_eq!( + /// source.offset(SourceLocation::new( + /// LineNumber::from(3), + /// LineOffset::new(0, LineOffsetEncoding::UTF32) + /// )), + /// TextSize::from(24) + /// ); + /// ``` + /// + pub fn offset(&self, source_location: SourceLocation) -> TextSize { + let (line_number, line_offset) = source_location.into_fields(); + + let line_range = self.line_range(line_number); + + let offset = if self.is_ascii() { + TextSize::from(line_offset.raw()) + } else { + match line_offset.encoding() { + LineOffsetEncoding::UTF8 => TextSize::from(line_offset.raw()), + LineOffsetEncoding::UTF16 => { + let n_code_units = line_offset.raw(); + let line_contents = &self.contents()[line_range]; + + let mut i = 0; + let mut offset = 0; + + for c in line_contents.chars() { + if i >= n_code_units { + break; + } + i += c.len_utf16() as u32; + offset += c.len_utf8() as u32; + } + + TextSize::from(offset) + } + LineOffsetEncoding::UTF32 => { + let n_code_units = line_offset.raw(); + let line_contents = &self.contents()[line_range]; + + let mut offset: u32 = 0; + + for c in line_contents.chars().take(n_code_units as usize) { + offset += c.len_utf8() as u32; + } + + TextSize::from(offset) + } + } + }; + + line_range.start() + offset.clamp(TextSize::from(0), line_range.len()) + } +} + +#[cfg(test)] +mod tests { + use biome_text_size::TextSize; + + use crate::source_location::LineNumber; + use crate::source_location::LineOffset; + use crate::source_location::LineOffsetEncoding; + use crate::SourceFile; + use crate::SourceLocation; + + #[test] + fn ascii_source_file() { + let source = SourceFile::new(String::new()); + assert_eq!(source.line_starts(), &[TextSize::from(0)]); + + let source = SourceFile::new("x = 1".to_string()); + assert_eq!(source.line_starts(), &[TextSize::from(0)]); + + let source = SourceFile::new("x = 1\n".to_string()); + assert_eq!( + source.line_starts(), + &[TextSize::from(0), TextSize::from(6)] + ); + + let source = SourceFile::new("x = 1\ny = 2\nz = x + y\n".to_string()); + assert_eq!( + source.line_starts(), + &[ + TextSize::from(0), + TextSize::from(6), + TextSize::from(12), + TextSize::from(22) + ] + ); + } + + #[test] + fn ascii_source_location() { + let contents = "x = 1\ny = 2".to_string(); + let source = SourceFile::new(contents); + + // First row. + let loc = source.source_location(TextSize::from(2), LineOffsetEncoding::UTF8); + assert_eq!( + loc, + SourceLocation::new( + LineNumber::from(0), + LineOffset::new(2, LineOffsetEncoding::UTF8) + ) + ); + + // Second row. + let loc = source.source_location(TextSize::from(6), LineOffsetEncoding::UTF8); + assert_eq!( + loc, + SourceLocation::new( + LineNumber::from(1), + LineOffset::new(0, LineOffsetEncoding::UTF8) + ) + ); + + let loc = source.source_location(TextSize::from(11), LineOffsetEncoding::UTF8); + assert_eq!( + loc, + SourceLocation::new( + LineNumber::from(1), + LineOffset::new(5, LineOffsetEncoding::UTF8) + ) + ); + } + + #[test] + fn ascii_carriage_return() { + let contents = "x = 4\ry = 3".to_string(); + let source = SourceFile::new(contents); + assert_eq!( + source.line_starts(), + &[TextSize::from(0), TextSize::from(6)] + ); + + assert_eq!( + source.source_location(TextSize::from(4), LineOffsetEncoding::UTF8), + SourceLocation::new( + LineNumber::from(0), + LineOffset::new(4, LineOffsetEncoding::UTF8) + ) + ); + assert_eq!( + source.source_location(TextSize::from(6), LineOffsetEncoding::UTF8), + SourceLocation::new( + LineNumber::from(1), + LineOffset::new(0, LineOffsetEncoding::UTF8) + ) + ); + assert_eq!( + source.source_location(TextSize::from(7), LineOffsetEncoding::UTF8), + SourceLocation::new( + LineNumber::from(1), + LineOffset::new(1, LineOffsetEncoding::UTF8) + ) + ); + } + + #[test] + fn ascii_carriage_return_newline() { + let contents = "x = 4\r\ny = 3".to_string(); + let source = SourceFile::new(contents); + assert_eq!( + source.line_starts(), + &[TextSize::from(0), TextSize::from(7)] + ); + + assert_eq!( + source.source_location(TextSize::from(4), LineOffsetEncoding::UTF8), + SourceLocation::new( + LineNumber::from(0), + LineOffset::new(4, LineOffsetEncoding::UTF8) + ) + ); + assert_eq!( + source.source_location(TextSize::from(7), LineOffsetEncoding::UTF8), + SourceLocation::new( + LineNumber::from(1), + LineOffset::new(0, LineOffsetEncoding::UTF8) + ) + ); + assert_eq!( + source.source_location(TextSize::from(8), LineOffsetEncoding::UTF8), + SourceLocation::new( + LineNumber::from(1), + LineOffset::new(1, LineOffsetEncoding::UTF8) + ) + ); + } + + #[test] + fn utf8_source_file() { + let source = SourceFile::new("x = '🫣'".to_string()); + assert_eq!(source.line_count(), 1); + assert_eq!(source.line_starts(), &[TextSize::from(0)]); + + let source = SourceFile::new("x = '🫣'\n".to_string()); + assert_eq!(source.line_count(), 2); + assert_eq!( + source.line_starts(), + &[TextSize::from(0), TextSize::from(11)] + ); + + let source = SourceFile::new("x = '🫣'\ny = 2\nz = x + y\n".to_string()); + assert_eq!(source.line_count(), 4); + assert_eq!( + source.line_starts(), + &[ + TextSize::from(0), + TextSize::from(11), + TextSize::from(17), + TextSize::from(27) + ] + ); + + let source = SourceFile::new("# 🫣\nclass Foo:\n \"\"\".\"\"\"".to_string()); + assert_eq!(source.line_count(), 3); + assert_eq!( + source.line_starts(), + &[TextSize::from(0), TextSize::from(7), TextSize::from(18)] + ); + } + + #[test] + fn utf8_carriage_return() { + let contents = "x = '🫣'\ry = 3".to_string(); + let source = SourceFile::new(contents); + assert_eq!(source.line_count(), 2); + assert_eq!( + source.line_starts(), + &[TextSize::from(0), TextSize::from(11)] + ); + + // Second ', UTF8 + assert_eq!( + source.source_location(TextSize::from(9), LineOffsetEncoding::UTF8), + SourceLocation::new( + LineNumber::from(0), + LineOffset::new(9, LineOffsetEncoding::UTF8) + ) + ); + assert_eq!( + source.source_location(TextSize::from(11), LineOffsetEncoding::UTF8), + SourceLocation::new( + LineNumber::from(1), + LineOffset::new(0, LineOffsetEncoding::UTF8) + ) + ); + assert_eq!( + source.source_location(TextSize::from(12), LineOffsetEncoding::UTF8), + SourceLocation::new( + LineNumber::from(1), + LineOffset::new(1, LineOffsetEncoding::UTF8) + ) + ); + + // Second ', UTF16 + assert_eq!( + source.source_location(TextSize::from(9), LineOffsetEncoding::UTF16), + SourceLocation::new( + LineNumber::from(0), + LineOffset::new(7, LineOffsetEncoding::UTF16) + ) + ); + assert_eq!( + source.source_location(TextSize::from(11), LineOffsetEncoding::UTF16), + SourceLocation::new( + LineNumber::from(1), + LineOffset::new(0, LineOffsetEncoding::UTF16) + ) + ); + assert_eq!( + source.source_location(TextSize::from(12), LineOffsetEncoding::UTF16), + SourceLocation::new( + LineNumber::from(1), + LineOffset::new(1, LineOffsetEncoding::UTF16) + ) + ); + + // Second ', UTF32 + assert_eq!( + source.source_location(TextSize::from(9), LineOffsetEncoding::UTF32), + SourceLocation::new( + LineNumber::from(0), + LineOffset::new(6, LineOffsetEncoding::UTF32) + ) + ); + assert_eq!( + source.source_location(TextSize::from(11), LineOffsetEncoding::UTF32), + SourceLocation::new( + LineNumber::from(1), + LineOffset::new(0, LineOffsetEncoding::UTF32) + ) + ); + assert_eq!( + source.source_location(TextSize::from(12), LineOffsetEncoding::UTF32), + SourceLocation::new( + LineNumber::from(1), + LineOffset::new(1, LineOffsetEncoding::UTF32) + ) + ); + } + + #[test] + fn utf8_carriage_return_newline() { + let contents = "x = '🫣'\r\ny = 3".to_string(); + let source = SourceFile::new(contents); + assert_eq!(source.line_count(), 2); + assert_eq!( + source.line_starts(), + &[TextSize::from(0), TextSize::from(12)] + ); + + // Second ' + assert_eq!( + source.source_location(TextSize::from(9), LineOffsetEncoding::UTF32), + SourceLocation::new( + LineNumber::from(0), + LineOffset::new(6, LineOffsetEncoding::UTF32) + ) + ); + assert_eq!( + source.source_location(TextSize::from(12), LineOffsetEncoding::UTF32), + SourceLocation::new( + LineNumber::from(1), + LineOffset::new(0, LineOffsetEncoding::UTF32) + ) + ); + assert_eq!( + source.source_location(TextSize::from(13), LineOffsetEncoding::UTF32), + SourceLocation::new( + LineNumber::from(1), + LineOffset::new(1, LineOffsetEncoding::UTF32) + ) + ); + } + + #[test] + fn utf8_byte_offset() { + let contents = "x = '☃'\ny = 2".to_string(); + let source = SourceFile::new(contents); + assert_eq!( + source.line_starts(), + &[TextSize::from(0), TextSize::from(10)] + ); + + // First row, start + let loc = source.source_location(TextSize::from(0), LineOffsetEncoding::UTF8); + assert_eq!( + loc, + SourceLocation::new( + LineNumber::from(0), + LineOffset::new(0, LineOffsetEncoding::UTF8) + ) + ); + let loc = source.source_location(TextSize::from(0), LineOffsetEncoding::UTF16); + assert_eq!( + loc, + SourceLocation::new( + LineNumber::from(0), + LineOffset::new(0, LineOffsetEncoding::UTF16) + ) + ); + let loc = source.source_location(TextSize::from(0), LineOffsetEncoding::UTF32); + assert_eq!( + loc, + SourceLocation::new( + LineNumber::from(0), + LineOffset::new(0, LineOffsetEncoding::UTF32) + ) + ); + + // First row, right before + let loc = source.source_location(TextSize::from(5), LineOffsetEncoding::UTF8); + assert_eq!( + loc, + SourceLocation::new( + LineNumber::from(0), + LineOffset::new(5, LineOffsetEncoding::UTF8) + ) + ); + let loc = source.source_location(TextSize::from(5), LineOffsetEncoding::UTF16); + assert_eq!( + loc, + SourceLocation::new( + LineNumber::from(0), + LineOffset::new(5, LineOffsetEncoding::UTF16) + ) + ); + let loc = source.source_location(TextSize::from(5), LineOffsetEncoding::UTF32); + assert_eq!( + loc, + SourceLocation::new( + LineNumber::from(0), + LineOffset::new(5, LineOffsetEncoding::UTF32) + ) + ); + + // First row, right after + let loc = source.source_location(TextSize::from(8), LineOffsetEncoding::UTF8); + assert_eq!( + loc, + SourceLocation::new( + LineNumber::from(0), + LineOffset::new(8, LineOffsetEncoding::UTF8) + ) + ); + let loc = source.source_location(TextSize::from(8), LineOffsetEncoding::UTF16); + assert_eq!( + loc, + SourceLocation::new( + LineNumber::from(0), + LineOffset::new(6, LineOffsetEncoding::UTF16) + ) + ); + let loc = source.source_location(TextSize::from(8), LineOffsetEncoding::UTF32); + assert_eq!( + loc, + SourceLocation::new( + LineNumber::from(0), + LineOffset::new(6, LineOffsetEncoding::UTF32) + ) + ); + + // Second row, start + let loc = source.source_location(TextSize::from(10), LineOffsetEncoding::UTF8); + assert_eq!( + loc, + SourceLocation::new( + LineNumber::from(1), + LineOffset::new(0, LineOffsetEncoding::UTF8) + ) + ); + let loc = source.source_location(TextSize::from(10), LineOffsetEncoding::UTF16); + assert_eq!( + loc, + SourceLocation::new( + LineNumber::from(1), + LineOffset::new(0, LineOffsetEncoding::UTF16) + ) + ); + let loc = source.source_location(TextSize::from(10), LineOffsetEncoding::UTF32); + assert_eq!( + loc, + SourceLocation::new( + LineNumber::from(1), + LineOffset::new(0, LineOffsetEncoding::UTF32) + ) + ); + + // One-past-the-end. + let loc = source.source_location(TextSize::from(15), LineOffsetEncoding::UTF8); + assert_eq!( + loc, + SourceLocation::new( + LineNumber::from(1), + LineOffset::new(5, LineOffsetEncoding::UTF8) + ) + ); + let loc = source.source_location(TextSize::from(15), LineOffsetEncoding::UTF16); + assert_eq!( + loc, + SourceLocation::new( + LineNumber::from(1), + LineOffset::new(5, LineOffsetEncoding::UTF16) + ) + ); + let loc = source.source_location(TextSize::from(15), LineOffsetEncoding::UTF32); + assert_eq!( + loc, + SourceLocation::new( + LineNumber::from(1), + LineOffset::new(5, LineOffsetEncoding::UTF32) + ) + ); + } +} diff --git a/crates/source_file/src/source_location.rs b/crates/source_file/src/source_location.rs new file mode 100644 index 00000000..e31fb167 --- /dev/null +++ b/crates/source_file/src/source_location.rs @@ -0,0 +1,92 @@ +use std::fmt::Debug; + +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct SourceLocation { + line_number: LineNumber, + line_offset: LineOffset, +} + +impl SourceLocation { + pub fn new(line_number: LineNumber, line_offset: LineOffset) -> Self { + Self { + line_number, + line_offset, + } + } + + pub fn line_number(&self) -> LineNumber { + self.line_number + } + + pub fn line_offset(&self) -> LineOffset { + self.line_offset + } + + pub fn into_fields(self) -> (LineNumber, LineOffset) { + (self.line_number, self.line_offset) + } +} + +/// A 0-indexed line number +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub struct LineNumber(u32); + +impl From for LineNumber { + fn from(value: u32) -> Self { + LineNumber(value) + } +} + +impl TryFrom for LineNumber { + type Error = std::num::TryFromIntError; + + fn try_from(value: usize) -> Result { + Ok(LineNumber(u32::try_from(value)?)) + } +} + +impl From for u32 { + fn from(value: LineNumber) -> Self { + value.0 + } +} + +impl From for usize { + fn from(value: LineNumber) -> Self { + value.0 as usize + } +} + +/// A 0-indexed offset into a line, represented as a number of code units under one of the +/// three possible [LineOffsetEncoding]s +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct LineOffset { + raw: u32, + encoding: LineOffsetEncoding, +} + +impl LineOffset { + pub fn new(raw: u32, encoding: LineOffsetEncoding) -> Self { + Self { raw, encoding } + } + + pub fn raw(&self) -> u32 { + self.raw + } + + pub fn encoding(&self) -> LineOffsetEncoding { + self.encoding + } +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub enum LineOffsetEncoding { + /// Preferred encoding, as Rust [String]s are UTF-8 + UTF8, + + /// UTF-16 is the encoding supported by all LSP clients, but is most expensive to translate + UTF16, + + /// Second choice because UTF-32 uses a fixed 4 byte encoding for each character (makes conversion relatively easy) + UTF32, +}