|
| 1 | +//! The crate contains tools for converting between byte offsets and line / column positions. |
| 2 | +
|
| 3 | +#![deny(clippy::use_self)] |
| 4 | + |
| 5 | +use biome_text_size::TextSize; |
| 6 | + |
| 7 | +mod line_index; |
| 8 | + |
| 9 | +pub use line_index::LineIndex; |
| 10 | + |
| 11 | +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] |
| 12 | +pub enum WideEncoding { |
| 13 | + Utf16, |
| 14 | + Utf32, |
| 15 | +} |
| 16 | + |
| 17 | +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] |
| 18 | +pub struct LineCol { |
| 19 | + /// Zero-based |
| 20 | + pub line: u32, |
| 21 | + /// Zero-based utf8 offset |
| 22 | + pub col: u32, |
| 23 | +} |
| 24 | + |
| 25 | +/// Deliberately not a generic type and different from `LineCol`. |
| 26 | +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] |
| 27 | +pub struct WideLineCol { |
| 28 | + /// Zero-based |
| 29 | + pub line: u32, |
| 30 | + /// Zero-based |
| 31 | + pub col: u32, |
| 32 | +} |
| 33 | + |
| 34 | +#[derive(Clone, Debug, Hash, PartialEq, Eq)] |
| 35 | +pub struct WideChar { |
| 36 | + /// Start offset of a character inside a line, zero-based |
| 37 | + pub start: TextSize, |
| 38 | + /// End offset of a character inside a line, zero-based |
| 39 | + pub end: TextSize, |
| 40 | +} |
| 41 | + |
| 42 | +impl WideChar { |
| 43 | + /// Returns the length in 8-bit UTF-8 code units. |
| 44 | + fn len(&self) -> TextSize { |
| 45 | + self.end - self.start |
| 46 | + } |
| 47 | + |
| 48 | + /// Returns the length in UTF-16 or UTF-32 code units. |
| 49 | + fn wide_len(&self, enc: WideEncoding) -> usize { |
| 50 | + match enc { |
| 51 | + WideEncoding::Utf16 => { |
| 52 | + if self.len() == TextSize::from(4) { |
| 53 | + 2 |
| 54 | + } else { |
| 55 | + 1 |
| 56 | + } |
| 57 | + } |
| 58 | + |
| 59 | + WideEncoding::Utf32 => 1, |
| 60 | + } |
| 61 | + } |
| 62 | +} |
| 63 | + |
| 64 | +#[cfg(test)] |
| 65 | +mod tests { |
| 66 | + use crate::WideEncoding::{Utf16, Utf32}; |
| 67 | + use crate::WideLineCol; |
| 68 | + use crate::line_index::LineIndex; |
| 69 | + use crate::{LineCol, WideEncoding}; |
| 70 | + use biome_text_size::TextSize; |
| 71 | + |
| 72 | + macro_rules! check_conversion { |
| 73 | + ($line_index:ident : $wide_line_col:expr => $text_size:expr ) => { |
| 74 | + let encoding = WideEncoding::Utf16; |
| 75 | + |
| 76 | + let line_col = $line_index.to_utf8(encoding, $wide_line_col); |
| 77 | + let offset = $line_index.offset(line_col); |
| 78 | + assert_eq!(offset, Some($text_size)); |
| 79 | + |
| 80 | + let line_col = $line_index.line_col(offset.unwrap()); |
| 81 | + let wide_line_col = $line_index.to_wide(encoding, line_col.unwrap()); |
| 82 | + assert_eq!(wide_line_col, Some($wide_line_col)); |
| 83 | + }; |
| 84 | + } |
| 85 | + |
| 86 | + #[test] |
| 87 | + fn empty_string() { |
| 88 | + let line_index = LineIndex::new(""); |
| 89 | + check_conversion!(line_index: WideLineCol { line: 0, col: 0 } => TextSize::from(0)); |
| 90 | + } |
| 91 | + |
| 92 | + #[test] |
| 93 | + fn empty_line() { |
| 94 | + let line_index = LineIndex::new("\n\n"); |
| 95 | + check_conversion!(line_index: WideLineCol { line: 1, col: 0 } => TextSize::from(1)); |
| 96 | + } |
| 97 | + |
| 98 | + #[test] |
| 99 | + fn line_end() { |
| 100 | + let line_index = LineIndex::new("abc\ndef\nghi"); |
| 101 | + check_conversion!(line_index: WideLineCol { line: 1, col: 3 } => TextSize::from(7)); |
| 102 | + } |
| 103 | + |
| 104 | + #[test] |
| 105 | + fn out_of_bounds_line() { |
| 106 | + let line_index = LineIndex::new("abcde\nfghij\n"); |
| 107 | + |
| 108 | + let offset = line_index.offset(LineCol { line: 5, col: 0 }); |
| 109 | + assert!(offset.is_none()); |
| 110 | + } |
| 111 | + |
| 112 | + #[test] |
| 113 | + fn unicode() { |
| 114 | + let line_index = LineIndex::new("'Jan 1, 2018 – Jan 1, 2019'"); |
| 115 | + |
| 116 | + check_conversion!(line_index: WideLineCol { line: 0, col: 0 } => TextSize::from(0)); |
| 117 | + check_conversion!(line_index: WideLineCol { line: 0, col: 1 } => TextSize::from(1)); |
| 118 | + check_conversion!(line_index: WideLineCol { line: 0, col: 12 } => TextSize::from(12)); |
| 119 | + check_conversion!(line_index: WideLineCol { line: 0, col: 13 } => TextSize::from(15)); |
| 120 | + check_conversion!(line_index: WideLineCol { line: 0, col: 14 } => TextSize::from(18)); |
| 121 | + check_conversion!(line_index: WideLineCol { line: 0, col: 15 } => TextSize::from(21)); |
| 122 | + check_conversion!(line_index: WideLineCol { line: 0, col: 26 } => TextSize::from(32)); |
| 123 | + check_conversion!(line_index: WideLineCol { line: 0, col: 27 } => TextSize::from(33)); |
| 124 | + } |
| 125 | + |
| 126 | + #[ignore] |
| 127 | + #[test] |
| 128 | + fn test_every_chars() { |
| 129 | + let text: String = { |
| 130 | + let mut chars: Vec<char> = ((0 as char)..char::MAX).collect(); |
| 131 | + chars.extend("\n".repeat(chars.len() / 16).chars()); |
| 132 | + chars.into_iter().collect() |
| 133 | + }; |
| 134 | + |
| 135 | + let line_index = LineIndex::new(&text); |
| 136 | + |
| 137 | + let mut lin_col = LineCol { line: 0, col: 0 }; |
| 138 | + let mut col_utf16 = 0; |
| 139 | + let mut col_utf32 = 0; |
| 140 | + for (offset, char) in text.char_indices() { |
| 141 | + let got_offset = line_index.offset(lin_col).unwrap(); |
| 142 | + assert_eq!(usize::from(got_offset), offset); |
| 143 | + |
| 144 | + let got_lin_col = line_index.line_col(got_offset).unwrap(); |
| 145 | + assert_eq!(got_lin_col, lin_col); |
| 146 | + |
| 147 | + for enc in [Utf16, Utf32] { |
| 148 | + let wide_lin_col = line_index.to_wide(enc, lin_col).unwrap(); |
| 149 | + let got_lin_col = line_index.to_utf8(enc, wide_lin_col); |
| 150 | + assert_eq!(got_lin_col, lin_col); |
| 151 | + |
| 152 | + let want_col = match enc { |
| 153 | + Utf16 => col_utf16, |
| 154 | + Utf32 => col_utf32, |
| 155 | + }; |
| 156 | + assert_eq!(wide_lin_col.col, want_col) |
| 157 | + } |
| 158 | + |
| 159 | + if char == '\n' { |
| 160 | + lin_col.line += 1; |
| 161 | + lin_col.col = 0; |
| 162 | + col_utf16 = 0; |
| 163 | + col_utf32 = 0; |
| 164 | + } else { |
| 165 | + lin_col.col += char.len_utf8() as u32; |
| 166 | + col_utf16 += char.len_utf16() as u32; |
| 167 | + col_utf32 += 1; |
| 168 | + } |
| 169 | + } |
| 170 | + } |
| 171 | +} |
0 commit comments