Skip to content

Commit 291f60e

Browse files
committed
test: add test cases
1 parent 3893ab8 commit 291f60e

File tree

4 files changed

+251
-7
lines changed

4 files changed

+251
-7
lines changed

.github/workflows/CI.yml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,7 @@ jobs:
291291
key: swc-exec-cache-${{ matrix.settings.crate }}-${{ hashFiles('**/Cargo.lock') }}
292292

293293
- name: Run cargo test
294-
if: matrix.settings.crate != 'swc_plugin_backend_tests' && matrix.settings.crate != 'swc_ecma_parser' && matrix.settings.crate != 'swc_ecma_minifier' && matrix.settings.crate != 'swc_core' && matrix.settings.crate != 'swc_ecma_quote' && matrix.settings.crate != 'swc_cli' && matrix.settings.crate != 'binding_core_wasm'
294+
if: matrix.settings.crate != 'swc_plugin_backend_tests' && matrix.settings.crate != 'swc_ecma_parser' && matrix.settings.crate != 'swc_ecma_minifier' && matrix.settings.crate != 'swc_core' && matrix.settings.crate != 'swc_ecma_quote' && matrix.settings.crate != 'swc_cli' && matrix.settings.crate != 'binding_core_wasm' && matrix.settings.crate != 'hstr'
295295
run: |
296296
cargo test -p ${{ matrix.settings.crate }}
297297
@@ -317,6 +317,11 @@ jobs:
317317
# export CARGO_TARGET_DIR=$(pwd)/target
318318
cargo test -p swc_plugin_backend_tests --release
319319
320+
- name: Run cargo test (hstr)
321+
if: matrix.settings.crate == 'hstr'
322+
run: |
323+
cargo test -p hstr --features serde
324+
320325
- name: Run cargo test (swc_ecma_minifier)
321326
if: matrix.settings.crate == 'swc_ecma_minifier'
322327
run: |

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/hstr/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ kstring = { workspace = true }
3434
num_cpus = { workspace = true }
3535
par-iter = { workspace = true }
3636
rand = { workspace = true }
37+
serde_json = { workspace = true }
3738
smartstring = { workspace = true }
3839
smol_str = { workspace = true }
3940
string_cache = { workspace = true }

crates/hstr/src/wtf8_atom.rs

Lines changed: 243 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -91,13 +91,23 @@ impl serde::ser::Serialize for Wtf8Atom {
9191

9292
while let Some(code_point) = iter.next() {
9393
if let Some(c) = code_point.to_char() {
94+
// Escape literal '\u' sequences to avoid ambiguity with surrogate encoding.
95+
// Without this escaping, we couldn't distinguish between:
96+
// - JavaScript's "\uD800" (actual unpaired surrogate)
97+
// - JavaScript's "\\uD800" (literal text '\uD800')
98+
//
99+
// By escaping literal '\u' to '\\u', we ensure:
100+
// - Unpaired surrogates serialize as '\uXXXX'
101+
// - Literal '\u' text serializes as '\\uXXXX'
94102
if c == '\\' && iter.peek().map(|cp| cp.to_u32()) == Some('u' as u32) {
95103
iter.next(); // skip 'u'
96104
result.push_str("\\\\u");
97105
} else {
98106
result.push(c)
99107
}
100108
} else {
109+
// Unpaired surrogates can't be represented in valid UTF-8,
110+
// so encode them as '\uXXXX' for JavaScript compatibility
101111
result.push_str(format!("\\u{:04X}", code_point.to_u32()).as_str());
102112
}
103113
}
@@ -118,13 +128,18 @@ impl<'de> serde::de::Deserialize<'de> for Wtf8Atom {
118128
fn convert_wtf8_string_to_wtf8(s: String) -> Wtf8Buf {
119129
let mut iter = s.chars().peekable();
120130
let mut result = Wtf8Buf::with_capacity(s.len());
131+
132+
// This function reverses the encoding done in serialize.
133+
// It handles two cases:
134+
// 1. '\uXXXX' - Decode as an unpaired surrogate code point
135+
// 2. '\\uXXXX' - Treat as literal text '\uXXXX'
121136
while let Some(c) = iter.next() {
122137
if c == '\\' {
123138
if iter.peek() == Some(&'u') {
124-
// skip 'u'
125-
let _ = iter.next();
139+
// Found '\u' - might be a surrogate encoding
140+
let _ = iter.next(); // skip 'u'
126141

127-
// read 4 hex digits encoded in `Serialize`
142+
// Try to read 4 hex digits
128143
let d1 = iter.next();
129144
let d2 = iter.next();
130145
let d3 = iter.next();
@@ -162,10 +177,11 @@ impl<'de> serde::de::Deserialize<'de> for Wtf8Atom {
162177
push_if_some!(d3);
163178
push_if_some!(d4);
164179
} else if iter.peek() == Some(&'\\') {
165-
// skip '\\'
166-
let _ = iter.next();
180+
// Found '\\' - this is an escaped backslash
181+
// '\\u' should become literal '\u' text
182+
let _ = iter.next(); // skip the second '\'
167183
if iter.peek() == Some(&'u') {
168-
let _ = iter.next();
184+
let _ = iter.next(); // skip 'u'
169185
result.push_char('\\');
170186
result.push_char('u');
171187
} else {
@@ -315,3 +331,224 @@ impl Wtf8Atom {
315331
}
316332
}
317333
}
334+
335+
#[cfg(test)]
336+
mod tests {
337+
use super::*;
338+
use crate::wtf8::{CodePoint, Wtf8Buf};
339+
340+
#[test]
341+
fn test_serialize_normal_utf8() {
342+
let atom = Wtf8Atom::new("Hello, world!");
343+
let serialized = serde_json::to_string(&atom).unwrap();
344+
assert_eq!(serialized, "\"Hello, world!\"");
345+
}
346+
347+
#[test]
348+
fn test_deserialize_normal_utf8() {
349+
let json = "\"Hello, world!\"";
350+
let atom: Wtf8Atom = serde_json::from_str(json).unwrap();
351+
assert_eq!(atom.as_str(), Some("Hello, world!"));
352+
}
353+
354+
#[test]
355+
fn test_serialize_unpaired_high_surrogate() {
356+
// Create a WTF-8 string with an unpaired high surrogate (U+D800)
357+
let mut wtf8 = Wtf8Buf::new();
358+
wtf8.push(unsafe { CodePoint::from_u32_unchecked(0xd800) });
359+
let atom = Wtf8Atom::from(wtf8);
360+
361+
let serialized = serde_json::to_string(&atom).unwrap();
362+
// The serialized output will have double escaping due to serde_json
363+
assert_eq!(serialized, "\"\\\\uD800\"");
364+
}
365+
366+
#[test]
367+
fn test_serialize_unpaired_low_surrogate() {
368+
// Create a WTF-8 string with an unpaired low surrogate (U+DC00)
369+
let mut wtf8 = Wtf8Buf::new();
370+
wtf8.push(unsafe { CodePoint::from_u32_unchecked(0xdc00) });
371+
let atom = Wtf8Atom::from(wtf8);
372+
373+
let serialized = serde_json::to_string(&atom).unwrap();
374+
// The serialized output will have double escaping due to serde_json
375+
assert_eq!(serialized, "\"\\\\uDC00\"");
376+
}
377+
378+
#[test]
379+
fn test_serialize_multiple_surrogates() {
380+
// Create a WTF-8 string with multiple unpaired surrogates
381+
let mut wtf8 = Wtf8Buf::new();
382+
wtf8.push_str("Hello ");
383+
wtf8.push(unsafe { CodePoint::from_u32_unchecked(0xd800) });
384+
wtf8.push_str(" World ");
385+
wtf8.push(unsafe { CodePoint::from_u32_unchecked(0xdc00) });
386+
let atom = Wtf8Atom::from(wtf8);
387+
388+
let serialized = serde_json::to_string(&atom).unwrap();
389+
// The serialized output will have double escaping due to serde_json
390+
assert_eq!(serialized, "\"Hello \\\\uD800 World \\\\uDC00\"");
391+
}
392+
393+
#[test]
394+
fn test_serialize_literal_backslash_u() {
395+
// Test that literal "\u" in the string gets escaped properly
396+
let atom = Wtf8Atom::new("\\u0041");
397+
let serialized = serde_json::to_string(&atom).unwrap();
398+
// serde_json escapes the backslash, resulting in 4 backslashes
399+
assert_eq!(serialized, "\"\\\\\\\\u0041\"");
400+
}
401+
402+
#[test]
403+
fn test_deserialize_escaped_backslash_u() {
404+
// Test deserializing the escaped format for unpaired surrogates
405+
let json = "\"\\\\uD800\"";
406+
let atom: Wtf8Atom = serde_json::from_str(json).unwrap();
407+
// This should be parsed as an unpaired surrogate
408+
assert_eq!(atom.as_str(), None);
409+
assert_eq!(atom.to_string_lossy(), "\u{FFFD}");
410+
}
411+
412+
#[test]
413+
fn test_deserialize_unpaired_surrogates() {
414+
let json = "\"\\\\uD800\""; // Use escaped format that matches serialization
415+
let atom: Wtf8Atom = serde_json::from_str(json).unwrap();
416+
// Should contain an unpaired surrogate, so as_str() returns None
417+
assert_eq!(atom.as_str(), None);
418+
// But to_string_lossy should work
419+
assert_eq!(atom.to_string_lossy(), "\u{FFFD}");
420+
}
421+
422+
#[test]
423+
fn test_round_trip_normal_string() {
424+
let original = Wtf8Atom::new("Hello, 世界! 🌍");
425+
let serialized = serde_json::to_string(&original).unwrap();
426+
let deserialized: Wtf8Atom = serde_json::from_str(&serialized).unwrap();
427+
assert_eq!(original.as_str(), deserialized.as_str());
428+
}
429+
430+
#[test]
431+
fn test_round_trip_unpaired_surrogates() {
432+
// Create a string with unpaired surrogates
433+
let mut wtf8 = Wtf8Buf::new();
434+
wtf8.push_str("Before ");
435+
wtf8.push(unsafe { CodePoint::from_u32_unchecked(0xd800) });
436+
wtf8.push_str(" Middle ");
437+
wtf8.push(unsafe { CodePoint::from_u32_unchecked(0xdc00) });
438+
wtf8.push_str(" After");
439+
let original = Wtf8Atom::from(wtf8);
440+
441+
let serialized = serde_json::to_string(&original).unwrap();
442+
let deserialized: Wtf8Atom = serde_json::from_str(&serialized).unwrap();
443+
444+
// Both should be equal when compared as WTF-8
445+
assert_eq!(original, deserialized);
446+
447+
// Both should produce the same lossy string
448+
assert_eq!(original.to_string_lossy(), deserialized.to_string_lossy());
449+
}
450+
451+
#[test]
452+
fn test_round_trip_mixed_content() {
453+
// Create a complex string with normal text, emojis, and unpaired surrogates
454+
let mut wtf8 = Wtf8Buf::new();
455+
wtf8.push_str("Hello 世界 🌍 ");
456+
wtf8.push(unsafe { CodePoint::from_u32_unchecked(0xd83d) }); // Unpaired high
457+
wtf8.push_str(" test ");
458+
wtf8.push(unsafe { CodePoint::from_u32_unchecked(0xdca9) }); // Unpaired low
459+
let original = Wtf8Atom::from(wtf8);
460+
461+
let serialized = serde_json::to_string(&original).unwrap();
462+
let deserialized: Wtf8Atom = serde_json::from_str(&serialized).unwrap();
463+
464+
assert_eq!(original, deserialized);
465+
}
466+
467+
#[test]
468+
fn test_empty_string() {
469+
let atom = Wtf8Atom::new("");
470+
let serialized = serde_json::to_string(&atom).unwrap();
471+
assert_eq!(serialized, "\"\"");
472+
473+
let deserialized: Wtf8Atom = serde_json::from_str("\"\"").unwrap();
474+
assert_eq!(deserialized.as_str(), Some(""));
475+
}
476+
477+
#[test]
478+
fn test_special_characters() {
479+
let test_cases = vec![
480+
("\"", "\"\\\"\""),
481+
("\n\r\t", "\"\\n\\r\\t\""), // serde_json escapes control characters
482+
("\\", "\"\\\\\""),
483+
("/", "\"/\""),
484+
];
485+
486+
for (input, expected) in test_cases {
487+
let atom = Wtf8Atom::new(input);
488+
let serialized = serde_json::to_string(&atom).unwrap();
489+
assert_eq!(serialized, expected, "Failed for input: {:?}", input);
490+
491+
let deserialized: Wtf8Atom = serde_json::from_str(&serialized).unwrap();
492+
assert_eq!(deserialized.as_str(), Some(input));
493+
}
494+
}
495+
496+
#[test]
497+
fn test_consecutive_surrogates_not_paired() {
498+
// Test that consecutive surrogates that don't form a valid pair
499+
// are handled correctly
500+
let mut wtf8 = Wtf8Buf::new();
501+
wtf8.push(unsafe { CodePoint::from_u32_unchecked(0xd800) }); // High surrogate
502+
wtf8.push(unsafe { CodePoint::from_u32_unchecked(0xd800) }); // Another high surrogate
503+
let atom = Wtf8Atom::from(wtf8);
504+
505+
let serialized = serde_json::to_string(&atom).unwrap();
506+
// The serialized output will have double escaping due to serde_json
507+
assert_eq!(serialized, "\"\\\\uD800\\\\uD800\"");
508+
509+
let deserialized: Wtf8Atom = serde_json::from_str(&serialized).unwrap();
510+
assert_eq!(atom, deserialized);
511+
}
512+
513+
#[test]
514+
fn test_deserialize_incomplete_escape() {
515+
// Test handling of incomplete escape sequences from our custom format
516+
let json = "\"\\\\\\\\u123\""; // Escaped backslash + incomplete sequence
517+
let atom: Wtf8Atom = serde_json::from_str(json).unwrap();
518+
// JSON decodes \\\\u123 to \\u123, then our deserializer sees \u123 and treats
519+
// it as literal
520+
assert_eq!(atom.as_str(), Some("\\u123"));
521+
}
522+
523+
#[test]
524+
fn test_deserialize_invalid_hex() {
525+
// Test handling of invalid hex in escape sequences from our custom format
526+
let json = "\"\\\\\\\\uGGGG\""; // Escaped backslash + invalid hex
527+
let atom: Wtf8Atom = serde_json::from_str(json).unwrap();
528+
// JSON decodes \\\\uGGGG to \\uGGGG, then our deserializer sees \uGGGG and
529+
// treats it as literal
530+
assert_eq!(atom.as_str(), Some("\\uGGGG"));
531+
}
532+
533+
#[test]
534+
fn test_try_into_atom_valid_utf8() {
535+
let wtf8_atom = Wtf8Atom::new("Valid UTF-8 string");
536+
let result = wtf8_atom.try_into_atom();
537+
assert!(result.is_ok());
538+
assert_eq!(result.unwrap().as_str(), "Valid UTF-8 string");
539+
}
540+
541+
#[test]
542+
fn test_try_into_atom_invalid_utf8() {
543+
// Create an atom with unpaired surrogates
544+
let mut wtf8 = Wtf8Buf::new();
545+
wtf8.push(unsafe { CodePoint::from_u32_unchecked(0xd800) });
546+
let wtf8_atom = Wtf8Atom::from(wtf8);
547+
548+
let result = wtf8_atom.try_into_atom();
549+
assert!(result.is_err());
550+
// Should return the original Wtf8Atom
551+
let err_atom = result.unwrap_err();
552+
assert_eq!(err_atom.to_string_lossy(), "\u{FFFD}");
553+
}
554+
}

0 commit comments

Comments
 (0)