test: add test cases

h-a-n-a · h-a-n-a · commit 291f60e33196 · 2025-10-22T14:46:24.000+08:00
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -291,7 +291,7 @@ jobs:
           key: swc-exec-cache-${{ matrix.settings.crate }}-${{ hashFiles('**/Cargo.lock') }}
 
       - name: Run cargo test
-        if: matrix.settings.crate != 'swc_plugin_backend_tests' && matrix.settings.crate != 'swc_ecma_parser' && matrix.settings.crate != 'swc_ecma_minifier' && matrix.settings.crate != 'swc_core' && matrix.settings.crate != 'swc_ecma_quote' && matrix.settings.crate != 'swc_cli' && matrix.settings.crate != 'binding_core_wasm'
+        if: matrix.settings.crate != 'swc_plugin_backend_tests' && matrix.settings.crate != 'swc_ecma_parser' && matrix.settings.crate != 'swc_ecma_minifier' && matrix.settings.crate != 'swc_core' && matrix.settings.crate != 'swc_ecma_quote' && matrix.settings.crate != 'swc_cli' && matrix.settings.crate != 'binding_core_wasm' && matrix.settings.crate != 'hstr'
         run: |
           cargo test -p ${{ matrix.settings.crate }}
 
@@ -317,6 +317,11 @@ jobs:
           # export CARGO_TARGET_DIR=$(pwd)/target
           cargo test -p swc_plugin_backend_tests --release
 
+      - name: Run cargo test (hstr)
+        if: matrix.settings.crate == 'hstr'
+        run: |
+          cargo test -p hstr --features serde
+
       - name: Run cargo test (swc_ecma_minifier)
         if: matrix.settings.crate == 'swc_ecma_minifier'
         run: |
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/hstr/Cargo.toml b/crates/hstr/Cargo.toml
@@ -34,6 +34,7 @@ kstring      = { workspace = true }
 num_cpus     = { workspace = true }
 par-iter     = { workspace = true }
 rand         = { workspace = true }
+serde_json   = { workspace = true }
 smartstring  = { workspace = true }
 smol_str     = { workspace = true }
 string_cache = { workspace = true }
diff --git a/crates/hstr/src/wtf8_atom.rs b/crates/hstr/src/wtf8_atom.rs
@@ -91,13 +91,23 @@ impl serde::ser::Serialize for Wtf8Atom {
 
             while let Some(code_point) = iter.next() {
                 if let Some(c) = code_point.to_char() {
+                    // Escape literal '\u' sequences to avoid ambiguity with surrogate encoding.
+                    // Without this escaping, we couldn't distinguish between:
+                    // - JavaScript's "\uD800" (actual unpaired surrogate)
+                    // - JavaScript's "\\uD800" (literal text '\uD800')
+                    //
+                    // By escaping literal '\u' to '\\u', we ensure:
+                    // - Unpaired surrogates serialize as '\uXXXX'
+                    // - Literal '\u' text serializes as '\\uXXXX'
                     if c == '\\' && iter.peek().map(|cp| cp.to_u32()) == Some('u' as u32) {
                         iter.next(); // skip 'u'
                         result.push_str("\\\\u");
                     } else {
                         result.push(c)
                     }
                 } else {
+                    // Unpaired surrogates can't be represented in valid UTF-8,
+                    // so encode them as '\uXXXX' for JavaScript compatibility
                     result.push_str(format!("\\u{:04X}", code_point.to_u32()).as_str());
                 }
             }
@@ -118,13 +128,18 @@ impl<'de> serde::de::Deserialize<'de> for Wtf8Atom {
         fn convert_wtf8_string_to_wtf8(s: String) -> Wtf8Buf {
             let mut iter = s.chars().peekable();
             let mut result = Wtf8Buf::with_capacity(s.len());
+
+            // This function reverses the encoding done in serialize.
+            // It handles two cases:
+            // 1. '\uXXXX' - Decode as an unpaired surrogate code point
+            // 2. '\\uXXXX' - Treat as literal text '\uXXXX'
             while let Some(c) = iter.next() {
                 if c == '\\' {
                     if iter.peek() == Some(&'u') {
-                        // skip 'u'
-                        let _ = iter.next();
+                        // Found '\u' - might be a surrogate encoding
+                        let _ = iter.next(); // skip 'u'
 
-                        // read 4 hex digits encoded in `Serialize`
+                        // Try to read 4 hex digits
                         let d1 = iter.next();
                         let d2 = iter.next();
                         let d3 = iter.next();
@@ -162,10 +177,11 @@ impl<'de> serde::de::Deserialize<'de> for Wtf8Atom {
                         push_if_some!(d3);
                         push_if_some!(d4);
                     } else if iter.peek() == Some(&'\\') {
-                        // skip '\\'
-                        let _ = iter.next();
+                        // Found '\\' - this is an escaped backslash
+                        // '\\u' should become literal '\u' text
+                        let _ = iter.next(); // skip the second '\'
                         if iter.peek() == Some(&'u') {
-                            let _ = iter.next();
+                            let _ = iter.next(); // skip 'u'
                             result.push_char('\\');
                             result.push_char('u');
                         } else {
@@ -315,3 +331,224 @@ impl Wtf8Atom {
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::wtf8::{CodePoint, Wtf8Buf};
+
+    #[test]
+    fn test_serialize_normal_utf8() {
+        let atom = Wtf8Atom::new("Hello, world!");
+        let serialized = serde_json::to_string(&atom).unwrap();
+        assert_eq!(serialized, "\"Hello, world!\"");
+    }
+
+    #[test]
+    fn test_deserialize_normal_utf8() {
+        let json = "\"Hello, world!\"";
+        let atom: Wtf8Atom = serde_json::from_str(json).unwrap();
+        assert_eq!(atom.as_str(), Some("Hello, world!"));
+    }
+
+    #[test]
+    fn test_serialize_unpaired_high_surrogate() {
+        // Create a WTF-8 string with an unpaired high surrogate (U+D800)
+        let mut wtf8 = Wtf8Buf::new();
+        wtf8.push(unsafe { CodePoint::from_u32_unchecked(0xd800) });
+        let atom = Wtf8Atom::from(wtf8);
+
+        let serialized = serde_json::to_string(&atom).unwrap();
+        // The serialized output will have double escaping due to serde_json
+        assert_eq!(serialized, "\"\\\\uD800\"");
+    }
+
+    #[test]
+    fn test_serialize_unpaired_low_surrogate() {
+        // Create a WTF-8 string with an unpaired low surrogate (U+DC00)
+        let mut wtf8 = Wtf8Buf::new();
+        wtf8.push(unsafe { CodePoint::from_u32_unchecked(0xdc00) });
+        let atom = Wtf8Atom::from(wtf8);
+
+        let serialized = serde_json::to_string(&atom).unwrap();
+        // The serialized output will have double escaping due to serde_json
+        assert_eq!(serialized, "\"\\\\uDC00\"");
+    }
+
+    #[test]
+    fn test_serialize_multiple_surrogates() {
+        // Create a WTF-8 string with multiple unpaired surrogates
+        let mut wtf8 = Wtf8Buf::new();
+        wtf8.push_str("Hello ");
+        wtf8.push(unsafe { CodePoint::from_u32_unchecked(0xd800) });
+        wtf8.push_str(" World ");
+        wtf8.push(unsafe { CodePoint::from_u32_unchecked(0xdc00) });
+        let atom = Wtf8Atom::from(wtf8);
+
+        let serialized = serde_json::to_string(&atom).unwrap();
+        // The serialized output will have double escaping due to serde_json
+        assert_eq!(serialized, "\"Hello \\\\uD800 World \\\\uDC00\"");
+    }
+
+    #[test]
+    fn test_serialize_literal_backslash_u() {
+        // Test that literal "\u" in the string gets escaped properly
+        let atom = Wtf8Atom::new("\\u0041");
+        let serialized = serde_json::to_string(&atom).unwrap();
+        // serde_json escapes the backslash, resulting in 4 backslashes
+        assert_eq!(serialized, "\"\\\\\\\\u0041\"");
+    }
+
+    #[test]
+    fn test_deserialize_escaped_backslash_u() {
+        // Test deserializing the escaped format for unpaired surrogates
+        let json = "\"\\\\uD800\"";
+        let atom: Wtf8Atom = serde_json::from_str(json).unwrap();
+        // This should be parsed as an unpaired surrogate
+        assert_eq!(atom.as_str(), None);
+        assert_eq!(atom.to_string_lossy(), "\u{FFFD}");
+    }
+
+    #[test]
+    fn test_deserialize_unpaired_surrogates() {
+        let json = "\"\\\\uD800\""; // Use escaped format that matches serialization
+        let atom: Wtf8Atom = serde_json::from_str(json).unwrap();
+        // Should contain an unpaired surrogate, so as_str() returns None
+        assert_eq!(atom.as_str(), None);
+        // But to_string_lossy should work
+        assert_eq!(atom.to_string_lossy(), "\u{FFFD}");
+    }
+
+    #[test]
+    fn test_round_trip_normal_string() {
+        let original = Wtf8Atom::new("Hello, 世界! 🌍");
+        let serialized = serde_json::to_string(&original).unwrap();
+        let deserialized: Wtf8Atom = serde_json::from_str(&serialized).unwrap();
+        assert_eq!(original.as_str(), deserialized.as_str());
+    }
+
+    #[test]
+    fn test_round_trip_unpaired_surrogates() {
+        // Create a string with unpaired surrogates
+        let mut wtf8 = Wtf8Buf::new();
+        wtf8.push_str("Before ");
+        wtf8.push(unsafe { CodePoint::from_u32_unchecked(0xd800) });
+        wtf8.push_str(" Middle ");
+        wtf8.push(unsafe { CodePoint::from_u32_unchecked(0xdc00) });
+        wtf8.push_str(" After");
+        let original = Wtf8Atom::from(wtf8);
+
+        let serialized = serde_json::to_string(&original).unwrap();
+        let deserialized: Wtf8Atom = serde_json::from_str(&serialized).unwrap();
+
+        // Both should be equal when compared as WTF-8
+        assert_eq!(original, deserialized);
+
+        // Both should produce the same lossy string
+        assert_eq!(original.to_string_lossy(), deserialized.to_string_lossy());
+    }
+
+    #[test]
+    fn test_round_trip_mixed_content() {
+        // Create a complex string with normal text, emojis, and unpaired surrogates
+        let mut wtf8 = Wtf8Buf::new();
+        wtf8.push_str("Hello 世界 🌍 ");
+        wtf8.push(unsafe { CodePoint::from_u32_unchecked(0xd83d) }); // Unpaired high
+        wtf8.push_str(" test ");
+        wtf8.push(unsafe { CodePoint::from_u32_unchecked(0xdca9) }); // Unpaired low
+        let original = Wtf8Atom::from(wtf8);
+
+        let serialized = serde_json::to_string(&original).unwrap();
+        let deserialized: Wtf8Atom = serde_json::from_str(&serialized).unwrap();
+
+        assert_eq!(original, deserialized);
+    }
+
+    #[test]
+    fn test_empty_string() {
+        let atom = Wtf8Atom::new("");
+        let serialized = serde_json::to_string(&atom).unwrap();
+        assert_eq!(serialized, "\"\"");
+
+        let deserialized: Wtf8Atom = serde_json::from_str("\"\"").unwrap();
+        assert_eq!(deserialized.as_str(), Some(""));
+    }
+
+    #[test]
+    fn test_special_characters() {
+        let test_cases = vec![
+            ("\"", "\"\\\"\""),
+            ("\n\r\t", "\"\\n\\r\\t\""), // serde_json escapes control characters
+            ("\\", "\"\\\\\""),
+            ("/", "\"/\""),
+        ];
+
+        for (input, expected) in test_cases {
+            let atom = Wtf8Atom::new(input);
+            let serialized = serde_json::to_string(&atom).unwrap();
+            assert_eq!(serialized, expected, "Failed for input: {:?}", input);
+
+            let deserialized: Wtf8Atom = serde_json::from_str(&serialized).unwrap();
+            assert_eq!(deserialized.as_str(), Some(input));
+        }
+    }
+
+    #[test]
+    fn test_consecutive_surrogates_not_paired() {
+        // Test that consecutive surrogates that don't form a valid pair
+        // are handled correctly
+        let mut wtf8 = Wtf8Buf::new();
+        wtf8.push(unsafe { CodePoint::from_u32_unchecked(0xd800) }); // High surrogate
+        wtf8.push(unsafe { CodePoint::from_u32_unchecked(0xd800) }); // Another high surrogate
+        let atom = Wtf8Atom::from(wtf8);
+
+        let serialized = serde_json::to_string(&atom).unwrap();
+        // The serialized output will have double escaping due to serde_json
+        assert_eq!(serialized, "\"\\\\uD800\\\\uD800\"");
+
+        let deserialized: Wtf8Atom = serde_json::from_str(&serialized).unwrap();
+        assert_eq!(atom, deserialized);
+    }
+
+    #[test]
+    fn test_deserialize_incomplete_escape() {
+        // Test handling of incomplete escape sequences from our custom format
+        let json = "\"\\\\\\\\u123\""; // Escaped backslash + incomplete sequence
+        let atom: Wtf8Atom = serde_json::from_str(json).unwrap();
+        // JSON decodes \\\\u123 to \\u123, then our deserializer sees \u123 and treats
+        // it as literal
+        assert_eq!(atom.as_str(), Some("\\u123"));
+    }
+
+    #[test]
+    fn test_deserialize_invalid_hex() {
+        // Test handling of invalid hex in escape sequences from our custom format
+        let json = "\"\\\\\\\\uGGGG\""; // Escaped backslash + invalid hex
+        let atom: Wtf8Atom = serde_json::from_str(json).unwrap();
+        // JSON decodes \\\\uGGGG to \\uGGGG, then our deserializer sees \uGGGG and
+        // treats it as literal
+        assert_eq!(atom.as_str(), Some("\\uGGGG"));
+    }
+
+    #[test]
+    fn test_try_into_atom_valid_utf8() {
+        let wtf8_atom = Wtf8Atom::new("Valid UTF-8 string");
+        let result = wtf8_atom.try_into_atom();
+        assert!(result.is_ok());
+        assert_eq!(result.unwrap().as_str(), "Valid UTF-8 string");
+    }
+
+    #[test]
+    fn test_try_into_atom_invalid_utf8() {
+        // Create an atom with unpaired surrogates
+        let mut wtf8 = Wtf8Buf::new();
+        wtf8.push(unsafe { CodePoint::from_u32_unchecked(0xd800) });
+        let wtf8_atom = Wtf8Atom::from(wtf8);
+
+        let result = wtf8_atom.try_into_atom();
+        assert!(result.is_err());
+        // Should return the original Wtf8Atom
+        let err_atom = result.unwrap_err();
+        assert_eq!(err_atom.to_string_lossy(), "\u{FFFD}");
+    }
+}