@@ -91,13 +91,23 @@ impl serde::ser::Serialize for Wtf8Atom {
9191
9292 while let Some ( code_point) = iter. next ( ) {
9393 if let Some ( c) = code_point. to_char ( ) {
94+ // Escape literal '\u' sequences to avoid ambiguity with surrogate encoding.
95+ // Without this escaping, we couldn't distinguish between:
96+ // - JavaScript's "\uD800" (actual unpaired surrogate)
97+ // - JavaScript's "\\uD800" (literal text '\uD800')
98+ //
99+ // By escaping literal '\u' to '\\u', we ensure:
100+ // - Unpaired surrogates serialize as '\uXXXX'
101+ // - Literal '\u' text serializes as '\\uXXXX'
94102 if c == '\\' && iter. peek ( ) . map ( |cp| cp. to_u32 ( ) ) == Some ( 'u' as u32 ) {
95103 iter. next ( ) ; // skip 'u'
96104 result. push_str ( "\\ \\ u" ) ;
97105 } else {
98106 result. push ( c)
99107 }
100108 } else {
109+ // Unpaired surrogates can't be represented in valid UTF-8,
110+ // so encode them as '\uXXXX' for JavaScript compatibility
101111 result. push_str ( format ! ( "\\ u{:04X}" , code_point. to_u32( ) ) . as_str ( ) ) ;
102112 }
103113 }
@@ -118,13 +128,18 @@ impl<'de> serde::de::Deserialize<'de> for Wtf8Atom {
118128 fn convert_wtf8_string_to_wtf8 ( s : String ) -> Wtf8Buf {
119129 let mut iter = s. chars ( ) . peekable ( ) ;
120130 let mut result = Wtf8Buf :: with_capacity ( s. len ( ) ) ;
131+
132+ // This function reverses the encoding done in serialize.
133+ // It handles two cases:
134+ // 1. '\uXXXX' - Decode as an unpaired surrogate code point
135+ // 2. '\\uXXXX' - Treat as literal text '\uXXXX'
121136 while let Some ( c) = iter. next ( ) {
122137 if c == '\\' {
123138 if iter. peek ( ) == Some ( & 'u' ) {
124- // skip 'u'
125- let _ = iter. next ( ) ;
139+ // Found '\u' - might be a surrogate encoding
140+ let _ = iter. next ( ) ; // skip 'u'
126141
127- // read 4 hex digits encoded in `Serialize`
142+ // Try to read 4 hex digits
128143 let d1 = iter. next ( ) ;
129144 let d2 = iter. next ( ) ;
130145 let d3 = iter. next ( ) ;
@@ -162,10 +177,11 @@ impl<'de> serde::de::Deserialize<'de> for Wtf8Atom {
162177 push_if_some ! ( d3) ;
163178 push_if_some ! ( d4) ;
164179 } else if iter. peek ( ) == Some ( & '\\' ) {
165- // skip '\\'
166- let _ = iter. next ( ) ;
180+ // Found '\\' - this is an escaped backslash
181+ // '\\u' should become literal '\u' text
182+ let _ = iter. next ( ) ; // skip the second '\'
167183 if iter. peek ( ) == Some ( & 'u' ) {
168- let _ = iter. next ( ) ;
184+ let _ = iter. next ( ) ; // skip 'u'
169185 result. push_char ( '\\' ) ;
170186 result. push_char ( 'u' ) ;
171187 } else {
@@ -315,3 +331,224 @@ impl Wtf8Atom {
315331 }
316332 }
317333}
334+
335+ #[ cfg( test) ]
336+ mod tests {
337+ use super :: * ;
338+ use crate :: wtf8:: { CodePoint , Wtf8Buf } ;
339+
340+ #[ test]
341+ fn test_serialize_normal_utf8 ( ) {
342+ let atom = Wtf8Atom :: new ( "Hello, world!" ) ;
343+ let serialized = serde_json:: to_string ( & atom) . unwrap ( ) ;
344+ assert_eq ! ( serialized, "\" Hello, world!\" " ) ;
345+ }
346+
347+ #[ test]
348+ fn test_deserialize_normal_utf8 ( ) {
349+ let json = "\" Hello, world!\" " ;
350+ let atom: Wtf8Atom = serde_json:: from_str ( json) . unwrap ( ) ;
351+ assert_eq ! ( atom. as_str( ) , Some ( "Hello, world!" ) ) ;
352+ }
353+
354+ #[ test]
355+ fn test_serialize_unpaired_high_surrogate ( ) {
356+ // Create a WTF-8 string with an unpaired high surrogate (U+D800)
357+ let mut wtf8 = Wtf8Buf :: new ( ) ;
358+ wtf8. push ( unsafe { CodePoint :: from_u32_unchecked ( 0xd800 ) } ) ;
359+ let atom = Wtf8Atom :: from ( wtf8) ;
360+
361+ let serialized = serde_json:: to_string ( & atom) . unwrap ( ) ;
362+ // The serialized output will have double escaping due to serde_json
363+ assert_eq ! ( serialized, "\" \\ \\ uD800\" " ) ;
364+ }
365+
366+ #[ test]
367+ fn test_serialize_unpaired_low_surrogate ( ) {
368+ // Create a WTF-8 string with an unpaired low surrogate (U+DC00)
369+ let mut wtf8 = Wtf8Buf :: new ( ) ;
370+ wtf8. push ( unsafe { CodePoint :: from_u32_unchecked ( 0xdc00 ) } ) ;
371+ let atom = Wtf8Atom :: from ( wtf8) ;
372+
373+ let serialized = serde_json:: to_string ( & atom) . unwrap ( ) ;
374+ // The serialized output will have double escaping due to serde_json
375+ assert_eq ! ( serialized, "\" \\ \\ uDC00\" " ) ;
376+ }
377+
378+ #[ test]
379+ fn test_serialize_multiple_surrogates ( ) {
380+ // Create a WTF-8 string with multiple unpaired surrogates
381+ let mut wtf8 = Wtf8Buf :: new ( ) ;
382+ wtf8. push_str ( "Hello " ) ;
383+ wtf8. push ( unsafe { CodePoint :: from_u32_unchecked ( 0xd800 ) } ) ;
384+ wtf8. push_str ( " World " ) ;
385+ wtf8. push ( unsafe { CodePoint :: from_u32_unchecked ( 0xdc00 ) } ) ;
386+ let atom = Wtf8Atom :: from ( wtf8) ;
387+
388+ let serialized = serde_json:: to_string ( & atom) . unwrap ( ) ;
389+ // The serialized output will have double escaping due to serde_json
390+ assert_eq ! ( serialized, "\" Hello \\ \\ uD800 World \\ \\ uDC00\" " ) ;
391+ }
392+
393+ #[ test]
394+ fn test_serialize_literal_backslash_u ( ) {
395+ // Test that literal "\u" in the string gets escaped properly
396+ let atom = Wtf8Atom :: new ( "\\ u0041" ) ;
397+ let serialized = serde_json:: to_string ( & atom) . unwrap ( ) ;
398+ // serde_json escapes the backslash, resulting in 4 backslashes
399+ assert_eq ! ( serialized, "\" \\ \\ \\ \\ u0041\" " ) ;
400+ }
401+
402+ #[ test]
403+ fn test_deserialize_escaped_backslash_u ( ) {
404+ // Test deserializing the escaped format for unpaired surrogates
405+ let json = "\" \\ \\ uD800\" " ;
406+ let atom: Wtf8Atom = serde_json:: from_str ( json) . unwrap ( ) ;
407+ // This should be parsed as an unpaired surrogate
408+ assert_eq ! ( atom. as_str( ) , None ) ;
409+ assert_eq ! ( atom. to_string_lossy( ) , "\u{FFFD} " ) ;
410+ }
411+
412+ #[ test]
413+ fn test_deserialize_unpaired_surrogates ( ) {
414+ let json = "\" \\ \\ uD800\" " ; // Use escaped format that matches serialization
415+ let atom: Wtf8Atom = serde_json:: from_str ( json) . unwrap ( ) ;
416+ // Should contain an unpaired surrogate, so as_str() returns None
417+ assert_eq ! ( atom. as_str( ) , None ) ;
418+ // But to_string_lossy should work
419+ assert_eq ! ( atom. to_string_lossy( ) , "\u{FFFD} " ) ;
420+ }
421+
422+ #[ test]
423+ fn test_round_trip_normal_string ( ) {
424+ let original = Wtf8Atom :: new ( "Hello, 世界! 🌍" ) ;
425+ let serialized = serde_json:: to_string ( & original) . unwrap ( ) ;
426+ let deserialized: Wtf8Atom = serde_json:: from_str ( & serialized) . unwrap ( ) ;
427+ assert_eq ! ( original. as_str( ) , deserialized. as_str( ) ) ;
428+ }
429+
430+ #[ test]
431+ fn test_round_trip_unpaired_surrogates ( ) {
432+ // Create a string with unpaired surrogates
433+ let mut wtf8 = Wtf8Buf :: new ( ) ;
434+ wtf8. push_str ( "Before " ) ;
435+ wtf8. push ( unsafe { CodePoint :: from_u32_unchecked ( 0xd800 ) } ) ;
436+ wtf8. push_str ( " Middle " ) ;
437+ wtf8. push ( unsafe { CodePoint :: from_u32_unchecked ( 0xdc00 ) } ) ;
438+ wtf8. push_str ( " After" ) ;
439+ let original = Wtf8Atom :: from ( wtf8) ;
440+
441+ let serialized = serde_json:: to_string ( & original) . unwrap ( ) ;
442+ let deserialized: Wtf8Atom = serde_json:: from_str ( & serialized) . unwrap ( ) ;
443+
444+ // Both should be equal when compared as WTF-8
445+ assert_eq ! ( original, deserialized) ;
446+
447+ // Both should produce the same lossy string
448+ assert_eq ! ( original. to_string_lossy( ) , deserialized. to_string_lossy( ) ) ;
449+ }
450+
451+ #[ test]
452+ fn test_round_trip_mixed_content ( ) {
453+ // Create a complex string with normal text, emojis, and unpaired surrogates
454+ let mut wtf8 = Wtf8Buf :: new ( ) ;
455+ wtf8. push_str ( "Hello 世界 🌍 " ) ;
456+ wtf8. push ( unsafe { CodePoint :: from_u32_unchecked ( 0xd83d ) } ) ; // Unpaired high
457+ wtf8. push_str ( " test " ) ;
458+ wtf8. push ( unsafe { CodePoint :: from_u32_unchecked ( 0xdca9 ) } ) ; // Unpaired low
459+ let original = Wtf8Atom :: from ( wtf8) ;
460+
461+ let serialized = serde_json:: to_string ( & original) . unwrap ( ) ;
462+ let deserialized: Wtf8Atom = serde_json:: from_str ( & serialized) . unwrap ( ) ;
463+
464+ assert_eq ! ( original, deserialized) ;
465+ }
466+
467+ #[ test]
468+ fn test_empty_string ( ) {
469+ let atom = Wtf8Atom :: new ( "" ) ;
470+ let serialized = serde_json:: to_string ( & atom) . unwrap ( ) ;
471+ assert_eq ! ( serialized, "\" \" " ) ;
472+
473+ let deserialized: Wtf8Atom = serde_json:: from_str ( "\" \" " ) . unwrap ( ) ;
474+ assert_eq ! ( deserialized. as_str( ) , Some ( "" ) ) ;
475+ }
476+
477+ #[ test]
478+ fn test_special_characters ( ) {
479+ let test_cases = vec ! [
480+ ( "\" " , "\" \\ \" \" " ) ,
481+ ( "\n \r \t " , "\" \\ n\\ r\\ t\" " ) , // serde_json escapes control characters
482+ ( "\\ " , "\" \\ \\ \" " ) ,
483+ ( "/" , "\" /\" " ) ,
484+ ] ;
485+
486+ for ( input, expected) in test_cases {
487+ let atom = Wtf8Atom :: new ( input) ;
488+ let serialized = serde_json:: to_string ( & atom) . unwrap ( ) ;
489+ assert_eq ! ( serialized, expected, "Failed for input: {:?}" , input) ;
490+
491+ let deserialized: Wtf8Atom = serde_json:: from_str ( & serialized) . unwrap ( ) ;
492+ assert_eq ! ( deserialized. as_str( ) , Some ( input) ) ;
493+ }
494+ }
495+
496+ #[ test]
497+ fn test_consecutive_surrogates_not_paired ( ) {
498+ // Test that consecutive surrogates that don't form a valid pair
499+ // are handled correctly
500+ let mut wtf8 = Wtf8Buf :: new ( ) ;
501+ wtf8. push ( unsafe { CodePoint :: from_u32_unchecked ( 0xd800 ) } ) ; // High surrogate
502+ wtf8. push ( unsafe { CodePoint :: from_u32_unchecked ( 0xd800 ) } ) ; // Another high surrogate
503+ let atom = Wtf8Atom :: from ( wtf8) ;
504+
505+ let serialized = serde_json:: to_string ( & atom) . unwrap ( ) ;
506+ // The serialized output will have double escaping due to serde_json
507+ assert_eq ! ( serialized, "\" \\ \\ uD800\\ \\ uD800\" " ) ;
508+
509+ let deserialized: Wtf8Atom = serde_json:: from_str ( & serialized) . unwrap ( ) ;
510+ assert_eq ! ( atom, deserialized) ;
511+ }
512+
513+ #[ test]
514+ fn test_deserialize_incomplete_escape ( ) {
515+ // Test handling of incomplete escape sequences from our custom format
516+ let json = "\" \\ \\ \\ \\ u123\" " ; // Escaped backslash + incomplete sequence
517+ let atom: Wtf8Atom = serde_json:: from_str ( json) . unwrap ( ) ;
518+ // JSON decodes \\\\u123 to \\u123, then our deserializer sees \u123 and treats
519+ // it as literal
520+ assert_eq ! ( atom. as_str( ) , Some ( "\\ u123" ) ) ;
521+ }
522+
523+ #[ test]
524+ fn test_deserialize_invalid_hex ( ) {
525+ // Test handling of invalid hex in escape sequences from our custom format
526+ let json = "\" \\ \\ \\ \\ uGGGG\" " ; // Escaped backslash + invalid hex
527+ let atom: Wtf8Atom = serde_json:: from_str ( json) . unwrap ( ) ;
528+ // JSON decodes \\\\uGGGG to \\uGGGG, then our deserializer sees \uGGGG and
529+ // treats it as literal
530+ assert_eq ! ( atom. as_str( ) , Some ( "\\ uGGGG" ) ) ;
531+ }
532+
533+ #[ test]
534+ fn test_try_into_atom_valid_utf8 ( ) {
535+ let wtf8_atom = Wtf8Atom :: new ( "Valid UTF-8 string" ) ;
536+ let result = wtf8_atom. try_into_atom ( ) ;
537+ assert ! ( result. is_ok( ) ) ;
538+ assert_eq ! ( result. unwrap( ) . as_str( ) , "Valid UTF-8 string" ) ;
539+ }
540+
541+ #[ test]
542+ fn test_try_into_atom_invalid_utf8 ( ) {
543+ // Create an atom with unpaired surrogates
544+ let mut wtf8 = Wtf8Buf :: new ( ) ;
545+ wtf8. push ( unsafe { CodePoint :: from_u32_unchecked ( 0xd800 ) } ) ;
546+ let wtf8_atom = Wtf8Atom :: from ( wtf8) ;
547+
548+ let result = wtf8_atom. try_into_atom ( ) ;
549+ assert ! ( result. is_err( ) ) ;
550+ // Should return the original Wtf8Atom
551+ let err_atom = result. unwrap_err ( ) ;
552+ assert_eq ! ( err_atom. to_string_lossy( ) , "\u{FFFD} " ) ;
553+ }
554+ }
0 commit comments