diff --git a/internal/libyaml/scanner.go b/internal/libyaml/scanner.go index 1cbcc32f..77a405ae 100644 --- a/internal/libyaml/scanner.go +++ b/internal/libyaml/scanner.go @@ -627,7 +627,11 @@ func isPrintable(b []byte, i int) bool { (b[i] == 0xEE) || (b[i] == 0xEF && // #xE000 <= . <= #xFFFD !(b[i+1] == 0xBB && b[i+2] == 0xBF) && // && . != #xFEFF - !(b[i+1] == 0xBF && (b[i+2] == 0xBE || b[i+2] == 0xBF)))) + !(b[i+1] == 0xBF && (b[i+2] == 0xBE || b[i+2] == 0xBF))) || + // 4-byte UTF-8 range for valid Unicode scalars: U+10000..U+10FFFF. + (b[i] == 0xF0 && b[i+1] >= 0x90 && b[i+1] <= 0xBF) || + (b[i] > 0xF0 && b[i] < 0xF4 && b[i+1] >= 0x80 && b[i+1] <= 0xBF) || + (b[i] == 0xF4 && b[i+1] >= 0x80 && b[i+1] <= 0x8F)) } // Check if the character at the specified position is NUL. diff --git a/internal/libyaml/testdata/constructor.yaml b/internal/libyaml/testdata/constructor.yaml index e9ba3269..6cb375d4 100644 --- a/internal/libyaml/testdata/constructor.yaml +++ b/internal/libyaml/testdata/constructor.yaml @@ -100,3 +100,27 @@ - 42 - -10 - 3.14 + +- scalar-resolution: + name: emojis + yaml: | + - πŸ˜€ + - πŸ‘πŸ½ + - πŸ‡ΊπŸ‡Έ + - πŸ³οΈβ€πŸŒˆ + - πŸ‘©πŸΌβ€β€οΈβ€πŸ’‹β€πŸ‘¨πŸΎ + want: + - πŸ˜€ + - πŸ‘πŸ½ + - πŸ‡ΊπŸ‡Έ + - πŸ³οΈβ€πŸŒˆ + - πŸ‘©πŸΌβ€β€οΈβ€πŸ’‹β€πŸ‘¨πŸΎ + +- scalar-resolution: + name: supplementary non-emoji and mixed scalar + yaml: | + - 𐀀 + - AπŸ˜€π€€Z + want: + - 𐀀 + - AπŸ˜€π€€Z diff --git a/internal/libyaml/testdata/emitter.yaml b/internal/libyaml/testdata/emitter.yaml index f1e463af..aa7286c3 100644 --- a/internal/libyaml/testdata/emitter.yaml +++ b/internal/libyaml/testdata/emitter.yaml @@ -466,6 +466,241 @@ - parent - child +- emit-config: + name: Various characters from Unicode ranges + conf: + unicode: true + data: + - STREAM_START_EVENT: + encoding: UTF8_ENCODING + - DOCUMENT_START_EVENT: + implicit: false + - SEQUENCE_START_EVENT + - SCALAR_EVENT: + value: ASCII + implicit: true + style: PLAIN_SCALAR_STYLE + - SCALAR_EVENT: + value: Γ© + implicit: true + style: PLAIN_SCALAR_STYLE + - SCALAR_EVENT: + value: Β© + implicit: true + style: PLAIN_SCALAR_STYLE + - SCALAR_EVENT: + value: € + implicit: true + style: PLAIN_SCALAR_STYLE + - SCALAR_EVENT: + value: 𐀀 + implicit: true + style: PLAIN_SCALAR_STYLE + - SCALAR_EVENT: + value: ζΌ’ + implicit: true + style: PLAIN_SCALAR_STYLE + - SEQUENCE_END_EVENT + - DOCUMENT_END_EVENT + - STREAM_END_EVENT + want: |- + - ASCII + - Γ© + - Β© + - € + - 𐀀 + - ζΌ’ + +- emit-config: + name: Emojis with Unicode enabled + conf: + unicode: true + data: + - STREAM_START_EVENT: + encoding: UTF8_ENCODING + - DOCUMENT_START_EVENT: + implicit: true + - SEQUENCE_START_EVENT + - SCALAR_EVENT: + value: πŸ˜€ + implicit: true + style: PLAIN_SCALAR_STYLE + - SCALAR_EVENT: + value: πŸ‘πŸ½ + implicit: true + style: PLAIN_SCALAR_STYLE + - SCALAR_EVENT: + value: πŸ‡ΊπŸ‡Έ + implicit: true + style: PLAIN_SCALAR_STYLE + - SCALAR_EVENT: + value: πŸ³οΈβ€πŸŒˆ + implicit: true + style: PLAIN_SCALAR_STYLE + - SCALAR_EVENT: + value: πŸ‘©πŸΌβ€β€οΈβ€πŸ’‹β€πŸ‘¨πŸΎ + implicit: true + style: PLAIN_SCALAR_STYLE + - SEQUENCE_END_EVENT + - DOCUMENT_END_EVENT + - STREAM_END_EVENT + want: | + - πŸ˜€ + - πŸ‘πŸ½ + - πŸ‡ΊπŸ‡Έ + - πŸ³οΈβ€πŸŒˆ + - πŸ‘©πŸΌβ€β€οΈβ€πŸ’‹β€πŸ‘¨πŸΎ + +- emit-config: + name: Unicode characters with double quotes and Unicode enabled + conf: + unicode: true + data: + - STREAM_START_EVENT: + encoding: UTF8_ENCODING + - DOCUMENT_START_EVENT: + implicit: false + - SEQUENCE_START_EVENT + - SCALAR_EVENT: + value: ASCII + implicit: true + style: DOUBLE_QUOTED_SCALAR_STYLE + - SCALAR_EVENT: + value: Γ© + implicit: true + style: DOUBLE_QUOTED_SCALAR_STYLE + - SCALAR_EVENT: + value: Β© + implicit: true + style: DOUBLE_QUOTED_SCALAR_STYLE + - SCALAR_EVENT: + value: € + implicit: true + style: DOUBLE_QUOTED_SCALAR_STYLE + - SCALAR_EVENT: + value: 𐀀 + implicit: true + style: DOUBLE_QUOTED_SCALAR_STYLE + - SCALAR_EVENT: + value: ζΌ’ + implicit: true + style: DOUBLE_QUOTED_SCALAR_STYLE + - SCALAR_EVENT: + value: πŸ˜€ + implicit: true + style: DOUBLE_QUOTED_SCALAR_STYLE + - SCALAR_EVENT: + value: πŸ³οΈβ€πŸŒˆ + implicit: true + style: DOUBLE_QUOTED_SCALAR_STYLE + - SCALAR_EVENT: + value: πŸ‘©πŸΌβ€β€οΈβ€πŸ’‹β€πŸ‘¨πŸΎ + implicit: true + style: DOUBLE_QUOTED_SCALAR_STYLE + - SEQUENCE_END_EVENT + - DOCUMENT_END_EVENT + - STREAM_END_EVENT + want: + - "ASCII" + - "Γ©" + - "Β©" + - "€" + - "𐀀" + - "ζΌ’" + - "πŸ˜€" + - "πŸ³οΈβ€πŸŒˆ" + - "πŸ‘©πŸΌβ€β€οΈβ€πŸ’‹β€πŸ‘¨πŸΎ" + +- emit-config: + name: Emojis with Unicode disabled + conf: + unicode: false + data: + - STREAM_START_EVENT: + encoding: UTF8_ENCODING + - DOCUMENT_START_EVENT: + implicit: true + - SEQUENCE_START_EVENT + - SCALAR_EVENT: + value: πŸ˜€ + implicit: true + style: PLAIN_SCALAR_STYLE + - SCALAR_EVENT: + value: πŸ‘πŸ½ + implicit: true + style: PLAIN_SCALAR_STYLE + - SCALAR_EVENT: + value: πŸ‡ΊπŸ‡Έ + implicit: true + style: PLAIN_SCALAR_STYLE + - SCALAR_EVENT: + value: πŸ³οΈβ€πŸŒˆ + implicit: true + style: PLAIN_SCALAR_STYLE + - SCALAR_EVENT: + value: πŸ‘©πŸΌβ€β€οΈβ€πŸ’‹β€πŸ‘¨πŸΎ + implicit: true + style: PLAIN_SCALAR_STYLE + - SEQUENCE_END_EVENT + - DOCUMENT_END_EVENT + - STREAM_END_EVENT + want: | + - ! "\U0001F600" + - ! "\U0001F44D\U0001F3FD" + - ! "\U0001F1FA\U0001F1F8" + - ! "\U0001F3F3\uFE0F\u200D\U0001F308" + - ! "\U0001F469\U0001F3FC\u200D\u2764\uFE0F\u200D\U0001F48B\u200D\U0001F468\U0001F3FE" + +- emit-config: + name: Supplementary non-emoji and mixed scalar with Unicode enabled + conf: + unicode: true + data: + - STREAM_START_EVENT: + encoding: UTF8_ENCODING + - DOCUMENT_START_EVENT: + implicit: true + - SEQUENCE_START_EVENT + - SCALAR_EVENT: + value: 𐀀 + implicit: true + style: PLAIN_SCALAR_STYLE + - SCALAR_EVENT: + value: AπŸ˜€π€Z + implicit: true + style: PLAIN_SCALAR_STYLE + - SEQUENCE_END_EVENT + - DOCUMENT_END_EVENT + - STREAM_END_EVENT + want: + - 𐀀 + - AπŸ˜€π€Z + +- emit-config: + name: Supplementary non-emoji and mixed scalar with Unicode disabled + conf: + unicode: false + data: + - STREAM_START_EVENT: + encoding: UTF8_ENCODING + - DOCUMENT_START_EVENT: + implicit: true + - SEQUENCE_START_EVENT + - SCALAR_EVENT: + value: 𐀀 + implicit: true + style: PLAIN_SCALAR_STYLE + - SCALAR_EVENT: + value: AπŸ˜€π€Z + implicit: true + style: PLAIN_SCALAR_STYLE + - SEQUENCE_END_EVENT + - DOCUMENT_END_EVENT + - STREAM_END_EVENT + want: | + - ! "\U00010000" + - ! "A\U0001F600\U00010001Z" + # Roundtrip test - roundtrip: name: Roundtrip diff --git a/internal/libyaml/testdata/parser.yaml b/internal/libyaml/testdata/parser.yaml index 745e82d0..9967e30d 100644 --- a/internal/libyaml/testdata/parser.yaml +++ b/internal/libyaml/testdata/parser.yaml @@ -376,6 +376,51 @@ - DOCUMENT_END_EVENT - STREAM_END_EVENT +- parse-events-detailed: + name: Emojis sequence + yaml: | + - πŸ˜€ + - πŸ‘πŸ½ + - πŸ‡ΊπŸ‡Έ + - πŸ³οΈβ€πŸŒˆ + - πŸ‘©πŸΌβ€β€οΈβ€πŸ’‹β€πŸ‘¨πŸΎ + want: + - STREAM_START_EVENT + - DOCUMENT_START_EVENT: + implicit: true + - SEQUENCE_START_EVENT + - SCALAR_EVENT: + value: πŸ˜€ + - SCALAR_EVENT: + value: πŸ‘πŸ½ + - SCALAR_EVENT: + value: πŸ‡ΊπŸ‡Έ + - SCALAR_EVENT: + value: πŸ³οΈβ€πŸŒˆ + - SCALAR_EVENT: + value: πŸ‘©πŸΌβ€β€οΈβ€πŸ’‹β€πŸ‘¨πŸΎ + - SEQUENCE_END_EVENT + - DOCUMENT_END_EVENT + - STREAM_END_EVENT + +- parse-events-detailed: + name: Supplementary non-emoji and mixed scalar sequence + yaml: | + - 𐀀 + - AπŸ˜€π€€Z + want: + - STREAM_START_EVENT + - DOCUMENT_START_EVENT: + implicit: true + - SEQUENCE_START_EVENT + - SCALAR_EVENT: + value: 𐀀 + - SCALAR_EVENT: + value: AπŸ˜€π€€Z + - SEQUENCE_END_EVENT + - DOCUMENT_END_EVENT + - STREAM_END_EVENT + # Error tests - parse-error: name: Error state diff --git a/internal/libyaml/testdata/scanner.yaml b/internal/libyaml/testdata/scanner.yaml index bfed1383..835fc24e 100644 --- a/internal/libyaml/testdata/scanner.yaml +++ b/internal/libyaml/testdata/scanner.yaml @@ -686,6 +686,38 @@ data: [0x19] want: false +- char-predicate: + name: Is printable - Unicode Supplementary Multilingual Plane character 𐀀 + func: isPrintable + data: [0xF0, 0x90, 0x80, 0x80] # U+10000 + want: true + +- char-predicate: + name: Is printable - CJK character ζΌ’ + func: isPrintable + data: [0xE6, 0xBC, 0xA2] # U+6F22 + want: true + +- char-predicate: + name: Is printable - emoji πŸ˜… + func: isPrintable + data: [0xF0, 0x9F, 0x98, 0x85] # U+1F605 + want: true + +- char-predicate: + name: Is printable - emoji with skin tone modifier πŸ‘πŸ½ + func: isPrintable + index: 0 # Only check the first code point (πŸ‘) for printability + data: [0xF0, 0x9F, 0x91, 0x8D, 0xF0, 0x9F, 0x8F, 0xBD] # U+1F44D U+1F3FD + want: true + +- char-predicate: + name: Is printable - emoji with skin tone modifier πŸ‘πŸ½ + func: isPrintable + index: 4 # Check the second code point (skin tone modifier) for printability + data: [0xF0, 0x9F, 0x91, 0x8D, 0xF0, 0x9F, 0x8F, 0xBD] + want: true + # isZeroChar tests - char-predicate: name: Is zero char - null @@ -957,7 +989,7 @@ # width tests - char-convert: - name: Width - 0x00 + name: Width - null character func: width data: [0x00] want: 1 @@ -1010,6 +1042,42 @@ data: [0xF8] want: 0 +- char-convert: + name: Width - ASCII - letter A + func: width + data: [0x41] # 'A' + want: 1 + +- char-convert: + name: Width - 2-byte char - U+00A9 COPYRIGHT SIGN + func: width + data: [0xC2, 0xA9] + want: 2 + +- char-convert: + name: Width - 3-byte char - U+20AC EURO SIGN + func: width + data: [0xE2, 0x82, 0xAC] + want: 3 + +- char-convert: + name: Width - 3-byte char - U+6F22 CJK character ζΌ’ + func: width + data: [0xE6, 0xBC, 0xA2] # U+6F22 CJK character ζΌ’ + want: 3 + +- char-convert: + name: Width - emoji πŸ‘ + func: width + data: [0xF0, 0x9F, 0x91, 0x8D] + want: 4 + +- char-convert: + name: Width - emoji with skin tone modifier πŸ‘πŸ½ + func: width + data: [0xF0, 0x9F, 0x91, 0x8D, 0xF0, 0x9F, 0x8F, 0xBD] + want: 4 + # isTagURIChar tests - char-predicate: name: Is tag URI char - alphanumeric (non-verbatim)