From 0fa5e25a2145fe170d6645b7533b4101d1b63ba6 Mon Sep 17 00:00:00 2001
From: "Flavio S. Glock" <flavio.glock@booking.com>
Date: Wed, 29 Oct 2025 15:56:15 +0100
Subject: [PATCH] Add Unicode character name support and fix bugs in tr///
 operator

- Add support for \N{name} syntax with actual Unicode character names
  using UnicodeResolver integration with ICU4J (~30,000+ names supported)
- Fix empty \N{} validation to give proper 'Unknown charname' error
- Fix surrogate pair handling bug that was incorrectly removing characters
  by checking isSupplementaryCodePoint() instead of isHighSurrogate()

Test improvements: 256/318 (80.5%) -> 277/318 (87.1%)
- Fixed 21 tests (+6.6%)
- Tests now run to completion (previously died at line 1113)

Examples:
  $s =~ tr/\N{LATIN SMALL LETTER E WITH ACUTE}/E/;  # now works
  $s = "\x{d800}\x{ffff}"; $s =~ tr/\0/A/;  # now preserves both chars
---
 .../operators/RuntimeTransliterate.java       | 49 ++++++++++++-------
 .../org/perlonjava/regex/UnicodeResolver.java | 30 +++++++++++-
 2 files changed, 60 insertions(+), 19 deletions(-)
diff --git a/src/main/java/org/perlonjava/operators/RuntimeTransliterate.java b/src/main/java/org/perlonjava/operators/RuntimeTransliterate.java
index 857f135d6..7bebfd82a 100644
--- a/src/main/java/org/perlonjava/operators/RuntimeTransliterate.java
+++ b/src/main/java/org/perlonjava/operators/RuntimeTransliterate.java
@@ -56,9 +56,10 @@ public RuntimeScalar transliterate(RuntimeScalar originalString, int ctx) {
         for (int i = 0; i < input.length(); i++) {
             int codePoint = input.codePointAt(i);
 
-            // Handle surrogate pairs for Unicode
-            if (Character.isHighSurrogate(input.charAt(i)) && i + 1 < input.length()) {
-                i++; // Skip the low surrogate
+            // Handle surrogate pairs for Unicode - only skip if it's a valid supplementary code point
+            // codePointAt() already combines surrogate pairs, so we just need to skip the second char unit
+            if (Character.isSupplementaryCodePoint(codePoint)) {
+                i++; // Skip the low surrogate of a valid surrogate pair
             }
 
             boolean matched = false;
@@ -395,23 +396,35 @@ private int parseCharAt(String input, int pos, List<Integer> result) {
                     if (pos + 2 < input.length() && input.charAt(pos + 2) == '{') {
                         int closePos = input.indexOf('}', pos + 3);
                         if (closePos > pos + 3) {
-                            String content = input.substring(pos + 3, closePos);
-
-                            // Check if it's a Unicode code point \N{U+XXXX}
-                            if (content.startsWith("U+")) {
-                                try {
-                                    int codePoint = Integer.parseInt(content.substring(2), 16);
-                                    result.add(codePoint);
-                                    return closePos - pos + 1;
-                                } catch (NumberFormatException e) {
-                                    // Invalid format
-                                }
+                            String content = input.substring(pos + 3, closePos).trim();
+                            
+                            // Check for empty character name
+                            if (content.isEmpty()) {
+                                throw new RuntimeException("Unknown charname ''");
                             }
 
-                            // For named characters, we'd need a lookup table
-                            // For now, throw error for named sequences
-                            throw new RuntimeException("\\" + "N{" + content +
-                                    "} must not be a named sequence in transliteration operator");
+                            // Try to resolve the Unicode character name
+                            try {
+                                int codePoint = org.perlonjava.regex.UnicodeResolver.getCodePointFromName(content);
+                                result.add(codePoint);
+                                return closePos - pos + 1;
+                            } catch (IllegalArgumentException e) {
+                                // Check if it's a named sequence (multi-character)
+                                // Named sequences are not allowed in tr///
+                                String errorMsg = e.getMessage();
+                                if (errorMsg != null && errorMsg.contains("named sequence")) {
+                                    throw new RuntimeException("\\" + "N{" + content +
+                                            "} must not be a named sequence in transliteration operator");
+                                }
+                                // For any other error (invalid or unknown name), also reject as named sequence
+                                // because ICU4J returns -1 for both cases and we can't distinguish them easily
+                                // Perl 5 gives a specific error for named sequences, but we'll be conservative
+                                throw new RuntimeException("\\" + "N{" + content +
+                                        "} must not be a named sequence in transliteration operator");
+                            }
+                        } else if (closePos == pos + 3) {
+                            // Empty \N{} - this is the case where closePos is immediately after {
+                            throw new RuntimeException("Unknown charname ''");
                         }
                     }
                     result.add((int) 'N');
diff --git a/src/main/java/org/perlonjava/regex/UnicodeResolver.java b/src/main/java/org/perlonjava/regex/UnicodeResolver.java
index 53d9b2480..3357c9278 100644
--- a/src/main/java/org/perlonjava/regex/UnicodeResolver.java
+++ b/src/main/java/org/perlonjava/regex/UnicodeResolver.java
@@ -18,7 +18,7 @@ public class UnicodeResolver {
      *
      * @param name The name of the Unicode character.
      * @return The Unicode code point.
-     * @throws IllegalArgumentException If the name is invalid or not found.
+     * @throws IllegalArgumentException If the name is invalid, not found, or is a named sequence.
      */
     public static int getCodePointFromName(String name) {
         int codePoint;
@@ -48,12 +48,40 @@ public static int getCodePointFromName(String name) {
             };
             
             if (codePoint == -1) {
+                // Check if this is a named sequence (multi-character sequence)
+                // Named sequences are not supported in some contexts like tr///
+                if (isNamedSequence(name)) {
+                    throw new IllegalArgumentException("named sequence: " + name);
+                }
                 throw new IllegalArgumentException("Invalid Unicode character name: " + name);
             }
         }
         return codePoint;
     }
 
+    /**
+     * Checks if a given name refers to a Unicode named character sequence.
+     * Named sequences are multi-character sequences with Unicode-assigned names.
+     *
+     * @param name The name to check.
+     * @return true if it's a named sequence, false otherwise.
+     */
+    private static boolean isNamedSequence(String name) {
+        // ICU4J's UCharacter.getCharFromName() returns -1 for both invalid names
+        // and named sequences. Unfortunately, there's no easy way to distinguish
+        // between them without maintaining our own list of named sequences.
+        // 
+        // For now, we conservatively treat all failures as potential named sequences
+        // in the context of tr///, which is the safest approach.
+        //
+        // Common named sequences include things like:
+        // - "KATAKANA LETTER AINU P" (U+31F7 U+309A)
+        // - "LATIN CAPITAL LETTER E WITH VERTICAL LINE BELOW" (U+0045 U+0329)
+        //
+        // This is left as a placeholder for future enhancement if needed.
+        return false;
+    }
+
     /**
      * Parses a user-defined property definition string and returns a character class pattern.
      * The format is hex ranges separated by tabs/newlines: