fglock · fglock · Oct 29, 2025
diff --git a/src/main/java/org/perlonjava/operators/RuntimeTransliterate.java b/src/main/java/org/perlonjava/operators/RuntimeTransliterate.java
@@ -56,9 +56,10 @@ public RuntimeScalar transliterate(RuntimeScalar originalString, int ctx) {
         for (int i = 0; i < input.length(); i++) {
             int codePoint = input.codePointAt(i);
 
-            // Handle surrogate pairs for Unicode
-            if (Character.isHighSurrogate(input.charAt(i)) && i + 1 < input.length()) {
-                i++; // Skip the low surrogate
+            // Handle surrogate pairs for Unicode - only skip if it's a valid supplementary code point
+            // codePointAt() already combines surrogate pairs, so we just need to skip the second char unit
+            if (Character.isSupplementaryCodePoint(codePoint)) {
+                i++; // Skip the low surrogate of a valid surrogate pair
             }
 
             boolean matched = false;
@@ -395,23 +396,35 @@ private int parseCharAt(String input, int pos, List<Integer> result) {
                     if (pos + 2 < input.length() && input.charAt(pos + 2) == '{') {
                         int closePos = input.indexOf('}', pos + 3);
                         if (closePos > pos + 3) {
-                            String content = input.substring(pos + 3, closePos);
-
-                            // Check if it's a Unicode code point \N{U+XXXX}
-                            if (content.startsWith("U+")) {
-                                try {
-                                    int codePoint = Integer.parseInt(content.substring(2), 16);
-                                    result.add(codePoint);
-                                    return closePos - pos + 1;
-                                } catch (NumberFormatException e) {
-                                    // Invalid format
-                                }
+                            String content = input.substring(pos + 3, closePos).trim();
+
+                            // Check for empty character name
+                            if (content.isEmpty()) {
+                                throw new RuntimeException("Unknown charname ''");
                             }
 
-                            // For named characters, we'd need a lookup table
-                            // For now, throw error for named sequences
-                            throw new RuntimeException("\\" + "N{" + content +
-                                    "} must not be a named sequence in transliteration operator");
+                            // Try to resolve the Unicode character name
+                            try {
+                                int codePoint = org.perlonjava.regex.UnicodeResolver.getCodePointFromName(content);
+                                result.add(codePoint);
+                                return closePos - pos + 1;
+                            } catch (IllegalArgumentException e) {
+                                // Check if it's a named sequence (multi-character)
+                                // Named sequences are not allowed in tr///
+                                String errorMsg = e.getMessage();
+                                if (errorMsg != null && errorMsg.contains("named sequence")) {
+                                    throw new RuntimeException("\\" + "N{" + content +
+                                            "} must not be a named sequence in transliteration operator");
+                                }
+                                // For any other error (invalid or unknown name), also reject as named sequence
+                                // because ICU4J returns -1 for both cases and we can't distinguish them easily
+                                // Perl 5 gives a specific error for named sequences, but we'll be conservative
+                                throw new RuntimeException("\\" + "N{" + content +
+                                        "} must not be a named sequence in transliteration operator");
+                            }
+                        } else if (closePos == pos + 3) {
+                            // Empty \N{} - this is the case where closePos is immediately after {
+                            throw new RuntimeException("Unknown charname ''");
                         }
                     }
                     result.add((int) 'N');

diff --git a/src/main/java/org/perlonjava/regex/UnicodeResolver.java b/src/main/java/org/perlonjava/regex/UnicodeResolver.java
@@ -18,7 +18,7 @@ public class UnicodeResolver {
      *
      * @param name The name of the Unicode character.
      * @return The Unicode code point.
-     * @throws IllegalArgumentException If the name is invalid or not found.
+     * @throws IllegalArgumentException If the name is invalid, not found, or is a named sequence.
      */
     public static int getCodePointFromName(String name) {
         int codePoint;
@@ -48,12 +48,40 @@ public static int getCodePointFromName(String name) {
             };
 
             if (codePoint == -1) {
+                // Check if this is a named sequence (multi-character sequence)
+                // Named sequences are not supported in some contexts like tr///
+                if (isNamedSequence(name)) {
+                    throw new IllegalArgumentException("named sequence: " + name);
+                }
                 throw new IllegalArgumentException("Invalid Unicode character name: " + name);
             }
         }
         return codePoint;
     }
 
+    /**
+     * Checks if a given name refers to a Unicode named character sequence.
+     * Named sequences are multi-character sequences with Unicode-assigned names.
+     *
+     * @param name The name to check.
+     * @return true if it's a named sequence, false otherwise.
+     */
+    private static boolean isNamedSequence(String name) {
+        // ICU4J's UCharacter.getCharFromName() returns -1 for both invalid names
+        // and named sequences. Unfortunately, there's no easy way to distinguish
+        // between them without maintaining our own list of named sequences.
+        // 
+        // For now, we conservatively treat all failures as potential named sequences
+        // in the context of tr///, which is the safest approach.
+        //
+        // Common named sequences include things like:
+        // - "KATAKANA LETTER AINU P" (U+31F7 U+309A)
+        // - "LATIN CAPITAL LETTER E WITH VERTICAL LINE BELOW" (U+0045 U+0329)
+        //
+        // This is left as a placeholder for future enhancement if needed.
+        return false;
+    }
+
     /**
      * Parses a user-defined property definition string and returns a character class pattern.
      * The format is hex ranges separated by tabs/newlines: