Skip to content

Commit caf7ec8

Browse files
committed
Add Unicode character name support and fix bugs in tr/// operator
- Add support for \N{name} syntax with actual Unicode character names using UnicodeResolver integration with ICU4J (~30,000+ names supported) - Fix empty \N{} validation to give proper 'Unknown charname' error - Fix surrogate pair handling bug that was incorrectly removing characters by checking isSupplementaryCodePoint() instead of isHighSurrogate() Test improvements: 256/318 (80.5%) -> 277/318 (87.1%) - Fixed 21 tests (+6.6%) - Tests now run to completion (previously died at line 1113) Examples: $s =~ tr/\N{LATIN SMALL LETTER E WITH ACUTE}/E/; # now works $s = "\x{d800}\x{ffff}"; $s =~ tr/\0/A/; # now preserves both chars
1 parent 953e21a commit caf7ec8

File tree

2 files changed

+60
-19
lines changed

2 files changed

+60
-19
lines changed

src/main/java/org/perlonjava/operators/RuntimeTransliterate.java

Lines changed: 31 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,10 @@ public RuntimeScalar transliterate(RuntimeScalar originalString, int ctx) {
5656
for (int i = 0; i < input.length(); i++) {
5757
int codePoint = input.codePointAt(i);
5858

59-
// Handle surrogate pairs for Unicode
60-
if (Character.isHighSurrogate(input.charAt(i)) && i + 1 < input.length()) {
61-
i++; // Skip the low surrogate
59+
// Handle surrogate pairs for Unicode - only skip if it's a valid supplementary code point
60+
// codePointAt() already combines surrogate pairs, so we just need to skip the second char unit
61+
if (Character.isSupplementaryCodePoint(codePoint)) {
62+
i++; // Skip the low surrogate of a valid surrogate pair
6263
}
6364

6465
boolean matched = false;
@@ -395,23 +396,35 @@ private int parseCharAt(String input, int pos, List<Integer> result) {
395396
if (pos + 2 < input.length() && input.charAt(pos + 2) == '{') {
396397
int closePos = input.indexOf('}', pos + 3);
397398
if (closePos > pos + 3) {
398-
String content = input.substring(pos + 3, closePos);
399-
400-
// Check if it's a Unicode code point \N{U+XXXX}
401-
if (content.startsWith("U+")) {
402-
try {
403-
int codePoint = Integer.parseInt(content.substring(2), 16);
404-
result.add(codePoint);
405-
return closePos - pos + 1;
406-
} catch (NumberFormatException e) {
407-
// Invalid format
408-
}
399+
String content = input.substring(pos + 3, closePos).trim();
400+
401+
// Check for empty character name
402+
if (content.isEmpty()) {
403+
throw new RuntimeException("Unknown charname ''");
409404
}
410405

411-
// For named characters, we'd need a lookup table
412-
// For now, throw error for named sequences
413-
throw new RuntimeException("\\" + "N{" + content +
414-
"} must not be a named sequence in transliteration operator");
406+
// Try to resolve the Unicode character name
407+
try {
408+
int codePoint = org.perlonjava.regex.UnicodeResolver.getCodePointFromName(content);
409+
result.add(codePoint);
410+
return closePos - pos + 1;
411+
} catch (IllegalArgumentException e) {
412+
// Check if it's a named sequence (multi-character)
413+
// Named sequences are not allowed in tr///
414+
String errorMsg = e.getMessage();
415+
if (errorMsg != null && errorMsg.contains("named sequence")) {
416+
throw new RuntimeException("\\" + "N{" + content +
417+
"} must not be a named sequence in transliteration operator");
418+
}
419+
// For any other error (invalid or unknown name), also reject as named sequence
420+
// because ICU4J returns -1 for both cases and we can't distinguish them easily
421+
// Perl 5 gives a specific error for named sequences, but we'll be conservative
422+
throw new RuntimeException("\\" + "N{" + content +
423+
"} must not be a named sequence in transliteration operator");
424+
}
425+
} else if (closePos == pos + 3) {
426+
// Empty \N{} - this is the case where closePos is immediately after {
427+
throw new RuntimeException("Unknown charname ''");
415428
}
416429
}
417430
result.add((int) 'N');

src/main/java/org/perlonjava/regex/UnicodeResolver.java

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ public class UnicodeResolver {
1818
*
1919
* @param name The name of the Unicode character.
2020
* @return The Unicode code point.
21-
* @throws IllegalArgumentException If the name is invalid or not found.
21+
* @throws IllegalArgumentException If the name is invalid, not found, or is a named sequence.
2222
*/
2323
public static int getCodePointFromName(String name) {
2424
int codePoint;
@@ -48,12 +48,40 @@ public static int getCodePointFromName(String name) {
4848
};
4949

5050
if (codePoint == -1) {
51+
// Check if this is a named sequence (multi-character sequence)
52+
// Named sequences are not supported in some contexts like tr///
53+
if (isNamedSequence(name)) {
54+
throw new IllegalArgumentException("named sequence: " + name);
55+
}
5156
throw new IllegalArgumentException("Invalid Unicode character name: " + name);
5257
}
5358
}
5459
return codePoint;
5560
}
5661

62+
/**
63+
* Checks if a given name refers to a Unicode named character sequence.
64+
* Named sequences are multi-character sequences with Unicode-assigned names.
65+
*
66+
* @param name The name to check.
67+
* @return true if it's a named sequence, false otherwise.
68+
*/
69+
private static boolean isNamedSequence(String name) {
70+
// ICU4J's UCharacter.getCharFromName() returns -1 for both invalid names
71+
// and named sequences. Unfortunately, there's no easy way to distinguish
72+
// between them without maintaining our own list of named sequences.
73+
//
74+
// For now, we conservatively treat all failures as potential named sequences
75+
// in the context of tr///, which is the safest approach.
76+
//
77+
// Common named sequences include things like:
78+
// - "KATAKANA LETTER AINU P" (U+31F7 U+309A)
79+
// - "LATIN CAPITAL LETTER E WITH VERTICAL LINE BELOW" (U+0045 U+0329)
80+
//
81+
// This is left as a placeholder for future enhancement if needed.
82+
return false;
83+
}
84+
5785
/**
5886
* Parses a user-defined property definition string and returns a character class pattern.
5987
* The format is hex ranges separated by tabs/newlines:

0 commit comments

Comments
 (0)