Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 31 additions & 18 deletions src/main/java/org/perlonjava/operators/RuntimeTransliterate.java
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,10 @@ public RuntimeScalar transliterate(RuntimeScalar originalString, int ctx) {
for (int i = 0; i < input.length(); i++) {
int codePoint = input.codePointAt(i);

// Handle surrogate pairs for Unicode
if (Character.isHighSurrogate(input.charAt(i)) && i + 1 < input.length()) {
i++; // Skip the low surrogate
// Handle surrogate pairs for Unicode - only skip if it's a valid supplementary code point
// codePointAt() already combines surrogate pairs, so we just need to skip the second char unit
if (Character.isSupplementaryCodePoint(codePoint)) {
i++; // Skip the low surrogate of a valid surrogate pair
}

boolean matched = false;
Expand Down Expand Up @@ -395,23 +396,35 @@ private int parseCharAt(String input, int pos, List<Integer> result) {
if (pos + 2 < input.length() && input.charAt(pos + 2) == '{') {
int closePos = input.indexOf('}', pos + 3);
if (closePos > pos + 3) {
String content = input.substring(pos + 3, closePos);

// Check if it's a Unicode code point \N{U+XXXX}
if (content.startsWith("U+")) {
try {
int codePoint = Integer.parseInt(content.substring(2), 16);
result.add(codePoint);
return closePos - pos + 1;
} catch (NumberFormatException e) {
// Invalid format
}
String content = input.substring(pos + 3, closePos).trim();

// Check for empty character name
if (content.isEmpty()) {
throw new RuntimeException("Unknown charname ''");
}

// For named characters, we'd need a lookup table
// For now, throw error for named sequences
throw new RuntimeException("\\" + "N{" + content +
"} must not be a named sequence in transliteration operator");
// Try to resolve the Unicode character name
try {
int codePoint = org.perlonjava.regex.UnicodeResolver.getCodePointFromName(content);
result.add(codePoint);
return closePos - pos + 1;
} catch (IllegalArgumentException e) {
// Check if it's a named sequence (multi-character)
// Named sequences are not allowed in tr///
String errorMsg = e.getMessage();
if (errorMsg != null && errorMsg.contains("named sequence")) {
throw new RuntimeException("\\" + "N{" + content +
"} must not be a named sequence in transliteration operator");
}
// For any other error (invalid or unknown name), also reject as named sequence
// because ICU4J returns -1 for both cases and we can't distinguish them easily
// Perl 5 gives a specific error for named sequences, but we'll be conservative
throw new RuntimeException("\\" + "N{" + content +
"} must not be a named sequence in transliteration operator");
}
} else if (closePos == pos + 3) {
// Empty \N{} - this is the case where closePos is immediately after {
throw new RuntimeException("Unknown charname ''");
}
}
result.add((int) 'N');
Expand Down
30 changes: 29 additions & 1 deletion src/main/java/org/perlonjava/regex/UnicodeResolver.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ public class UnicodeResolver {
*
* @param name The name of the Unicode character.
* @return The Unicode code point.
* @throws IllegalArgumentException If the name is invalid or not found.
* @throws IllegalArgumentException If the name is invalid, not found, or is a named sequence.
*/
public static int getCodePointFromName(String name) {
int codePoint;
Expand Down Expand Up @@ -48,12 +48,40 @@ public static int getCodePointFromName(String name) {
};

if (codePoint == -1) {
// Check if this is a named sequence (multi-character sequence)
// Named sequences are not supported in some contexts like tr///
if (isNamedSequence(name)) {
throw new IllegalArgumentException("named sequence: " + name);
}
throw new IllegalArgumentException("Invalid Unicode character name: " + name);
}
}
return codePoint;
}

/**
* Checks if a given name refers to a Unicode named character sequence.
* Named sequences are multi-character sequences with Unicode-assigned names.
*
* @param name The name to check.
* @return true if it's a named sequence, false otherwise.
*/
private static boolean isNamedSequence(String name) {
// ICU4J's UCharacter.getCharFromName() returns -1 for both invalid names
// and named sequences. Unfortunately, there's no easy way to distinguish
// between them without maintaining our own list of named sequences.
//
// For now, we conservatively treat all failures as potential named sequences
// in the context of tr///, which is the safest approach.
//
// Common named sequences include things like:
// - "KATAKANA LETTER AINU P" (U+31F7 U+309A)
// - "LATIN CAPITAL LETTER E WITH VERTICAL LINE BELOW" (U+0045 U+0329)
//
// This is left as a placeholder for future enhancement if needed.
return false;
}

/**
* Parses a user-defined property definition string and returns a character class pattern.
* The format is hex ranges separated by tabs/newlines:
Expand Down