2626 *
2727 * TODO(jat): many of the classification methods implemented here are not
2828 * correct in that they only handle ASCII characters, and many other methods
29- * are not currently implemented. I think the proper approach is to introduce * a deferred binding parameter which substitutes an implementation using
29+ * are not currently implemented. I think the proper approach is to introduce
30+ * a deferred binding parameter which substitutes an implementation using
3031 * a fully-correct Unicode character database, at the expense of additional
3132 * data being downloaded. That way developers that need the functionality
3233 * can get it without those who don't need it paying for it.
3334 *
3435 * <pre>
3536 * The following methods are still not implemented -- most would require Unicode
3637 * character db to be useful:
37- * - digit / is* / to*(int codePoint)
38- * - isDefined(char)
38+ * - digit(int codePoint)
3939 * - isIdentifierIgnorable(char)
4040 * - isJavaIdentifierPart(char)
4141 * - isJavaIdentifierStart(char)
4242 * - isJavaLetter(char) -- deprecated, so probably not
4343 * - isJavaLetterOrDigit(char) -- deprecated, so probably not
44- * - isISOControl(char)
45- * - isMirrored(char)
46- * - isSpaceChar(char)
4744 * - isUnicodeIdentifierPart(char)
4845 * - isUnicodeIdentifierStart(char)
4946 * - getDirectionality(*)
5552 *
5653 * The following do not properly handle characters outside of ASCII:
5754 * - digit(char c, int radix)
58- * - isDigit(char c)
59- * - isLetter(char c)
60- * - isLetterOrDigit(char c)
6155 * - isLowerCase(char c)
6256 * - isUpperCase(char c)
6357 * </pre>
@@ -72,11 +66,11 @@ static class CharSequenceAdapter implements CharSequence {
7266 private int start ;
7367 private int end ;
7468
75- public CharSequenceAdapter (char [] charArray ) {
69+ CharSequenceAdapter (char [] charArray ) {
7670 this (charArray , 0 , charArray .length );
7771 }
7872
79- public CharSequenceAdapter (char [] charArray , int start , int end ) {
73+ CharSequenceAdapter (char [] charArray , int start , int end ) {
8074 this .charArray = charArray ;
8175 this .start = start ;
8276 this .end = end ;
@@ -234,57 +228,136 @@ public static boolean isBmpCodePoint(int codePoint) {
234228 return codePoint >= MIN_VALUE && codePoint <= MAX_VALUE ;
235229 }
236230
231+ private static NativeRegExp definedRegex ;
232+
233+ public static boolean isDefined (char c ) {
234+ return isDefined (String .valueOf (c ));
235+ }
236+
237+ public static boolean isDefined (int codePoint ) {
238+ return isValidCodePoint (codePoint )
239+ && isDefined (String .NativeString .fromCodePoint (codePoint ));
240+ }
241+
242+ private static boolean isDefined (String str ) {
243+ if (definedRegex == null ) {
244+ definedRegex = new NativeRegExp ("\\ P{Cn}" , "u" );
245+ }
246+ return definedRegex .test (str );
247+ }
248+
237249 private static NativeRegExp digitRegex ;
238250
239- /*
240- * TODO: correct Unicode handling.
241- */
242251 public static boolean isDigit (char c ) {
252+ return isDigit (String .valueOf (c ));
253+ }
254+
255+ // Known differences between Java 17 and Chrome 135
256+ // 11f50 .. 11f59, 16ac0 .. 16ac9, 1e4f0 .. 1e4f9, 1fbf0 .. 1fbf9
257+ public static boolean isDigit (int codePoint ) {
258+ return isValidCodePoint (codePoint ) && isDigit (String .NativeString .fromCodePoint (codePoint ));
259+ }
260+
261+ private static boolean isDigit (String str ) {
243262 if (digitRegex == null ) {
244- digitRegex = new NativeRegExp ("\\ d " );
263+ digitRegex = new NativeRegExp ("\\ p{Nd}" , "u " );
245264 }
246- return digitRegex .test (String .valueOf (c ));
265+ return digitRegex .test (String .valueOf (str ));
247266 }
248267
249268 public static boolean isHighSurrogate (char ch ) {
250269 return ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE ;
251270 }
252271
272+ private static NativeRegExp ideographicRegex ;
273+
274+ public static boolean isIdeographic (int codePoint ) {
275+ return isValidCodePoint (codePoint )
276+ && isIdeographic (String .NativeString .fromCodePoint (codePoint ));
277+ }
278+
279+ private static boolean isIdeographic (String str ) {
280+ if (ideographicRegex == null ) {
281+ ideographicRegex = new NativeRegExp ("\\ p{Ideographic}" , "u" );
282+ }
283+ return ideographicRegex .test (str );
284+ }
285+
253286 private static NativeRegExp leterRegex ;
254287
255- /*
256- * TODO: correct Unicode handling.
257- */
258288 public static boolean isLetter (char c ) {
289+ return isLetter (String .valueOf (c ));
290+ }
291+
292+ public static boolean isLetter (int codePoint ) {
293+ return isValidCodePoint (codePoint )
294+ && isLetter (String .NativeString .fromCodePoint (codePoint ));
295+ }
296+
297+ public static boolean isLetter (String str ) {
259298 if (leterRegex == null ) {
260- leterRegex = new NativeRegExp ("[A-Z] " , "i " );
299+ leterRegex = new NativeRegExp ("\\ p{L} " , "u " );
261300 }
262- return leterRegex .test (String . valueOf ( c ) );
301+ return leterRegex .test (str );
263302 }
264303
265304 private static NativeRegExp isLeterOrDigitRegex ;
266305
267- /*
268- * TODO: correct Unicode handling.
269- */
270306 public static boolean isLetterOrDigit (char c ) {
271307 if (isLeterOrDigitRegex == null ) {
272- isLeterOrDigitRegex = new NativeRegExp ("[A-Z \\ d ]" , "i " );
308+ isLeterOrDigitRegex = new NativeRegExp ("[\\ p{Nd} \\ p{L} ]" , "u " );
273309 }
274310 return isLeterOrDigitRegex .test (String .valueOf (c ));
275311 }
276312
277- /*
278- * TODO: correct Unicode handling.
279- */
313+ private static NativeRegExp lowerCaseRegex ;
314+
280315 public static boolean isLowerCase (char c ) {
281- return toLowerCase (c ) == c && isLetter (c );
316+ return isLowerCase (String .valueOf (c ));
317+ }
318+
319+ public static boolean isLowerCase (int codePoint ) {
320+ return isValidCodePoint (codePoint )
321+ && isLowerCase (String .NativeString .fromCodePoint (codePoint ));
322+ }
323+
324+ private static boolean isLowerCase (String str ) {
325+ if (lowerCaseRegex == null ) {
326+ lowerCaseRegex = new NativeRegExp ("\\ p{Lowercase}" , "u" );
327+ }
328+ return lowerCaseRegex .test (str );
282329 }
283330
284331 public static boolean isLowSurrogate (char ch ) {
285332 return ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE ;
286333 }
287334
335+ private static NativeRegExp mirroredRegex ;
336+
337+ public static boolean isMirrored (char c ) {
338+ return isMirrored (String .valueOf (c ));
339+ }
340+
341+ public static boolean isMirrored (int codePoint ) {
342+ return isValidCodePoint (codePoint )
343+ && isMirrored (String .NativeString .fromCodePoint (codePoint ));
344+ }
345+
346+ private static boolean isMirrored (String str ) {
347+ if (mirroredRegex == null ) {
348+ mirroredRegex = new NativeRegExp ("\\ p{Bidi_Mirrored}" , "u" );
349+ }
350+ return mirroredRegex .test (str );
351+ }
352+
353+ public static boolean isISOControl (char ch ) {
354+ return ch <= '\u001F' || (ch >= '\u007F' && ch <= '\u009F' );
355+ }
356+
357+ public static boolean isISOControl (int codePoint ) {
358+ return codePoint <= '\u001F' || (codePoint >= '\u007F' && codePoint <= '\u009F' );
359+ }
360+
288361 /**
289362 * Deprecated - see isWhitespace(char).
290363 */
@@ -306,12 +379,35 @@ public static boolean isSpace(char c) {
306379 }
307380 }
308381
382+ private static NativeRegExp spaceRegex ;
383+
384+ public static boolean isSpaceChar (char c ) {
385+ return isSpaceChar (String .valueOf (c ));
386+ }
387+
388+ public static boolean isSpaceChar (int codePoint ) {
389+ return isValidCodePoint (codePoint )
390+ && isSpaceChar (String .NativeString .fromCodePoint (codePoint ));
391+ }
392+
393+ private static boolean isSpaceChar (String str ) {
394+ if (spaceRegex == null ) {
395+ spaceRegex = new NativeRegExp ("\\ p{Z}" , "u" );
396+ }
397+ return spaceRegex .test (str );
398+ }
399+
400+ public static boolean isSurrogate (char ch ) {
401+ return ch >= MIN_SURROGATE && ch <= MAX_SURROGATE ;
402+ }
403+
309404 public static boolean isWhitespace (char ch ) {
310405 return isWhitespace (String .valueOf (ch ));
311406 }
312407
313408 public static boolean isWhitespace (int codePoint ) {
314- return isWhitespace (String .fromCodePoint (codePoint ));
409+ return isValidCodePoint (codePoint )
410+ && isWhitespace (String .NativeString .fromCodePoint (codePoint ));
315411 }
316412
317413 private static NativeRegExp whitespaceRegex ;
@@ -339,14 +435,31 @@ public static boolean isSurrogatePair(char highSurrogate, char lowSurrogate) {
339435
340436 public static boolean isTitleCase (char c ) {
341437 // https://www.compart.com/en/unicode/category/Lt
342- return c != toUpperCase (c ) && c != toLowerCase (c );
438+ // here we should use the semantic of String.toUpperCase
439+ return c != String .valueOf (c ).toUpperCase ().charAt (0 ) && c != toLowerCase (c );
343440 }
344441
345- /*
346- * TODO: correct Unicode handling.
347- */
442+ public static boolean isTitleCase (int codePoint ) {
443+ // as of Unicode 16 there are no title-case chars beyond 0xffff
444+ return codePoint > 0 && codePoint < 0xffff && isTitleCase ((char ) codePoint );
445+ }
446+
447+ private static NativeRegExp upperCaseRegex ;
448+
348449 public static boolean isUpperCase (char c ) {
349- return toUpperCase (c ) == c && isLetter (c );
450+ return isUpperCase (String .valueOf (c ));
451+ }
452+
453+ public static boolean isUpperCase (int codePoint ) {
454+ return isValidCodePoint (codePoint )
455+ && isUpperCase (String .NativeString .fromCodePoint (codePoint ));
456+ }
457+
458+ private static boolean isUpperCase (String c ) {
459+ if (upperCaseRegex == null ) {
460+ upperCaseRegex = new NativeRegExp ("\\ p{Uppercase}" , "u" );
461+ }
462+ return upperCaseRegex .test (c );
350463 }
351464
352465 public static boolean isValidCodePoint (int codePoint ) {
@@ -390,8 +503,8 @@ public static char[] toChars(int codePoint) {
390503
391504 if (codePoint >= MIN_SUPPLEMENTARY_CODE_POINT ) {
392505 return new char [] {
393- getHighSurrogate (codePoint ),
394- getLowSurrogate (codePoint ),
506+ highSurrogate (codePoint ),
507+ lowSurrogate (codePoint ),
395508 };
396509 } else {
397510 return new char [] {
@@ -404,8 +517,8 @@ public static int toChars(int codePoint, char[] dst, int dstIndex) {
404517 checkCriticalArgument (codePoint >= 0 && codePoint <= MAX_CODE_POINT );
405518
406519 if (codePoint >= MIN_SUPPLEMENTARY_CODE_POINT ) {
407- dst [dstIndex ++] = getHighSurrogate (codePoint );
408- dst [dstIndex ] = getLowSurrogate (codePoint );
520+ dst [dstIndex ++] = highSurrogate (codePoint );
521+ dst [dstIndex ] = lowSurrogate (codePoint );
409522 return 2 ;
410523 } else {
411524 dst [dstIndex ] = (char ) codePoint ;
@@ -426,14 +539,36 @@ public static char toLowerCase(char c) {
426539 return CaseMapper .charToLowerCase (c );
427540 }
428541
542+ public static int toLowerCase (int codePoint ) {
543+ if (codePoint > MAX_CODE_POINT ) {
544+ return codePoint ;
545+ }
546+ return CaseMapper .intToLowerCase (codePoint );
547+ }
548+
429549 public static String toString (char x ) {
430550 return String .valueOf (x );
431551 }
432552
553+ public static String toString (int codePoint ) {
554+ if (isValidCodePoint (codePoint )) {
555+ return String .NativeString .fromCodePoint (codePoint );
556+ } else {
557+ throw new IllegalArgumentException ("Invalid code point: " + codePoint );
558+ }
559+ }
560+
433561 public static char toUpperCase (char c ) {
434562 return CaseMapper .charToUpperCase (c );
435563 }
436564
565+ public static int toUpperCase (int codePoint ) {
566+ if (!isValidCodePoint (codePoint )) {
567+ return codePoint ;
568+ }
569+ return CaseMapper .intToUpperCase (codePoint );
570+ }
571+
437572 public static Character valueOf (char c ) {
438573 if (c < 128 ) {
439574 return BoxedValues .get (c );
@@ -473,26 +608,26 @@ static char forDigit(int digit) {
473608
474609 /**
475610 * Computes the high surrogate character of the UTF16 representation of a
476- * non-BMP code point. See {@link getLowSurrogate }.
611+ * non-BMP code point. See {@link #lowSurrogate }.
477612 *
478613 * @param codePoint requested codePoint, required to be >=
479614 * MIN_SUPPLEMENTARY_CODE_POINT
480615 * @return high surrogate character
481616 */
482- static char getHighSurrogate (int codePoint ) {
617+ public static char highSurrogate (int codePoint ) {
483618 return (char ) (MIN_HIGH_SURROGATE
484619 + (((codePoint - MIN_SUPPLEMENTARY_CODE_POINT ) >> 10 ) & 1023 ));
485620 }
486621
487622 /**
488623 * Computes the low surrogate character of the UTF16 representation of a
489- * non-BMP code point. See {@link getHighSurrogate }.
624+ * non-BMP code point. See {@link #highSurrogate }.
490625 *
491626 * @param codePoint requested codePoint, required to be >=
492627 * MIN_SUPPLEMENTARY_CODE_POINT
493628 * @return low surrogate character
494629 */
495- static char getLowSurrogate (int codePoint ) {
630+ public static char lowSurrogate (int codePoint ) {
496631 return (char ) (MIN_LOW_SURROGATE + ((codePoint - MIN_SUPPLEMENTARY_CODE_POINT ) & 1023 ));
497632 }
498633
0 commit comments