regex capture group names must use identifier syntax

khwilliamson · khwilliamson · commit c11897a3cbc5 · 2025-10-07T22:41:57.000-06:00
Prior to this commit the non-first characters could be any \w character.
But an identifier excludes a few \w characters from appearing in them.
This commit tightens what is allowed.

Commit xd1e2a852fbc901b45fba20906a8f42ca227ae462 gave a list of them,
but I forgot a couple details in generating that list, so it wasn't
quite right.

The complete corrected list is:
GREEK YPOGEGRAMMENI
COMBINING CYRILLIC HUNDRED THOUSANDS SIGN
COMBINING CYRILLIC MILLIONS SIGN
COMBINING PARENTHESES OVERLAY
COMBINING ENCLOSING CIRCLE
COMBINING ENCLOSING SQUARE
COMBINING ENCLOSING DIAMOND
COMBINING ENCLOSING CIRCLE BACKSLASH
COMBINING ENCLOSING SCREEN
COMBINING ENCLOSING KEYCAP
COMBINING ENCLOSING UPWARD POINTING TRIANGLE
CIRCLED LATIN CAPITAL LETTER A - Z
CIRCLED LATIN SMALL LETTER A - Z
VERTICAL TILDE
COMBINING CYRILLIC TEN MILLIONS SIGN
COMBINING CYRILLIC HUNDRED MILLIONS SIGN
COMBINING CYRILLIC THOUSAND MILLIONS SIGN
ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM
ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM
ARABIC LIGATURE SHADDA WITH FATHA ISOLATED FORM
ARABIC LIGATURE SHADDA WITH DAMMA ISOLATED FORM
ARABIC LIGATURE SHADDA WITH KASRA ISOLATED FORM
ARABIC LIGATURE SHADDA WITH SUPERSCRIPT ALEF ISOLATED FORM
ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM
ARABIC LIGATURE JALLAJALALOUHOU
ARABIC FATHATAN ISOLATED FORM
ARABIC DAMMATAN ISOLATED FORM
ARABIC KASRATAN ISOLATED FORM
ARABIC FATHA ISOLATED FORM
ARABIC DAMMA ISOLATED FORM
ARABIC KASRA ISOLATED FORM
ARABIC SHADDA ISOLATED FORM
ARABIC SUKUN ISOLATED FORM
SQUARED LATIN CAPITAL LETTER A - Z
NEGATIVE CIRCLED LATIN CAPITAL LETTER A - Z
NEGATIVE SQUARED LATIN CAPITAL LETTER A - Z
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
@@ -373,6 +373,41 @@ consisted of only ASCII characters.  The real upper limit was as few as
 Chinese or Osage.  Now an identifier in any language may contain at
 least 255 characters.
 
+=item *
+
+The allowed characters for regular expression capture group names has
+been corrected to conform to Perl identifier syntax, which in turn is
+based on public Unicode rules.  The net result of this change is that,
+as of Unicode 17.0, about 160 characters that formerly were allowed to
+be in an identifier no longer are.  Only programs that do
+L<C<use utf8>|utf8> can be affected, and then only characters that
+appear in the 2nd or later positions of the name.  The characters that
+an identifier name can begin with are unchanged.
+
+130 of the now unacceptable characters are 5 sets of 26 Latin letters
+that are enclosed by some shape, such as CIRCLED LATIN CAPITAL LETTER N.
+Another 8 are generic modifiers that add shapes around other characters;
+5 are modifiers to Cyrillic numbers; and 16 are Arabic ligatures and
+isolated forms.  The other two are GREEK YPOGEGRAMMENI and VERTICAL
+TILDE.
+
+You can get a complete list of them by running the following program
+
+ perl -le 'use re qw(Debug COMPILE); qr/(?[ \w - \p{XIDC} ])/'
+
+Look near the final line.  The one that begins C<stclass> contains a
+list of 4 and 5 digit hexdecimal numbers.  These are the Unicode code
+points that were previously allowed, but no longer are.
+
+(Long after Perl identifier rules were formed, Unicode has added
+recommendations to further restrict legal identifier names.  These were
+added to counter cases where, for example, programmers snuck code past
+reviewers using characters that look like other ones.  The two
+properties are C<Identifier_Status> and C<Identifier_Type>.
+L<https://www.unicode.org/reports/tr39/>.  Perl currently doesn't do
+anything with these, except to furnish you the ability to use them in
+regular expressions.)
+
 =back
 
 =head1 Known Problems
diff --git a/regcomp.c b/regcomp.c
@@ -2530,7 +2530,7 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
         do {
             RExC_parse_advance(advance);
         } while (   RExC_parse < RExC_end
-                 && (advance = isWORDCHAR_utf8_safe( (U8 *) RExC_parse,
+                 && (advance = isIDCONT_utf8_safe( (U8 *) RExC_parse,
                                                      (U8 *) RExC_end)));
     } else {
         RExC_parse_inc_by(1); /* so the <- from the vFAIL is after the offending
diff --git a/t/re/pat.t b/t/re/pat.t
@@ -28,7 +28,7 @@ skip_all_without_unicode_tables();
 my $has_locales = locales_enabled('LC_CTYPE');
 my $utf8_locale = find_utf8_ctype_locale();
 
-plan tests => 1296;  # Update this when adding/deleting tests.
+plan tests => 1298;  # Update this when adding/deleting tests.
 
 run_tests() unless caller;
 
@@ -1388,6 +1388,28 @@ EOP
             fresh_perl_like($prog, qr!Group name must start with a non-digit word character!, {},
                         sprintf("'U+%04X not legal IDFirst'", ord($char)));
         }
+
+        foreach my $char (chr(0x2115), chr(0x24B7)) {
+            my $prog = <<"EOP";
+use utf8;;
+no warnings 'utf8';
+print 0 + "abc" =~ qr/(?<a${char}b>abc)/;
+EOP
+            utf8::encode($prog);
+            if ($char =~ /\p{XID_Continue}/) {
+                fresh_perl_is($prog, 1,
+                                {},
+                                sprintf("U+%04X is legal IDCont",
+                                        ord($char)));
+            }
+            else {
+                fresh_perl_like($prog,
+                                qr/Sequence .* not terminated/,
+                                {},
+                                sprintf("U+%04X not legal IDCont",
+                                ord($char)));
+            }
+        }
     }
 
     { # [perl #101710]