@@ -153,87 +153,148 @@ function menu( $items, $default = null, $title = 'Choose an item' ) {
153153}
154154
155155/**
156- * Attempts an encoding-safe way of getting string length. If mb_string extensions aren't
157- * installed , falls back to basic strlen if no encoding is present
156+ * Attempts an encoding-safe way of getting string length. If intl extension or PCRE with '\X' or mb_string extension aren't
157+ * available , falls back to basic strlen.
158158 *
159159 * @param string $str The string to check.
160160 * @param string|bool $encoding Optional. The encoding of the string. Default false.
161161 * @return int Numeric value that represents the string's length
162162 */
163163function safe_strlen ( $ str , $ encoding = false ) {
164+ // Allow for selective testings - "1" bit set tests grapheme_strlen(), "2" preg_match_all( '/\X/u' ), "4" mb_strlen(), "other" strlen().
165+ $ test_safe_strlen = getenv ( 'PHP_CLI_TOOLS_TEST_SAFE_STRLEN ' );
166+
167+ // Assume UTF-8 if no encoding given - `grapheme_strlen()` will return null if given non-UTF-8 string.
168+ if ( ( ! $ encoding || 'UTF-8 ' === $ encoding ) && can_use_icu () && null !== ( $ length = grapheme_strlen ( $ str ) ) ) {
169+ if ( ! $ test_safe_strlen || ( $ test_safe_strlen & 1 ) ) {
170+ return $ length ;
171+ }
172+ }
173+ // Assume UTF-8 if no encoding given - `preg_match_all()` will return false if given non-UTF-8 string.
174+ if ( ( ! $ encoding || 'UTF-8 ' === $ encoding ) && can_use_pcre_x () && false !== ( $ length = preg_match_all ( '/\X/u ' , $ str , $ dummy /*needed for PHP 5.3*/ ) ) ) {
175+ if ( ! $ test_safe_strlen || ( $ test_safe_strlen & 2 ) ) {
176+ return $ length ;
177+ }
178+ }
179+ // Legacy encodings and old PHPs will reach here.
164180 if ( function_exists ( 'mb_strlen ' ) && ( $ encoding || function_exists ( 'mb_detect_encoding ' ) ) ) {
165181 if ( ! $ encoding ) {
166182 $ encoding = mb_detect_encoding ( $ str , null , true /*strict*/ );
167183 }
168- $ length = mb_strlen ( $ str , $ encoding );
169- } else {
170- // iconv will return PHP notice if non-ascii characters are present in input string
171- $ str = iconv ( $ encoding ? $ encoding : 'ASCII ' , 'ASCII ' , $ str );
172-
173- $ length = strlen ( $ str );
184+ $ length = mb_strlen ( $ str , $ encoding );
185+ if ( 'UTF-8 ' === $ encoding ) {
186+ // Subtract combining characters.
187+ $ length -= preg_match_all ( get_unicode_regexs ( 'm ' ), $ str , $ dummy /*needed for PHP 5.3*/ );
188+ }
189+ if ( ! $ test_safe_strlen || ( $ test_safe_strlen & 4 ) ) {
190+ return $ length ;
191+ }
174192 }
175-
176- return $ length ;
193+ return strlen ( $ str );
177194}
178195
179196/**
180- * Attempts an encoding-safe way of getting a substring. If mb_string extensions aren't
181- * installed , falls back to ascii substring if no encoding is present
197+ * Attempts an encoding-safe way of getting a substring. If intl extension or PCRE with '\X' or mb_string extension aren't
198+ * available , falls back to substr().
182199 *
183200 * @param string $str The input string.
184201 * @param int $start The starting position of the substring.
185- * @param int|bool|null $length Optional. Maximum length of the substring. Default false.
186- * @param int|bool $is_width Optional. If set and encoding is UTF-8, $length is interpreted as spacing width. Default false.
202+ * @param int|bool|null $length Optional, unless $is_width is set . Maximum length of the substring. Default false. Negative not supported .
203+ * @param int|bool $is_width Optional. If set and encoding is UTF-8, $length (which must be specified) is interpreted as spacing width. Default false.
187204 * @param string|bool $encoding Optional. The encoding of the string. Default false.
188- * @return string Substring of string specified by start and length parameters
205+ * @return bool| string False if given unsupported args, otherwise substring of string specified by start and length parameters
189206 */
190207function safe_substr ( $ str , $ start , $ length = false , $ is_width = false , $ encoding = false ) {
208+ // Negative $length or $is_width and $length not specified not supported.
209+ if ( $ length < 0 || ( $ is_width && ( null === $ length || false === $ length ) ) ) {
210+ return false ;
211+ }
212+ $ have_safe_strlen = false ;
191213 // PHP 5.3 substr takes false as full length, PHP > 5.3 takes null - for compat. do `safe_strlen()`.
192214 if ( null === $ length || false === $ length ) {
193215 $ length = safe_strlen ( $ str , $ encoding );
216+ $ have_safe_strlen = true ;
217+ }
218+
219+ // Allow for selective testings - "1" bit set tests grapheme_substr(), "2" preg_match( '/\X/' ), "4" mb_substr(), "8" substr().
220+ $ test_safe_substr = getenv ( 'PHP_CLI_TOOLS_TEST_SAFE_SUBSTR ' );
221+
222+ // Assume UTF-8 if no encoding given - `grapheme_substr()` will return false (not null like `grapheme_strlen()`) if given non-UTF-8 string.
223+ if ( ( ! $ encoding || 'UTF-8 ' === $ encoding ) && can_use_icu () && false !== ( $ try = grapheme_substr ( $ str , $ start , $ length ) ) ) {
224+ if ( ! $ test_safe_substr || ( $ test_safe_substr & 1 ) ) {
225+ return $ is_width ? _safe_substr_eaw ( $ try , $ length ) : $ try ;
226+ }
227+ }
228+ // Assume UTF-8 if no encoding given - `preg_match()` will return false if given non-UTF-8 string.
229+ if ( ( ! $ encoding || 'UTF-8 ' === $ encoding ) && can_use_pcre_x () ) {
230+ if ( $ start < 0 ) {
231+ $ start = max ( $ start + ( $ have_safe_strlen ? $ length : safe_strlen ( $ str , $ encoding ) ), 0 );
232+ }
233+ if ( $ start ) {
234+ if ( preg_match ( '/^\X{ ' . $ start . '}(\X{0, ' . $ length . '})/u ' , $ str , $ matches ) ) {
235+ if ( ! $ test_safe_substr || ( $ test_safe_substr & 2 ) ) {
236+ return $ is_width ? _safe_substr_eaw ( $ matches [1 ], $ length ) : $ matches [1 ];
237+ }
238+ }
239+ } else {
240+ if ( preg_match ( '/^\X{0, ' . $ length . '}/u ' , $ str , $ matches ) ) {
241+ if ( ! $ test_safe_substr || ( $ test_safe_substr & 2 ) ) {
242+ return $ is_width ? _safe_substr_eaw ( $ matches [0 ], $ length ) : $ matches [0 ];
243+ }
244+ }
245+ }
194246 }
247+ // Legacy encodings and old PHPs will reach here.
195248 if ( function_exists ( 'mb_substr ' ) && ( $ encoding || function_exists ( 'mb_detect_encoding ' ) ) ) {
196249 if ( ! $ encoding ) {
197250 $ encoding = mb_detect_encoding ( $ str , null , true /*strict*/ );
198251 }
199- $ substr = mb_substr ( $ str , $ start , $ length , $ encoding );
200-
201- if ( $ is_width && 'UTF-8 ' === $ encoding ) {
202- // Set the East Asian Width regex.
203- $ eaw_regex = get_unicode_regexs ( 'eaw ' );
204- // If there's any East Asian double-width chars...
205- if ( preg_match ( $ eaw_regex , $ substr ) ) {
206- // Note that if the length ends in the middle of a double-width char, the char is excluded, not included.
207-
208- // See if it's all EAW - the most likely case.
209- if ( preg_match_all ( $ eaw_regex , $ substr , $ dummy /*needed for PHP 5.3*/ ) === $ length ) {
210- // Just halve the length so (rounded down to a minimum of 1).
211- $ substr = mb_substr ( $ substr , 0 , max ( (int ) ( $ length / 2 ), 1 ), $ encoding );
212- } else {
213- // Explode string into an array of UTF-8 chars. Based on core `_mb_substr()` in "wp-includes/compat.php".
214- $ chars = preg_split ( '/([\x00-\x7f\xc2-\xf4][^\x00-\x7f\xc2-\xf4]*)/ ' , $ substr , $ length + 1 , PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY );
215- $ cnt = min ( count ( $ chars ), $ length );
216- $ width = $ length ;
217-
218- for ( $ length = 0 ; $ length < $ cnt && $ width > 0 ; $ length ++ ) {
219- $ width -= preg_match ( $ eaw_regex , $ chars [ $ length ] ) ? 2 : 1 ;
220- }
221- // Round down to a minimum of 1.
222- if ( $ width < 0 && $ length > 1 ) {
223- $ length --;
224- }
225- return join ( '' , array_slice ( $ chars , 0 , $ length ) );
226- }
227- }
252+ // Bug: not adjusting for combining chars.
253+ $ try = mb_substr ( $ str , $ start , $ length , $ encoding );
254+ if ( 'UTF-8 ' === $ encoding && $ is_width ) {
255+ $ try = _safe_substr_eaw ( $ try , $ length );
256+ }
257+ if ( ! $ test_safe_substr || ( $ test_safe_substr & 4 ) ) {
258+ return $ try ;
228259 }
229- } else {
230- // iconv will return PHP notice if non-ascii characters are present in input string
231- $ str = iconv ( $ encoding ? $ encoding : 'ASCII ' , 'ASCII ' , $ str );
232-
233- $ substr = substr ( $ str , $ start , $ length );
234260 }
261+ return substr ( $ str , $ start , $ length );
262+ }
263+
264+ /**
265+ * Internal function used by `safe_substr()` to adjust for East Asian double-width chars.
266+ *
267+ * @return string
268+ */
269+ function _safe_substr_eaw ( $ str , $ length ) {
270+ // Set the East Asian Width regex.
271+ $ eaw_regex = get_unicode_regexs ( 'eaw ' );
235272
236- return $ substr ;
273+ // If there's any East Asian double-width chars...
274+ if ( preg_match ( $ eaw_regex , $ str ) ) {
275+ // Note that if the length ends in the middle of a double-width char, the char is excluded, not included.
276+
277+ // See if it's all EAW.
278+ if ( preg_match_all ( $ eaw_regex , $ str , $ dummy /*needed for PHP 5.3*/ ) === $ length ) {
279+ // Just halve the length so (rounded down to a minimum of 1).
280+ $ str = mb_substr ( $ str , 0 , max ( (int ) ( $ length / 2 ), 1 ), 'UTF-8 ' );
281+ } else {
282+ // Explode string into an array of UTF-8 chars. Based on core `_mb_substr()` in "wp-includes/compat.php".
283+ $ chars = preg_split ( '/([\x00-\x7f\xc2-\xf4][^\x00-\x7f\xc2-\xf4]*)/ ' , $ str , $ length + 1 , PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY );
284+ $ cnt = min ( count ( $ chars ), $ length );
285+ $ width = $ length ;
286+
287+ for ( $ length = 0 ; $ length < $ cnt && $ width > 0 ; $ length ++ ) {
288+ $ width -= preg_match ( $ eaw_regex , $ chars [ $ length ] ) ? 2 : 1 ;
289+ }
290+ // Round down to a minimum of 1.
291+ if ( $ width < 0 && $ length > 1 ) {
292+ $ length --;
293+ }
294+ return join ( '' , array_slice ( $ chars , 0 , $ length ) );
295+ }
296+ }
297+ return $ str ;
237298}
238299
239300/**
@@ -266,18 +327,19 @@ function strwidth( $string, $encoding = false ) {
266327 // Allow for selective testings - "1" bit set tests grapheme_strlen(), "2" preg_match_all( '/\X/u' ), "4" mb_strwidth(), "other" safe_strlen().
267328 $ test_strwidth = getenv ( 'PHP_CLI_TOOLS_TEST_STRWIDTH ' );
268329
269- // Assume UTF-8 - `grapheme_strlen()` will return null if given non-UTF-8 string.
270- if ( function_exists ( ' grapheme_strlen ' ) && null !== ( $ width = grapheme_strlen ( $ string ) ) ) {
330+ // Assume UTF-8 if no encoding given - `grapheme_strlen()` will return null if given non-UTF-8 string.
331+ if ( ( ! $ encoding || ' UTF-8 ' === $ encoding ) && can_use_icu ( ) && null !== ( $ width = grapheme_strlen ( $ string ) ) ) {
271332 if ( ! $ test_strwidth || ( $ test_strwidth & 1 ) ) {
272333 return $ width + preg_match_all ( $ eaw_regex , $ string , $ dummy /*needed for PHP 5.3*/ );
273334 }
274335 }
275- // Assume UTF-8 - `preg_match_all()` will return false if given non-UTF-8 string (or if PCRE UTF-8 mode is unavailable) .
276- if ( false !== ( $ width = preg_match_all ( '/\X/u ' , $ string , $ dummy /*needed for PHP 5.3*/ ) ) ) {
336+ // Assume UTF-8 if no encoding given - `preg_match_all()` will return false if given non-UTF-8 string.
337+ if ( ( ! $ encoding || ' UTF-8 ' === $ encoding ) && can_use_pcre_x () && false !== ( $ width = preg_match_all ( '/\X/u ' , $ string , $ dummy /*needed for PHP 5.3*/ ) ) ) {
277338 if ( ! $ test_strwidth || ( $ test_strwidth & 2 ) ) {
278339 return $ width + preg_match_all ( $ eaw_regex , $ string , $ dummy /*needed for PHP 5.3*/ );
279340 }
280341 }
342+ // Legacy encodings and old PHPs will reach here.
281343 if ( function_exists ( 'mb_strwidth ' ) && ( $ encoding || function_exists ( 'mb_detect_encoding ' ) ) ) {
282344 if ( ! $ encoding ) {
283345 $ encoding = mb_detect_encoding ( $ string , null , true /*strict*/ );
@@ -294,6 +356,40 @@ function strwidth( $string, $encoding = false ) {
294356 return safe_strlen ( $ string , $ encoding );
295357}
296358
359+ /**
360+ * Returns whether ICU is modern enough not to flake out.
361+ *
362+ * @return bool
363+ */
364+ function can_use_icu () {
365+ static $ can_use_icu = null ;
366+
367+ if ( null === $ can_use_icu ) {
368+ // Choosing ICU 54, Unicode 7.0.
369+ $ can_use_icu = defined ( 'INTL_ICU_VERSION ' ) && version_compare ( INTL_ICU_VERSION , '54.1 ' , '>= ' ) && function_exists ( 'grapheme_strlen ' ) && function_exists ( 'grapheme_substr ' );
370+ }
371+
372+ return $ can_use_icu ;
373+ }
374+
375+ /**
376+ * Returns whether PCRE Unicode extended grapheme cluster '\X' is available for use.
377+ *
378+ * @return bool
379+ */
380+ function can_use_pcre_x () {
381+ static $ can_use_pcre_x = null ;
382+
383+ if ( null === $ can_use_pcre_x ) {
384+ // '\X' introduced (as Unicde extended grapheme cluster) in PCRE 8.32 - see https://vcs.pcre.org/pcre/code/tags/pcre-8.32/ChangeLog?view=markup line 53.
385+ // Older versions of PCRE were bundled with PHP <= 5.3.23 & <= 5.4.13.
386+ $ pcre_version = substr ( PCRE_VERSION , 0 , strspn ( PCRE_VERSION , '0123456789. ' ) ); // Remove any trailing date stuff.
387+ $ can_use_pcre_x = version_compare ( $ pcre_version , '8.32 ' , '>= ' ) && false !== @preg_match ( '/\X/u ' , '' );
388+ }
389+
390+ return $ can_use_pcre_x ;
391+ }
392+
297393/**
298394 * Get the regexs generated from Unicode data.
299395 *
0 commit comments