Skip to content

Commit 363c753

Browse files
Merge pull request #123 from wp-cli/use_preg_split_in_safe_substr
Use preg_split instead of preg_match with quantifiers in safe_substr.
2 parents 14b8a08 + dcf1f29 commit 363c753

File tree

2 files changed

+62
-36
lines changed

2 files changed

+62
-36
lines changed

lib/cli/cli.php

Lines changed: 22 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ function safe_strlen( $str, $encoding = false ) {
181181
if ( ! $encoding ) {
182182
$encoding = mb_detect_encoding( $str, null, true /*strict*/ );
183183
}
184-
$length = mb_strlen( $str, $encoding );
184+
$length = $encoding ? mb_strlen( $str, $encoding ) : mb_strlen( $str ); // mbstring funcs can fail if given `$encoding` arg that evals to false.
185185
if ( 'UTF-8' === $encoding ) {
186186
// Subtract combining characters.
187187
$length -= preg_match_all( get_unicode_regexs( 'm' ), $str, $dummy /*needed for PHP 5.3*/ );
@@ -209,14 +209,22 @@ function safe_substr( $str, $start, $length = false, $is_width = false, $encodin
209209
if ( $length < 0 || ( $is_width && ( null === $length || false === $length ) ) ) {
210210
return false;
211211
}
212-
$have_safe_strlen = false;
213-
// PHP 5.3 substr takes false as full length, PHP > 5.3 takes null - for compat. do `safe_strlen()`.
212+
// Need this for normalization below and other uses.
213+
$safe_strlen = safe_strlen( $str, $encoding );
214+
215+
// Normalize `$length` when not specified - PHP 5.3 substr takes false as full length, PHP > 5.3 takes null.
214216
if ( null === $length || false === $length ) {
215-
$length = safe_strlen( $str, $encoding );
216-
$have_safe_strlen = true;
217+
$length = $safe_strlen;
218+
}
219+
// Normalize `$start` - various methods treat this differently.
220+
if ( $start > $safe_strlen ) {
221+
return '';
222+
}
223+
if ( $start < 0 && -$start > $safe_strlen ) {
224+
$start = 0;
217225
}
218226

219-
// Allow for selective testings - "1" bit set tests grapheme_substr(), "2" preg_match( '/\X/' ), "4" mb_substr(), "8" substr().
227+
// Allow for selective testings - "1" bit set tests grapheme_substr(), "2" preg_split( '/\X/' ), "4" mb_substr(), "8" substr().
220228
$test_safe_substr = getenv( 'PHP_CLI_TOOLS_TEST_SAFE_SUBSTR' );
221229

222230
// Assume UTF-8 if no encoding given - `grapheme_substr()` will return false (not null like `grapheme_strlen()`) if given non-UTF-8 string.
@@ -225,22 +233,12 @@ function safe_substr( $str, $start, $length = false, $is_width = false, $encodin
225233
return $is_width ? _safe_substr_eaw( $try, $length ) : $try;
226234
}
227235
}
228-
// Assume UTF-8 if no encoding given - `preg_match()` will return false if given non-UTF-8 string.
236+
// Assume UTF-8 if no encoding given - `preg_split()` returns a one element array if given non-UTF-8 string (PHP bug) so need to check `preg_last_error()`.
229237
if ( ( ! $encoding || 'UTF-8' === $encoding ) && can_use_pcre_x() ) {
230-
if ( $start < 0 ) {
231-
$start = max( $start + ( $have_safe_strlen ? $length : safe_strlen( $str, $encoding ) ), 0 );
232-
}
233-
if ( $start ) {
234-
if ( preg_match( '/^\X{' . $start . '}(\X{0,' . $length . '})/u', $str, $matches ) ) {
235-
if ( ! $test_safe_substr || ( $test_safe_substr & 2 ) ) {
236-
return $is_width ? _safe_substr_eaw( $matches[1], $length ) : $matches[1];
237-
}
238-
}
239-
} else {
240-
if ( preg_match( '/^\X{0,' . $length . '}/u', $str, $matches ) ) {
241-
if ( ! $test_safe_substr || ( $test_safe_substr & 2 ) ) {
242-
return $is_width ? _safe_substr_eaw( $matches[0], $length ) : $matches[0];
243-
}
238+
if ( false !== ( $try = preg_split( '/(\X)/u', $str, $safe_strlen + 1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ) ) && ! preg_last_error() ) {
239+
$try = implode( '', array_slice( $try, $start, $length ) );
240+
if ( ! $test_safe_substr || ( $test_safe_substr & 2 ) ) {
241+
return $is_width ? _safe_substr_eaw( $try, $length ) : $try;
244242
}
245243
}
246244
}
@@ -250,7 +248,7 @@ function safe_substr( $str, $start, $length = false, $is_width = false, $encodin
250248
$encoding = mb_detect_encoding( $str, null, true /*strict*/ );
251249
}
252250
// Bug: not adjusting for combining chars.
253-
$try = mb_substr( $str, $start, $length, $encoding );
251+
$try = $encoding ? mb_substr( $str, $start, $length, $encoding ) : mb_substr( $str, $start, $length ); // mbstring funcs can fail if given `$encoding` arg that evals to false.
254252
if ( 'UTF-8' === $encoding && $is_width ) {
255253
$try = _safe_substr_eaw( $try, $length );
256254
}
@@ -275,7 +273,7 @@ function _safe_substr_eaw( $str, $length ) {
275273
// Note that if the length ends in the middle of a double-width char, the char is excluded, not included.
276274

277275
// See if it's all EAW.
278-
if ( preg_match_all( $eaw_regex, $str, $dummy /*needed for PHP 5.3*/ ) === $length ) {
276+
if ( function_exists( 'mb_substr' ) && preg_match_all( $eaw_regex, $str, $dummy /*needed for PHP 5.3*/ ) === $length ) {
279277
// Just halve the length so (rounded down to a minimum of 1).
280278
$str = mb_substr( $str, 0, max( (int) ( $length / 2 ), 1 ), 'UTF-8' );
281279
} else {
@@ -344,7 +342,7 @@ function strwidth( $string, $encoding = false ) {
344342
if ( ! $encoding ) {
345343
$encoding = mb_detect_encoding( $string, null, true /*strict*/ );
346344
}
347-
$width = mb_strwidth( $string, $encoding );
345+
$width = $encoding ? mb_strwidth( $string, $encoding ) : mb_strwidth( $string ); // mbstring funcs can fail if given `$encoding` arg that evals to false.
348346
if ( 'UTF-8' === $encoding ) {
349347
// Subtract combining characters.
350348
$width -= preg_match_all( $m_regex, $string, $dummy /*needed for PHP 5.3*/ );

tests/test-cli.php

Lines changed: 40 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,12 @@ function test_various_substr() {
111111
putenv( 'PHP_CLI_TOOLS_TEST_SAFE_SUBSTR' );
112112

113113
// Latin, kana, Latin, Latin combining, Thai combining, Hangul.
114-
$str = 'lムnöม้p를';
114+
$str = 'lムnöม้p를'; // 18 bytes.
115+
116+
// Large string.
117+
$large_str_str_start = 65536 * 2;
118+
$large_str = str_repeat( 'a', $large_str_str_start ) . $str;
119+
$large_str_len = strlen( $large_str ); // 128K + 18 bytes.
115120

116121
if ( \cli\can_use_icu() ) {
117122
putenv( 'PHP_CLI_TOOLS_TEST_SAFE_SUBSTR=1' ); // Tests grapheme_substr().
@@ -124,6 +129,10 @@ function test_various_substr() {
124129
$this->assertSame( 'lムnöม้p', \cli\safe_substr( $str, 0, 6 ) );
125130
$this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, 0, 7 ) );
126131
$this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, 0, 8 ) );
132+
$this->assertSame( '', \cli\safe_substr( $str, 19 ) ); // Start too large.
133+
$this->assertSame( '', \cli\safe_substr( $str, 19, 7 ) ); // Start too large, with length.
134+
$this->assertSame( '', \cli\safe_substr( $str, 8 ) ); // Start same as length.
135+
$this->assertSame( '', \cli\safe_substr( $str, 8, 0 ) ); // Start same as length, with zero length.
127136
$this->assertSame( '', \cli\safe_substr( $str, -1 ) );
128137
$this->assertSame( 'p를', \cli\safe_substr( $str, -2 ) );
129138
$this->assertSame( 'ม้p를', \cli\safe_substr( $str, -3 ) );
@@ -134,11 +143,18 @@ function test_various_substr() {
134143
$this->assertSame( 'ムnöม้p를', \cli\safe_substr( $str, -6 ) );
135144
$this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, -7 ) );
136145
$this->assertSame( 'lムnö', \cli\safe_substr( $str, -7, 4 ) );
137-
// $this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, -8 ) ); // grapheme_substr() returns false on this.
146+
$this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, -8 ) );
147+
$this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, -9 ) ); // Negative start too large.
148+
149+
$this->assertSame( $large_str, \cli\safe_substr( $large_str, 0 ) );
150+
$this->assertSame( '', \cli\safe_substr( $large_str, $large_str_str_start, 0 ) );
151+
$this->assertSame( 'l', \cli\safe_substr( $large_str, $large_str_str_start, 1 ) );
152+
$this->assertSame( 'lム', \cli\safe_substr( $large_str, $large_str_str_start, 2 ) );
153+
$this->assertSame( 'p를', \cli\safe_substr( $large_str, -2 ) );
138154
}
139155

140156
if ( \cli\can_use_pcre_x() ) {
141-
putenv( 'PHP_CLI_TOOLS_TEST_SAFE_SUBSTR=2' ); // Tests preg_match( '/\X/u' ).
157+
putenv( 'PHP_CLI_TOOLS_TEST_SAFE_SUBSTR=2' ); // Tests preg_split( '/\X/u' ).
142158
$this->assertSame( '', \cli\safe_substr( $str, 0, 0 ) );
143159
$this->assertSame( 'l', \cli\safe_substr( $str, 0, 1 ) );
144160
$this->assertSame( 'lム', \cli\safe_substr( $str, 0, 2 ) );
@@ -148,6 +164,10 @@ function test_various_substr() {
148164
$this->assertSame( 'lムnöม้p', \cli\safe_substr( $str, 0, 6 ) );
149165
$this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, 0, 7 ) );
150166
$this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, 0, 8 ) );
167+
$this->assertSame( '', \cli\safe_substr( $str, 19 ) ); // Start too large.
168+
$this->assertSame( '', \cli\safe_substr( $str, 19, 7 ) ); // Start too large, with length.
169+
$this->assertSame( '', \cli\safe_substr( $str, 8 ) ); // Start same as length.
170+
$this->assertSame( '', \cli\safe_substr( $str, 8, 0 ) ); // Start same as length, with zero length.
151171
$this->assertSame( '', \cli\safe_substr( $str, -1 ) );
152172
$this->assertSame( 'p를', \cli\safe_substr( $str, -2 ) );
153173
$this->assertSame( 'ม้p를', \cli\safe_substr( $str, -3 ) );
@@ -159,6 +179,13 @@ function test_various_substr() {
159179
$this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, -7 ) );
160180
$this->assertSame( 'lムnö', \cli\safe_substr( $str, -7, 4 ) );
161181
$this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, -8 ) );
182+
$this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, -9 ) ); // Negative start too large.
183+
184+
$this->assertSame( $large_str, \cli\safe_substr( $large_str, 0 ) );
185+
$this->assertSame( '', \cli\safe_substr( $large_str, $large_str_str_start, 0 ) );
186+
$this->assertSame( 'l', \cli\safe_substr( $large_str, $large_str_str_start, 1 ) );
187+
$this->assertSame( 'lム', \cli\safe_substr( $large_str, $large_str_str_start, 2 ) );
188+
$this->assertSame( 'p를', \cli\safe_substr( $large_str, -2 ) );
162189
}
163190

164191
if ( function_exists( 'mb_substr' ) ) {
@@ -174,8 +201,9 @@ function test_various_substr() {
174201
$this->assertSame( '', \cli\safe_substr( $str, 0, 0 ) );
175202
$this->assertSame( 'l', \cli\safe_substr( $str, 0, 1 ) );
176203
$this->assertSame( "l\xe3", \cli\safe_substr( $str, 0, 2 ) ); // Corrupt.
204+
$this->assertSame( '', \cli\safe_substr( $str, strlen( $str ) + 1 ) ); // Return '' not false to match behavior of other methods when `$start` > strlen.
177205

178-
// Non-UTF-8 - both grapheme_substr() and preg_match( '/\X/u' ) will fail.
206+
// Non-UTF-8 - both grapheme_substr() and preg_split( '/\X/u' ) will fail.
179207

180208
putenv( 'PHP_CLI_TOOLS_TEST_SAFE_SUBSTR' );
181209

@@ -392,10 +420,10 @@ function test_strwidth() {
392420

393421
if ( \cli\can_use_icu() ) {
394422
$this->assertSame( 5, \cli\strwidth( $str ) ); // Tests grapheme_strlen().
395-
putenv( 'PHP_CLI_TOOLS_TEST_STRWIDTH=2' ); // Test preg_match_all( '/\X/u' ).
423+
putenv( 'PHP_CLI_TOOLS_TEST_STRWIDTH=2' ); // Test preg_split( '/\X/u' ).
396424
$this->assertSame( 5, \cli\strwidth( $str ) );
397425
} else {
398-
$this->assertSame( 5, \cli\strwidth( $str ) ); // Tests preg_match_all( '/\X/u' ).
426+
$this->assertSame( 5, \cli\strwidth( $str ) ); // Tests preg_split( '/\X/u' ).
399427
}
400428

401429
if ( function_exists( 'mb_strwidth' ) && function_exists( 'mb_detect_order' ) ) {
@@ -422,10 +450,10 @@ function test_strwidth() {
422450

423451
if ( \cli\can_use_icu() ) {
424452
$this->assertSame( 11, \cli\strwidth( $str ) ); // Tests grapheme_strlen().
425-
putenv( 'PHP_CLI_TOOLS_TEST_STRWIDTH=2' ); // Test preg_match_all( '/\X/u' ).
453+
putenv( 'PHP_CLI_TOOLS_TEST_STRWIDTH=2' ); // Test preg_split( '/\X/u' ).
426454
$this->assertSame( 11, \cli\strwidth( $str ) );
427455
} else {
428-
$this->assertSame( 11, \cli\strwidth( $str ) ); // Tests preg_match_all( '/\X/u' ).
456+
$this->assertSame( 11, \cli\strwidth( $str ) ); // Tests preg_split( '/\X/u' ).
429457
}
430458

431459
if ( function_exists( 'mb_strwidth' ) && function_exists( 'mb_detect_order' ) ) {
@@ -434,7 +462,7 @@ function test_strwidth() {
434462
$this->assertSame( 11, \cli\strwidth( $str ) );
435463
}
436464

437-
// Non-UTF-8 - both grapheme_strlen() and preg_match_all( '/\X/u' ) will fail.
465+
// Non-UTF-8 - both grapheme_strlen() and preg_split( '/\X/u' ) will fail.
438466

439467
putenv( 'PHP_CLI_TOOLS_TEST_STRWIDTH' );
440468

@@ -486,11 +514,11 @@ function test_safe_strlen() {
486514
putenv( 'PHP_CLI_TOOLS_TEST_SAFE_STRLEN' ); // Test grapheme_strlen().
487515
$this->assertSame( 7, \cli\safe_strlen( $str ) );
488516
if ( \cli\can_use_pcre_x() ) {
489-
putenv( 'PHP_CLI_TOOLS_TEST_SAFE_STRLEN=2' ); // Test preg_match_all( '/\X/u' ).
517+
putenv( 'PHP_CLI_TOOLS_TEST_SAFE_STRLEN=2' ); // Test preg_split( '/\X/u' ).
490518
$this->assertSame( 7, \cli\safe_strlen( $str ) );
491519
}
492520
} elseif ( \cli\can_use_pcre_x() ) {
493-
$this->assertSame( 7, \cli\safe_strlen( $str ) ); // Tests preg_match_all( '/\X/u' ).
521+
$this->assertSame( 7, \cli\safe_strlen( $str ) ); // Tests preg_split( '/\X/u' ).
494522
} else {
495523
putenv( 'PHP_CLI_TOOLS_TEST_SAFE_STRLEN=8' ); // Test strlen().
496524
$this->assertSame( 18, \cli\safe_strlen( $str ) ); // strlen() - no. of bytes.
@@ -504,7 +532,7 @@ function test_safe_strlen() {
504532
$this->assertSame( 9, mb_strlen( $str, 'UTF-8' ) ); // mb_strlen() - counts the 2 combining chars.
505533
}
506534

507-
// Non-UTF-8 - both grapheme_strlen() and preg_match_all( '/\X/u' ) will fail.
535+
// Non-UTF-8 - both grapheme_strlen() and preg_split( '/\X/u' ) will fail.
508536

509537
putenv( 'PHP_CLI_TOOLS_TEST_SAFE_STRLEN' );
510538

0 commit comments

Comments
 (0)