Skip to content

Commit c00e0fa

Browse files
committed
Refactor levenshtein()
1 parent 0280b83 commit c00e0fa

8 files changed

+151
-120
lines changed

ext/standard/basic_functions.stub.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -985,7 +985,7 @@ function iptcparse(string $iptcblock): array|false {}
985985

986986
/* levenshtein.c */
987987

988-
function levenshtein(string $str1, string $str2, $cost_ins = UNKNOWN, int $cost_rep = UNKNOWN, int $cost_del = UNKNOWN): int {}
988+
function levenshtein(string $str1, string $str2, int $cost_ins = 1, int $cost_rep = 1, int $cost_del = 1): int {}
989989

990990
/* link.c */
991991

ext/standard/basic_functions_arginfo.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/* This is a generated file, edit the .stub.php file instead.
2-
* Stub hash: 9cf2c691c081d9aaefaa9d22337a9e00efb0af77 */
2+
* Stub hash: 05b740207a70a9d272b4c327882fd0b52016a0af */
33

44
ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_set_time_limit, 0, 1, _IS_BOOL, 0)
55
ZEND_ARG_TYPE_INFO(0, seconds, IS_LONG, 0)
@@ -1568,9 +1568,9 @@ ZEND_END_ARG_INFO()
15681568
ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_levenshtein, 0, 2, IS_LONG, 0)
15691569
ZEND_ARG_TYPE_INFO(0, str1, IS_STRING, 0)
15701570
ZEND_ARG_TYPE_INFO(0, str2, IS_STRING, 0)
1571-
ZEND_ARG_INFO(0, cost_ins)
1572-
ZEND_ARG_TYPE_INFO(0, cost_rep, IS_LONG, 0)
1573-
ZEND_ARG_TYPE_INFO(0, cost_del, IS_LONG, 0)
1571+
ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, cost_ins, IS_LONG, 0, "1")
1572+
ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, cost_rep, IS_LONG, 0, "1")
1573+
ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, cost_del, IS_LONG, 0, "1")
15741574
ZEND_END_ARG_INFO()
15751575

15761576
#if defined(HAVE_SYMLINK) || defined(PHP_WIN32)

ext/standard/levenshtein.c

Lines changed: 17 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,6 @@ static zend_long reference_levdist(const char *s1, size_t l1, const char *s2, si
3737
return l1 * cost_del;
3838
}
3939

40-
if ((l1 > LEVENSHTEIN_MAX_LENGTH) || (l2 > LEVENSHTEIN_MAX_LENGTH)) {
41-
return -1;
42-
}
4340
p1 = safe_emalloc((l2 + 1), sizeof(zend_long), 0);
4441
p2 = safe_emalloc((l2 + 1), sizeof(zend_long), 0);
4542

@@ -74,58 +71,32 @@ static zend_long reference_levdist(const char *s1, size_t l1, const char *s2, si
7471
}
7572
/* }}} */
7673

77-
/* {{{ custom_levdist
78-
*/
79-
static int custom_levdist(char *str1, char *str2, char *callback_name)
80-
{
81-
php_error_docref(NULL, E_WARNING, "The general Levenshtein support is not there yet");
82-
/* not there yet */
83-
84-
return -1;
85-
}
86-
/* }}} */
87-
8874
/* {{{ proto int levenshtein(string str1, string str2[, int cost_ins, int cost_rep, int cost_del])
8975
Calculate Levenshtein distance between two strings */
9076
PHP_FUNCTION(levenshtein)
9177
{
92-
int argc = ZEND_NUM_ARGS();
93-
char *str1, *str2;
94-
char *callback_name;
95-
size_t str1_len, str2_len, callback_len;
96-
zend_long cost_ins, cost_rep, cost_del;
97-
zend_long distance = -1;
98-
99-
switch (argc) {
100-
case 2: /* just two strings: use maximum performance version */
101-
if (zend_parse_parameters(2, "ss", &str1, &str1_len, &str2, &str2_len) == FAILURE) {
102-
RETURN_THROWS();
103-
}
104-
distance = reference_levdist(str1, str1_len, str2, str2_len, 1, 1, 1);
105-
break;
106-
107-
case 5: /* more general version: calc cost by ins/rep/del weights */
108-
if (zend_parse_parameters(5, "sslll", &str1, &str1_len, &str2, &str2_len, &cost_ins, &cost_rep, &cost_del) == FAILURE) {
109-
RETURN_THROWS();
110-
}
111-
distance = reference_levdist(str1, str1_len, str2, str2_len, cost_ins, cost_rep, cost_del);
112-
break;
113-
114-
case 3: /* most general version: calc cost by user-supplied function */
115-
if (zend_parse_parameters(3, "sss", &str1, &str1_len, &str2, &str2_len, &callback_name, &callback_len) == FAILURE) {
116-
RETURN_THROWS();
117-
}
118-
distance = custom_levdist(str1, str2, callback_name);
119-
break;
78+
zend_string *string1, *string2;
79+
zend_long cost_ins = 1;
80+
zend_long cost_rep = 1;
81+
zend_long cost_del = 1;
82+
zend_long distance = 0;
83+
84+
if (zend_parse_parameters(ZEND_NUM_ARGS(), "SS|lll", &string1, &string2, &cost_ins, &cost_rep, &cost_del) == FAILURE) {
85+
RETURN_THROWS();
86+
}
12087

121-
default:
122-
WRONG_PARAM_COUNT;
88+
if (ZSTR_LEN(string1) > LEVENSHTEIN_MAX_LENGTH) {
89+
zend_argument_value_error(1, "must be less than %d characters", LEVENSHTEIN_MAX_LENGTH + 1);
90+
RETURN_THROWS();
12391
}
12492

125-
if (distance < 0 && /* TODO */ ZEND_NUM_ARGS() != 3) {
126-
php_error_docref(NULL, E_WARNING, "Argument string(s) too long");
93+
if (ZSTR_LEN(string2) > LEVENSHTEIN_MAX_LENGTH) {
94+
zend_argument_value_error(2, "must be less than %d characters", LEVENSHTEIN_MAX_LENGTH + 1);
95+
RETURN_THROWS();
12796
}
12897

98+
distance = reference_levdist(ZSTR_VAL(string1), ZSTR_LEN(string1), ZSTR_VAL(string2), ZSTR_LEN(string2), cost_ins, cost_rep, cost_del);
99+
129100
RETURN_LONG(distance);
130101
}
131102
/* }}} */

ext/standard/tests/strings/levenshtein.phpt

Lines changed: 54 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -3,76 +3,61 @@ levenshtein() function test
33
--FILE--
44
<?php
55

6-
function test_me($title,$expect,$text1,$text2,$cost1="",$cost2="",$cost3="") {
7-
8-
if ($cost1=="") {
9-
$result=levenshtein($text1,$text2);
10-
}
11-
elseif ($cost2=="") {
12-
$result=levenshtein($text1,$text2,$cost1);
13-
}
14-
else {
15-
$result=levenshtein($text1,$text2,$cost1,$cost2,$cost3);
16-
}
17-
if($result==$expect) return 0;
18-
19-
echo "$title: result is $result instead of $expect ";
20-
echo "for '$text1'/'$text2' ";
21-
if($cost1) echo "($cost1:$cost2:$cost3)";
22-
echo "\n";
23-
24-
return 1;
25-
}
26-
27-
$n=0;
28-
29-
$n += test_me("equal" , 0, "12345", "12345");
30-
$n += test_me("1st empty" , 3, "", "xzy");
31-
$n += test_me("2nd empty" , 3, "xzy", "");
32-
$n += test_me("both empty" , 0, "", "");
33-
$n += test_me("1 char" , 1, "1", "2");
34-
$n += test_me("2 char swap", 2, "12", "21");
35-
36-
$n += test_me("inexpensive delete", 2, "2121", "11", 2, 1, 1);
37-
$n += test_me("expensive delete" , 10, "2121", "11", 2, 1, 5);
38-
$n += test_me("inexpensive insert", 2, "11", "2121", 1, 1, 1);
39-
$n += test_me("expensive insert" , 10, "11", "2121", 5, 1, 1);
40-
41-
$n += test_me("expensive replace" , 3, "111", "121", 2, 3, 2);
42-
$n += test_me("very expensive replace", 4, "111", "121", 2, 9, 2);
43-
44-
$n += test_me("bug #7368", 2, "13458", "12345");
45-
$n += test_me("bug #7368", 2, "1345", "1234");
46-
47-
$n += test_me("bug #6562", 1, "debugg", "debug");
48-
$n += test_me("bug #6562", 1, "ddebug", "debug");
49-
$n += test_me("bug #6562", 2, "debbbug", "debug");
50-
$n += test_me("bug #6562", 1, "debugging", "debuging");
51-
52-
$n += test_me("bug #16473", 2, "a", "bc");
53-
$n += test_me("bug #16473", 2, "xa", "xbc");
54-
$n += test_me("bug #16473", 2, "xax", "xbcx");
55-
$n += test_me("bug #16473", 2, "ax", "bcx");
56-
57-
$n += test_me("custom", -1, "111", "121", "my_levcode");
58-
$n += test_me("lt maxlength1", 254, "AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsu", "A");
59-
$n += test_me("gt maxlength1", -1, "AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuv", "A");
60-
61-
$n += test_me("lt maxlength2", 254, "A", "AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsu");
62-
$n += test_me("gt maxlength2", -1, "A", "AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuv");
63-
64-
echo ($n==0)?"all passed\n":"$n failed\n";
65-
66-
var_dump(levenshtein("", "", -1, -1, -1));
67-
var_dump(levenshtein("", "", 10, 10, 10));
6+
echo '--- Equal ---' . \PHP_EOL;
7+
var_dump(levenshtein('12345', '12345'));
8+
9+
echo '--- First string empty ---' . \PHP_EOL;
10+
var_dump(levenshtein('', 'xyz'));
11+
echo '--- Second string empty ---' . \PHP_EOL;
12+
var_dump(levenshtein('xyz', ''));
13+
echo '--- Both empty ---' . \PHP_EOL;
14+
var_dump(levenshtein('', ''));
15+
var_dump(levenshtein('', '', 10, 10, 10));
16+
17+
echo '--- 1 character ---' . \PHP_EOL;
18+
var_dump(levenshtein('1', '2'));
19+
echo '--- 2 character swapped ---' . \PHP_EOL;
20+
var_dump(levenshtein('12', '21'));
21+
22+
echo '--- Inexpensive deletion ---' . \PHP_EOL;
23+
var_dump(levenshtein('2121', '11', 2));
24+
echo '--- Expensive deletion ---' . \PHP_EOL;
25+
var_dump(levenshtein('2121', '11', 2, 1, 5));
26+
27+
echo '--- Inexpensive insertion ---' . \PHP_EOL;
28+
var_dump(levenshtein('11', '2121'));
29+
echo '--- Expensive insertion ---' . \PHP_EOL;
30+
var_dump(levenshtein('11', '2121', 5));
31+
32+
echo '--- Expensive replacement ---' . \PHP_EOL;
33+
var_dump(levenshtein('111', '121', 2, 3, 2));
34+
echo '--- Very expensive replacement ---' . \PHP_EOL;
35+
var_dump(levenshtein('111', '121', 2, 9, 2));
6836

6937
?>
70-
--EXPECTF--
71-
Warning: levenshtein(): The general Levenshtein support is not there yet in %s on line %d
72-
73-
Warning: levenshtein(): Argument string(s) too long in %s on line %d
74-
75-
Warning: levenshtein(): Argument string(s) too long in %s on line %d
76-
all passed
38+
--EXPECT--
39+
--- Equal ---
40+
int(0)
41+
--- First string empty ---
42+
int(3)
43+
--- Second string empty ---
44+
int(3)
45+
--- Both empty ---
7746
int(0)
7847
int(0)
48+
--- 1 character ---
49+
int(1)
50+
--- 2 character swapped ---
51+
int(2)
52+
--- Inexpensive deletion ---
53+
int(2)
54+
--- Expensive deletion ---
55+
int(10)
56+
--- Inexpensive insertion ---
57+
int(2)
58+
--- Expensive insertion ---
59+
int(10)
60+
--- Expensive replacement ---
61+
int(3)
62+
--- Very expensive replacement ---
63+
int(4)
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
--TEST--
2+
levenshtein() bug 16473
3+
--FILE--
4+
<?php
5+
6+
var_dump(levenshtein('a', 'bc'));
7+
var_dump(levenshtein('xa', 'xbc'));
8+
var_dump(levenshtein('xax', 'xbcx'));
9+
var_dump(levenshtein('ax', 'bcx'));
10+
11+
?>
12+
--EXPECT--
13+
int(2)
14+
int(2)
15+
int(2)
16+
int(2)
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
--TEST--
2+
levenshtein() bug 6562
3+
--FILE--
4+
<?php
5+
6+
var_dump(levenshtein('debugg', 'debug'));
7+
var_dump(levenshtein('ddebug', 'debug'));
8+
var_dump(levenshtein('debbbug', 'debug'));
9+
var_dump(levenshtein('debugging', 'debuging'));
10+
11+
?>
12+
--EXPECT--
13+
int(1)
14+
int(1)
15+
int(2)
16+
int(1)
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
--TEST--
2+
levenshtein() bug 7368
3+
--FILE--
4+
<?php
5+
6+
var_dump(levenshtein('13458', '12345'));
7+
var_dump(levenshtein('1345', '1234'));
8+
9+
?>
10+
--EXPECT--
11+
int(2)
12+
int(2)
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
--TEST--
2+
levenshtein() error conditions
3+
--FILE--
4+
<?php
5+
6+
echo '--- String 1 ---' . \PHP_EOL;
7+
var_dump(levenshtein('AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsu', 'A'));
8+
try {
9+
var_dump(levenshtein('AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuv', 'A'));
10+
} catch (\ValueError $e) {
11+
echo $e->getMessage() . \PHP_EOL;
12+
}
13+
echo '--- String 2 ---' . \PHP_EOL;
14+
var_dump(levenshtein('A', 'AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsu'));
15+
try {
16+
var_dump(levenshtein('A', 'AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuv'));
17+
} catch (\ValueError $e) {
18+
echo $e->getMessage() . \PHP_EOL;
19+
}
20+
21+
// TODO ValueError for negative costs?
22+
// var_dump(levenshtein("", "", -1, -1, -1));
23+
24+
?>
25+
--EXPECT--
26+
--- String 1 ---
27+
int(254)
28+
levenshtein(): Argument #1 ($str1) must be less than 256 characters
29+
--- String 2 ---
30+
int(254)
31+
levenshtein(): Argument #2 ($str2) must be less than 256 characters

0 commit comments

Comments
 (0)