@@ -13,6 +13,7 @@ public static void main (String[] args) {
13
13
Levenshtein l = new Levenshtein ();
14
14
15
15
System .out .println (l .distanceAbsolute ("My string" , "My $tring" ));
16
+ System .out .println (l .distanceAbsolute ("My string" , "M string2" ));
16
17
System .out .println (l .distance ("My string" , "My $tring" ));
17
18
System .out .println (l .similarity ("My string" , "My $tring" ));
18
19
}
@@ -24,7 +25,7 @@ public static int Distance(String s1, String s2) {
24
25
25
26
@ Override
26
27
public double distance (String s1 , String s2 ) {
27
- return (( double ) distanceAbsolute (s1 , s2 ) ) / Math .max (s1 .length (), s2 .length ());
28
+ return (double ) distanceAbsolute (s1 , s2 ) / Math .max (s1 .length (), s2 .length ());
28
29
29
30
}
30
31
@@ -35,7 +36,7 @@ public double similarity(String s1, String s2) {
35
36
36
37
/**
37
38
* The Levenshtein distance, or edit distance, between two words is the
38
- * minimum number of single-character edits (i.e. insertions, deletions or
39
+ * minimum number of single-character edits (insertions, deletions or
39
40
* substitutions) required to change one word into the other.
40
41
*
41
42
* http://en.wikipedia.org/wiki/Levenshtein_distance
@@ -45,57 +46,69 @@ public double similarity(String s1, String s2) {
45
46
* It is zero if and only if the strings are equal.
46
47
* If the strings are the same size, the Hamming distance is an upper bound
47
48
* on the Levenshtein distance.
48
- * The Levenshtein distance between two strings is no greater than the sum
49
- * of their Levenshtein distances from a third string (triangle inequality).
49
+ * The Levenshtein distance verifies the triangle inequality (the distance
50
+ * between two strings is no greater than the sum Levenshtein distances from
51
+ * a third string).
52
+ *
53
+ * Implementation uses dynamic programming (Wagner–Fischer algorithm), with
54
+ * only 2 rows of data. The space requirement is thus O(m) and the algorithm
55
+ * runs in O(mn).
50
56
*
51
- * @param s0
52
57
* @param s1
58
+ * @param s2
53
59
* @return
54
60
*/
55
- public int distanceAbsolute (String s0 , String s1 ) {
56
- int len0 = s0 . length () + 1 ;
57
- int len1 = s1 . length () + 1 ;
58
-
59
- // the array of distances
60
- int [] cost = new int [ len0 ];
61
- int [] newcost = new int [ len0 ] ;
62
-
63
- // initial cost of skipping prefix in String s0
64
- for ( int i = 0 ; i < len0 ; i ++ ) {
65
- cost [ i ] = i ;
61
+ public int distanceAbsolute (String s1 , String s2 ) {
62
+ if ( s1 . equals ( s2 )){
63
+ return 0 ;
64
+ }
65
+
66
+ if ( s1 . length () == 0 ) {
67
+ return s2 . length () ;
68
+ }
69
+
70
+ if ( s2 . length () == 0 ) {
71
+ return s1 . length () ;
66
72
}
67
73
68
- // dynamicaly computing the array of distances
69
- // transformation cost for each letter in s1
70
- for (int j = 1 ; j < len1 ; j ++) {
71
-
72
- // initial cost of skipping prefix in String s1
73
- newcost [0 ] = j - 1 ;
74
-
75
- // transformation cost for each letter in s0
76
- for (int i = 1 ; i < len0 ; i ++) {
77
-
78
- // matching current letters in both strings
79
- int match = (s0 .charAt (i - 1 ) == s1 .charAt (j - 1 )) ? 0 : 1 ;
74
+ // create two work vectors of integer distances
75
+ int [] v0 = new int [s2 .length () + 1 ];
76
+ int [] v1 = new int [s2 .length () + 1 ];
77
+ int [] vtemp ;
80
78
81
- // computing cost for each transformation
82
- int cost_replace = cost [i - 1 ] + match ;
83
- int cost_insert = cost [i ] + 1 ;
84
- int cost_delete = newcost [i - 1 ] + 1 ;
79
+ // initialize v0 (the previous row of distances)
80
+ // this row is A[0][i]: edit distance for an empty s
81
+ // the distance is just the number of characters to delete from t
82
+ for (int i = 0 ; i < v0 .length ; i ++) {
83
+ v0 [i ] = i ;
84
+ }
85
+
86
+ for (int i = 0 ; i < s1 .length (); i ++) {
87
+ // calculate v1 (current row distances) from the previous row v0
88
+ // first element of v1 is A[i+1][0]
89
+ // edit distance is delete (i+1) chars from s to match empty t
90
+ v1 [0 ] = i + 1 ;
85
91
86
- // keep minimum cost
87
- newcost [i ] = Math .min (
88
- Math .min (cost_insert , cost_delete ),
89
- cost_replace );
92
+ // use formula to fill in the rest of the row
93
+ for (int j = 0 ; j < s2 .length (); j ++) {
94
+ int cost = (s1 .charAt (i ) == s2 .charAt (j )) ? 0 : 1 ;
95
+ v1 [j + 1 ] = Math .min (
96
+ v1 [j ] + 1 , // Cost of insertion
97
+ Math .min (
98
+ v0 [j + 1 ] + 1 , // Cost of remove
99
+ v0 [j ] + cost )); // Cost of substitution
90
100
}
91
-
92
- // swap cost/newcost arrays
93
- int [] swap = cost ;
94
- cost = newcost ;
95
- newcost = swap ;
101
+
102
+ // copy v1 (current row) to v0 (previous row) for next iteration
103
+ //System.arraycopy(v1, 0, v0, 0, v0.length);
104
+
105
+ // Flip references to current and previous row
106
+ vtemp = v0 ;
107
+ v0 = v1 ;
108
+ v1 = vtemp ;
109
+
96
110
}
97
111
98
- // the distance is the cost for transforming all letters in both strings
99
- return cost [len0 - 1 ];
112
+ return v0 [s2 .length ()];
100
113
}
101
114
}
0 commit comments