Skip to content

Commit c61fa9d

Browse files
committed
Weighted Levenshtein + refactored Levenshtein
1 parent 9f1da60 commit c61fa9d

File tree

4 files changed

+239
-43
lines changed

4 files changed

+239
-43
lines changed

README.md

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,33 @@ public class MyApp {
4343
}
4444
```
4545

46+
## Weighted Levenshtein
47+
An implementation of Levenshtein that allows to define different weights for different character substitutions.
48+
49+
```java
50+
import info.debatty.java.stringsimilarity.*;
51+
52+
public class MyApp {
53+
54+
public static void main(String[] args) {
55+
WeightedLevenshtein wl = new WeightedLevenshtein(
56+
57+
new CharacterSubstitutionInterface() {
58+
public double cost(char c1, char c2) {
59+
60+
// t and r are next to each other,
61+
// let's assign a lower cost to substitution
62+
if (c1 == 't' && c2 == 'r') {
63+
return 0.5;
64+
}
65+
66+
return 1.0;
67+
}
68+
});
69+
System.out.println(wl.distanceAbsolute("String1", "Srring2"));
70+
}
71+
```
72+
4673
## Damerau-Levenshtein
4774
Similar to Levenshtein, Damerau-Levenshtein distance is the minimum number of operations needed to transform one string into the other, where an operation is defined as an insertion, deletion, or substitution of a single character, or a **transposition of two adjacent characters**.
4875

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
/*
2+
* The MIT License
3+
*
4+
* Copyright 2015 Thibault Debatty.
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy
7+
* of this software and associated documentation files (the "Software"), to deal
8+
* in the Software without restriction, including without limitation the rights
9+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10+
* copies of the Software, and to permit persons to whom the Software is
11+
* furnished to do so, subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be included in
14+
* all copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22+
* THE SOFTWARE.
23+
*/
24+
25+
package info.debatty.java.stringsimilarity;
26+
27+
/**
28+
* Used to indicate the cost of character substitution.
29+
*
30+
* Cost should always be in [0.0 .. 1.0]
31+
* For example, in an OCR application, cost('o', 'a') could be 0.4
32+
* In a checkspelling application, cost('u', 'i') could be 0.4 because these are
33+
* next to each other on the keyboard...
34+
* @author Thibault Debatty
35+
*/
36+
public interface CharacterSubstitutionInterface {
37+
public double cost(char c1, char c2);
38+
}

src/main/java/info/debatty/java/stringsimilarity/Levenshtein.java

Lines changed: 56 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ public static void main (String[] args) {
1313
Levenshtein l = new Levenshtein();
1414

1515
System.out.println(l.distanceAbsolute("My string", "My $tring"));
16+
System.out.println(l.distanceAbsolute("My string", "M string2"));
1617
System.out.println(l.distance("My string", "My $tring"));
1718
System.out.println(l.similarity("My string", "My $tring"));
1819
}
@@ -24,7 +25,7 @@ public static int Distance(String s1, String s2) {
2425

2526
@Override
2627
public double distance(String s1, String s2) {
27-
return ((double) distanceAbsolute(s1, s2)) / Math.max(s1.length(), s2.length());
28+
return (double) distanceAbsolute(s1, s2) / Math.max(s1.length(), s2.length());
2829

2930
}
3031

@@ -35,7 +36,7 @@ public double similarity(String s1, String s2) {
3536

3637
/**
3738
* The Levenshtein distance, or edit distance, between two words is the
38-
* minimum number of single-character edits (i.e. insertions, deletions or
39+
* minimum number of single-character edits (insertions, deletions or
3940
* substitutions) required to change one word into the other.
4041
*
4142
* http://en.wikipedia.org/wiki/Levenshtein_distance
@@ -45,57 +46,69 @@ public double similarity(String s1, String s2) {
4546
* It is zero if and only if the strings are equal.
4647
* If the strings are the same size, the Hamming distance is an upper bound
4748
* on the Levenshtein distance.
48-
* The Levenshtein distance between two strings is no greater than the sum
49-
* of their Levenshtein distances from a third string (triangle inequality).
49+
* The Levenshtein distance verifies the triangle inequality (the distance
50+
* between two strings is no greater than the sum Levenshtein distances from
51+
* a third string).
52+
*
53+
* Implementation uses dynamic programming (Wagner–Fischer algorithm), with
54+
* only 2 rows of data. The space requirement is thus O(m) and the algorithm
55+
* runs in O(mn).
5056
*
51-
* @param s0
5257
* @param s1
58+
* @param s2
5359
* @return
5460
*/
55-
public int distanceAbsolute(String s0, String s1) {
56-
int len0 = s0.length() + 1;
57-
int len1 = s1.length() + 1;
58-
59-
// the array of distances
60-
int[] cost = new int[len0];
61-
int[] newcost = new int[len0];
62-
63-
// initial cost of skipping prefix in String s0
64-
for (int i = 0; i < len0; i++) {
65-
cost[i] = i;
61+
public int distanceAbsolute(String s1, String s2) {
62+
if (s1.equals(s2)){
63+
return 0;
64+
}
65+
66+
if (s1.length() == 0) {
67+
return s2.length();
68+
}
69+
70+
if (s2.length() == 0) {
71+
return s1.length();
6672
}
6773

68-
// dynamicaly computing the array of distances
69-
// transformation cost for each letter in s1
70-
for (int j = 1; j < len1; j++) {
71-
72-
// initial cost of skipping prefix in String s1
73-
newcost[0] = j - 1;
74-
75-
// transformation cost for each letter in s0
76-
for (int i = 1; i < len0; i++) {
77-
78-
// matching current letters in both strings
79-
int match = (s0.charAt(i - 1) == s1.charAt(j - 1)) ? 0 : 1;
74+
// create two work vectors of integer distances
75+
int[] v0 = new int[s2.length() + 1];
76+
int[] v1 = new int[s2.length() + 1];
77+
int[] vtemp;
8078

81-
// computing cost for each transformation
82-
int cost_replace = cost[i - 1] + match;
83-
int cost_insert = cost[i] + 1;
84-
int cost_delete = newcost[i - 1] + 1;
79+
// initialize v0 (the previous row of distances)
80+
// this row is A[0][i]: edit distance for an empty s
81+
// the distance is just the number of characters to delete from t
82+
for (int i = 0; i < v0.length; i++) {
83+
v0[i] = i;
84+
}
85+
86+
for (int i = 0; i < s1.length(); i++) {
87+
// calculate v1 (current row distances) from the previous row v0
88+
// first element of v1 is A[i+1][0]
89+
// edit distance is delete (i+1) chars from s to match empty t
90+
v1[0] = i + 1;
8591

86-
// keep minimum cost
87-
newcost[i] = Math.min(
88-
Math.min(cost_insert, cost_delete),
89-
cost_replace);
92+
// use formula to fill in the rest of the row
93+
for (int j = 0; j < s2.length(); j++) {
94+
int cost = (s1.charAt(i) == s2.charAt(j)) ? 0 : 1;
95+
v1[j + 1] = Math.min(
96+
v1[j] + 1, // Cost of insertion
97+
Math.min(
98+
v0[j + 1] + 1, // Cost of remove
99+
v0[j] + cost)); // Cost of substitution
90100
}
91-
92-
// swap cost/newcost arrays
93-
int[] swap = cost;
94-
cost = newcost;
95-
newcost = swap;
101+
102+
// copy v1 (current row) to v0 (previous row) for next iteration
103+
//System.arraycopy(v1, 0, v0, 0, v0.length);
104+
105+
// Flip references to current and previous row
106+
vtemp = v0;
107+
v0 = v1;
108+
v1 = vtemp;
109+
96110
}
97111

98-
// the distance is the cost for transforming all letters in both strings
99-
return cost[len0 - 1];
112+
return v0[s2.length()];
100113
}
101114
}
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
/*
2+
* The MIT License
3+
*
4+
* Copyright 2015 Thibault Debatty.
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy
7+
* of this software and associated documentation files (the "Software"), to deal
8+
* in the Software without restriction, including without limitation the rights
9+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10+
* copies of the Software, and to permit persons to whom the Software is
11+
* furnished to do so, subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be included in
14+
* all copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22+
* THE SOFTWARE.
23+
*/
24+
25+
package info.debatty.java.stringsimilarity;
26+
27+
/**
28+
* Implementation of Levenshtein that allows to define different weights for
29+
* different character substitutions.
30+
*
31+
* @author Thibault Debatty
32+
*/
33+
public class WeightedLevenshtein implements StringSimilarityInterface {
34+
35+
/**
36+
* @param args the command line arguments
37+
*/
38+
public static void main(String[] args) {
39+
WeightedLevenshtein wl = new WeightedLevenshtein(
40+
new CharacterSubstitutionInterface() {
41+
public double cost(char c1, char c2) {
42+
if (c1 == 't' && c2 == 'r') {
43+
return 0.5;
44+
}
45+
return 1.0;
46+
}
47+
});
48+
System.out.println(wl.distanceAbsolute("String1", "Srring2"));
49+
}
50+
51+
private final CharacterSubstitutionInterface charsub;
52+
53+
public WeightedLevenshtein(CharacterSubstitutionInterface charsub) {
54+
this.charsub = charsub;
55+
}
56+
57+
public double distanceAbsolute(String s1, String s2) {
58+
if (s1.equals(s2)){
59+
return 0;
60+
}
61+
62+
if (s1.length() == 0) {
63+
return s2.length();
64+
}
65+
66+
if (s2.length() == 0) {
67+
return s1.length();
68+
}
69+
70+
// create two work vectors of integer distances
71+
double[] v0 = new double[s2.length() + 1];
72+
double[] v1 = new double[s2.length() + 1];
73+
double[] vtemp;
74+
75+
// initialize v0 (the previous row of distances)
76+
// this row is A[0][i]: edit distance for an empty s
77+
// the distance is just the number of characters to delete from t
78+
for (int i = 0; i < v0.length; i++) {
79+
v0[i] = i;
80+
}
81+
82+
for (int i = 0; i < s1.length(); i++) {
83+
// calculate v1 (current row distances) from the previous row v0
84+
// first element of v1 is A[i+1][0]
85+
// edit distance is delete (i+1) chars from s to match empty t
86+
v1[0] = i + 1;
87+
88+
// use formula to fill in the rest of the row
89+
for (int j = 0; j < s2.length(); j++) {
90+
double cost = (s1.charAt(i) == s2.charAt(j)) ? 0 : charsub.cost(s1.charAt(i), s2.charAt(j));
91+
v1[j + 1] = Math.min(
92+
v1[j] + 1, // Cost of insertion
93+
Math.min(
94+
v0[j + 1] + 1, // Cost of remove
95+
v0[j] + cost)); // Cost of substitution
96+
}
97+
98+
// copy v1 (current row) to v0 (previous row) for next iteration
99+
//System.arraycopy(v1, 0, v0, 0, v0.length);
100+
101+
// Flip references to current and previous row
102+
vtemp = v0;
103+
v0 = v1;
104+
v1 = vtemp;
105+
106+
}
107+
108+
return v0[s2.length()];
109+
}
110+
111+
public double similarity(String s1, String s2) {
112+
return 1.0 - distance(s1, s2);
113+
}
114+
115+
public double distance(String s1, String s2) {
116+
return (double) distanceAbsolute(s1, s2) / Math.max(s1.length(), s2.length());
117+
}
118+
}

0 commit comments

Comments
 (0)