Skip to content

Commit 34cf398

Browse files
authored
Merge pull request #29 from paulirwin/issue/28
Resolves #28: null and empty value handling
2 parents 471682e + 3406d2c commit 34cf398

34 files changed

+495
-177
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
/nbproject/private/
22
/build/
33
/dist/
4-
/target/
4+
/target/
5+
.idea/
6+
*.iml

src/main/java/info/debatty/java/stringsimilarity/CharacterSubstitutionInterface.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,9 @@
3737
public interface CharacterSubstitutionInterface {
3838
/**
3939
* Indicate the cost of substitution c1 and c2.
40-
* @param c1
41-
* @param c2
42-
* @return
40+
* @param c1 The first character of the substitution.
41+
* @param c2 The second character of the substitution.
42+
* @return The cost in the range [0, 1].
4343
*/
4444
double cost(char c1, char c2);
4545
}

src/main/java/info/debatty/java/stringsimilarity/Cosine.java

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import info.debatty.java.stringsimilarity.interfaces.NormalizedStringSimilarity;
2727
import info.debatty.java.stringsimilarity.interfaces.NormalizedStringDistance;
2828
import java.util.Map;
29+
2930
import net.jcip.annotations.Immutable;
3031

3132
/**
@@ -64,11 +65,23 @@ public Cosine() {
6465

6566
/**
6667
* Compute the cosine similarity between strings.
67-
* @param s1
68-
* @param s2
69-
* @return
68+
* @param s1 The first string to compare.
69+
* @param s2 The second string to compare.
70+
* @return The cosine similarity in the range [0, 1]
71+
* @throws NullPointerException if s1 or s2 is null.
7072
*/
7173
public final double similarity(final String s1, final String s2) {
74+
if (s1 == null) {
75+
throw new NullPointerException("s1 must not be null");
76+
}
77+
78+
if (s2 == null) {
79+
throw new NullPointerException("s2 must not be null");
80+
}
81+
82+
if (s1.equals(s2)) {
83+
return 1;
84+
}
7285

7386
if (s1.length() < getK() || s2.length() < getK()) {
7487
return 0;
@@ -125,9 +138,10 @@ private static double dotProduct(
125138

126139
/**
127140
* Return 1.0 - similarity.
128-
* @param s1
129-
* @param s2
130-
* @return
141+
* @param s1 The first string to compare.
142+
* @param s2 The second string to compare.
143+
* @return 1.0 - the cosine similarity in the range [0, 1]
144+
* @throws NullPointerException if s1 or s2 is null.
131145
*/
132146
public final double distance(final String s1, final String s2) {
133147
return 1.0 - similarity(s1, s2);

src/main/java/info/debatty/java/stringsimilarity/Damerau.java

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525

2626
import info.debatty.java.stringsimilarity.interfaces.MetricStringDistance;
2727
import java.util.HashMap;
28+
2829
import net.jcip.annotations.Immutable;
2930

3031
/**
@@ -49,12 +50,25 @@ public class Damerau implements MetricStringDistance {
4950
* needed to transform one string into the other (insertion, deletion,
5051
* substitution of a single character, or a transposition of two adjacent
5152
* characters).
52-
* @param s1
53-
* @param s2
54-
* @return
53+
* @param s1 The first string to compare.
54+
* @param s2 The second string to compare.
55+
* @return The computed distance.
56+
* @throws NullPointerException if s1 or s2 is null.
5557
*/
5658
public final double distance(final String s1, final String s2) {
5759

60+
if (s1 == null) {
61+
throw new NullPointerException("s1 must not be null");
62+
}
63+
64+
if (s2 == null) {
65+
throw new NullPointerException("s2 must not be null");
66+
}
67+
68+
if (s1.equals(s2)) {
69+
return 0;
70+
}
71+
5872
// INFinite distance is the max possible distance
5973
int inf = s1.length() + s2.length();
6074

src/main/java/info/debatty/java/stringsimilarity/Jaccard.java

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import java.util.HashSet;
3131
import java.util.Map;
3232
import java.util.Set;
33+
3334
import net.jcip.annotations.Immutable;
3435

3536
/**
@@ -68,12 +69,25 @@ public Jaccard() {
6869
}
6970

7071
/**
71-
* Compute jaccard index: |A inter B| / |A union B|.
72-
* @param s1
73-
* @param s2
74-
* @return
72+
* Compute Jaccard index: |A inter B| / |A union B|.
73+
* @param s1 The first string to compare.
74+
* @param s2 The second string to compare.
75+
* @return The Jaccard index in the range [0, 1]
76+
* @throws NullPointerException if s1 or s2 is null.
7577
*/
7678
public final double similarity(final String s1, final String s2) {
79+
if (s1 == null) {
80+
throw new NullPointerException("s1 must not be null");
81+
}
82+
83+
if (s2 == null) {
84+
throw new NullPointerException("s2 must not be null");
85+
}
86+
87+
if (s1.equals(s2)) {
88+
return 1;
89+
}
90+
7791
Map<String, Integer> profile1 = getProfile(s1);
7892
Map<String, Integer> profile2 = getProfile(s2);
7993

@@ -95,9 +109,10 @@ public final double similarity(final String s1, final String s2) {
95109

96110
/**
97111
* Distance is computed as 1 - similarity.
98-
* @param s1
99-
* @param s2
100-
* @return
112+
* @param s1 The first string to compare.
113+
* @param s2 The second string to compare.
114+
* @return 1 - the Jaccard similarity.
115+
* @throws NullPointerException if s1 or s2 is null.
101116
*/
102117
public final double distance(final String s1, final String s2) {
103118
return 1.0 - similarity(s1, s2);

src/main/java/info/debatty/java/stringsimilarity/JaroWinkler.java

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import info.debatty.java.stringsimilarity.interfaces.NormalizedStringSimilarity;
44
import info.debatty.java.stringsimilarity.interfaces.NormalizedStringDistance;
55
import java.util.Arrays;
6+
67
import net.jcip.annotations.Immutable;
78

89
/**
@@ -54,12 +55,25 @@ public final double getThreshold() {
5455
}
5556

5657
/**
57-
* Compute JW similarity.
58-
* @param s1
59-
* @param s2
60-
* @return
58+
* Compute Jaro-Winkler similarity.
59+
* @param s1 The first string to compare.
60+
* @param s2 The second string to compare.
61+
* @return The Jaro-Winkler similarity in the range [0, 1]
62+
* @throws NullPointerException if s1 or s2 is null.
6163
*/
6264
public final double similarity(final String s1, final String s2) {
65+
if (s1 == null) {
66+
throw new NullPointerException("s1 must not be null");
67+
}
68+
69+
if (s2 == null) {
70+
throw new NullPointerException("s2 must not be null");
71+
}
72+
73+
if (s1.equals(s2)) {
74+
return 1;
75+
}
76+
6377
int[] mtp = matches(s1, s2);
6478
float m = mtp[0];
6579
if (m == 0) {
@@ -78,9 +92,10 @@ public final double similarity(final String s1, final String s2) {
7892

7993
/**
8094
* Return 1 - similarity.
81-
* @param s1
82-
* @param s2
83-
* @return
95+
* @param s1 The first string to compare.
96+
* @param s2 The second string to compare.
97+
* @return 1 - similarity.
98+
* @throws NullPointerException if s1 or s2 is null.
8499
*/
85100
public final double distance(final String s1, final String s2) {
86101
return 1.0 - similarity(s1, s2);

src/main/java/info/debatty/java/stringsimilarity/Levenshtein.java

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,20 @@ public class Levenshtein implements MetricStringDistance {
3333
* only 2 rows of data. The space requirement is thus O(m) and the algorithm
3434
* runs in O(mn).
3535
*
36-
* @param s1
37-
* @param s2
38-
* @return
36+
* @param s1 The first string to compare.
37+
* @param s2 The second string to compare.
38+
* @return The computed Levenshtein distance.
39+
* @throws NullPointerException if s1 or s2 is null.
3940
*/
4041
public final double distance(final String s1, final String s2) {
42+
if (s1 == null) {
43+
throw new NullPointerException("s1 must not be null");
44+
}
45+
46+
if (s2 == null) {
47+
throw new NullPointerException("s2 must not be null");
48+
}
49+
4150
if (s1.equals(s2)) {
4251
return 0;
4352
}

src/main/java/info/debatty/java/stringsimilarity/LongestCommonSubsequence.java

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,24 +30,46 @@ public class LongestCommonSubsequence implements StringDistance {
3030
* Return the LCS distance between strings s1 and s2, computed as |s1| +
3131
* |s2| - 2 * |LCS(s1, s2)|.
3232
*
33-
* @param s1
34-
* @param s2
33+
* @param s1 The first string to compare.
34+
* @param s2 The second string to compare.
3535
* @return the LCS distance between strings s1 and s2, computed as |s1| +
3636
* |s2| - 2 * |LCS(s1, s2)|
37+
* @throws NullPointerException if s1 or s2 is null.
3738
*/
3839
public final double distance(final String s1, final String s2) {
40+
if (s1 == null) {
41+
throw new NullPointerException("s1 must not be null");
42+
}
43+
44+
if (s2 == null) {
45+
throw new NullPointerException("s2 must not be null");
46+
}
47+
48+
if (s1.equals(s2)) {
49+
return 0;
50+
}
51+
3952
return s1.length() + s2.length() - 2 * length(s1, s2);
4053
}
4154

4255
/**
4356
* Return the length of Longest Common Subsequence (LCS) between strings s1
4457
* and s2.
4558
*
46-
* @param s1
47-
* @param s2
59+
* @param s1 The first string to compare.
60+
* @param s2 The second string to compare.
4861
* @return the length of LCS(s1, s2)
62+
* @throws NullPointerException if s1 or s2 is null.
4963
*/
5064
public final int length(final String s1, final String s2) {
65+
if (s1 == null) {
66+
throw new NullPointerException("s1 must not be null");
67+
}
68+
69+
if (s2 == null) {
70+
throw new NullPointerException("s2 must not be null");
71+
}
72+
5173
/* function LCSLength(X[1..m], Y[1..n])
5274
C = array(0..m, 0..n)
5375

src/main/java/info/debatty/java/stringsimilarity/MetricLCS.java

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -31,30 +31,43 @@
3131
/**
3232
* Distance metric based on Longest Common Subsequence, from the notes "An
3333
* LCS-based string metric" by Daniel Bakkelund.
34+
*
3435
* @author Thibault Debatty
3536
*/
3637
@Immutable
3738
public class MetricLCS
38-
implements MetricStringDistance, NormalizedStringDistance {
39+
implements MetricStringDistance, NormalizedStringDistance {
3940

4041
private final LongestCommonSubsequence lcs = new LongestCommonSubsequence();
4142

4243
/**
4344
* Distance metric based on Longest Common Subsequence, computed as
4445
* 1 - |LCS(s1, s2)| / max(|s1|, |s2|).
45-
* @param s1
46-
* @param s2
47-
* @return
46+
*
47+
* @param s1 The first string to compare.
48+
* @param s2 The second string to compare.
49+
* @return The computed distance metric value.
50+
* @throws NullPointerException if s1 or s2 is null.
4851
*/
4952
public final double distance(final String s1, final String s2) {
50-
int mLen = Math.max(s1.length(), s2.length());
51-
if (mLen == 0) {
52-
return 0;
53-
}
54-
return 1.0
55-
- (1.0 * lcs.length(s1, s2))
56-
/ mLen;
53+
if (s1 == null) {
54+
throw new NullPointerException("s1 must not be null");
55+
}
5756

58-
}
57+
if (s2 == null) {
58+
throw new NullPointerException("s2 must not be null");
59+
}
5960

61+
if (s1.equals(s2)) {
62+
return 0;
63+
}
64+
65+
int mLen = Math.max(s1.length(), s2.length());
66+
if (mLen == 0) {
67+
return 0;
68+
}
69+
return 1.0
70+
- (1.0 * lcs.length(s1, s2))
71+
/ mLen;
72+
}
6073
}

src/main/java/info/debatty/java/stringsimilarity/NGram.java

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -37,25 +37,30 @@ public NGram() {
3737

3838
/**
3939
* Compute n-gram distance.
40-
* @param s0
41-
* @param s1
42-
* @return
40+
* @param s0 The first string to compare.
41+
* @param s1 The second string to compare.
42+
* @return The computed n-gram distance in the range [0, 1]
43+
* @throws NullPointerException if s0 or s1 is null.
4344
*/
4445
public final double distance(final String s0, final String s1) {
45-
final char special = '\n';
46-
final int sl = s0.length();
47-
final int tl = s1.length();
46+
if (s0 == null) {
47+
throw new NullPointerException("s0 must not be null");
48+
}
49+
50+
if (s1 == null) {
51+
throw new NullPointerException("s1 must not be null");
52+
}
4853

4954
if (s0.equals(s1)) {
5055
return 0;
5156
}
5257

58+
final char special = '\n';
59+
final int sl = s0.length();
60+
final int tl = s1.length();
61+
5362
if (sl == 0 || tl == 0) {
54-
if (sl == tl) {
55-
return 0;
56-
} else {
57-
return 1;
58-
}
63+
return 1;
5964
}
6065

6166
int cost = 0;

0 commit comments

Comments
 (0)