Skip to content

Commit 1afdc76

Browse files
committed
Added checkstyle test + removed sparse vector classes, which not used anymore
1 parent 289e082 commit 1afdc76

19 files changed

+106
-1316
lines changed

pom.xml

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
<url>http://www.opensource.org/licenses/mit-license.php</url>
2323
</license>
2424
</licenses>
25-
25+
2626
<developers>
2727
<developer>
2828
<name>Thibault Debatty</name>
@@ -31,7 +31,7 @@
3131
<organizationUrl>http://debatty.info</organizationUrl>
3232
</developer>
3333
</developers>
34-
34+
3535
<scm>
3636
<connection>scm:git:[email protected]:tdebatty/java-string-similarity.git</connection>
3737
<developerConnection>scm:git:[email protected]:tdebatty/java-string-similarity.git</developerConnection>
@@ -156,6 +156,29 @@
156156
</instrumentation>
157157
</configuration>
158158
</plugin>
159+
160+
<plugin>
161+
<groupId>org.apache.maven.plugins</groupId>
162+
<artifactId>maven-checkstyle-plugin</artifactId>
163+
<version>2.16</version>
164+
<executions>
165+
<execution>
166+
<id>validate</id>
167+
<phase>verify</phase>
168+
<configuration>
169+
<configLocation>checkstyle.xml</configLocation>
170+
<cacheFile>target/checkstyle_cache</cacheFile>
171+
<encoding>UTF-8</encoding>
172+
<consoleOutput>true</consoleOutput>
173+
<linkXRef>false</linkXRef>
174+
<excludes>**/examples/**</excludes>
175+
</configuration>
176+
<goals>
177+
<goal>check</goal>
178+
</goals>
179+
</execution>
180+
</executions>
181+
</plugin>
159182
</plugins>
160183
</build>
161184

src/main/java/info/debatty/java/stringsimilarity/Cosine.java

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -56,15 +56,15 @@ public Cosine(final int k) {
5656
* Implements Cosine Similarity between strings. The strings are first
5757
* transformed in vectors of occurrences of k-shingles (sequences of k
5858
* characters). In this n-dimensional space, the similarity between the two
59-
* strings is the cosine of their respective vectors.
60-
* Default k is 3.
59+
* strings is the cosine of their respective vectors. Default k is 3.
6160
*/
6261
public Cosine() {
6362
super();
6463
}
6564

6665
/**
6766
* Compute the cosine similarity between strings.
67+
*
6868
* @param s1 The first string to compare.
6969
* @param s2 The second string to compare.
7070
* @return The cosine similarity in the range [0, 1]
@@ -94,8 +94,6 @@ public final double similarity(final String s1, final String s2) {
9494
/ (norm(profile1) * norm(profile2));
9595
}
9696

97-
98-
9997
/**
10098
* Compute the norm L2 : sqrt(Sum_i( v_i²)).
10199
*
@@ -126,8 +124,8 @@ private static double dotProduct(
126124

127125
double agg = 0;
128126
for (Map.Entry<String, Integer> entry : small_profile.entrySet()) {
129-
Integer i=large_profile.get(entry.getKey());
130-
if (i==null) {
127+
Integer i = large_profile.get(entry.getKey());
128+
if (i == null) {
131129
continue;
132130
}
133131
agg += 1.0 * entry.getValue() * i;
@@ -138,6 +136,7 @@ private static double dotProduct(
138136

139137
/**
140138
* Return 1.0 - similarity.
139+
*
141140
* @param s1 The first string to compare.
142141
* @param s2 The second string to compare.
143142
* @return 1.0 - the cosine similarity in the range [0, 1]
@@ -147,7 +146,13 @@ public final double distance(final String s1, final String s2) {
147146
return 1.0 - similarity(s1, s2);
148147
}
149148

150-
public double similarity(
149+
/**
150+
* {@inheritDoc}
151+
* @param profile1
152+
* @param profile2
153+
* @return
154+
*/
155+
public final double similarity(
151156
final Map<String, Integer> profile1,
152157
final Map<String, Integer> profile2) {
153158

src/main/java/info/debatty/java/stringsimilarity/JaroWinkler.java

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -111,17 +111,19 @@ private int[] matches(final String s1, final String s2) {
111111
min = s1;
112112
}
113113
int range = Math.max(max.length() / 2 - 1, 0);
114-
int[] matchIndexes = new int[min.length()];
115-
Arrays.fill(matchIndexes, -1);
116-
boolean[] matchFlags = new boolean[max.length()];
114+
int[] match_indexes = new int[min.length()];
115+
Arrays.fill(match_indexes, -1);
116+
boolean[] match_flags = new boolean[max.length()];
117117
int matches = 0;
118118
for (int mi = 0; mi < min.length(); mi++) {
119119
char c1 = min.charAt(mi);
120120
for (int xi = Math.max(mi - range, 0),
121-
xn = Math.min(mi + range + 1, max.length()); xi < xn; xi++) {
122-
if (!matchFlags[xi] && c1 == max.charAt(xi)) {
123-
matchIndexes[mi] = xi;
124-
matchFlags[xi] = true;
121+
xn = Math.min(mi + range + 1, max.length());
122+
xi < xn;
123+
xi++) {
124+
if (!match_flags[xi] && c1 == max.charAt(xi)) {
125+
match_indexes[mi] = xi;
126+
match_flags[xi] = true;
125127
matches++;
126128
break;
127129
}
@@ -130,13 +132,13 @@ private int[] matches(final String s1, final String s2) {
130132
char[] ms1 = new char[matches];
131133
char[] ms2 = new char[matches];
132134
for (int i = 0, si = 0; i < min.length(); i++) {
133-
if (matchIndexes[i] != -1) {
135+
if (match_indexes[i] != -1) {
134136
ms1[si] = min.charAt(i);
135137
si++;
136138
}
137139
}
138140
for (int i = 0, si = 0; i < max.length(); i++) {
139-
if (matchFlags[i]) {
141+
if (match_flags[i]) {
140142
ms2[si] = max.charAt(i);
141143
si++;
142144
}

src/main/java/info/debatty/java/stringsimilarity/LongestCommonSubsequence.java

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -87,15 +87,15 @@ public final int length(final String s1, final String s2) {
8787
C[i,j] := max(C[i,j-1], C[i-1,j])
8888
return C[m,n]
8989
*/
90-
int m = s1.length();
91-
int n = s2.length();
90+
int s1_length = s1.length();
91+
int s2_length = s2.length();
9292
char[] x = s1.toCharArray();
9393
char[] y = s2.toCharArray();
9494

95-
int[][] c = new int[m + 1][n + 1];
95+
int[][] c = new int[s1_length + 1][s2_length + 1];
9696

97-
for (int i = 1; i <= m; i++) {
98-
for (int j = 1; j <= n; j++) {
97+
for (int i = 1; i <= s1_length; i++) {
98+
for (int j = 1; j <= s2_length; j++) {
9999
if (x[i - 1] == y[j - 1]) {
100100
c[i][j] = c[i - 1][j - 1] + 1;
101101

@@ -105,6 +105,6 @@ public final int length(final String s1, final String s2) {
105105
}
106106
}
107107

108-
return c[m][n];
108+
return c[s1_length][s2_length];
109109
}
110110
}

src/main/java/info/debatty/java/stringsimilarity/MetricLCS.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,12 +62,12 @@ public final double distance(final String s1, final String s2) {
6262
return 0;
6363
}
6464

65-
int mLen = Math.max(s1.length(), s2.length());
66-
if (mLen == 0) {
65+
int m_len = Math.max(s1.length(), s2.length());
66+
if (m_len == 0) {
6767
return 0;
6868
}
6969
return 1.0
7070
- (1.0 * lcs.length(s1, s2))
71-
/ mLen;
71+
/ m_len;
7272
}
7373
}

src/main/java/info/debatty/java/stringsimilarity/NormalizedLevenshtein.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,13 +62,13 @@ public final double distance(final String s1, final String s2) {
6262
return 0;
6363
}
6464

65-
int mLen = Math.max(s1.length(), s2.length());
65+
int m_len = Math.max(s1.length(), s2.length());
6666

67-
if (mLen == 0) {
67+
if (m_len == 0) {
6868
return 0;
6969
}
7070

71-
return l.distance(s1, s2) / mLen;
71+
return l.distance(s1, s2) / m_len;
7272
}
7373

7474
/**

src/main/java/info/debatty/java/stringsimilarity/OptimalStringAlignment.java

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ public final class OptimalStringAlignment implements StringDistance {
5151
* @return the OSA distance
5252
* @throws NullPointerException if s1 or s2 is null.
5353
*/
54-
public final double distance(final String s1, final String s2) {
54+
public double distance(final String s1, final String s2) {
5555
if (s1 == null) {
5656
throw new NullPointerException("s1 must not be null");
5757
}
@@ -92,19 +92,21 @@ public final double distance(final String s1, final String s2) {
9292
for (int j = 1; j <= m; j++) {
9393

9494
//if s1[i - 1] = s2[j - 1] then cost = 0, else cost = 1
95-
cost = (s1.charAt(i - 1) == s2.charAt(j - 1)) ? 0 : 1;
95+
cost = 1;
96+
if (s1.charAt(i - 1) == s2.charAt(j - 1)) {
97+
cost = 0;
98+
}
9699

97100
d[i][j] = min(
98101
d[i - 1][j - 1] + cost, // substitution
99-
d[i][j - 1] + 1, // insertion
100-
d[i - 1][j] + 1 // deletion
102+
d[i][j - 1] + 1, // insertion
103+
d[i - 1][j] + 1 // deletion
101104
);
102105

103106
//transposition check
104-
if (i > 1 && j > 1
105-
&& s1.charAt(i - 1) == s2.charAt(j - 2)
106-
&& s1.charAt(i - 2) == s2.charAt(j - 1)
107-
){
107+
if (i > 1 && j > 1
108+
&& s1.charAt(i - 1) == s2.charAt(j - 2)
109+
&& s1.charAt(i - 2) == s2.charAt(j - 1)) {
108110
d[i][j] = Math.min(d[i][j], d[i - 2][j - 2] + cost);
109111
}
110112
}

src/main/java/info/debatty/java/stringsimilarity/ShingleBased.java

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,9 @@
4444
* replaced by a single space, and a k-gram is a sequence of k characters.
4545
*
4646
* Default value of k is 3. A good rule of thumb is to imagine that there are
47-
* only 20 characters and estimate the number of k-shingles as 20^k. For
48-
* small documents like e-mails, k = 5 is a recommended value. For large
49-
* documents, such as research articles, k = 9 is considered a safe choice.
47+
* only 20 characters and estimate the number of k-shingles as 20^k. For small
48+
* documents like e-mails, k = 5 is a recommended value. For large documents,
49+
* such as research articles, k = 9 is considered a safe choice.
5050
*
5151
* @author Thibault Debatty
5252
*/
@@ -93,11 +93,10 @@ public int getK() {
9393
/**
9494
* Compute and return the profile of s, as defined by Ukkonen "Approximate
9595
* string-matching with q-grams and maximal matches".
96-
* https://www.cs.helsinki.fi/u/ukkonen/TCS92.pdf
97-
* The profile is the number of occurrences of k-shingles, and is used to
98-
* compute q-gram similarity, Jaccard index, etc.
99-
* Pay attention: the memory requirement of the profile can be up to
100-
* k * size of the string
96+
* https://www.cs.helsinki.fi/u/ukkonen/TCS92.pdf The profile is the number
97+
* of occurrences of k-shingles, and is used to compute q-gram similarity,
98+
* Jaccard index, etc. Pay attention: the memory requirement of the profile
99+
* can be up to k * size of the string
101100
*
102101
* @param string
103102
* @return the profile of this string, as an unmodifiable Map
@@ -109,7 +108,7 @@ public final Map<String, Integer> getProfile(final String string) {
109108
for (int i = 0; i < (string_no_space.length() - k + 1); i++) {
110109
String shingle = string_no_space.substring(i, i + k);
111110
Integer old = shingles.get(shingle);
112-
if (old!=null) {
111+
if (old != null) {
113112
shingles.put(shingle, old + 1);
114113
} else {
115114
shingles.put(shingle, 1);

src/main/java/info/debatty/java/stringsimilarity/SorensenDice.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,15 +63,15 @@ public SorensenDice(final int k) {
6363
* The strings are first converted to boolean sets of k-shingles (sequences
6464
* of k characters), then the similarity is computed as 2 * |A inter B| /
6565
* (|A| + |B|). Attention: Sorensen-Dice distance (and similarity) does not
66-
* satisfy triangle inequality.
67-
* Default k is 3.
66+
* satisfy triangle inequality. Default k is 3.
6867
*/
6968
public SorensenDice() {
7069
super();
7170
}
7271

7372
/**
7473
* Similarity is computed as 2 * |A inter B| / (|A| + |B|).
74+
*
7575
* @param s1 The first string to compare.
7676
* @param s2 The second string to compare.
7777
* @return The computed Sorensen-Dice similarity.
@@ -108,15 +108,15 @@ public final double similarity(final String s1, final String s2) {
108108
return 2.0 * inter / (profile1.size() + profile2.size());
109109
}
110110

111-
112111
/**
113112
* Returns 1 - similarity.
113+
*
114114
* @param s1 The first string to compare.
115115
* @param s2 The second string to compare.
116116
* @return 1.0 - the computed similarity
117117
* @throws NullPointerException if s1 or s2 is null.
118118
*/
119-
public double distance(String s1, String s2) {
119+
public final double distance(final String s1, final String s2) {
120120
return 1 - similarity(s1, s2);
121121
}
122122
}

0 commit comments

Comments
 (0)