Skip to content

implement soundex similarity #335

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: dev
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.jembi.jempi.linker.backend;

import com.fasterxml.jackson.core.JsonProcessingException;
import org.apache.commons.codec.language.Soundex;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.similarity.JaccardSimilarity;
import org.apache.commons.text.similarity.JaroWinklerSimilarity;
Expand All @@ -27,6 +28,7 @@ public final class LinkerProbabilistic {
static final JaccardSimilarity JACCARD_SIMILARITY = new JaccardSimilarity();
static final JaroSimilarity JARO_SIMILARITY = new JaroSimilarity();
static final ExactSimilarity EXACT_SIMILARITY = new ExactSimilarity();
static final SoundexSimilarity SOUNDEX_SIMILARITY = new SoundexSimilarity();
private static final int METRIC_MIN = 0;
private static final int METRIC_MAX = 1;
private static final int METRIC_SCORE = 2;
Expand All @@ -36,15 +38,15 @@ public final class LinkerProbabilistic {
private static final float MISSING_PENALTY = 0.925F;
static List<ProbabilisticField> currentProbabilisticLinkFields = LINKER_CONFIG.probabilisticLinkFields
.stream()
.map(f -> new ProbabilisticField(getSimilarityFunction(f.similarityScore()), f.comparisonLevels(), f.m(), f.u()))
.map(f -> new ProbabilisticField(getSimilarityFunction(SimilarityFunctionName.valueOf(f.similarityScore())), f.comparisonLevels(), f.m(), f.u()))
.toList();
static List<ProbabilisticField> currentProbabilisticValidateFields = LINKER_CONFIG.probabilisticValidateFields
.stream()
.map(f -> new ProbabilisticField(getSimilarityFunction(f.similarityScore()), f.comparisonLevels(), f.m(), f.u()))
.map(f -> new ProbabilisticField(getSimilarityFunction(SimilarityFunctionName.valueOf(f.similarityScore())), f.comparisonLevels(), f.m(), f.u()))
.toList();
static List<ProbabilisticField> currentProbabilisticMatchFields = LINKER_CONFIG.probabilisticMatchNotificationFields
.stream()
.map(f -> new ProbabilisticField(getSimilarityFunction(f.similarityScore()), f.comparisonLevels(), f.m(), f.u()))
.map(f -> new ProbabilisticField(getSimilarityFunction(SimilarityFunctionName.valueOf(f.similarityScore())), f.comparisonLevels(), f.m(), f.u()))
.toList();

static List<ProbabilisticField> updatedProbabilisticLinkFields = null;
Expand All @@ -65,18 +67,33 @@ static List<ProbabilisticField> toLinkProbabilisticFieldList(
final var list = new ArrayList<ProbabilisticField>();
for (int i = 0; i < mu.size(); i++) {
list.add(new ProbabilisticField(
getSimilarityFunction(probabilisticMetaData.get(i).similarityScore()),
getSimilarityFunction(SimilarityFunctionName.valueOf(probabilisticMetaData.get(i).similarityScore())),
probabilisticMetaData.get(i).comparisonLevels(),
mu.get(i).m(), mu.get(i).u()));
}
return list;
}

static SimilarityScore<Double> getSimilarityFunction(final String func) {
if ("JARO_WINKLER_SIMILARITY".equals(func)) {
return JARO_WINKLER_SIMILARITY;
} else {
return JACCARD_SIMILARITY;
public enum SimilarityFunctionName {
JARO_WINKLER_SIMILARITY,
JARO_SIMILARITY,
JACCARD_SIMILARITY,
SOUNDEX_SIMILARITY,
EXACT_SIMILARITY
}

static SimilarityScore<Double> getSimilarityFunction(final SimilarityFunctionName func) {
switch (func) {
case JARO_WINKLER_SIMILARITY:
return JARO_WINKLER_SIMILARITY;
case JARO_SIMILARITY:
return JARO_SIMILARITY;
case JACCARD_SIMILARITY:
return JACCARD_SIMILARITY;
case SOUNDEX_SIMILARITY:
return SOUNDEX_SIMILARITY;
default:
return EXACT_SIMILARITY;
}
}

Expand Down Expand Up @@ -268,14 +285,33 @@ public Double apply(
if (StringUtils.isEmpty(left) || StringUtils.isEmpty(right)) {
return 0.5;
}
// assert - we have 2 non-empty strings

return StringUtils.equals(left, right)
? 1.0
: 0.0;
}

}

static class SoundexSimilarity implements SimilarityScore<Double> {

private final Soundex soundex = new Soundex();

@Override
public Double apply(
final CharSequence left,
final CharSequence right) {
if (StringUtils.isEmpty(left) || StringUtils.isEmpty(right)) {
return 0.5;
}

return StringUtils.equals(soundex.soundex(left.toString()), soundex.soundex(right.toString()))
? 1.0
: 0.0;
}

}

static class JaroSimilarity implements SimilarityScore<Double> {

@Override
Expand Down