Skip to content

Commit 8eafe5b

Browse files
committed
Use Spotlight core tokenizer when collecting ngrams (introduces dependecy to Spotlight Core).
1 parent 728653c commit 8eafe5b

File tree

4 files changed

+20
-12
lines changed

4 files changed

+20
-12
lines changed

examples/indexing/names_and_entities.pig

+1-1
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ sfs = UNION ONSCHEMA
118118
STORE sfs INTO '$TEMPORARY_SF_LOCATION/sfs';
119119

120120
-- Define Ngram generator with maximum Ngram length
121-
DEFINE ngramGenerator pignlproc.helpers.RestrictedNGramGenerator('$MAX_NGRAM_LENGTH', '$TEMPORARY_SF_LOCATION/sfs');
121+
DEFINE ngramGenerator pignlproc.helpers.RestrictedNGramGenerator('$MAX_NGRAM_LENGTH', '$TEMPORARY_SF_LOCATION/sfs', '$LOCALE');
122122

123123
EXEC;
124124

examples/indexing/names_and_entities.pig.params

+2-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@ PIGNLPROC_JAR=%PIG_PATH/target/pignlproc-0.1.0-SNAPSHOT.jar
33
# number of reducers
44
DEFAULT_PARALLEL=6
55

6-
LANG=nl
6+
LANG=%LANG
7+
LOCALE=%LOCALE
78

89
INPUT=/user/hadoop/%LANGwiki-latest-pages-articles.xml
910
OUTPUT=hdfs:///user/hadoop/%LANG/names_and_entities

pom.xml

+6
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,12 @@
8989
<version>4.2</version>
9090
<scope>test</scope>
9191
</dependency>
92+
<dependency>
93+
<groupId>org.dbpedia.spotlight</groupId>
94+
<version>0.6</version>
95+
<artifactId>core</artifactId>
96+
</dependency>
97+
9298
</dependencies>
9399

94100
<build>

src/main/java/pignlproc/helpers/RestrictedNGramGenerator.java

+11-10
Original file line numberDiff line numberDiff line change
@@ -18,23 +18,21 @@
1818
* limitations under the License.
1919
*/
2020

21-
import opennlp.tools.tokenize.SimpleTokenizer;
22-
import opennlp.tools.tokenize.Tokenizer;
2321
import opennlp.tools.util.Span;
2422
import org.apache.pig.EvalFunc;
2523
import org.apache.pig.FuncSpec;
2624
import org.apache.pig.data.*;
2725
import org.apache.pig.impl.logicalLayer.FrontendException;
2826
import org.apache.pig.impl.logicalLayer.schema.Schema;
27+
import org.dbpedia.spotlight.db.model.RawTokenizer;
28+
import org.dbpedia.spotlight.db.model.Stemmer;
29+
import org.dbpedia.spotlight.db.tokenize.LanguageIndependentRawTokenizer;
2930

3031
import java.io.BufferedReader;
3132
import java.io.File;
3233
import java.io.FileReader;
3334
import java.io.IOException;
34-
import java.util.ArrayList;
35-
import java.util.HashSet;
36-
import java.util.List;
37-
import java.util.Set;
35+
import java.util.*;
3836

3937

4038
/**
@@ -47,7 +45,7 @@ public class RestrictedNGramGenerator extends EvalFunc<DataBag> {
4745

4846
private int ngramSizeLimit;
4947

50-
private final Tokenizer tokenizer = SimpleTokenizer.INSTANCE;
48+
private RawTokenizer tokenizer;
5149

5250
private final BagFactory bagFactory = DefaultBagFactory.getInstance();
5351
private final TupleFactory tupleFactory = TupleFactory.getInstance();
@@ -56,14 +54,17 @@ public class RestrictedNGramGenerator extends EvalFunc<DataBag> {
5654
private String surfaceFormListFile;
5755

5856

59-
public RestrictedNGramGenerator(int ngramSizeLimit, String surfaceFormListFile) {
57+
public RestrictedNGramGenerator(int ngramSizeLimit, String surfaceFormListFile, String locale) {
6058
this.ngramSizeLimit = ngramSizeLimit;
6159
this.surfaceFormListFile = surfaceFormListFile;
60+
61+
String[] localeA = locale.split("_");
62+
this.tokenizer = new LanguageIndependentRawTokenizer(new Locale(localeA[0], localeA[1]), new Stemmer());
6263
}
6364

6465
// Pig versions < 0.9 seem to only pass strings in constructor
65-
public RestrictedNGramGenerator(String ngramSizeLimit, String surfaceFormListFile) {
66-
this(Integer.valueOf(ngramSizeLimit), surfaceFormListFile);
66+
public RestrictedNGramGenerator(String ngramSizeLimit, String surfaceFormListFile, String locale) {
67+
this(Integer.valueOf(ngramSizeLimit), surfaceFormListFile, locale);
6768
}
6869

6970
public List<String> getCacheFiles() {

0 commit comments

Comments
 (0)