18
18
* limitations under the License.
19
19
*/
20
20
21
- import opennlp .tools .tokenize .SimpleTokenizer ;
22
- import opennlp .tools .tokenize .Tokenizer ;
23
21
import opennlp .tools .util .Span ;
24
22
import org .apache .pig .EvalFunc ;
25
23
import org .apache .pig .FuncSpec ;
26
24
import org .apache .pig .data .*;
27
25
import org .apache .pig .impl .logicalLayer .FrontendException ;
28
26
import org .apache .pig .impl .logicalLayer .schema .Schema ;
27
+ import org .dbpedia .spotlight .db .model .RawTokenizer ;
28
+ import org .dbpedia .spotlight .db .model .Stemmer ;
29
+ import org .dbpedia .spotlight .db .tokenize .LanguageIndependentRawTokenizer ;
29
30
30
31
import java .io .BufferedReader ;
31
32
import java .io .File ;
32
33
import java .io .FileReader ;
33
34
import java .io .IOException ;
34
- import java .util .ArrayList ;
35
- import java .util .HashSet ;
36
- import java .util .List ;
37
- import java .util .Set ;
35
+ import java .util .*;
38
36
39
37
40
38
/**
@@ -47,7 +45,7 @@ public class RestrictedNGramGenerator extends EvalFunc<DataBag> {
47
45
48
46
private int ngramSizeLimit ;
49
47
50
- private final Tokenizer tokenizer = SimpleTokenizer . INSTANCE ;
48
+ private RawTokenizer tokenizer ;
51
49
52
50
private final BagFactory bagFactory = DefaultBagFactory .getInstance ();
53
51
private final TupleFactory tupleFactory = TupleFactory .getInstance ();
@@ -56,14 +54,17 @@ public class RestrictedNGramGenerator extends EvalFunc<DataBag> {
56
54
private String surfaceFormListFile ;
57
55
58
56
59
- public RestrictedNGramGenerator (int ngramSizeLimit , String surfaceFormListFile ) {
57
+ public RestrictedNGramGenerator (int ngramSizeLimit , String surfaceFormListFile , String locale ) {
60
58
this .ngramSizeLimit = ngramSizeLimit ;
61
59
this .surfaceFormListFile = surfaceFormListFile ;
60
+
61
+ String [] localeA = locale .split ("_" );
62
+ this .tokenizer = new LanguageIndependentRawTokenizer (new Locale (localeA [0 ], localeA [1 ]), new Stemmer ());
62
63
}
63
64
64
65
// Pig versions < 0.9 seem to only pass strings in constructor
65
- public RestrictedNGramGenerator (String ngramSizeLimit , String surfaceFormListFile ) {
66
- this (Integer .valueOf (ngramSizeLimit ), surfaceFormListFile );
66
+ public RestrictedNGramGenerator (String ngramSizeLimit , String surfaceFormListFile , String locale ) {
67
+ this (Integer .valueOf (ngramSizeLimit ), surfaceFormListFile , locale );
67
68
}
68
69
69
70
public List <String > getCacheFiles () {
0 commit comments