Skip to content

Commit ae1b967

Browse files
J38Stanford NLP
authored and
Stanford NLP
committed
Merge branch 'master' of origin
1 parent c816856 commit ae1b967

File tree

5 files changed

+74
-72
lines changed

5 files changed

+74
-72
lines changed

itest/src/edu/stanford/nlp/pipeline/TokenizerBenchmarkTestCase.java

+64-64
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ public class TokenizerBenchmarkTestCase extends TestCase {
2424
public StanfordCoreNLP pipeline;
2525

2626
/** nested class for holding test example info such as text and gold tokens **/
27-
static class TestExample {
27+
class TestExample {
2828

2929
private String sentenceID;
3030
private String sentenceText;
@@ -53,6 +53,7 @@ public TestExample(List<String> conllLines) {
5353
String[] mwtRange = conllLine.split("\t")[0].split("-");
5454
currMWT = 1 + Integer.parseInt(mwtRange[1]) - Integer.parseInt(mwtRange[0]);
5555
charEnd = charBegin + conllLine.split("\t")[1].length();
56+
continue;
5657
} else {
5758
String tokenText = conllLine.split("\t")[1];
5859
if (currMWT == 0) {
@@ -69,7 +70,7 @@ public TestExample(List<String> conllLines) {
6970
}
7071

7172
/** helper method to build a CoreLabel from String and offsets **/
72-
public static CoreLabel buildCoreLabel(String word, int begin, int end) {
73+
public CoreLabel buildCoreLabel(String word, int begin, int end) {
7374
CoreLabel token = new CoreLabel();
7475
token.setWord(word);
7576
token.setBeginPosition(begin);
@@ -106,70 +107,69 @@ public String systemTokensString() {
106107

107108
/** tokenize text with pipeline, populate systemTokensList **/
108109
public void tokenizeSentenceText() {
109-
// todo [cdm 2019]: Restore all these tests that I deleted so that things build
110-
// systemTokensList = new ArrayList<CoreLabel>();
111-
// CoreLabel currMWTToken = null;
112-
// CoreDocument exampleTokensDoc = new CoreDocument(pipeline.process(sentenceText));
113-
// for (CoreLabel tok : exampleTokensDoc.tokens()) {
114-
// if (containedByMultiWordToken(tok)) {
115-
// if (currMWTToken == null || !isMultiWordTokenOf(tok, currMWTToken)) {
116-
// int charBegin =
117-
// systemTokensList.size() == 0 ?
118-
// 0 : systemTokensList.get(systemTokensList.size()-1).endPosition();
119-
// currMWTToken = placeholderMWTToken(tok, charBegin);
120-
// }
121-
// systemTokensList.add(buildCoreLabel(tok.word(), currMWTToken.beginPosition(), currMWTToken.endPosition()));
122-
// } else {
123-
// currMWTToken = null;
124-
// int charBegin =
125-
// systemTokensList.size() == 0 ?
126-
// 0 : systemTokensList.get(systemTokensList.size()-1).endPosition();
127-
// systemTokensList.add(buildCoreLabel(tok.word(), charBegin, charBegin + tok.word().length()));
128-
// }
129-
// }
110+
systemTokensList = new ArrayList<CoreLabel>();
111+
CoreLabel currMWTToken = null;
112+
CoreDocument exampleTokensDoc = new CoreDocument(pipeline.process(sentenceText));
113+
for (CoreLabel tok : exampleTokensDoc.tokens()) {
114+
if (containedByMultiWordToken(tok)) {
115+
if (currMWTToken == null || !isMultiWordTokenOf(tok, currMWTToken)) {
116+
int charBegin =
117+
systemTokensList.size() == 0 ?
118+
0 : systemTokensList.get(systemTokensList.size()-1).endPosition();
119+
currMWTToken = placeholderMWTToken(tok, charBegin);
120+
}
121+
systemTokensList.add(buildCoreLabel(tok.word(), currMWTToken.beginPosition(), currMWTToken.endPosition()));
122+
} else {
123+
currMWTToken = null;
124+
int charBegin =
125+
systemTokensList.size() == 0 ?
126+
0 : systemTokensList.get(systemTokensList.size()-1).endPosition();
127+
systemTokensList.add(buildCoreLabel(tok.word(), charBegin, charBegin + tok.word().length()));
128+
}
129+
}
130+
}
131+
132+
/** create a placeholder CoreLabel with the info of the original mwt token **/
133+
public CoreLabel placeholderMWTToken(CoreLabel containedToken, int beginPosition) {
134+
CoreLabel placeholderToken = new CoreLabel();
135+
placeholderToken.setWord(containedToken.get(CoreAnnotations.MWTTokenTextAnnotation.class));
136+
placeholderToken.setBeginPosition(beginPosition);
137+
placeholderToken.setEndPosition(beginPosition + placeholderToken.word().length());
138+
placeholderToken.set(CoreAnnotations.MWTTokenCharacterOffsetBeginAnnotation.class,
139+
containedToken.get(CoreAnnotations.MWTTokenCharacterOffsetBeginAnnotation.class));
140+
placeholderToken.set(CoreAnnotations.MWTTokenCharacterOffsetEndAnnotation.class,
141+
containedToken.get(CoreAnnotations.MWTTokenCharacterOffsetEndAnnotation.class));
142+
placeholderToken.setIsMWT(true);
143+
return placeholderToken;
130144
}
131145

132-
// /** create a placeholder CoreLabel with the info of the original mwt token **/
133-
// public CoreLabel placeholderMWTToken(CoreLabel containedToken, int beginPosition) {
134-
// CoreLabel placeholderToken = new CoreLabel();
135-
// placeholderToken.setWord(containedToken.get(CoreAnnotations.MWTTokenTextAnnotation.class));
136-
// placeholderToken.setBeginPosition(beginPosition);
137-
// placeholderToken.setEndPosition(beginPosition + placeholderToken.word().length());
138-
// placeholderToken.set(CoreAnnotations.MWTTokenCharacterOffsetBeginAnnotation.class,
139-
// containedToken.get(CoreAnnotations.MWTTokenCharacterOffsetBeginAnnotation.class));
140-
// placeholderToken.set(CoreAnnotations.MWTTokenCharacterOffsetEndAnnotation.class,
141-
// containedToken.get(CoreAnnotations.MWTTokenCharacterOffsetEndAnnotation.class));
142-
// placeholderToken.setIsMWT(true);
143-
// return placeholderToken;
144-
// }
145-
//
146-
// /** check if a token is split off from a multi word token **/
147-
// public boolean containedByMultiWordToken(CoreLabel tok) {
148-
// if (tok.get(CoreAnnotations.MWTTokenTextAnnotation.class) != null) {
149-
// return true;
150-
// } else {
151-
// return false;
152-
// }
153-
// }
154-
//
155-
// /** check if a token is a split off token of another **/
156-
// public boolean isMultiWordTokenOf(CoreLabel splitToken, CoreLabel multiWordPlaceholderToken) {
157-
// int mwtPlaceholderBegin = multiWordPlaceholderToken.get(
158-
// CoreAnnotations.MWTTokenCharacterOffsetBeginAnnotation.class
159-
// );
160-
// int mwtPlaceholderEnd = multiWordPlaceholderToken.get(
161-
// CoreAnnotations.MWTTokenCharacterOffsetEndAnnotation.class
162-
// );
163-
// if (splitToken.get(CoreAnnotations.MWTTokenTextAnnotation.class).equals(multiWordPlaceholderToken.word())
164-
// && mwtPlaceholderBegin <= splitToken.beginPosition()
165-
// && mwtPlaceholderBegin <= splitToken.endPosition()
166-
// && mwtPlaceholderEnd >= splitToken.beginPosition()
167-
// && mwtPlaceholderEnd >= splitToken.endPosition()) {
168-
// return true;
169-
// } else {
170-
// return false;
171-
// }
172-
// }
146+
/** check if a token is split off from a multi word token **/
147+
public boolean containedByMultiWordToken(CoreLabel tok) {
148+
if (tok.get(CoreAnnotations.MWTTokenTextAnnotation.class) != null) {
149+
return true;
150+
} else {
151+
return false;
152+
}
153+
}
154+
155+
/** check if a token is a split off token of another **/
156+
public boolean isMultiWordTokenOf(CoreLabel splitToken, CoreLabel multiWordPlaceholderToken) {
157+
int mwtPlaceholderBegin = multiWordPlaceholderToken.get(
158+
CoreAnnotations.MWTTokenCharacterOffsetBeginAnnotation.class
159+
);
160+
int mwtPlaceholderEnd = multiWordPlaceholderToken.get(
161+
CoreAnnotations.MWTTokenCharacterOffsetEndAnnotation.class
162+
);
163+
if (splitToken.get(CoreAnnotations.MWTTokenTextAnnotation.class).equals(multiWordPlaceholderToken.word())
164+
&& mwtPlaceholderBegin <= splitToken.beginPosition()
165+
&& mwtPlaceholderBegin <= splitToken.endPosition()
166+
&& mwtPlaceholderEnd >= splitToken.beginPosition()
167+
&& mwtPlaceholderEnd >= splitToken.endPosition()) {
168+
return true;
169+
} else {
170+
return false;
171+
}
172+
}
173173

174174
/** return TP, FP, FN stats for this example **/
175175
public ClassicCounter<String> f1Stats() {

itest/src/edu/stanford/nlp/pipeline/TokenizerGermanBenchmarkITest.java

+1-2
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,7 @@ public void setUp() {
1717

1818
public void testOnDev() {
1919
goldFilePath = "/u/nlp/data/stanford-corenlp-testing/data/tokenize/de_gsd-ud-dev.conllu";
20-
runTest("dev", "de", 0.5);
20+
runTest("dev", "de", 0.95);
2121
}
2222

23-
2423
}

itest/src/edu/stanford/nlp/pipeline/TokenizerSpanishBenchmarkITest.java

+4-1
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,15 @@ public void setUp() {
1010
Properties props = new Properties();
1111
props.put("annotators", "tokenize");
1212
props.put("tokenize.language", "es");
13+
props.put("tokenize.options", "splitAll=false");
14+
props.put("tokenize.mwt.mappingFile",
15+
"/u/nlp/data/stanford-corenlp-testing/resources/es_mwt.tsv");
1316
pipeline = new StanfordCoreNLP(props);
1417
}
1518

1619
public void testOnDev() {
1720
goldFilePath = "/u/nlp/data/stanford-corenlp-testing/data/tokenize/es_ancora-ud-dev.conllu";
18-
runTest("dev", "es", 0.5);
21+
runTest("dev", "es", 0.994);
1922
}
2023

2124
}

src/edu/stanford/nlp/pipeline/MWTAnnotator.java

+4-4
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ public class MWTAnnotator {
1616

1717
public MWTAnnotator(String name, Properties props) {
1818
String prefix = (name != null && !name.equals("")) ? name+".mwt." : "mwt.";
19-
System.out.println(prefix+"mappingFile");
20-
System.out.println(props.getProperty(prefix+"mappingFile"));
19+
//System.out.println(prefix+"mappingFile");
20+
//System.out.println(props.getProperty(prefix+"mappingFile"));
2121
loadMultiWordTokenMappings(props.getProperty(prefix+"mappingFile"));
2222
}
2323

@@ -39,11 +39,11 @@ public void annotate(Annotation annotation) {
3939
for (CoreLabel token : annotation.get(CoreAnnotations.TokensAnnotation.class)) {
4040
// check if token text is in the mapping
4141
if (multiWordTokenMapping.containsKey(token.word())) {
42-
System.err.println("found match: "+token.word());
42+
//System.err.println("found match: "+token.word());
4343
int numWordsForToken = multiWordTokenMapping.get(token.word()).size();
4444
List<CoreLabel> newTokens = new ArrayList<CoreLabel>();
4545
for (String word : multiWordTokenMapping.get(token.word())) {
46-
System.err.println("splitting into: "+word);
46+
//System.err.println("splitting into: "+word);
4747
CoreLabel newToken = new CoreLabel();
4848
newToken.setWord(word);
4949
newToken.setValue(word);

src/edu/stanford/nlp/pipeline/TokenizerAnnotator.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ else if (LanguageInfo.getLanguageFromString(
223223
}
224224
// set up an MWTAnnotator if a mapping file is provided
225225
if (!props.getProperty("tokenize.mwt.mappingFile", "").equals("")) {
226-
System.out.println("Setting up MWTAnnotator!!");
226+
//System.out.println("Setting up MWTAnnotator!!");
227227
splitMWTTokens = true;
228228
mwtAnnotator = new MWTAnnotator("tokenize", props);
229229
}

0 commit comments

Comments
 (0)