@@ -24,7 +24,7 @@ public class TokenizerBenchmarkTestCase extends TestCase {
24
24
public StanfordCoreNLP pipeline ;
25
25
26
26
/** nested class for holding test example info such as text and gold tokens **/
27
- static class TestExample {
27
+ class TestExample {
28
28
29
29
private String sentenceID ;
30
30
private String sentenceText ;
@@ -53,6 +53,7 @@ public TestExample(List<String> conllLines) {
53
53
String [] mwtRange = conllLine .split ("\t " )[0 ].split ("-" );
54
54
currMWT = 1 + Integer .parseInt (mwtRange [1 ]) - Integer .parseInt (mwtRange [0 ]);
55
55
charEnd = charBegin + conllLine .split ("\t " )[1 ].length ();
56
+ continue ;
56
57
} else {
57
58
String tokenText = conllLine .split ("\t " )[1 ];
58
59
if (currMWT == 0 ) {
@@ -69,7 +70,7 @@ public TestExample(List<String> conllLines) {
69
70
}
70
71
71
72
/** helper method to build a CoreLabel from String and offsets **/
72
- public static CoreLabel buildCoreLabel (String word , int begin , int end ) {
73
+ public CoreLabel buildCoreLabel (String word , int begin , int end ) {
73
74
CoreLabel token = new CoreLabel ();
74
75
token .setWord (word );
75
76
token .setBeginPosition (begin );
@@ -106,70 +107,69 @@ public String systemTokensString() {
106
107
107
108
/** tokenize text with pipeline, populate systemTokensList **/
108
109
public void tokenizeSentenceText () {
109
- // todo [cdm 2019]: Restore all these tests that I deleted so that things build
110
- // systemTokensList = new ArrayList<CoreLabel>();
111
- // CoreLabel currMWTToken = null;
112
- // CoreDocument exampleTokensDoc = new CoreDocument(pipeline.process(sentenceText));
113
- // for (CoreLabel tok : exampleTokensDoc.tokens()) {
114
- // if (containedByMultiWordToken(tok)) {
115
- // if (currMWTToken == null || !isMultiWordTokenOf(tok, currMWTToken)) {
116
- // int charBegin =
117
- // systemTokensList.size() == 0 ?
118
- // 0 : systemTokensList.get(systemTokensList.size()-1).endPosition();
119
- // currMWTToken = placeholderMWTToken(tok, charBegin);
120
- // }
121
- // systemTokensList.add(buildCoreLabel(tok.word(), currMWTToken.beginPosition(), currMWTToken.endPosition()));
122
- // } else {
123
- // currMWTToken = null;
124
- // int charBegin =
125
- // systemTokensList.size() == 0 ?
126
- // 0 : systemTokensList.get(systemTokensList.size()-1).endPosition();
127
- // systemTokensList.add(buildCoreLabel(tok.word(), charBegin, charBegin + tok.word().length()));
128
- // }
129
- // }
110
+ systemTokensList = new ArrayList <CoreLabel >();
111
+ CoreLabel currMWTToken = null ;
112
+ CoreDocument exampleTokensDoc = new CoreDocument (pipeline .process (sentenceText ));
113
+ for (CoreLabel tok : exampleTokensDoc .tokens ()) {
114
+ if (containedByMultiWordToken (tok )) {
115
+ if (currMWTToken == null || !isMultiWordTokenOf (tok , currMWTToken )) {
116
+ int charBegin =
117
+ systemTokensList .size () == 0 ?
118
+ 0 : systemTokensList .get (systemTokensList .size ()-1 ).endPosition ();
119
+ currMWTToken = placeholderMWTToken (tok , charBegin );
120
+ }
121
+ systemTokensList .add (buildCoreLabel (tok .word (), currMWTToken .beginPosition (), currMWTToken .endPosition ()));
122
+ } else {
123
+ currMWTToken = null ;
124
+ int charBegin =
125
+ systemTokensList .size () == 0 ?
126
+ 0 : systemTokensList .get (systemTokensList .size ()-1 ).endPosition ();
127
+ systemTokensList .add (buildCoreLabel (tok .word (), charBegin , charBegin + tok .word ().length ()));
128
+ }
129
+ }
130
+ }
131
+
132
+ /** create a placeholder CoreLabel with the info of the original mwt token **/
133
+ public CoreLabel placeholderMWTToken (CoreLabel containedToken , int beginPosition ) {
134
+ CoreLabel placeholderToken = new CoreLabel ();
135
+ placeholderToken .setWord (containedToken .get (CoreAnnotations .MWTTokenTextAnnotation .class ));
136
+ placeholderToken .setBeginPosition (beginPosition );
137
+ placeholderToken .setEndPosition (beginPosition + placeholderToken .word ().length ());
138
+ placeholderToken .set (CoreAnnotations .MWTTokenCharacterOffsetBeginAnnotation .class ,
139
+ containedToken .get (CoreAnnotations .MWTTokenCharacterOffsetBeginAnnotation .class ));
140
+ placeholderToken .set (CoreAnnotations .MWTTokenCharacterOffsetEndAnnotation .class ,
141
+ containedToken .get (CoreAnnotations .MWTTokenCharacterOffsetEndAnnotation .class ));
142
+ placeholderToken .setIsMWT (true );
143
+ return placeholderToken ;
130
144
}
131
145
132
- // /** create a placeholder CoreLabel with the info of the original mwt token **/
133
- // public CoreLabel placeholderMWTToken(CoreLabel containedToken, int beginPosition) {
134
- // CoreLabel placeholderToken = new CoreLabel();
135
- // placeholderToken.setWord(containedToken.get(CoreAnnotations.MWTTokenTextAnnotation.class));
136
- // placeholderToken.setBeginPosition(beginPosition);
137
- // placeholderToken.setEndPosition(beginPosition + placeholderToken.word().length());
138
- // placeholderToken.set(CoreAnnotations.MWTTokenCharacterOffsetBeginAnnotation.class,
139
- // containedToken.get(CoreAnnotations.MWTTokenCharacterOffsetBeginAnnotation.class));
140
- // placeholderToken.set(CoreAnnotations.MWTTokenCharacterOffsetEndAnnotation.class,
141
- // containedToken.get(CoreAnnotations.MWTTokenCharacterOffsetEndAnnotation.class));
142
- // placeholderToken.setIsMWT(true);
143
- // return placeholderToken;
144
- // }
145
- //
146
- // /** check if a token is split off from a multi word token **/
147
- // public boolean containedByMultiWordToken(CoreLabel tok) {
148
- // if (tok.get(CoreAnnotations.MWTTokenTextAnnotation.class) != null) {
149
- // return true;
150
- // } else {
151
- // return false;
152
- // }
153
- // }
154
- //
155
- // /** check if a token is a split off token of another **/
156
- // public boolean isMultiWordTokenOf(CoreLabel splitToken, CoreLabel multiWordPlaceholderToken) {
157
- // int mwtPlaceholderBegin = multiWordPlaceholderToken.get(
158
- // CoreAnnotations.MWTTokenCharacterOffsetBeginAnnotation.class
159
- // );
160
- // int mwtPlaceholderEnd = multiWordPlaceholderToken.get(
161
- // CoreAnnotations.MWTTokenCharacterOffsetEndAnnotation.class
162
- // );
163
- // if (splitToken.get(CoreAnnotations.MWTTokenTextAnnotation.class).equals(multiWordPlaceholderToken.word())
164
- // && mwtPlaceholderBegin <= splitToken.beginPosition()
165
- // && mwtPlaceholderBegin <= splitToken.endPosition()
166
- // && mwtPlaceholderEnd >= splitToken.beginPosition()
167
- // && mwtPlaceholderEnd >= splitToken.endPosition()) {
168
- // return true;
169
- // } else {
170
- // return false;
171
- // }
172
- // }
146
+ /** check if a token is split off from a multi word token **/
147
+ public boolean containedByMultiWordToken (CoreLabel tok ) {
148
+ if (tok .get (CoreAnnotations .MWTTokenTextAnnotation .class ) != null ) {
149
+ return true ;
150
+ } else {
151
+ return false ;
152
+ }
153
+ }
154
+
155
+ /** check if a token is a split off token of another **/
156
+ public boolean isMultiWordTokenOf (CoreLabel splitToken , CoreLabel multiWordPlaceholderToken ) {
157
+ int mwtPlaceholderBegin = multiWordPlaceholderToken .get (
158
+ CoreAnnotations .MWTTokenCharacterOffsetBeginAnnotation .class
159
+ );
160
+ int mwtPlaceholderEnd = multiWordPlaceholderToken .get (
161
+ CoreAnnotations .MWTTokenCharacterOffsetEndAnnotation .class
162
+ );
163
+ if (splitToken .get (CoreAnnotations .MWTTokenTextAnnotation .class ).equals (multiWordPlaceholderToken .word ())
164
+ && mwtPlaceholderBegin <= splitToken .beginPosition ()
165
+ && mwtPlaceholderBegin <= splitToken .endPosition ()
166
+ && mwtPlaceholderEnd >= splitToken .beginPosition ()
167
+ && mwtPlaceholderEnd >= splitToken .endPosition ()) {
168
+ return true ;
169
+ } else {
170
+ return false ;
171
+ }
172
+ }
173
173
174
174
/** return TP, FP, FN stats for this example **/
175
175
public ClassicCounter <String > f1Stats () {
0 commit comments