Merge branch 'master' of origin

J38 · Stanford NLP · commit ae1b967dc314 · 2019-05-23T16:32:09.000-07:00
diff --git a/itest/src/edu/stanford/nlp/pipeline/TokenizerBenchmarkTestCase.java b/itest/src/edu/stanford/nlp/pipeline/TokenizerBenchmarkTestCase.java
@@ -24,7 +24,7 @@ public class TokenizerBenchmarkTestCase extends TestCase {
     public StanfordCoreNLP pipeline;
 
     /** nested class for holding test example info such as text and gold tokens **/
-    static class TestExample {
+    class TestExample {
 
         private String sentenceID;
         private String sentenceText;
@@ -53,6 +53,7 @@ public TestExample(List<String> conllLines) {
                     String[] mwtRange = conllLine.split("\t")[0].split("-");
                     currMWT = 1 + Integer.parseInt(mwtRange[1]) - Integer.parseInt(mwtRange[0]);
                     charEnd = charBegin + conllLine.split("\t")[1].length();
+                    continue;
                 } else {
                     String tokenText = conllLine.split("\t")[1];
                     if (currMWT == 0) {
@@ -69,7 +70,7 @@ public TestExample(List<String> conllLines) {
         }
 
         /** helper method to build a CoreLabel from String and offsets **/
-        public static CoreLabel buildCoreLabel(String word, int begin, int end) {
+        public CoreLabel buildCoreLabel(String word, int begin, int end) {
             CoreLabel token = new CoreLabel();
             token.setWord(word);
             token.setBeginPosition(begin);
@@ -106,70 +107,69 @@ public String systemTokensString() {
 
         /** tokenize text with pipeline, populate systemTokensList **/
         public void tokenizeSentenceText() {
-// todo [cdm 2019]: Restore all these tests that I deleted so that things build
-//            systemTokensList = new ArrayList<CoreLabel>();
-//            CoreLabel currMWTToken = null;
-//            CoreDocument exampleTokensDoc = new CoreDocument(pipeline.process(sentenceText));
-//            for (CoreLabel tok : exampleTokensDoc.tokens()) {
-//                if (containedByMultiWordToken(tok)) {
-//                    if (currMWTToken == null || !isMultiWordTokenOf(tok, currMWTToken)) {
-//                        int charBegin =
-//                                systemTokensList.size() == 0 ?
-//                                        0 : systemTokensList.get(systemTokensList.size()-1).endPosition();
-//                        currMWTToken = placeholderMWTToken(tok, charBegin);
-//                    }
-//                    systemTokensList.add(buildCoreLabel(tok.word(), currMWTToken.beginPosition(), currMWTToken.endPosition()));
-//                } else {
-//                    currMWTToken = null;
-//                    int charBegin =
-//                            systemTokensList.size() == 0 ?
-//                                    0 : systemTokensList.get(systemTokensList.size()-1).endPosition();
-//                    systemTokensList.add(buildCoreLabel(tok.word(), charBegin, charBegin + tok.word().length()));
-//                }
-//            }
+            systemTokensList = new ArrayList<CoreLabel>();
+            CoreLabel currMWTToken = null;
+            CoreDocument exampleTokensDoc = new CoreDocument(pipeline.process(sentenceText));
+            for (CoreLabel tok : exampleTokensDoc.tokens()) {
+                if (containedByMultiWordToken(tok)) {
+                    if (currMWTToken == null || !isMultiWordTokenOf(tok, currMWTToken)) {
+                        int charBegin =
+                                systemTokensList.size() == 0 ?
+                                        0 : systemTokensList.get(systemTokensList.size()-1).endPosition();
+                        currMWTToken = placeholderMWTToken(tok, charBegin);
+                    }
+                    systemTokensList.add(buildCoreLabel(tok.word(), currMWTToken.beginPosition(), currMWTToken.endPosition()));
+                } else {
+                    currMWTToken = null;
+                    int charBegin =
+                            systemTokensList.size() == 0 ?
+                                    0 : systemTokensList.get(systemTokensList.size()-1).endPosition();
+                    systemTokensList.add(buildCoreLabel(tok.word(), charBegin, charBegin + tok.word().length()));
+                }
+            }
+        }
+
+        /** create a placeholder CoreLabel with the info of the original mwt token **/
+        public CoreLabel placeholderMWTToken(CoreLabel containedToken, int beginPosition) {
+            CoreLabel placeholderToken = new CoreLabel();
+            placeholderToken.setWord(containedToken.get(CoreAnnotations.MWTTokenTextAnnotation.class));
+            placeholderToken.setBeginPosition(beginPosition);
+            placeholderToken.setEndPosition(beginPosition + placeholderToken.word().length());
+            placeholderToken.set(CoreAnnotations.MWTTokenCharacterOffsetBeginAnnotation.class,
+                    containedToken.get(CoreAnnotations.MWTTokenCharacterOffsetBeginAnnotation.class));
+            placeholderToken.set(CoreAnnotations.MWTTokenCharacterOffsetEndAnnotation.class,
+                    containedToken.get(CoreAnnotations.MWTTokenCharacterOffsetEndAnnotation.class));
+            placeholderToken.setIsMWT(true);
+            return placeholderToken;
         }
 
-//        /** create a placeholder CoreLabel with the info of the original mwt token **/
-//        public CoreLabel placeholderMWTToken(CoreLabel containedToken, int beginPosition) {
-//            CoreLabel placeholderToken = new CoreLabel();
-//            placeholderToken.setWord(containedToken.get(CoreAnnotations.MWTTokenTextAnnotation.class));
-//            placeholderToken.setBeginPosition(beginPosition);
-//            placeholderToken.setEndPosition(beginPosition + placeholderToken.word().length());
-//            placeholderToken.set(CoreAnnotations.MWTTokenCharacterOffsetBeginAnnotation.class,
-//                    containedToken.get(CoreAnnotations.MWTTokenCharacterOffsetBeginAnnotation.class));
-//            placeholderToken.set(CoreAnnotations.MWTTokenCharacterOffsetEndAnnotation.class,
-//                    containedToken.get(CoreAnnotations.MWTTokenCharacterOffsetEndAnnotation.class));
-//            placeholderToken.setIsMWT(true);
-//            return placeholderToken;
-//        }
-//
-//        /** check if a token is split off from a multi word token **/
-//        public boolean containedByMultiWordToken(CoreLabel tok) {
-//            if (tok.get(CoreAnnotations.MWTTokenTextAnnotation.class) != null) {
-//                return true;
-//            } else {
-//                return false;
-//            }
-//        }
-//
-//        /** check if a token is a split off token of another **/
-//        public boolean isMultiWordTokenOf(CoreLabel splitToken, CoreLabel multiWordPlaceholderToken) {
-//            int mwtPlaceholderBegin = multiWordPlaceholderToken.get(
-//                    CoreAnnotations.MWTTokenCharacterOffsetBeginAnnotation.class
-//            );
-//            int mwtPlaceholderEnd = multiWordPlaceholderToken.get(
-//                    CoreAnnotations.MWTTokenCharacterOffsetEndAnnotation.class
-//            );
-//            if (splitToken.get(CoreAnnotations.MWTTokenTextAnnotation.class).equals(multiWordPlaceholderToken.word())
-//                    && mwtPlaceholderBegin <= splitToken.beginPosition()
-//                    && mwtPlaceholderBegin <= splitToken.endPosition()
-//                    && mwtPlaceholderEnd >= splitToken.beginPosition()
-//                    && mwtPlaceholderEnd >= splitToken.endPosition()) {
-//                return true;
-//            } else {
-//                return false;
-//            }
-//        }
+        /** check if a token is split off from a multi word token **/
+        public boolean containedByMultiWordToken(CoreLabel tok) {
+            if (tok.get(CoreAnnotations.MWTTokenTextAnnotation.class) != null) {
+                return true;
+            } else {
+                return false;
+            }
+        }
+
+        /** check if a token is a split off token of another **/
+        public boolean isMultiWordTokenOf(CoreLabel splitToken, CoreLabel multiWordPlaceholderToken) {
+            int mwtPlaceholderBegin = multiWordPlaceholderToken.get(
+                    CoreAnnotations.MWTTokenCharacterOffsetBeginAnnotation.class
+            );
+            int mwtPlaceholderEnd = multiWordPlaceholderToken.get(
+                    CoreAnnotations.MWTTokenCharacterOffsetEndAnnotation.class
+            );
+            if (splitToken.get(CoreAnnotations.MWTTokenTextAnnotation.class).equals(multiWordPlaceholderToken.word())
+                    && mwtPlaceholderBegin <= splitToken.beginPosition()
+                    && mwtPlaceholderBegin <= splitToken.endPosition()
+                    && mwtPlaceholderEnd >= splitToken.beginPosition()
+                    && mwtPlaceholderEnd >= splitToken.endPosition()) {
+                return true;
+            } else {
+                return false;
+            }
+        }
 
         /** return TP, FP, FN stats for this example **/
         public ClassicCounter<String> f1Stats() {
diff --git a/itest/src/edu/stanford/nlp/pipeline/TokenizerGermanBenchmarkITest.java b/itest/src/edu/stanford/nlp/pipeline/TokenizerGermanBenchmarkITest.java
@@ -17,8 +17,7 @@ public void setUp() {
 
     public void testOnDev() {
         goldFilePath = "/u/nlp/data/stanford-corenlp-testing/data/tokenize/de_gsd-ud-dev.conllu";
-        runTest("dev", "de", 0.5);
+        runTest("dev", "de", 0.95);
     }
 
-
 }
diff --git a/itest/src/edu/stanford/nlp/pipeline/TokenizerSpanishBenchmarkITest.java b/itest/src/edu/stanford/nlp/pipeline/TokenizerSpanishBenchmarkITest.java
@@ -10,12 +10,15 @@ public void setUp() {
         Properties props = new Properties();
         props.put("annotators", "tokenize");
         props.put("tokenize.language", "es");
+        props.put("tokenize.options", "splitAll=false");
+        props.put("tokenize.mwt.mappingFile", 
+                  "/u/nlp/data/stanford-corenlp-testing/resources/es_mwt.tsv");
         pipeline = new StanfordCoreNLP(props);
     }
 
     public void testOnDev() {
         goldFilePath = "/u/nlp/data/stanford-corenlp-testing/data/tokenize/es_ancora-ud-dev.conllu";
-        runTest("dev", "es", 0.5);
+        runTest("dev", "es", 0.994);
     }
 
 }
diff --git a/src/edu/stanford/nlp/pipeline/MWTAnnotator.java b/src/edu/stanford/nlp/pipeline/MWTAnnotator.java
@@ -16,8 +16,8 @@ public class MWTAnnotator {
 
     public MWTAnnotator(String name, Properties props) {
         String prefix = (name != null && !name.equals("")) ? name+".mwt." : "mwt.";
-        System.out.println(prefix+"mappingFile");
-        System.out.println(props.getProperty(prefix+"mappingFile"));
+        //System.out.println(prefix+"mappingFile");
+        //System.out.println(props.getProperty(prefix+"mappingFile"));
         loadMultiWordTokenMappings(props.getProperty(prefix+"mappingFile"));
     }
 
@@ -39,11 +39,11 @@ public void annotate(Annotation annotation) {
         for (CoreLabel token : annotation.get(CoreAnnotations.TokensAnnotation.class)) {
             // check if token text is in the mapping
             if (multiWordTokenMapping.containsKey(token.word())) {
-                System.err.println("found match: "+token.word());
+                //System.err.println("found match: "+token.word());
                 int numWordsForToken = multiWordTokenMapping.get(token.word()).size();
                 List<CoreLabel> newTokens = new ArrayList<CoreLabel>();
                 for (String word : multiWordTokenMapping.get(token.word())) {
-                    System.err.println("splitting into: "+word);
+                    //System.err.println("splitting into: "+word);
                     CoreLabel newToken = new CoreLabel();
                     newToken.setWord(word);
                     newToken.setValue(word);
diff --git a/src/edu/stanford/nlp/pipeline/TokenizerAnnotator.java b/src/edu/stanford/nlp/pipeline/TokenizerAnnotator.java
@@ -223,7 +223,7 @@ else if (LanguageInfo.getLanguageFromString(
     }
     // set up an MWTAnnotator if a mapping file is provided
     if (!props.getProperty("tokenize.mwt.mappingFile", "").equals("")) {
-      System.out.println("Setting up MWTAnnotator!!");
+      //System.out.println("Setting up MWTAnnotator!!");
       splitMWTTokens = true;
       mwtAnnotator = new MWTAnnotator("tokenize", props);
     }

Original file line number	Diff line number	Diff line change
`@@ -17,8 +17,7 @@ public void setUp() {`
`17`	`17`
`18`	`18`	`public void testOnDev() {`
`19`	`19`	`goldFilePath = "/u/nlp/data/stanford-corenlp-testing/data/tokenize/de_gsd-ud-dev.conllu";`
`20`		`- runTest("dev", "de", 0.5);`
	`20`	`+ runTest("dev", "de", 0.95);`
`21`	`21`	`}`
`22`	`22`
`23`		`-`
`24`	`23`	`}`
Original file line number	Diff line number	Diff line change
`@@ -10,12 +10,15 @@ public void setUp() {`
`10`	`10`	`Properties props = new Properties();`
`11`	`11`	`props.put("annotators", "tokenize");`
`12`	`12`	`props.put("tokenize.language", "es");`
	`13`	`+ props.put("tokenize.options", "splitAll=false");`
	`14`	`+ props.put("tokenize.mwt.mappingFile",`
	`15`	`+ "/u/nlp/data/stanford-corenlp-testing/resources/es_mwt.tsv");`
`13`	`16`	`pipeline = new StanfordCoreNLP(props);`
`14`	`17`	`}`
`15`	`18`
`16`	`19`	`public void testOnDev() {`
`17`	`20`	`goldFilePath = "/u/nlp/data/stanford-corenlp-testing/data/tokenize/es_ancora-ud-dev.conllu";`
`18`		`- runTest("dev", "es", 0.5);`
	`21`	`+ runTest("dev", "es", 0.994);`
`19`	`22`	`}`
`20`	`23`
`21`	`24`	`}`
Original file line number	Diff line number	Diff line change
`@@ -223,7 +223,7 @@ else if (LanguageInfo.getLanguageFromString(`
`223`	`223`	`}`
`224`	`224`	`// set up an MWTAnnotator if a mapping file is provided`
`225`	`225`	`if (!props.getProperty("tokenize.mwt.mappingFile", "").equals("")) {`
`226`		`- System.out.println("Setting up MWTAnnotator!!");`
	`226`	`+ //System.out.println("Setting up MWTAnnotator!!");`
`227`	`227`	`splitMWTTokens = true;`
`228`	`228`	`mwtAnnotator = new MWTAnnotator("tokenize", props);`
`229`	`229`	`}`