first-stage and python: Safer maximum sentence length checks

Turns out max_sentence_length (MAXSENTLEN) is not really the maximum sentence length but two more than it (yikes). See issue #32. first-stage/PARSE/parseIt.C: fix check to avoid segfaults for sentence lengths one fewer than user-requested maximum length. python/bllipparser/RerankingParser.py: fixed similar check as above python/tests/test_reranking_parser.py: added long sentence tests
BLLIP · Apr 20, 2015 · 6f5d9c4 · 6f5d9c4
1 parent cac2a74
commit 6f5d9c4
Show file tree

Hide file tree

Showing 3 changed files with 27 additions and 5 deletions.
diff --git a/first-stage/PARSE/parseIt.C b/first-stage/PARSE/parseIt.C
@@ -202,7 +202,7 @@ mainLoop(void* arg)
       if (len == 0) {
 	break;
       }
-      if (len > params.maxSentLen) 
+      if (len >= params.maxSentLen)
 	{
 	  ECString msg("skipping sentence longer than specified limit of ");
 	  msg += intToString(params.maxSentLen);

diff --git a/python/bllipparser/RerankingParser.py b/python/bllipparser/RerankingParser.py
@@ -559,10 +559,12 @@ def parse(self, sentence, rerank='auto', sentence_id=None):
         rerank = self.check_models_loaded_or_error(rerank)
 
         sentence = Sentence(sentence)
-        if len(sentence) > parser.max_sentence_length:
-            raise ValueError("Sentence is too long (%s tokens, maximum "
-                             "supported: %s)" %
-                             (len(sentence), parser.max_sentence_length))
+        # max_sentence_length is actually 1 longer than the maximum
+        # allowed sentence length
+        if len(sentence) >= parser.max_sentence_length - 1:
+            raise ValueError("Sentence is too long (%s tokens, must be "
+                             "under %s)" %
+                             (len(sentence), parser.max_sentence_length - 1))
 
         try:
             parses = parser.parse(sentence.sentrep)

diff --git a/python/tests/test_reranking_parser.py b/python/tests/test_reranking_parser.py
@@ -501,6 +501,26 @@ def test_reranking_parser_basics():
     >>> constraints = {(1, 5) : 'VP'}
     >>> nbest_list7 = rrp.parse_constrained('British left waffles on Falklands .'.split(), constraints)
     >>> assert str(nbest_list).strip() == str(nbest_list7).strip()
+    >>> rrp.parse('a ' * 398)
+    []
+    >>> rrp.parse('b ' * 399)
+    Traceback (most recent call last):
+      File "/usr/lib64/python2.6/doctest.py", line 1253, in __run
+        compileflags, 1) in test.globs
+      File "<doctest test_reranking_parser_basics[53]>", line 1, in <module>
+        rrp.parse('b ' * 399)
+      File "/home/dmcclosky/local/lib/python2.6/site-packages/bllipparser/RerankingParser.py", line 565, in parse
+        (len(sentence), parser.max_sentence_length - 1))
+    ValueError: Sentence is too long (399 tokens, must be under 399)
+    >>> rrp.parse('c ' * 400)
+    Traceback (most recent call last):
+      File "/usr/lib64/python2.6/doctest.py", line 1253, in __run
+        compileflags, 1) in test.globs
+      File "<doctest test_reranking_parser_basics[53]>", line 1, in <module>
+        rrp.parse('c ' * 400)
+      File "/home/dmcclosky/local/lib/python2.6/site-packages/bllipparser/RerankingParser.py", line 565, in parse
+        (len(sentence), parser.max_sentence_length - 1))
+    ValueError: Sentence is too long (400 tokens, must be under 399)
     """
 
 def stringify_dict(d):