Skip to content

Commit

Permalink
first-stage and python: Safer maximum sentence length checks
Browse files Browse the repository at this point in the history
Turns out max_sentence_length (MAXSENTLEN) is not really the maximum
sentence length but two more than it (yikes). See issue #32.

first-stage/PARSE/parseIt.C: fix check to avoid segfaults for sentence
    lengths one fewer than user-requested maximum length.
python/bllipparser/RerankingParser.py: fixed similar check as above
python/tests/test_reranking_parser.py: added long sentence tests
  • Loading branch information
dmcc committed Apr 20, 2015
1 parent cac2a74 commit 6f5d9c4
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 5 deletions.
2 changes: 1 addition & 1 deletion first-stage/PARSE/parseIt.C
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ mainLoop(void* arg)
if (len == 0) {
break;
}
if (len > params.maxSentLen)
if (len >= params.maxSentLen)
{
ECString msg("skipping sentence longer than specified limit of ");
msg += intToString(params.maxSentLen);
Expand Down
10 changes: 6 additions & 4 deletions python/bllipparser/RerankingParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,10 +559,12 @@ def parse(self, sentence, rerank='auto', sentence_id=None):
rerank = self.check_models_loaded_or_error(rerank)

sentence = Sentence(sentence)
if len(sentence) > parser.max_sentence_length:
raise ValueError("Sentence is too long (%s tokens, maximum "
"supported: %s)" %
(len(sentence), parser.max_sentence_length))
# max_sentence_length is actually 1 longer than the maximum
# allowed sentence length
if len(sentence) >= parser.max_sentence_length - 1:
raise ValueError("Sentence is too long (%s tokens, must be "
"under %s)" %
(len(sentence), parser.max_sentence_length - 1))

try:
parses = parser.parse(sentence.sentrep)
Expand Down
20 changes: 20 additions & 0 deletions python/tests/test_reranking_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,26 @@ def test_reranking_parser_basics():
>>> constraints = {(1, 5) : 'VP'}
>>> nbest_list7 = rrp.parse_constrained('British left waffles on Falklands .'.split(), constraints)
>>> assert str(nbest_list).strip() == str(nbest_list7).strip()
>>> rrp.parse('a ' * 398)
[]
>>> rrp.parse('b ' * 399)
Traceback (most recent call last):
File "/usr/lib64/python2.6/doctest.py", line 1253, in __run
compileflags, 1) in test.globs
File "<doctest test_reranking_parser_basics[53]>", line 1, in <module>
rrp.parse('b ' * 399)
File "/home/dmcclosky/local/lib/python2.6/site-packages/bllipparser/RerankingParser.py", line 565, in parse
(len(sentence), parser.max_sentence_length - 1))
ValueError: Sentence is too long (399 tokens, must be under 399)
>>> rrp.parse('c ' * 400)
Traceback (most recent call last):
File "/usr/lib64/python2.6/doctest.py", line 1253, in __run
compileflags, 1) in test.globs
File "<doctest test_reranking_parser_basics[53]>", line 1, in <module>
rrp.parse('c ' * 400)
File "/home/dmcclosky/local/lib/python2.6/site-packages/bllipparser/RerankingParser.py", line 565, in parse
(len(sentence), parser.max_sentence_length - 1))
ValueError: Sentence is too long (400 tokens, must be under 399)
"""

def stringify_dict(d):
Expand Down

0 comments on commit 6f5d9c4

Please sign in to comment.