|
20 | 20 |
|
21 | 21 | import collections |
22 | 22 | import math |
| 23 | +import re |
| 24 | +import sys |
| 25 | +import unicodedata |
23 | 26 |
|
24 | 27 | # Dependency imports |
25 | 28 |
|
26 | 29 | import numpy as np |
| 30 | +import six |
27 | 31 | # pylint: disable=redefined-builtin |
28 | 32 | from six.moves import xrange |
29 | 33 | from six.moves import zip |
@@ -93,9 +97,15 @@ def compute_bleu(reference_corpus, |
93 | 97 | for ngram in translation_ngram_counts: |
94 | 98 | possible_matches_by_order[len(ngram)-1] += translation_ngram_counts[ngram] |
95 | 99 | precisions = [0] * max_order |
| 100 | + smooth = 1.0 |
96 | 101 | for i in xrange(0, max_order): |
97 | 102 | if possible_matches_by_order[i] > 0: |
98 | 103 | precisions[i] = matches_by_order[i] / possible_matches_by_order[i] |
| 104 | + if matches_by_order[i] > 0: |
| 105 | + precisions[i] = matches_by_order[i] / possible_matches_by_order[i] |
| 106 | + else: |
| 107 | + smooth *= 2 |
| 108 | + precisions[i] = 1.0 / (smooth * possible_matches_by_order[i]) |
99 | 109 | else: |
100 | 110 | precisions[i] = 0.0 |
101 | 111 |
|
@@ -131,3 +141,59 @@ def bleu_score(predictions, labels, **unused_kwargs): |
131 | 141 |
|
132 | 142 | bleu = tf.py_func(compute_bleu, (labels, outputs), tf.float32) |
133 | 143 | return bleu, tf.constant(1.0) |
| 144 | + |
| 145 | + |
| 146 | +class UnicodeRegex(object): |
| 147 | + """Ad-hoc hack to recognize all punctuation and symbols.""" |
| 148 | + |
| 149 | + def __init__(self): |
| 150 | + def _property_chars(prefix): |
| 151 | + return ''.join(six.unichr(x) for x in range(sys.maxunicode) |
| 152 | + if unicodedata.category(six.unichr(x)).startswith(prefix)) |
| 153 | + punctuation = self._property_chars('P') |
| 154 | + self.nondigit_punct_re = re.compile(r'([^\d])([' + punctuation + r'])') |
| 155 | + self.punct_nondigit_re = re.compile(r'([' + punctuation + r'])([^\d])') |
| 156 | + self.symbol_re = re.compile('([' + _property_chars('S') + '])') |
| 157 | + |
| 158 | + |
| 159 | +def bleu_tokenize(string): |
| 160 | + r"""Tokenize a string following the official BLEU implementation. |
| 161 | +
|
| 162 | + See https://github.com/moses-smt/mosesdecoder/" |
| 163 | + "blob/master/scripts/generic/mteval-v14.pl#L954-L983 |
| 164 | + In our case, the input string is expected to be just one line |
| 165 | + and no HTML entities de-escaping is needed. |
| 166 | + So we just tokenize on punctuation and symbols, |
| 167 | + except when a punctuation is preceded and followed by a digit |
| 168 | + (e.g. a comma/dot as a thousand/decimal separator). |
| 169 | +
|
| 170 | + Note that a numer (e.g. a year) followed by a dot at the end of sentence |
| 171 | + is NOT tokenized, |
| 172 | + i.e. the dot stays with the number because `s/(\p{P})(\P{N})/ $1 $2/g` |
| 173 | + does not match this case (unless we add a space after each sentence). |
| 174 | + However, this error is already in the original mteval-v14.pl |
| 175 | + and we want to be consistent with it. |
| 176 | +
|
| 177 | + Args: |
| 178 | + string: the input string |
| 179 | +
|
| 180 | + Returns: |
| 181 | + a list of tokens |
| 182 | + """ |
| 183 | + string = UnicodeRegex.nondigit_punct_re.sub(r'\1 \2 ', string) |
| 184 | + string = UnicodeRegex.punct_nondigit_re.sub(r' \1 \2', string) |
| 185 | + string = UnicodeRegex.symbol_re.sub(r' \1 ', string) |
| 186 | + return string.split() |
| 187 | + |
| 188 | + |
| 189 | +def bleu_wrapper(ref_filename, hyp_filename, case_sensitive=False): |
| 190 | + """Compute BLEU for two files (reference and hypothesis translation).""" |
| 191 | + ref_lines = open(ref_filename).read().splitlines() |
| 192 | + hyp_lines = open(hyp_filename).read().splitlines() |
| 193 | + assert len(ref_lines) == len(hyp_lines) |
| 194 | + if not case_sensitive: |
| 195 | + ref_lines = [x.lower() for x in ref_lines] |
| 196 | + hyp_lines = [x.lower() for x in hyp_lines] |
| 197 | + ref_tokens = [bleu_tokenize(x) for x in ref_lines] |
| 198 | + hyp_tokens = [bleu_tokenize(x) for x in hyp_lines] |
| 199 | + return compute_bleu(ref_tokens, hyp_tokens) |
0 commit comments