3636
3737
3838# Conversion between Unicode and UTF-8, if required (on Python2)
39- _native_to_unicode = (lambda s : s .decode ("utf-8" )) if PY2 else (lambda s : s )
39+ native_to_unicode = (lambda s : s .decode ("utf-8" )) if PY2 else (lambda s : s )
4040
4141
42- _unicode_to_native = (lambda s : s .encode ("utf-8" )) if PY2 else (lambda s : s )
42+ unicode_to_native = (lambda s : s .encode ("utf-8" )) if PY2 else (lambda s : s )
4343
4444
4545# Reserved tokens for things like padding and EOS symbols.
@@ -220,7 +220,7 @@ def encode(self, raw_text):
220220 a list of integers in the range [0, vocab_size)
221221 """
222222 return self ._tokens_to_subtokens (self ._tokenizer .encode (
223- _native_to_unicode (raw_text )))
223+ native_to_unicode (raw_text )))
224224
225225 def decode (self , subtokens ):
226226 """Converts a sequence of subtoken ids to a native string.
@@ -230,7 +230,7 @@ def decode(self, subtokens):
230230 Returns:
231231 a native string
232232 """
233- return _unicode_to_native (self ._tokenizer .decode (
233+ return unicode_to_native (self ._tokenizer .decode (
234234 self ._subtokens_to_tokens (subtokens )))
235235
236236 @property
@@ -335,6 +335,9 @@ def bisect(min_val, max_val):
335335 else :
336336 other_subtokenizer = bisect (min_val , present_count - 1 )
337337
338+ if other_subtokenizer is None :
339+ return subtokenizer
340+
338341 if (abs (other_subtokenizer .vocab_size - target_size ) <
339342 abs (subtokenizer .vocab_size - target_size )):
340343 return other_subtokenizer
@@ -449,13 +452,13 @@ def _load_from_file(self, filename):
449452 subtoken_strings = []
450453 with tf .gfile .Open (filename ) as f :
451454 for line in f :
452- subtoken_strings .append (_native_to_unicode (line .strip ()[1 :- 1 ]))
455+ subtoken_strings .append (native_to_unicode (line .strip ()[1 :- 1 ]))
453456 self ._init_from_list (subtoken_strings )
454457
455458 def store_to_file (self , filename ):
456459 with tf .gfile .Open (filename , "w" ) as f :
457460 for subtoken_string in self ._all_subtoken_strings :
458- f .write ("'" + _unicode_to_native (subtoken_string ) + "'\n " )
461+ f .write ("'" + unicode_to_native (subtoken_string ) + "'\n " )
459462
460463 def _escape_token (self , token ):
461464 r"""Escape away underscores and OOV characters and append '_'.
@@ -524,7 +527,7 @@ def get_token_counts(cls, text_filepattern, corpus_max_lines):
524527 with tf .gfile .Open (text_filename ) as f :
525528 for line in f :
526529 # The tokenizer updates token_counts in encode()
527- tok .encode (_native_to_unicode (line .strip ()))
530+ tok .encode (native_to_unicode (line .strip ()))
528531 lines_read += 1
529532 if corpus_max_lines > 0 and lines_read > corpus_max_lines :
530533 return tok .token_counts
0 commit comments