Skip to content
This repository was archived by the owner on Jul 7, 2023. It is now read-only.

Commit 9f59a50

Browse files
committed
adjust vocab with random lines
1 parent fc35144 commit 9f59a50

File tree

2 files changed

+13
-7
lines changed

2 files changed

+13
-7
lines changed

tensor2tensor/data_generators/generator_utils.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -338,13 +338,19 @@ def generate():
338338

339339
# Use Tokenizer to count the word occurrences.
340340
with tf.gfile.GFile(filepath, mode="r") as source_file:
341-
file_byte_budget = 3.5e5 if filepath.endswith("en") else 7e5
341+
file_byte_budget = 1e6 if filepath.endswith("en") else 1e6
342+
counter = 0
343+
countermax = int(source_file.size() / 1e6)
342344
for line in source_file:
343-
if file_byte_budget <= 0:
344-
break
345-
line = line.strip()
346-
file_byte_budget -= len(line)
347-
yield line
345+
if counter < countermax:
346+
counter += 1
347+
else:
348+
if file_byte_budget <= 0:
349+
break
350+
line = line.strip()
351+
file_byte_budget -= len(line)
352+
counter = 0
353+
yield line
348354

349355
return get_or_generate_vocab_inner(data_dir, vocab_filename, vocab_size,
350356
generate())

tensor2tensor/data_generators/translate_enfr.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
_ENFR_TRAIN_DATASETS = [
4242
[
4343
"https://s3.amazonaws.com/opennmt-trainingdata/baseline-1M-enfr.tgz",
44-
("baseline-1M-enfr/baseline-1M_train.en", "baseline-1M-enfr/baseline-1M_train.en")
44+
("baseline-1M-enfr/baseline-1M_train.en", "baseline-1M-enfr/baseline-1M_train.fr")
4545
],
4646
# [
4747
# "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz",

0 commit comments

Comments
 (0)