tensorflow
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 1 deletion b/‎.gitignore‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎setup.py‎
Lines changed: 1 addition & 1 deletion b/‎setup.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensor2tensor/bin/make_tf_configs.py‎
Lines changed: 2 additions & 1 deletion b/‎tensor2tensor/bin/make_tf_configs.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎tensor2tensor/data_generators/algorithmic_math.py‎
Lines changed: 7 additions & 7 deletions b/‎tensor2tensor/data_generators/algorithmic_math.py‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎tensor2tensor/data_generators/generator_utils.py‎
100755100644
Lines changed: 1 addition & 1 deletion b/‎tensor2tensor/data_generators/generator_utils.py‎
100755100644
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensor2tensor/data_generators/wmt.py‎
100755100644
Lines changed: 2 additions & 0 deletions b/‎tensor2tensor/data_generators/wmt.py‎
100755100644
Lines changed: 2 additions & 0 deletions
diff --git a/‎tensor2tensor/data_generators/wmt_test.py‎
100755100644
Lines changed: 5 additions & 6 deletions b/‎tensor2tensor/data_generators/wmt_test.py‎
100755100644
Lines changed: 5 additions & 6 deletions
diff --git a/‎tensor2tensor/docs/distributed_training.md‎
Lines changed: 12 additions & 1 deletion b/‎tensor2tensor/docs/distributed_training.md‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎tensor2tensor/models/attention_lm.py‎
Lines changed: 5 additions & 9 deletions b/‎tensor2tensor/models/attention_lm.py‎
Lines changed: 5 additions & 9 deletions
diff --git a/‎tensor2tensor/models/attention_lm_moe.py‎
Lines changed: 8 additions & 10 deletions b/‎tensor2tensor/models/attention_lm_moe.py‎
Lines changed: 8 additions & 10 deletions
@@ -4,6 +4,6 @@
 # Python egg metadata, regenerated from source files by setuptools.
 /*.egg-info
 
-# PyPI distribution artifacts
+# PyPI distribution artificats
 build/
 dist/
@@ -5,7 +5,7 @@
 
 setup(
     name='tensor2tensor',
-    version='1.0.7',
+    version='1.0.8',
     description='Tensor2Tensor',
     author='Google Inc.',
     author_email='[email protected]',
 
@@ -55,7 +55,7 @@ def main(_):
     for idx, job in enumerate(jobs):
       if task_type == "worker":
         cmd_line_flags = " ".join([
-            "--master=%s" % job,
+            "--master=grpc://%s" % job,
             "--ps_replicas=%d" % len(ps),
             "--worker_replicas=%d" % len(workers),
             "--worker_gpu=1",
@@ -66,6 +66,7 @@ def main(_):
         ])
       else:
         cmd_line_flags = " ".join([
+            "--master=grpc://%s" % job,
             "--schedule=run_std_server",
         ])
 
 
@@ -570,16 +570,16 @@ def calculus_integrate(alphabet_size=26,
 
   functions = {"log": "L"}
   alg_cfg = math_dataset_init(alphabet_size, digits=5, functions=functions)
-  nbr_case=0
+  nbr_case = 0
   while nbr_case < nbr_cases:
     try:
       sample, target = generate_calculus_integrate_sample(
-        alg_cfg.vlist,
-        list(alg_cfg.ops.values()), min_depth, max_depth, alg_cfg.functions)
+          alg_cfg.vlist,
+          list(alg_cfg.ops.values()), min_depth, max_depth, alg_cfg.functions)
       yield {
-        "inputs": alg_cfg.int_encoder(sample),
-        "targets": alg_cfg.int_encoder(target)
+          "inputs": alg_cfg.int_encoder(sample),
+          "targets": alg_cfg.int_encoder(target)
       }
-    except:
+    except:  # pylint:disable=bare-except
       continue
-    nbr_case = nbr_case + 1
+    nbr_case += 1
@@ -27,7 +27,7 @@
 
 import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
-import six.moves.urllib_request as urllib # Imports urllib on Python2, urllib.request on Python3
+import six.moves.urllib_request as urllib  # Imports urllib on Python2, urllib.request on Python3
 
 from tensor2tensor.data_generators.text_encoder import SubwordTextEncoder
 from tensor2tensor.data_generators.tokenizer import Tokenizer
 
@@ -28,6 +28,7 @@
 
 import tensorflow as tf
 
+
 # End-of-sentence marker (should correspond to the position of EOS in the
 # RESERVED_TOKENS list in text_encoder.py)
 EOS = 1
@@ -44,6 +45,7 @@ def character_generator(source_path, target_path, character_vocab, eos=None):
   Args:
     source_path: path to the file with source sentences.
     target_path: path to the file with target sentences.
+    character_vocab: a TextEncoder to encode the characters.
     eos: integer to append at the end of each sequence (default: None).
 
   Yields:
 
@@ -25,8 +25,8 @@
 # Dependency imports
 
 import six
-from tensor2tensor.data_generators import wmt
 from tensor2tensor.data_generators import text_encoder
+from tensor2tensor.data_generators import wmt
 
 import tensorflow as tf
 
@@ -40,7 +40,7 @@ def testCharacterGenerator(self):
     if six.PY2:
       enc_f = lambda s: s
     else:
-      enc_f = lambda s: s.encode('utf-8')
+      enc_f = lambda s: s.encode("utf-8")
     with io.open(tmp_file_path + ".src", "wb") as src_file:
       src_file.write(enc_f("source1\n"))
       src_file.write(enc_f("source2\n"))
@@ -51,16 +51,15 @@ def testCharacterGenerator(self):
     # Call character generator on the generated files.
     results_src, results_tgt = [], []
     character_vocab = text_encoder.ByteTextEncoder()
-    for dictionary in wmt.character_generator(tmp_file_path + ".src",
-                                              tmp_file_path + ".tgt",
-                                              character_vocab):
+    for dictionary in wmt.character_generator(
+        tmp_file_path + ".src", tmp_file_path + ".tgt", character_vocab):
       self.assertEqual(sorted(list(dictionary)), ["inputs", "targets"])
       results_src.append(dictionary["inputs"])
       results_tgt.append(dictionary["targets"])
 
     # Check that the results match the files.
     # First check that the results match the encoded original strings;
-    # this is a comparison of integer arrays
+    # this is a comparison of integer arrays.
     self.assertEqual(len(results_src), 2)
     self.assertEqual(results_src[0],
                      character_vocab.encode("source1"))
 
@@ -35,7 +35,7 @@ os.environ['TF_CONFIG'] = json.dumps({
 The following T2T command-line flags must also be set on the workers for
 distributed training:
 
-- `--master=$ADDRESS`
+- `--master=grpc://$ADDRESS`
 - `--worker_replicas=$NUM_WORKERS`
 - `--worker_gpu=$NUM_GPUS_PER_WORKER`
 - `--worker_id=$WORKER_ID`
@@ -55,6 +55,17 @@ Parameter servers only need `--schedule=run_std_server`.
 generates the `TF_CONFIG` json strings and the above-mentioned command-line
 flags for the workers and parameter servers.
 
+Given a set of worker and parameter server addresses, the script outputs, for
+each job, a line with the `TF_CONFIG` environment variable and the command-line
+flags necessary for distributed training. For each job, you should invoke the
+`t2t-trainer` with the `TF_CONFIG` value and flags that are output.
+
+For example:
+
+```
+TF_CONFIG=$JOB_TF_CONFIG t2t-trainer $JOB_FLAGS --model=transformer ...
+```
+
 ## Command-line flags for eval jobs
 
 Eval jobs should set the following flags and do not need the `TF_CONFIG`
 
@@ -24,8 +24,6 @@
 from __future__ import division
 from __future__ import print_function
 
-import copy
-
 # Dependency imports
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
@@ -43,13 +41,9 @@
 class AttentionLM(t2t_model.T2TModel):
   """Attention net.  See file docstring."""
 
-  def model_fn_body(self, features, train):
+  def model_fn_body(self, features):
     # Remove dropout if not training
-    hparams = copy.copy(self._hparams)
-    if not train:
-      hparams.attention_dropout = 0.
-      hparams.relu_dropout = 0.
-      hparams.residual_dropout = 0.
+    hparams = self._hparams
     targets = features["targets"]
     targets = tf.squeeze(targets, 2)
 
@@ -162,8 +156,10 @@ def attention_lm_base():
   hparams.add_hparam("num_heads", 8)
   hparams.add_hparam("attention_key_channels", 0)
   hparams.add_hparam("attention_value_channels", 0)
+  # All hyperparameters ending in "dropout" are automatically set to 0.0
+  # when not in training mode.
   hparams.add_hparam("attention_dropout", 0.0)
   hparams.add_hparam("relu_dropout", 0.0)
-  hparams.add_hparam("pos", "timing")  # timing, none
   hparams.add_hparam("residual_dropout", 0.1)
+  hparams.add_hparam("pos", "timing")  # timing, none
   return hparams
@@ -24,8 +24,6 @@
 from __future__ import division
 from __future__ import print_function
 
-import copy
-
 # Dependency imports
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
@@ -43,13 +41,9 @@
 class AttentionLmMoe(t2t_model.T2TModel):
   """Attention net.  See file docstring."""
 
-  def model_fn_body_sharded(self, sharded_features, train):
+  def model_fn_body_sharded(self, sharded_features):
     # Remove dropout if not training
-    hparams = copy.copy(self._hparams)
-    if not train:
-      hparams.attention_dropout = 0.
-      hparams.relu_dropout = 0.
-      hparams.residual_dropout = 0.
+    hparams = self._hparams
     dp = self._data_parallelism
     targets = sharded_features["targets"]
     targets = dp(tf.squeeze, targets, 2)
@@ -81,7 +75,9 @@ def residual_fn(x, y):
         with tf.variable_scope("ffn"):
           if str(layer) in hparams.moe_layers.split(","):
             y, loss = common_layers.moe_layer(
-                dp, self._ps_devices, x, train, hparams.hidden_size,
+                dp, self._ps_devices, x,
+                hparams.mode == tf.contrib.learn.ModeKeys.TRAIN,
+                hparams.hidden_size,
                 hparams.moe_hidden_size, hparams.moe_n1, hparams.moe_n2,
                 hparams.moe_loss_coef)
             extra_loss += loss
@@ -162,10 +158,12 @@ def attention_lm_moe_base():
   hparams.add_hparam("num_heads", 8)
   hparams.add_hparam("attention_key_channels", 0)
   hparams.add_hparam("attention_value_channels", 0)
+  # All hyperparameters ending in "dropout" are automatically set to 0.0
+  # when not in training mode.
   hparams.add_hparam("attention_dropout", 0.0)
   hparams.add_hparam("relu_dropout", 0.0)
-  hparams.add_hparam("pos", "timing")  # timing, none
   hparams.add_hparam("residual_dropout", 0.1)
+  hparams.add_hparam("pos", "timing")  # timing, none
   return hparams