Merge pull request #456 from lukaszkaiser/push

lukaszkaiser · web-flow · commit d9f807cf2738 · 2017-12-03T18:18:42.000-08:00
1.3.2 (small changes for colab)
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name='tensor2tensor',
-    version='1.3.1',
+    version='1.3.2',
     description='Tensor2Tensor',
     author='Google Inc.',
     author_email='no-reply@google.com',
diff --git a/tensor2tensor/data_generators/translate_enfr.py b/tensor2tensor/data_generators/translate_enfr.py
@@ -143,6 +143,14 @@ def use_small_dataset(self):
     return False
 
 
+@registry.register_problem
+class TranslateEnfrWmt32kPacked(TranslateEnfrWmt32k):
+
+  @property
+  def packed_length(self):
+    return 256
+
+
 @registry.register_problem
 class TranslateEnfrWmtSmallCharacters(translate.TranslateProblem):
   """Problem spec for WMT En-Fr translation."""
diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py
@@ -1182,7 +1182,8 @@ def dot_product_attention(q,
                           dropout_rate=0.0,
                           image_shapes=None,
                           name=None,
-                          make_image_summary=True):
+                          make_image_summary=True,
+                          save_weights_to=None):
   """dot-product attention.
 
   Args:
@@ -1195,17 +1196,22 @@ def dot_product_attention(q,
       see comments for attention_image_summary()
     name: an optional string
     make_image_summary: True if you want an image summary.
+    save_weights_to: an optional dictionary to capture attention weights
+      for vizualization; the weights tensor will be appended there under
+      a string key created from the variable scope (including name).
 
   Returns:
     A Tensor.
   """
   with tf.variable_scope(
-      name, default_name="dot_product_attention", values=[q, k, v]):
+      name, default_name="dot_product_attention", values=[q, k, v]) as scope:
     # [batch, num_heads, query_length, memory_length]
     logits = tf.matmul(q, k, transpose_b=True)
     if bias is not None:
       logits += bias
     weights = tf.nn.softmax(logits, name="attention_weights")
+    if save_weights_to is not None:
+      save_weights_to[scope.name] = weights
     # dropping out the attention links for each of the heads
     weights = tf.nn.dropout(weights, 1.0 - dropout_rate)
     if (not tf.get_variable_scope().reuse and
@@ -2245,6 +2251,7 @@ def multihead_attention(query_antecedent,
                         gap_size=0,
                         num_memory_blocks=2,
                         name=None,
+                        save_weights_to=None,
                         **kwargs):
   """Multihead scaled-dot-product attention with input/output transformations.
 
@@ -2284,7 +2291,10 @@ def multihead_attention(query_antecedent,
               memory blocks.
     num_memory_blocks: Integer option to indicate how many memory blocks to look
                        at.
-    name: an optional string
+    name: an optional string.
+    save_weights_to: an optional dictionary to capture attention weights
+      for vizualization; the weights tensor will be appended there under
+      a string key created from the variable scope (including name).
     **kwargs (dict): Parameters for the attention function
 
   Caching:
@@ -2345,7 +2355,8 @@ def multihead_attention(query_antecedent,
       if isinstance(x, tuple):
         x, additional_returned_value = x  # Unpack
     elif attention_type == "dot_product":
-      x = dot_product_attention(q, k, v, bias, dropout_rate, image_shapes)
+      x = dot_product_attention(q, k, v, bias, dropout_rate, image_shapes,
+                                save_weights_to=save_weights_to)
     elif attention_type == "dot_product_relative":
       x = dot_product_attention_relative(q, k, v, bias, max_relative_position,
                                          dropout_rate, image_shapes)
diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
@@ -45,6 +45,10 @@
 class Transformer(t2t_model.T2TModel):
   """Attention net.  See file docstring."""
 
+  def __init__(self, *args, **kwargs):
+    super(Transformer, self).__init__(*args, **kwargs)
+    self.attention_weights = dict()  # For vizualizing attention heads.
+
   def encode(self, inputs, target_space, hparams, features=None):
     """Encode transformer inputs.
 
@@ -73,7 +77,8 @@ def encode(self, inputs, target_space, hparams, features=None):
 
     encoder_output = transformer_encoder(
         encoder_input, self_attention_bias,
-        hparams, nonpadding=_features_to_nonpadding(features, "inputs"))
+        hparams, nonpadding=_features_to_nonpadding(features, "inputs"),
+        save_weights_to=self.attention_weights)
 
     return encoder_output, encoder_decoder_attention_bias
 
@@ -114,7 +119,8 @@ def decode(self,
         encoder_decoder_attention_bias,
         hparams,
         cache=cache,
-        nonpadding=nonpadding)
+        nonpadding=nonpadding,
+        save_weights_to=self.attention_weights)
 
     if hparams.use_tpu and hparams.mode == tf.estimator.ModeKeys.TRAIN:
       # TPU does not react kindly to extra dimensions.
@@ -507,7 +513,8 @@ def transformer_encoder(encoder_input,
                         encoder_self_attention_bias,
                         hparams,
                         name="encoder",
-                        nonpadding=None):
+                        nonpadding=None,
+                        save_weights_to=None):
   """A stack of transformer layers.
 
   Args:
@@ -522,6 +529,9 @@ def transformer_encoder(encoder_input,
       encoder_self_attention_bias.  The knowledge about padding is used
       for pad_remover(efficiency) and to mask out padding in convoltutional
       layers.
+    save_weights_to: an optional dictionary to capture attention weights
+      for vizualization; the weights tensor will be appended there under
+      a string key created from the variable scope (including name).
 
   Returns:
     y: a Tensors
@@ -551,6 +561,7 @@ def transformer_encoder(encoder_input,
               hparams.num_heads,
               hparams.attention_dropout,
               attention_type=hparams.self_attention_type,
+              save_weights_to=save_weights_to,
               max_relative_position=hparams.max_relative_position)
           x = common_layers.layer_postprocess(x, y, hparams)
         with tf.variable_scope("ffn"):
@@ -571,7 +582,8 @@ def transformer_decoder(decoder_input,
                         hparams,
                         cache=None,
                         name="decoder",
-                        nonpadding=None):
+                        nonpadding=None,
+                        save_weights_to=None):
   """A stack of transformer layers.
 
   Args:
@@ -590,6 +602,9 @@ def transformer_decoder(decoder_input,
       to mask out padding in convoltutional layers.  We generally only
       need this mask for "packed" datasets, because for ordinary datasets,
       no padding is ever followed by nonpadding.
+    save_weights_to: an optional dictionary to capture attention weights
+      for vizualization; the weights tensor will be appended there under
+      a string key created from the variable scope (including name).
 
   Returns:
     y: a Tensors
@@ -612,6 +627,7 @@ def transformer_decoder(decoder_input,
               hparams.num_heads,
               hparams.attention_dropout,
               attention_type=hparams.self_attention_type,
+              save_weights_to=save_weights_to,
               max_relative_position=hparams.max_relative_position,
               cache=layer_cache)
           x = common_layers.layer_postprocess(x, y, hparams)
@@ -624,7 +640,8 @@ def transformer_decoder(decoder_input,
                 hparams.attention_key_channels or hparams.hidden_size,
                 hparams.attention_value_channels or hparams.hidden_size,
                 hparams.hidden_size, hparams.num_heads,
-                hparams.attention_dropout)
+                hparams.attention_dropout,
+                save_weights_to=save_weights_to)
             x = common_layers.layer_postprocess(x, y, hparams)
         with tf.variable_scope("ffn"):
           y = transformer_ffn_layer(
diff --git a/tensor2tensor/notebooks/hello_t2t.ipynb b/tensor2tensor/notebooks/hello_t2t.ipynb