Add shakeshake_type hparam: batch, image, equal

rshin · rshin · commit 2adf3ae883a4 · 2017-07-13T17:49:45.000-07:00
diff --git a/tensor2tensor/models/common_layers.py b/tensor2tensor/models/common_layers.py
@@ -60,7 +60,13 @@ def inverse_exp_decay(max_step, min_value=0.01):
 
 def shakeshake2_py(x, y, equal=False, individual=False):
   """The shake-shake sum of 2 tensors, python version."""
-  alpha = 0.5 if equal else tf.random_uniform([])
+  if equal:
+    alpha = 0.5
+  if individual:
+    alpha = tf.random_uniform(tf.get_shape(x)[:1])
+  else:
+    alpha = tf.random_uniform([])
+
   return alpha * x + (1.0 - alpha) * y
 
 
@@ -72,6 +78,14 @@ def shakeshake2_grad(x1, x2, dy):
   return dx
 
 
+@function.Defun()
+def shakeshake2_indiv_grad(x1, x2, dy):
+  """Overriding gradient for shake-shake of 2 tensors."""
+  y = shakeshake2_py(x1, x2, individual=True)
+  dx = tf.gradients(ys=[y], xs=[x1, x2], grad_ys=[dy])
+  return dx
+
+
 @function.Defun()
 def shakeshake2_equal_grad(x1, x2, dy):
   """Overriding gradient for shake-shake of 2 tensors."""
@@ -85,10 +99,10 @@ def shakeshake2(x1, x2):
   """The shake-shake function with a different alpha for forward/backward."""
   return shakeshake2_py(x1, x2)
 
-@function.Defun(grad_func=shakeshake2_grad)
-def shakeshake2_eqforward(x1, x2):
-  """The shake-shake function with a different alpha for forward/backward."""
-  return shakeshake2_py(x1, x2, equal=True)
+
+@function.Defun(grad_func=shakeshake2_indiv_grad)
+def shakeshake2_indiv(x1, x2):
+  return shakeshake2_py(x1, x2, individual=True)
 
 
 @function.Defun(grad_func=shakeshake2_equal_grad)
diff --git a/tensor2tensor/models/shake_shake.py b/tensor2tensor/models/shake_shake.py
@@ -36,7 +36,7 @@ def downsampling_residual_branch(x, conv_filters):
   return tf.concat([x1, x2], axis=3)
 
 
-def shake_shake_block(x, conv_filters, stride, mode):
+def shake_shake_block(x, conv_filters, stride, hparams):
   with tf.variable_scope('branch_1'):
     branch1 = shake_shake_block_branch(x, conv_filters, stride)
   with tf.variable_scope('branch_2'):
@@ -47,21 +47,28 @@ def shake_shake_block(x, conv_filters, stride, mode):
     skip = downsampling_residual_branch(x, conv_filters)
 
   # TODO(rshin): Use different alpha for each image in batch.
-  if mode == tf.contrib.learn.ModeKeys.TRAIN:
-    shaken = common_layers.shakeshake2(branch1, branch2)
+  if hparams.mode == tf.contrib.learn.ModeKeys.TRAIN:
+    if hparams.shakeshake_type == 'batch':
+      shaken = common_layers.shakeshake2(branch1, branch2)
+    elif hparams.shakeshake_type == 'image':
+      shaken = common_layers.shakeshake2_indiv(branch1, branch2)
+    elif hparams.shakeshake_type == 'equal':
+      shaken = common_layers.shakeshake2_py(branch1, branch2, equal=True)
+    else:
+      raise ValueError('Invalid shakeshake_type: {!r}'.format(shaken))
   else:
-    shaken = common_layers.shakeshake2_eqforward(branch1, branch2)
+    shaken = common_layers.shakeshake2_py(branch1, branch2, equal=True)
   shaken.set_shape(branch1.get_shape())
 
   return skip + shaken
 
 
-def shake_shake_stage(x, num_blocks, conv_filters, initial_stride, mode):
+def shake_shake_stage(x, num_blocks, conv_filters, initial_stride, hparams):
   with tf.variable_scope('block_0'):
-    x = shake_shake_block(x, conv_filters, initial_stride, mode)
+    x = shake_shake_block(x, conv_filters, initial_stride, hparams)
   for i in xrange(1, num_blocks):
     with tf.variable_scope('block_{}'.format(i)):
-      x = shake_shake_block(x, conv_filters, 1, mode)
+      x = shake_shake_block(x, conv_filters, 1, hparams)
   return x
 
 
@@ -76,6 +83,7 @@ class ShakeShake(t2t_model.T2TModel):
 
   def model_fn_body(self, features):
     hparams = self._hparams
+    print(hparams.learning_rate)
 
     inputs = features["inputs"]
     assert (hparams.num_hidden_layers - 2) % 6 == 0
@@ -87,13 +95,14 @@ def model_fn_body(self, features):
     x = inputs
     mode = hparams.mode
     with tf.variable_scope('shake_shake_stage_1'):
-      x = shake_shake_stage(x, blocks_per_stage, hparams.base_filters, 1, mode)
+      x = shake_shake_stage(x, blocks_per_stage, hparams.base_filters, 1,
+                            hparams)
     with tf.variable_scope('shake_shake_stage_2'):
       x = shake_shake_stage(x, blocks_per_stage, hparams.base_filters * 2, 2,
-                            mode)
+                            hparams)
     with tf.variable_scope('shake_shake_stage_3'):
       x = shake_shake_stage(x, blocks_per_stage, hparams.base_filters * 4, 2,
-                            mode)
+                            hparams)
 
     # For canonical Shake-Shake, we should perform 8x8 average pooling and then
     # have a fully-connected layer (which produces the logits for each class).
@@ -130,4 +139,5 @@ def shakeshake_cifar10():
   hparams.optimizer = "Momentum"
   hparams.optimizer_momentum_momentum = 0.9
   hparams.add_hparam('base_filters', 16)
+  hparams.add_hparam('shakeshake_type', 'batch')
   return hparams