tensorflow
diff --git a/‎README.md‎
Lines changed: 11 additions & 0 deletions b/‎README.md‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎RELEASE.md‎
Lines changed: 19 additions & 0 deletions b/‎RELEASE.md‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎examples/census_example.py‎
Lines changed: 8 additions & 3 deletions b/‎examples/census_example.py‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎examples/sentiment_example.py‎
Lines changed: 79 additions & 26 deletions b/‎examples/sentiment_example.py‎
Lines changed: 79 additions & 26 deletions
diff --git a/‎examples/simple_example.py‎
Lines changed: 43 additions & 37 deletions b/‎examples/simple_example.py‎
Lines changed: 43 additions & 37 deletions
diff --git a/‎setup.py‎
Lines changed: 1 addition & 1 deletion b/‎setup.py‎
Lines changed: 1 addition & 1 deletion
@@ -53,7 +53,18 @@ Note: If you clone tf.Transform's implementation and samples from GitHub's
 from PyPI) they will likely only work with TensorFlow's nightly
 [build](https://github.com/tensorflow/tensorflow).
 
+### Compatible Versions
 
+This is a table of versions known to be compatible with each other.  This is not
+a comprehensive list, meaning other combinations may also work, but these are
+the combinations tested by our testing framework and by the team before
+releasing a new version.
+
+|tensorflow-transform                                                            |tensorflow|apache-beam[gcp]|
+|--------------------------------------------------------------------------------|----------|----------------|
+|[GitHub master](https://github.com/tensorflow/transform/blob/master/RELEASE.md) |nightly   |latest (2.x)    |
+|[0.3.0](https://github.com/tensorflow/transform/blob/v0.3.0/RELEASE.md)         |1.3       |2.1.1           |
+|[0.1.10](https://github.com/tensorflow/transform/blob/v0.1.10/RELEASE.md)       |1.0       |2.0.0           |
 ## Getting Started
 
 For instructions on using tf.Transform see the [getting started
 
@@ -1,3 +1,22 @@
+# Release 0.3.1
+
+## Major Features and Improvements
+* We now provide helper methods for creating `serving_input_receiver_fn` for use
+with tf.estimator.  These mirror the existing functions targeting the
+legacy tf.contrib.learn.estimators-- i.e. for each `*_serving_input_fn()`
+in input_fn_maker there is now also a `*_serving_input_receiver_fn()`.
+
+## Bug Fixes and Other Changes
+* Introduced `tft.apply_vocab` this allows users to separately apply a single
+  vocabulary (as generated by `tft.uniques`) to several different columns.
+* Provide a source distribution tar `tensorflow-transform-X.Y.Z.tar.gz`.
+
+## Breaking changes
+* The default prefix for `tft.string_to_int` `vocab_filename` changed from
+`vocab_string_to_int` to `vocab_string_to_int_uniques`. To make your pipelines
+resilient to implementation details please set `vocab_filename` if you are using
+the generated vocab_filename on a downstream component.
+
 # Release 0.3.0
 
 ## Major Features and Improvements
 
@@ -23,6 +23,7 @@
 import pprint
 import tempfile
 
+
 import tensorflow as tf
 import tensorflow_transform as tft
 from apache_beam.io import textio
@@ -195,14 +196,18 @@ def convert_label(label):
 
 
 def train_and_evaluate(transformed_train_filepattern,
-                       transformed_test_filepattern, transformed_metadata_dir):
+                       transformed_test_filepattern, transformed_metadata_dir,
+                       num_train_instances=NUM_TRAIN_INSTANCES,
+                       num_test_instances=NUM_TEST_INSTANCES):
   """Train the model on training data and evaluate on test data.
 
   Args:
     transformed_train_filepattern: File pattern for transformed training data
         shards
     transformed_test_filepattern: File pattern for transformed test data shards
     transformed_metadata_dir: Directory containing transformed data metadata
+    num_train_instances: Number of instances in train set
+    num_test_instances: Number of instances in test set
 
   Returns:
     The results from the estimator's 'evaluate' method
@@ -231,7 +236,7 @@ def train_and_evaluate(transformed_train_filepattern,
   # Estimate the model using the default optimizer.
   estimator.fit(
       input_fn=train_input_fn,
-      max_steps=TRAIN_NUM_EPOCHS * NUM_TRAIN_INSTANCES / TRAIN_BATCH_SIZE)
+      max_steps=TRAIN_NUM_EPOCHS * num_train_instances / TRAIN_BATCH_SIZE)
 
   # Evaluate model on test dataset.
   eval_input_fn = input_fn_maker.build_training_input_fn(
@@ -240,7 +245,7 @@ def train_and_evaluate(transformed_train_filepattern,
       training_batch_size=1,
       label_keys=[LABEL_COLUMN])
 
-  return estimator.evaluate(input_fn=eval_input_fn, steps=NUM_TEST_INSTANCES)
+  return estimator.evaluate(input_fn=eval_input_fn, steps=num_test_instances)
 
 
 def main():
 
@@ -23,6 +23,7 @@
 import pprint
 import tempfile
 
+
 import tensorflow as tf
 import tensorflow_transform as tft
 from apache_beam.io import textio
@@ -49,6 +50,13 @@
 REVIEW_WEIGHT = 'review_weight'
 LABEL_COLUMN = 'label'
 
+RAW_DATA_METADATA = dataset_metadata.DatasetMetadata(dataset_schema.Schema({
+    REVIEW_COLUMN: dataset_schema.ColumnSchema(
+        tf.string, [], dataset_schema.FixedColumnRepresentation()),
+    LABEL_COLUMN: dataset_schema.ColumnSchema(
+        tf.int64, [], dataset_schema.FixedColumnRepresentation()),
+}))
+
 DELIMITERS = '.,!?() '
 
 
@@ -99,13 +107,13 @@ def ReadAndShuffleData(pcoll, filepatterns):
       lambda p: {REVIEW_COLUMN: p[0], LABEL_COLUMN: p[1]})
 
 
-def transform_data(train_neg_filepattern, train_pos_filepattern,
-                   test_neg_filepattern, test_pos_filepattern,
-                   transformed_train_filebase, transformed_test_filebase,
-                   transformed_metadata_dir):
-  """Transform the data and write out as a TFRecord of Example protos.
+def read_and_shuffle_data(
+    train_neg_filepattern, train_pos_filepattern, test_neg_filepattern,
+    test_pos_filepattern, shuffled_train_filebase, shuffled_test_filebase):
+  """Read and shuffle the data and write out as a TFRecord of Example protos.
 
-  Read in the data from the positive and negative examples on disk, and
+  Read in the data from the positive and negative examples on disk, shuffle it
+  and write it out in TFRecord format.
   transform it using a preprocessing pipeline that removes punctuation,
   tokenizes and maps tokens to int64 values indices.
 
@@ -114,6 +122,42 @@ def transform_data(train_neg_filepattern, train_pos_filepattern,
     train_pos_filepattern: Filepattern for training data positive examples
     test_neg_filepattern: Filepattern for test data negative examples
     test_pos_filepattern: Filepattern for test data positive examples
+    shuffled_train_filebase: Base filename for shuffled training data shards
+    shuffled_test_filebase: Base filename for shuffled test data shards
+  """
+  with beam.Pipeline() as pipeline:
+    # pylint: disable=no-value-for-parameter
+    _ = (
+        pipeline
+        | 'ReadAndShuffleTrain' >> ReadAndShuffleData(
+            (train_neg_filepattern, train_pos_filepattern))
+        | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
+            shuffled_train_filebase,
+            coder=example_proto_coder.ExampleProtoCoder(
+                RAW_DATA_METADATA.schema)))
+    _ = (
+        pipeline
+        | 'ReadAndShuffleTest' >> ReadAndShuffleData(
+            (test_neg_filepattern, test_pos_filepattern))
+        | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
+            shuffled_test_filebase,
+            coder=example_proto_coder.ExampleProtoCoder(
+                RAW_DATA_METADATA.schema)))
+    # pylint: enable=no-value-for-parameter
+
+
+def transform_data(shuffled_train_filepattern, shuffled_test_filepattern,
+                   transformed_train_filebase, transformed_test_filebase,
+                   transformed_metadata_dir):
+  """Transform the data and write out as a TFRecord of Example protos.
+
+  Read in the data from the positive and negative examples on disk, and
+  transform it using a preprocessing pipeline that removes punctuation,
+  tokenizes and maps tokens to int64 values indices.
+
+  Args:
+    shuffled_train_filepattern: Base filename for shuffled training data shards
+    shuffled_test_filepattern: Base filename for shuffled test data shards
     transformed_train_filebase: Base filename for transformed training data
         shards
     transformed_test_filebase: Base filename for transformed test data shards
@@ -123,19 +167,19 @@ def transform_data(train_neg_filepattern, train_pos_filepattern,
 
   with beam.Pipeline() as pipeline:
     with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
-      # pylint: disable=no-value-for-parameter
-      train_data = pipeline | 'ReadTrain' >> ReadAndShuffleData(
-          (train_neg_filepattern, train_pos_filepattern))
-      # pylint: disable=no-value-for-parameter
-      test_data = pipeline | 'ReadTest' >> ReadAndShuffleData(
-          (test_neg_filepattern, test_pos_filepattern))
-
-      metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema({
-          REVIEW_COLUMN: dataset_schema.ColumnSchema(
-              tf.string, [], dataset_schema.FixedColumnRepresentation()),
-          LABEL_COLUMN: dataset_schema.ColumnSchema(
-              tf.int64, [], dataset_schema.FixedColumnRepresentation()),
-      }))
+      train_data = (
+          pipeline |
+          'ReadTrain' >> tfrecordio.ReadFromTFRecord(
+              shuffled_train_filepattern,
+              coder=example_proto_coder.ExampleProtoCoder(
+                  RAW_DATA_METADATA.schema)))
+
+      test_data = (
+          pipeline |
+          'ReadTest' >> tfrecordio.ReadFromTFRecord(
+              shuffled_test_filepattern,
+              coder=example_proto_coder.ExampleProtoCoder(
+                  RAW_DATA_METADATA.schema)))
 
       def preprocessing_fn(inputs):
         """Preprocess input columns into transformed columns."""
@@ -153,12 +197,12 @@ def preprocessing_fn(inputs):
         }
 
       (transformed_train_data, transformed_metadata), transform_fn = (
-          (train_data, metadata)
+          (train_data, RAW_DATA_METADATA)
           | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset(
               preprocessing_fn))
 
       transformed_test_data, _ = (
-          ((test_data, metadata), transform_fn)
+          ((test_data, RAW_DATA_METADATA), transform_fn)
           | 'Transform' >> beam_impl.TransformDataset())
 
       _ = (
@@ -183,7 +227,9 @@ def preprocessing_fn(inputs):
 
 
 def train_and_evaluate(transformed_train_filepattern,
-                       transformed_test_filepattern, transformed_metadata_dir):
+                       transformed_test_filepattern, transformed_metadata_dir,
+                       num_train_instances=NUM_TRAIN_INSTANCES,
+                       num_test_instances=NUM_TEST_INSTANCES):
   """Train the model on training data and evaluate on evaluation data.
 
   Args:
@@ -192,6 +238,8 @@ def train_and_evaluate(transformed_train_filepattern,
     transformed_test_filepattern: Base filename for transformed evaluation data
         shards
     transformed_metadata_dir: Directory containing transformed data metadata
+    num_train_instances: Number of instances in train set
+    num_test_instances: Number of instances in test set
 
   Returns:
     The results from the estimator's 'evaluate' method
@@ -219,7 +267,7 @@ def train_and_evaluate(transformed_train_filepattern,
   # Estimate the model using the default optimizer.
   estimator.fit(
       input_fn=train_input_fn,
-      max_steps=TRAIN_NUM_EPOCHS * NUM_TRAIN_INSTANCES / TRAIN_BATCH_SIZE)
+      max_steps=TRAIN_NUM_EPOCHS * num_train_instances / TRAIN_BATCH_SIZE)
 
   # Evaluate model on eval dataset.
   eval_input_fn = input_fn_maker.build_training_input_fn(
@@ -228,7 +276,7 @@ def train_and_evaluate(transformed_train_filepattern,
       training_batch_size=1,
       label_keys=[LABEL_COLUMN])
 
-  return estimator.evaluate(input_fn=eval_input_fn, steps=NUM_TEST_INSTANCES)
+  return estimator.evaluate(input_fn=eval_input_fn, steps=num_test_instances)
 
 
 def main():
@@ -248,14 +296,19 @@ def main():
   train_pos_filepattern = os.path.join(args.input_data_dir, 'train/pos/*')
   test_neg_filepattern = os.path.join(args.input_data_dir, 'test/neg/*')
   test_pos_filepattern = os.path.join(args.input_data_dir, 'test/pos/*')
+  shuffled_train_filebase = os.path.join(transformed_data_dir, 'train_shuffled')
+  shuffled_test_filebase = os.path.join(transformed_data_dir, 'test_shuffled')
   transformed_train_filebase = os.path.join(transformed_data_dir,
                                             'train_transformed')
   transformed_test_filebase = os.path.join(transformed_data_dir,
                                            'test_transformed')
   transformed_metadata_dir = os.path.join(transformed_data_dir, 'metadata')
 
-  transform_data(train_neg_filepattern, train_pos_filepattern,
-                 test_neg_filepattern, test_pos_filepattern,
+  read_and_shuffle_data(train_neg_filepattern, train_pos_filepattern,
+                        test_neg_filepattern, test_pos_filepattern,
+                        shuffled_train_filebase, shuffled_test_filebase)
+
+  transform_data(shuffled_train_filebase + '*', shuffled_test_filebase + '*',
                  transformed_train_filebase, transformed_test_filebase,
                  transformed_metadata_dir)
 
 
@@ -20,52 +20,58 @@
 import pprint
 import tempfile
 
+
 import tensorflow as tf
 import tensorflow_transform as tft
 import tensorflow_transform.beam.impl as beam_impl
 from tensorflow_transform.tf_metadata import dataset_metadata
 from tensorflow_transform.tf_metadata import dataset_schema
 
 
-def preprocessing_fn(inputs):
-  """Preprocess input columns into transformed columns."""
-  x = inputs['x']
-  y = inputs['y']
-  s = inputs['s']
-  x_centered = x - tft.mean(x)
-  y_normalized = tft.scale_to_0_1(y)
-  s_integerized = tft.string_to_int(s)
-  x_centered_times_y_normalized = (x_centered * y_normalized)
-  return {
-      'x_centered': x_centered,
-      'y_normalized': y_normalized,
-      'x_centered_times_y_normalized': x_centered_times_y_normalized,
-      's_integerized': s_integerized
-  }
+def main():
+  def preprocessing_fn(inputs):
+    """Preprocess input columns into transformed columns."""
+    x = inputs['x']
+    y = inputs['y']
+    s = inputs['s']
+    x_centered = x - tft.mean(x)
+    y_normalized = tft.scale_to_0_1(y)
+    s_integerized = tft.string_to_int(s)
+    x_centered_times_y_normalized = (x_centered * y_normalized)
+    return {
+        'x_centered': x_centered,
+        'y_normalized': y_normalized,
+        'x_centered_times_y_normalized': x_centered_times_y_normalized,
+        's_integerized': s_integerized
+    }
+
+  raw_data = [
+      {'x': 1, 'y': 1, 's': 'hello'},
+      {'x': 2, 'y': 2, 's': 'world'},
+      {'x': 3, 'y': 3, 's': 'hello'}
+  ]
 
-raw_data = [
-    {'x': 1, 'y': 1, 's': 'hello'},
-    {'x': 2, 'y': 2, 's': 'world'},
-    {'x': 3, 'y': 3, 's': 'hello'}
-]
+  raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema({
+      's': dataset_schema.ColumnSchema(
+          tf.string, [], dataset_schema.FixedColumnRepresentation()),
+      'y': dataset_schema.ColumnSchema(
+          tf.float32, [], dataset_schema.FixedColumnRepresentation()),
+      'x': dataset_schema.ColumnSchema(
+          tf.float32, [], dataset_schema.FixedColumnRepresentation())
+  }))
 
-raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema({
-    's': dataset_schema.ColumnSchema(
-        tf.string, [], dataset_schema.FixedColumnRepresentation()),
-    'y': dataset_schema.ColumnSchema(
-        tf.float32, [], dataset_schema.FixedColumnRepresentation()),
-    'x': dataset_schema.ColumnSchema(
-        tf.float32, [], dataset_schema.FixedColumnRepresentation())
-}))
+  with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
+    transform_fn = (
+        (raw_data, raw_data_metadata)
+        | beam_impl.AnalyzeDataset(preprocessing_fn))
+    transformed_dataset = (
+        ((raw_data, raw_data_metadata), transform_fn)
+        | beam_impl.TransformDataset())
 
-with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
-  transform_fn = (
-      (raw_data, raw_data_metadata)
-      | beam_impl.AnalyzeDataset(preprocessing_fn))
-  transformed_dataset = (
-      ((raw_data, raw_data_metadata), transform_fn)
-      | beam_impl.TransformDataset())
+  # pylint: disable=unused-variable
+  transformed_data, transformed_metadata = transformed_dataset
 
-transformed_data, transformed_metadata = transformed_dataset
+  pprint.pprint(transformed_data)
 
-pprint.pprint(transformed_data)
+if __name__ == '__main__':
+  main()
@@ -17,7 +17,7 @@
 from setuptools import setup
 
 # Tensorflow transform version.
-__version__ = '0.3.0'
+__version__ = '0.3.1'
 
 
 def _make_required_install_packages():