diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 37a04a2f3c3..5b9517a0e55 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -129,8 +129,6 @@ filegroup(
         "//tensorflow/contrib/tensorboard:all_files",
         "//tensorflow/contrib/testing:all_files",
         "//tensorflow/contrib/tfprof/python/tools/tfprof:all_files",
-        "//tensorflow/contrib/tfprof/tools/tfprof:all_files",
-        "//tensorflow/contrib/tfprof/tools/tfprof/internal:all_files",
         "//tensorflow/contrib/training:all_files",
         "//tensorflow/contrib/util:all_files",
         "//tensorflow/core:all_files",
@@ -188,6 +186,8 @@ filegroup(
         "//tensorflow/tools/proto_text:all_files",
         "//tensorflow/tools/quantization:all_files",
         "//tensorflow/tools/test:all_files",
+        "//tensorflow/tools/tfprof:all_files",
+        "//tensorflow/tools/tfprof/internal:all_files",
         "//tensorflow/user_ops:all_files",
         "//third_party/hadoop:all_files",
     ],
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index 1bcbba22675..57579065923 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -430,6 +430,7 @@ tf_cc_test(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index eeedaaff27b..90c87210b18 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -34,6 +34,7 @@ cc_library(
         ":constants",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
         "//tensorflow/core/util/tensor_bundle:naming",
@@ -63,7 +64,6 @@ tf_cc_test(
 filegroup(
     name = "saved_model_half_plus_two",
     srcs = glob([
-        "testdata/half_plus_two/**",
         "testdata/half_plus_two_pbtxt/**",
         "testdata/half_plus_two_sharded/**",
     ]),
diff --git a/tensorflow/cc/saved_model/constants.h b/tensorflow/cc/saved_model/constants.h
index f67c56ba1ca..654e7651702 100644
--- a/tensorflow/cc/saved_model/constants.h
+++ b/tensorflow/cc/saved_model/constants.h
@@ -30,6 +30,9 @@ constexpr char kSavedModelFilenamePb[] = "saved_model.pb";
 // SavedModel text format proto filename.
 constexpr char kSavedModelFilenamePbTxt[] = "saved_model.pbtxt";
 
+// SavedModel legacy init op key.
+constexpr char kSavedModelLegacyInitOpKey[] = "legacy_init_op";
+
 // Directory in which to save the SavedModel variables.
 constexpr char kSavedModelVariablesDirectory[] = "variables";
 
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index 1f952293550..c654d56e8a1 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/monitoring/counter.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/protobuf_internal.h"
 #include "tensorflow/core/protobuf/saved_model.pb.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/public/session_options.h"
@@ -83,10 +84,32 @@ Status LoadMetaGraphIntoSession(const MetaGraphDef& meta_graph_def,
   return (*session)->Create(meta_graph_def.graph_def());
 }
 
-Status Restore(const RunOptions& run_options, const string& export_dir,
-               const StringPiece restore_op_name,
-               const StringPiece variable_filename_const_op_name,
-               Session* session) {
+Tensor CreateStringTensor(const string& value) {
+  Tensor tensor(DT_STRING, TensorShape({}));
+  tensor.scalar<string>()() = value;
+  return tensor;
+}
+
+void AddAssetsTensorsToInputs(const StringPiece export_dir,
+                              const std::vector<AssetFileDef>& asset_file_defs,
+                              std::vector<std::pair<string, Tensor>>* inputs) {
+  if (asset_file_defs.empty()) {
+    return;
+  }
+  for (auto& asset_file_def : asset_file_defs) {
+    Tensor assets_file_path_tensor = CreateStringTensor(io::JoinPath(
+        export_dir, kSavedModelAssetsDirectory, asset_file_def.filename()));
+    inputs->push_back(
+        {asset_file_def.tensor_info().name(), assets_file_path_tensor});
+  }
+}
+
+Status RunRestore(const RunOptions& run_options, const string& export_dir,
+                  const StringPiece restore_op_name,
+                  const StringPiece variable_filename_const_op_name,
+                  const std::vector<AssetFileDef>& asset_file_defs,
+                  Session* session) {
+  LOG(INFO) << "Restoring SavedModel bundle.";
   // Find path to variables to be restored in export directory.
   const string variables_directory =
       io::JoinPath(export_dir, kSavedModelVariablesDirectory);
@@ -109,11 +132,54 @@ Status Restore(const RunOptions& run_options, const string& export_dir,
   std::vector<std::pair<string, Tensor>> inputs = {
       {variable_filename_const_op_name.ToString(), variables_path_tensor}};
 
+  AddAssetsTensorsToInputs(export_dir, asset_file_defs, &inputs);
+
   RunMetadata run_metadata;
   return session->Run(run_options, inputs, {}, {restore_op_name.ToString()},
                       nullptr /* outputs */, &run_metadata);
 }
 
+Status RunLegacyInitOp(const RunOptions& run_options, const string& export_dir,
+                       const MetaGraphDef& meta_graph_def,
+                       const std::vector<AssetFileDef>& asset_file_defs,
+                       Session* session) {
+  LOG(INFO) << "Running LegacyInitOp on SavedModel bundle.";
+  const auto& collection_def_map = meta_graph_def.collection_def();
+  const auto init_op_it = collection_def_map.find(kSavedModelLegacyInitOpKey);
+  if (init_op_it != collection_def_map.end()) {
+    if (init_op_it->second.node_list().value_size() != 1) {
+      return errors::FailedPrecondition(strings::StrCat(
+          "Expected exactly one serving init op in : ", export_dir));
+    }
+    std::vector<std::pair<string, Tensor>> inputs;
+    AddAssetsTensorsToInputs(export_dir, asset_file_defs, &inputs);
+    RunMetadata run_metadata;
+    const StringPiece legacy_init_op_name =
+        init_op_it->second.node_list().value(0);
+    return session->Run(run_options, inputs, {},
+                        {legacy_init_op_name.ToString()}, nullptr /* outputs */,
+                        &run_metadata);
+  }
+  return Status::OK();
+}
+
+Status GetAssetFileDefs(const MetaGraphDef& meta_graph_def,
+                        std::vector<AssetFileDef>* asset_file_defs) {
+  const auto& collection_def_map = meta_graph_def.collection_def();
+  const auto assets_it = collection_def_map.find(kSavedModelAssetsKey);
+  if (assets_it == collection_def_map.end()) {
+    return Status::OK();
+  }
+  const auto& any_assets = assets_it->second.any_list().value();
+  for (const auto& any_asset : any_assets) {
+    AssetFileDef asset_file_def;
+    TF_RETURN_IF_ERROR(
+        ParseAny(any_asset, &asset_file_def, "tensorflow.AssetFileDef"));
+    asset_file_defs->push_back(asset_file_def);
+  }
+  return Status::OK();
+}
+
 Status LoadSavedModelInternal(const SessionOptions& session_options,
                               const RunOptions& run_options,
                               const string& export_dir,
@@ -134,12 +200,19 @@ Status LoadSavedModelInternal(const SessionOptions& session_options,
   TF_RETURN_IF_ERROR(LoadMetaGraphIntoSession(
       bundle->meta_graph_def, session_options, &bundle->session));
 
+  std::vector<AssetFileDef> asset_file_defs;
   TF_RETURN_IF_ERROR(
-      Restore(run_options, export_dir,
-              bundle->meta_graph_def.saver_def().restore_op_name(),
-              bundle->meta_graph_def.saver_def().filename_tensor_name(),
-              bundle->session.get()));
-
+      GetAssetFileDefs(bundle->meta_graph_def, &asset_file_defs));
+  TF_RETURN_IF_ERROR(
+      RunRestore(run_options, export_dir,
+                 bundle->meta_graph_def.saver_def().restore_op_name(),
+                 bundle->meta_graph_def.saver_def().filename_tensor_name(),
+                 asset_file_defs, bundle->session.get()));
+  // TODO(sukritiramesh): Add support for a single main op to run upon load,
+  // which will supersede the legacy_init_op and separate RunRestore.
+  TF_RETURN_IF_ERROR(RunLegacyInitOp(run_options, export_dir,
+                                     bundle->meta_graph_def, asset_file_defs,
+                                     bundle->session.get()));
   return Status::OK();
 }
 
diff --git a/tensorflow/cc/saved_model/loader_test.cc b/tensorflow/cc/saved_model/loader_test.cc
index a7e4d6cfde8..55a22e4e817 100644
--- a/tensorflow/cc/saved_model/loader_test.cc
+++ b/tensorflow/cc/saved_model/loader_test.cc
@@ -29,7 +29,6 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-constexpr char kTestDataPb[] = "cc/saved_model/testdata/half_plus_two";
 constexpr char kTestDataPbTxt[] = "cc/saved_model/testdata/half_plus_two_pbtxt";
 constexpr char kTestDataSharded[] =
     "cc/saved_model/testdata/half_plus_two_sharded";
@@ -45,12 +44,26 @@ class LoaderTest : public ::testing::Test {
     return example.SerializeAsString();
   }
 
+  void ValidateAssets(const string& export_dir,
+                      const SavedModelBundle& bundle) {
+    const string asset_directory =
+        io::JoinPath(export_dir, kSavedModelAssetsDirectory);
+    const string asset_filename = "foo.txt";
+    const string asset_filepath = io::JoinPath(asset_directory, asset_filename);
+    EXPECT_TRUE(Env::Default()->FileExists(asset_filepath));
+
+    std::vector<Tensor> path_outputs;
+    TF_ASSERT_OK(
+        bundle.session->Run({}, {"filename_tensor:0"}, {}, &path_outputs));
+    ASSERT_EQ(1, path_outputs.size());
+
+    test::ExpectTensorEqual<string>(
+        test::AsTensor<string>({"foo.txt"}, TensorShape({})), path_outputs[0]);
+  }
+
   void CheckSavedModelBundle(const string& export_dir,
                              const SavedModelBundle& bundle) {
-    const string asset_path =
-        io::JoinPath(export_dir, kSavedModelAssetsDirectory, "foo.txt");
-    EXPECT_TRUE(Env::Default()->FileExists(asset_path));
-
+    ValidateAssets(export_dir, bundle);
     // Retrieve the regression signature from meta graph def.
     const auto signature_def_map = bundle.meta_graph_def.signature_def();
     const auto signature_def = signature_def_map.at(kRegressMethodName);
@@ -151,18 +164,6 @@ TEST_F(LoaderTest, PbtxtFormat) {
   CheckSavedModelBundle(export_dir, bundle);
 }
 
-TEST_F(LoaderTest, SingleShardVariables) {
-  SavedModelBundle bundle;
-  SessionOptions session_options;
-  RunOptions run_options;
-
-  const string export_dir =
-      io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPb);
-  TF_ASSERT_OK(LoadSavedModel(session_options, run_options, export_dir,
-                              {kSavedModelTagServe}, &bundle));
-  CheckSavedModelBundle(export_dir, bundle);
-}
-
 TEST_F(LoaderTest, InvalidExportPath) {
   SavedModelBundle bundle;
   RunOptions run_options;
diff --git a/tensorflow/cc/saved_model/testdata/half_plus_two/assets/foo.txt b/tensorflow/cc/saved_model/testdata/half_plus_two/assets/foo.txt
deleted file mode 100644
index f9ff0366880..00000000000
--- a/tensorflow/cc/saved_model/testdata/half_plus_two/assets/foo.txt
+++ /dev/null
@@ -1 +0,0 @@
-asset-file-contents
\ No newline at end of file
diff --git a/tensorflow/cc/saved_model/testdata/half_plus_two/saved_model.pb b/tensorflow/cc/saved_model/testdata/half_plus_two/saved_model.pb
deleted file mode 100644
index e894f9b1011..00000000000
Binary files a/tensorflow/cc/saved_model/testdata/half_plus_two/saved_model.pb and /dev/null differ
diff --git a/tensorflow/cc/saved_model/testdata/half_plus_two/variables/variables.data-00000-of-00001 b/tensorflow/cc/saved_model/testdata/half_plus_two/variables/variables.data-00000-of-00001
deleted file mode 100644
index 20bc7d454dd..00000000000
Binary files a/tensorflow/cc/saved_model/testdata/half_plus_two/variables/variables.data-00000-of-00001 and /dev/null differ
diff --git a/tensorflow/cc/saved_model/testdata/half_plus_two/variables/variables.index b/tensorflow/cc/saved_model/testdata/half_plus_two/variables/variables.index
deleted file mode 100644
index e7df518f5b5..00000000000
Binary files a/tensorflow/cc/saved_model/testdata/half_plus_two/variables/variables.index and /dev/null differ
diff --git a/tensorflow/cc/saved_model/testdata/half_plus_two_pbtxt/saved_model.pbtxt b/tensorflow/cc/saved_model/testdata/half_plus_two_pbtxt/saved_model.pbtxt
index 2e714d262db..693262eb4d7 100644
--- a/tensorflow/cc/saved_model/testdata/half_plus_two_pbtxt/saved_model.pbtxt
+++ b/tensorflow/cc/saved_model/testdata/half_plus_two_pbtxt/saved_model.pbtxt
@@ -102,6 +102,24 @@ meta_graphs {
           type: "type"
         }
       }
+      op {
+        name: "MergeV2Checkpoints"
+        input_arg {
+          name: "checkpoint_prefixes"
+          type: DT_STRING
+        }
+        input_arg {
+          name: "destination_prefix"
+          type: DT_STRING
+        }
+        attr {
+          name: "delete_old_dirs"
+          type: "bool"
+          default_value {
+            b: true
+          }
+        }
+      }
       op {
         name: "Mul"
         input_arg {
@@ -140,6 +158,35 @@ meta_graphs {
       op {
         name: "NoOp"
       }
+      op {
+        name: "Pack"
+        input_arg {
+          name: "values"
+          type_attr: "T"
+          number_attr: "N"
+        }
+        output_arg {
+          name: "output"
+          type_attr: "T"
+        }
+        attr {
+          name: "N"
+          type: "int"
+          has_minimum: true
+          minimum: 1
+        }
+        attr {
+          name: "T"
+          type: "type"
+        }
+        attr {
+          name: "axis"
+          type: "int"
+          default_value {
+            i: 0
+          }
+        }
+      }
       op {
         name: "ParseExample"
         input_arg {
@@ -267,9 +314,9 @@ meta_graphs {
         }
       }
       op {
-        name: "SaveSlices"
+        name: "SaveV2"
         input_arg {
-          name: "filename"
+          name: "prefix"
           type: DT_STRING
         }
         input_arg {
@@ -277,15 +324,15 @@ meta_graphs {
           type: DT_STRING
         }
         input_arg {
-          name: "shapes_and_slices"
+          name: "shape_and_slices"
           type: DT_STRING
         }
         input_arg {
-          name: "data"
-          type_list_attr: "T"
+          name: "tensors"
+          type_list_attr: "dtypes"
         }
         attr {
-          name: "T"
+          name: "dtypes"
           type: "list(type)"
           has_minimum: true
           minimum: 1
@@ -311,19 +358,29 @@ meta_graphs {
         }
       }
       op {
-        name: "ShardedFilespec"
+        name: "StringJoin"
         input_arg {
-          name: "basename"
+          name: "inputs"
           type: DT_STRING
-        }
-        input_arg {
-          name: "num_shards"
-          type: DT_INT32
+          number_attr: "N"
         }
         output_arg {
-          name: "filename"
+          name: "output"
           type: DT_STRING
         }
+        attr {
+          name: "N"
+          type: "int"
+          has_minimum: true
+          minimum: 1
+        }
+        attr {
+          name: "separator"
+          type: "string"
+          default_value {
+            s: ""
+          }
+        }
       }
       op {
         name: "Variable"
@@ -899,6 +956,244 @@ meta_graphs {
         }
       }
     }
+    node {
+      name: "Const"
+      op: "Const"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "/tmp/original/export/assets/foo.txt"
+          }
+        }
+      }
+    }
+    node {
+      name: "filename_tensor/initial_value"
+      op: "Const"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "foo.txt"
+          }
+        }
+      }
+    }
+    node {
+      name: "filename_tensor"
+      op: "Variable"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "container"
+        value {
+          s: ""
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "shape"
+        value {
+          shape {
+          }
+        }
+      }
+      attr {
+        key: "shared_name"
+        value {
+          s: ""
+        }
+      }
+    }
+    node {
+      name: "filename_tensor/Assign"
+      op: "Assign"
+      input: "filename_tensor"
+      input: "filename_tensor/initial_value"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "_class"
+        value {
+          list {
+            s: "loc:@filename_tensor"
+          }
+        }
+      }
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "use_locking"
+        value {
+          b: true
+        }
+      }
+      attr {
+        key: "validate_shape"
+        value {
+          b: true
+        }
+      }
+    }
+    node {
+      name: "filename_tensor/read"
+      op: "Identity"
+      input: "filename_tensor"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "_class"
+        value {
+          list {
+            s: "loc:@filename_tensor"
+          }
+        }
+      }
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+    }
+    node {
+      name: "Assign/value"
+      op: "Const"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "foo.txt"
+          }
+        }
+      }
+    }
+    node {
+      name: "Assign"
+      op: "Assign"
+      input: "filename_tensor"
+      input: "Assign/value"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "_class"
+        value {
+          list {
+            s: "loc:@filename_tensor"
+          }
+        }
+      }
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "use_locking"
+        value {
+          b: false
+        }
+      }
+      attr {
+        key: "validate_shape"
+        value {
+          b: true
+        }
+      }
+    }
     node {
       name: "Identity"
       op: "Identity"
@@ -931,6 +1226,11 @@ meta_graphs {
       input: "^a/Assign"
       input: "^b/Assign"
     }
+    node {
+      name: "group_deps"
+      op: "NoOp"
+      input: "^Assign"
+    }
     node {
       name: "save/Const"
       op: "Const"
@@ -961,6 +1261,63 @@ meta_graphs {
         }
       }
     }
+    node {
+      name: "save/StringJoin/inputs_1"
+      op: "Const"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "_temp_ff2bd25218b646ea9ed224eecdce5e79/part"
+          }
+        }
+      }
+    }
+    node {
+      name: "save/StringJoin"
+      op: "StringJoin"
+      input: "save/Const"
+      input: "save/StringJoin/inputs_1"
+      attr {
+        key: "N"
+        value {
+          i: 2
+        }
+      }
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "separator"
+        value {
+          s: ""
+        }
+      }
+    }
     node {
       name: "save/num_shards"
       op: "Const"
@@ -1024,7 +1381,7 @@ meta_graphs {
     node {
       name: "save/ShardedFilename"
       op: "ShardedFilename"
-      input: "save/Const"
+      input: "save/StringJoin"
       input: "save/ShardedFilename/shard"
       input: "save/num_shards"
       attr {
@@ -1038,7 +1395,7 @@ meta_graphs {
       }
     }
     node {
-      name: "save/save/tensor_names"
+      name: "save/SaveV2/tensor_names"
       op: "Const"
       attr {
         key: "_output_shapes"
@@ -1075,7 +1432,7 @@ meta_graphs {
       }
     }
     node {
-      name: "save/save/shapes_and_slices"
+      name: "save/SaveV2/shape_and_slices"
       op: "Const"
       attr {
         key: "_output_shapes"
@@ -1112,15 +1469,15 @@ meta_graphs {
       }
     }
     node {
-      name: "save/save"
-      op: "SaveSlices"
+      name: "save/SaveV2"
+      op: "SaveV2"
       input: "save/ShardedFilename"
-      input: "save/save/tensor_names"
-      input: "save/save/shapes_and_slices"
+      input: "save/SaveV2/tensor_names"
+      input: "save/SaveV2/shape_and_slices"
       input: "a"
       input: "b"
       attr {
-        key: "T"
+        key: "dtypes"
         value {
           list {
             type: DT_FLOAT
@@ -1133,7 +1490,7 @@ meta_graphs {
       name: "save/control_dependency"
       op: "Identity"
       input: "save/ShardedFilename"
-      input: "^save/save"
+      input: "^save/SaveV2"
       attr {
         key: "T"
         value {
@@ -1159,11 +1516,65 @@ meta_graphs {
       }
     }
     node {
-      name: "save/ShardedFilespec"
-      op: "ShardedFilespec"
-      input: "save/Const"
-      input: "save/num_shards"
+      name: "save/MergeV2Checkpoints/checkpoint_prefixes"
+      op: "Pack"
+      input: "save/ShardedFilename"
       input: "^save/control_dependency"
+      attr {
+        key: "N"
+        value {
+          i: 1
+        }
+      }
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 1
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "axis"
+        value {
+          i: 0
+        }
+      }
+    }
+    node {
+      name: "save/MergeV2Checkpoints"
+      op: "MergeV2Checkpoints"
+      input: "save/MergeV2Checkpoints/checkpoint_prefixes"
+      input: "save/Const"
+      attr {
+        key: "delete_old_dirs"
+        value {
+          b: true
+        }
+      }
+    }
+    node {
+      name: "save/Identity"
+      op: "Identity"
+      input: "save/Const"
+      input: "^save/control_dependency"
+      input: "^save/MergeV2Checkpoints"
+      attr {
+        key: "T"
+        value {
+          type: DT_STRING
+        }
+      }
       attr {
         key: "_output_shapes"
         value {
@@ -1467,12 +1878,39 @@ meta_graphs {
   }
   saver_def {
     filename_tensor_name: "save/Const:0"
-    save_tensor_name: "save/ShardedFilespec:0"
+    save_tensor_name: "save/Identity:0"
     restore_op_name: "save/restore_all"
     max_to_keep: 5
     sharded: true
     keep_checkpoint_every_n_hours: 10000.0
-    version: V1
+    version: V2
+  }
+  collection_def {
+    key: "asset_filepaths"
+    value {
+      node_list {
+        value: "Const:0"
+      }
+    }
+  }
+  collection_def {
+    key: "legacy_init_op"
+    value {
+      node_list {
+        value: "group_deps"
+      }
+    }
+  }
+  collection_def {
+    key: "saved_model_assets"
+    value {
+      any_list {
+        value {
+          type_url: "type.googleapis.com/tensorflow.AssetFileDef"
+          value: "\n\t\n\007Const:0\022\007foo.txt"
+        }
+      }
+    }
   }
   collection_def {
     key: "trainable_variables"
diff --git a/tensorflow/cc/saved_model/testdata/half_plus_two_sharded/saved_model.pb b/tensorflow/cc/saved_model/testdata/half_plus_two_sharded/saved_model.pb
index e894f9b1011..0df49f21685 100644
Binary files a/tensorflow/cc/saved_model/testdata/half_plus_two_sharded/saved_model.pb and b/tensorflow/cc/saved_model/testdata/half_plus_two_sharded/saved_model.pb differ
diff --git a/tensorflow/cc/training/queue_runner.cc b/tensorflow/cc/training/queue_runner.cc
index 81f49c5dcfc..79d306f3676 100644
--- a/tensorflow/cc/training/queue_runner.cc
+++ b/tensorflow/cc/training/queue_runner.cc
@@ -54,7 +54,8 @@ Status QueueRunner::Init(const QueueRunnerDef& queue_runner_def) {
 }
 
 QueueRunner::~QueueRunner() {
-  should_stop_ = true;
+  // Cannot run Stop() here because the session might already be closed or
+  // destroyed.
   Join();
 }
 
@@ -72,6 +73,15 @@ Status QueueRunner::Start(Session* sess) {
   return Status::OK();
 }
 
+Status QueueRunner::Stop(Session* sess) {
+  should_stop_ = true;
+  if (cancel_op_name_.empty()) {
+    return Status::OK();
+  } else {
+    return sess->Run({}, {}, {cancel_op_name_}, nullptr);
+  }
+}
+
 Status QueueRunner::Join() {
   thread_pool_.reset();
   started_ = false;
@@ -80,9 +90,8 @@ Status QueueRunner::Join() {
 
 void QueueRunner::Run(Session* sess, const string& enqueue_op) {
   bool decremented = false;
-  while (!should_stop_) {
-    std::vector<Tensor> outputs;
-    auto status = sess->Run({}, {}, {enqueue_op}, &outputs);
+  while (!should_stop_.load()) {
+    auto status = sess->Run({}, {}, {enqueue_op}, nullptr);
     if (status.ok()) {
       continue;
     } else if (queue_closed_exception_types_.count(
@@ -94,19 +103,25 @@ void QueueRunner::Run(Session* sess, const string& enqueue_op) {
 
       // If all enqueue ops have finished, run the close op.
       if (runs_ == 0 && !close_op_name_.empty()) {
-        std::vector<Tensor> outputs;
-        auto s = sess->Run({}, {}, {close_op_name_}, &outputs);
-        if (!s.ok()) {
-          status_ = status;
+        auto s = sess->Run({}, {}, {close_op_name_}, nullptr);
+        if (!s.ok() && status_.ok() &&
+            queue_closed_exception_types_.count(static_cast<int>(s.code())) ==
+                0) {
+          status_ = s;
         }
       }
     } else {
-      mutex_lock l(mu_);
-      should_stop_ = true;
-      // Only record the first failure status.
-      if (status_.ok()) {
-        status_ = status;
+      {
+        mutex_lock l(mu_);
+        should_stop_ = true;
+        // Only record the first failure status.
+        if (status_.ok()) {
+          status_ = status;
+        }
       }
+      // Stop the queue runner immediately to propagate the error to
+      // subsequent queues.
+      Stop(sess);
     }
   }
 
diff --git a/tensorflow/cc/training/queue_runner.h b/tensorflow/cc/training/queue_runner.h
index 7eeab8bd45a..c3fe4026efe 100644
--- a/tensorflow/cc/training/queue_runner.h
+++ b/tensorflow/cc/training/queue_runner.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <unordered_set>
 #include <vector>
+
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/threadpool.h"
@@ -49,6 +50,9 @@ class QueueRunner {
   // Starts the queue runner with the given session.
   Status Start(Session* sess);
 
+  // Requests to stop and runs the cancel op.
+  Status Stop(Session* sess);
+
   // Joins all the threads. Returns okay if all threads run successfully;
   // otherwise returns the first captured failure status.
   Status Join();
@@ -60,14 +64,14 @@ class QueueRunner {
   string queue_name_;
   std::vector<string> enqueue_op_names_;
   string close_op_name_;
-  // The cancel op is not being called currently.
   string cancel_op_name_;
   // code::Code casted to int to avoid a hash function.
   std::unordered_set<int> queue_closed_exception_types_;
 
   std::unique_ptr<thread::ThreadPool> thread_pool_;
-  bool should_stop_;
+  std::atomic<bool> should_stop_;
   std::atomic<bool> started_;
+  condition_variable wait_to_close_;
   mutex mu_;
   // TODO(yuefengz): implement c++ coordinator.
   int runs_ = 0;
diff --git a/tensorflow/cc/training/queue_runner_test.cc b/tensorflow/cc/training/queue_runner_test.cc
index 8719677274a..29165778c5c 100644
--- a/tensorflow/cc/training/queue_runner_test.cc
+++ b/tensorflow/cc/training/queue_runner_test.cc
@@ -14,8 +14,10 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/cc/training/queue_runner.h"
+
 #include <string>
 #include <vector>
+
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -23,39 +25,42 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/queue_runner.pb.h"
 #include "tensorflow/core/public/session.h"
 
+namespace tensorflow {
 namespace {
 
-using ::tensorflow::DataType;
-using ::tensorflow::error::Code;
-using ::tensorflow::GraphDef;
-using ::tensorflow::ops::Assign;
-using ::tensorflow::ops::Const;
-using ::tensorflow::ops::CountUpTo;
-using ::tensorflow::ops::FIFOQueue;
-using ::tensorflow::ops::InputList;
-using ::tensorflow::ops::QueueClose;
-using ::tensorflow::ops::QueueDequeue;
-using ::tensorflow::ops::QueueEnqueue;
-using ::tensorflow::ops::Square;
-using ::tensorflow::ops::Variable;
-using ::tensorflow::QueueRunner;
-using ::tensorflow::QueueRunnerDef;
-using ::tensorflow::Scope;
-using ::tensorflow::Session;
-using ::tensorflow::SessionOptions;
-using ::tensorflow::Tensor;
-using ::tensorflow::TensorShape;
+using error::Code;
+using ops::Assign;
+using ops::Const;
+using ops::CountUpTo;
+using ops::FIFOQueue;
+using ops::QueueClose;
+using ops::QueueDequeue;
+using ops::QueueEnqueue;
+using ops::Square;
+using ops::Variable;
 
 constexpr char kAssignOpName[] = "assign";
+constexpr char kCancelOp0[] = "cancel0";
+constexpr char kCancelOp1[] = "cancel1";
+constexpr char kCloseOp0[] = "close0";
+constexpr char kCloseOp1[] = "close1";
 constexpr char kCountUpToOpName[] = "count";
+constexpr char kDequeueOp0[] = "dequeue0";
+constexpr char kDequeueOp1[] = "dequeue1";
+constexpr char kEnqueueOp0[] = "enqueue0";
+constexpr char kEnqueueOp1[] = "enqueue1";
 constexpr char kIllegalOpName1[] = "would fail";
 constexpr char kIllegalOpName2[] = "fail again";
 constexpr char kQueueName[] = "unit_test";
+constexpr char kQueueName0[] = "q0";
+constexpr char kQueueName1[] = "q1";
 constexpr char kSquareOpName[] = "square";
 constexpr char kVarOpName[] = "var";
 
@@ -75,7 +80,7 @@ GraphDef BuildSimpleGraph() {
 
 QueueRunnerDef BuildQueueRunnerDef(
     const std::string& queue_name, const std::vector<std::string>& enqueue_ops,
-    const std::string& close_op,
+    const std::string& close_op, const std::string& cancel_op,
     const std::vector<Code>& queue_closed_error_codes) {
   QueueRunnerDef queue_runner_def;
   *queue_runner_def.mutable_queue_name() = kQueueName;
@@ -83,6 +88,7 @@ QueueRunnerDef BuildQueueRunnerDef(
     *queue_runner_def.mutable_enqueue_op_name()->Add() = enqueue_op;
   }
   *queue_runner_def.mutable_close_op_name() = close_op;
+  *queue_runner_def.mutable_cancel_op_name() = cancel_op;
   for (const auto& error_code : queue_closed_error_codes) {
     *queue_runner_def.mutable_queue_closed_exception_types()->Add() =
         error_code;
@@ -96,8 +102,7 @@ std::unique_ptr<Session> BuildSessionAndInitVariable(
   std::unique_ptr<Session> session(NewSession(options));
   TF_CHECK_OK(session->Create(graph_def));
 
-  std::vector<Tensor> nothing;
-  TF_CHECK_OK(session->Run({}, {}, {kAssignOpName}, &nothing));
+  TF_CHECK_OK(session->Run({}, {}, {kAssignOpName}, nullptr));
   return session;
 }
 
@@ -106,7 +111,7 @@ TEST(QueueRunnerTest, BasicTest) {
   auto session = BuildSessionAndInitVariable(graph_def);
 
   QueueRunnerDef queue_runner_def = BuildQueueRunnerDef(
-      kQueueName, {kCountUpToOpName, kCountUpToOpName}, kSquareOpName, {});
+      kQueueName, {kCountUpToOpName, kCountUpToOpName}, kSquareOpName, "", {});
 
   QueueRunner qr(queue_runner_def);
   qr.Start(session.get());
@@ -123,7 +128,7 @@ TEST(QueueRunnerTest, QueueClosedCode) {
   auto session = BuildSessionAndInitVariable(graph_def);
 
   QueueRunnerDef queue_runner_def =
-      BuildQueueRunnerDef(kQueueName, {kCountUpToOpName}, kSquareOpName,
+      BuildQueueRunnerDef(kQueueName, {kCountUpToOpName}, kSquareOpName, "",
                           {Code::OUT_OF_RANGE, Code::CANCELLED});
 
   QueueRunner qr(queue_runner_def);
@@ -141,60 +146,167 @@ TEST(QueueRunnerDef, CatchErrorInJoin) {
   auto session = BuildSessionAndInitVariable(graph_def);
 
   QueueRunnerDef queue_runner_def = BuildQueueRunnerDef(
-      kQueueName, {kIllegalOpName1, kIllegalOpName2}, kCountUpToOpName, {});
+      kQueueName, {kIllegalOpName1, kIllegalOpName2}, kCountUpToOpName, "", {});
 
   QueueRunner qr(queue_runner_def);
   qr.Start(session.get());
   EXPECT_EQ(qr.Join().code(), Code::NOT_FOUND);
 }
 
-TEST(QueueRunnerTest, RealEnqueueDequeue) {
+GraphDef BuildDoubleQueueGraph() {
   Scope root = Scope::NewRootScope();
-  auto q0 = FIFOQueue(root.WithOpName("q0"), {DataType::DT_INT32});
+  auto q0 = FIFOQueue(root.WithOpName(kQueueName0), {DataType::DT_INT32});
   auto ten = Const(root, 10);
-  auto enqueue0 = QueueEnqueue(root.WithOpName("enqueue0"), q0, {ten});
-  auto close0 = QueueClose(root.WithOpName("close0"), q0);
-  auto q1 = FIFOQueue(root.WithOpName("q1"), {DataType::DT_INT32});
+  auto enqueue0 = QueueEnqueue(root.WithOpName(kEnqueueOp0), q0, {ten});
+  auto close0 = QueueClose(root.WithOpName(kCloseOp0), q0);
+  auto cancel0 = QueueClose(root.WithOpName(kCancelOp0), q0,
+                            QueueClose::CancelPendingEnqueues(true));
+  auto q1 = FIFOQueue(root.WithOpName(kQueueName1), {DataType::DT_INT32});
   auto dequeue0 =
-      QueueDequeue(root.WithOpName("dequeue0"), q0, {DataType::DT_INT32});
-  auto enqueue1 = QueueEnqueue(root.WithOpName("enqueue1"), q1, {dequeue0[0]});
+      QueueDequeue(root.WithOpName(kDequeueOp0), q0, {DataType::DT_INT32});
+  auto enqueue1 = QueueEnqueue(root.WithOpName(kEnqueueOp1), q1, {dequeue0[0]});
   auto dequeue1 =
-      QueueDequeue(root.WithOpName("dequeue1"), q1, {DataType::DT_INT32});
-  auto close1 = QueueClose(root.WithOpName("close1"), q1);
+      QueueDequeue(root.WithOpName(kDequeueOp1), q1, {DataType::DT_INT32});
+  auto close1 = QueueClose(root.WithOpName(kCloseOp1), q1);
+  auto cancel1 = QueueClose(root.WithOpName(kCancelOp1), q1,
+                            QueueClose::CancelPendingEnqueues(true));
 
   GraphDef graph_def;
   TF_EXPECT_OK(root.ToGraphDef(&graph_def));
+  return graph_def;
+}
+
+TEST(QueueRunnerTest, RealEnqueueDequeue) {
+  auto graph_def = BuildDoubleQueueGraph();
 
   SessionOptions options;
   std::unique_ptr<Session> session(NewSession(options));
   TF_CHECK_OK(session->Create(graph_def));
 
   QueueRunnerDef queue_runner_def =
-      BuildQueueRunnerDef(kQueueName, {"enqueue1"}, "close1", {});
+      BuildQueueRunnerDef(kQueueName, {kEnqueueOp1}, kCloseOp1, "", {});
   QueueRunner qr;
   qr.Init(queue_runner_def);
   TF_CHECK_OK(qr.Start(session.get()));
 
-  std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session->Run({}, {}, {"enqueue0"}, &outputs));
-  TF_EXPECT_OK(session->Run({}, {}, {"enqueue0"}, &outputs));
-  TF_EXPECT_OK(session->Run({}, {}, {"close0"}, &outputs));
+  TF_EXPECT_OK(session->Run({}, {}, {kEnqueueOp0}, nullptr));
+  TF_EXPECT_OK(session->Run({}, {}, {kEnqueueOp0}, nullptr));
+  // Closing queue 0 would also close the queue runner.
+  TF_EXPECT_OK(session->Run({}, {}, {kCloseOp0}, nullptr));
 
   TF_EXPECT_OK(qr.Join());
   std::vector<Tensor> dq1;
-  TF_EXPECT_OK(session->Run({}, {"dequeue1"}, {}, &dq1));
+  TF_EXPECT_OK(session->Run({}, {kDequeueOp1}, {}, &dq1));
   EXPECT_EQ(*dq1[0].scalar<int>().data(), 10);
   std::vector<Tensor> dq2;
-  TF_EXPECT_OK(session->Run({}, {"dequeue1"}, {}, &dq2));
+  TF_EXPECT_OK(session->Run({}, {kDequeueOp1}, {}, &dq2));
   EXPECT_EQ(*dq2[0].scalar<int>().data(), 10);
 
-  EXPECT_EQ(session->Run({}, {"dequeue1"}, {}, &dq1).code(),
+  EXPECT_EQ(session->Run({}, {kDequeueOp1}, {}, nullptr).code(),
             Code::OUT_OF_RANGE);
 }
 
+void JoinThread(QueueRunner* queue_runner, bool* join_succeeded,
+                Notification* join_done) {
+  EXPECT_EQ(queue_runner->Join().code(), Code::CANCELLED);
+  *join_succeeded = true;
+  join_done->Notify();
+}
+
+TEST(QueueRunnerTest, SessionCloseCancelPendingEnqueue) {
+  auto graph_def = BuildDoubleQueueGraph();
+
+  SessionOptions options;
+  std::unique_ptr<Session> session(NewSession(options));
+  TF_CHECK_OK(session->Create(graph_def));
+
+  QueueRunnerDef queue_runner_def = BuildQueueRunnerDef(
+      kQueueName1, {kEnqueueOp1}, kCloseOp1, kCancelOp1, {});
+  QueueRunner qr;
+  qr.Init(queue_runner_def);
+  TF_CHECK_OK(qr.Start(session.get()));
+
+  TF_EXPECT_OK(session->Run({}, {}, {kEnqueueOp0}, nullptr));
+
+  std::vector<Tensor> dq1;
+  TF_EXPECT_OK(session->Run({}, {kDequeueOp1}, {}, &dq1));
+  EXPECT_EQ(*dq1[0].scalar<int>().data(), 10);
+
+  // The expected behavior is the QueueRunner::Join() call is blocked until
+  // Session::Close() is called.
+  bool join_succeeded = false;
+  Notification join_done;
+  Env::Default()->SchedClosure(
+      std::bind(&JoinThread, &qr, &join_succeeded, &join_done));
+
+  Env::Default()->SleepForMicroseconds(10000000);
+  EXPECT_EQ(join_succeeded, false);
+
+  // Closing the session is required to cancel pending enqueue nodes.
+  TF_EXPECT_OK(session->Close());
+
+  join_done.WaitForNotification();
+  EXPECT_EQ(join_succeeded, true);
+}
+
+TEST(QueueRunnerTest, Stop) {
+  auto graph_def = BuildDoubleQueueGraph();
+
+  SessionOptions options;
+  std::unique_ptr<Session> session(NewSession(options));
+  TF_CHECK_OK(session->Create(graph_def));
+
+  QueueRunnerDef queue_runner_def = BuildQueueRunnerDef(
+      kQueueName1, {kEnqueueOp1}, kCloseOp1, kCancelOp1, {});
+  QueueRunner qr;
+  qr.Init(queue_runner_def);
+  TF_CHECK_OK(qr.Start(session.get()));
+
+  TF_EXPECT_OK(qr.Stop(session.get()));
+
+  TF_EXPECT_OK(session->Run({}, {}, {kEnqueueOp0}, nullptr));
+
+  EXPECT_EQ(session->Run({}, {kDequeueOp1}, {}, nullptr).code(),
+            Code::OUT_OF_RANGE);
+
+  // qr is already stopped
+  TF_EXPECT_OK(qr.Join());
+}
+
+TEST(QueueRunnerTest, StopTwoQueues) {
+  auto graph_def = BuildDoubleQueueGraph();
+
+  SessionOptions options;
+  std::unique_ptr<Session> session(NewSession(options));
+  TF_CHECK_OK(session->Create(graph_def));
+
+  QueueRunnerDef queue_runner0 =
+      BuildQueueRunnerDef(kQueueName0, {kEnqueueOp0}, kCloseOp0, kCancelOp0,
+                          {Code::OUT_OF_RANGE, Code::CANCELLED});
+  QueueRunnerDef queue_runner1 =
+      BuildQueueRunnerDef(kQueueName1, {kEnqueueOp1}, kCloseOp1, kCancelOp1,
+                          {Code::OUT_OF_RANGE, Code::CANCELLED});
+  QueueRunner qr0;
+  qr0.Init(queue_runner0);
+  TF_CHECK_OK(qr0.Start(session.get()));
+  QueueRunner qr1;
+  qr1.Init(queue_runner1);
+  TF_CHECK_OK(qr1.Start(session.get()));
+
+  std::vector<Tensor> dq;
+  TF_EXPECT_OK(session->Run({}, {kDequeueOp1}, {}, &dq));
+  EXPECT_EQ(*dq[0].scalar<int>().data(), 10);
+
+  TF_EXPECT_OK(qr0.Stop(session.get()));
+  TF_EXPECT_OK(qr1.Stop(session.get()));
+
+  TF_EXPECT_OK(qr0.Join());
+  TF_EXPECT_OK(qr1.Join());
+}
+
 TEST(QueueRunnerTest, EmptyEnqueueOps) {
   QueueRunnerDef queue_runner_def =
-      BuildQueueRunnerDef(kQueueName, {}, kCountUpToOpName, {});
+      BuildQueueRunnerDef(kQueueName, {}, kCountUpToOpName, "", {});
 
   QueueRunner qr;
   EXPECT_EQ(qr.Init(queue_runner_def).code(), Code::INVALID_ARGUMENT);
@@ -203,8 +315,8 @@ TEST(QueueRunnerTest, EmptyEnqueueOps) {
 TEST(QueueRunnerTest, InitAfterStart) {
   GraphDef graph_def = BuildSimpleGraph();
   auto session = BuildSessionAndInitVariable(graph_def);
-  QueueRunnerDef queue_runner_def =
-      BuildQueueRunnerDef(kQueueName, {kCountUpToOpName}, kCountUpToOpName, {});
+  QueueRunnerDef queue_runner_def = BuildQueueRunnerDef(
+      kQueueName, {kCountUpToOpName}, kCountUpToOpName, "", {});
 
   QueueRunner qr;
   TF_EXPECT_OK(qr.Init(queue_runner_def));
@@ -213,3 +325,4 @@ TEST(QueueRunnerTest, InitAfterStart) {
 }
 
 }  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/cmake/external/gemmlowp.cmake b/tensorflow/contrib/cmake/external/gemmlowp.cmake
index 11868d44dd6..024c064cf43 100644
--- a/tensorflow/contrib/cmake/external/gemmlowp.cmake
+++ b/tensorflow/contrib/cmake/external/gemmlowp.cmake
@@ -1,7 +1,7 @@
 include (ExternalProject)
 
-set(gemmlowp_URL http://github.com/google/gemmlowp/archive/c0bacf11fb509a2cbe15a97362a2df067ffd57a2.tar.gz)
-set(gemmlowp_HASH SHA256=dc64a38f9927db18748d9024987c9b102115e25bc2be4b76aa8e422b8f83d882)
+set(gemmlowp_URL http://github.com/google/gemmlowp/archive/a6f29d8ac48d63293f845f2253eccbf86bc28321.tar.gz)
+set(gemmlowp_HASH SHA256=75d40ea8e68b0d1644f052fffe8f14a410b2a73d40ccb859a95c0578d194ec26)
 set(gemmlowp_BUILD ${CMAKE_BINARY_DIR}/gemmlowp/src/gemmlowp)
 set(gemmlowp_INCLUDE_DIR ${CMAKE_BINARY_DIR}/gemmlowp/src/gemmlowp)
 
diff --git a/tensorflow/contrib/cmake/patches/gif/unistd.h b/tensorflow/contrib/cmake/patches/gif/unistd.h
index e69de29bb2d..cd52ce31d4d 100644
--- a/tensorflow/contrib/cmake/patches/gif/unistd.h
+++ b/tensorflow/contrib/cmake/patches/gif/unistd.h
@@ -0,0 +1,14 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
diff --git a/tensorflow/contrib/cmake/tf_core_kernels.cmake b/tensorflow/contrib/cmake/tf_core_kernels.cmake
index 91ad74d9e76..8b3a2d75f48 100644
--- a/tensorflow/contrib/cmake/tf_core_kernels.cmake
+++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake
@@ -89,6 +89,7 @@ if(WIN32)
       "${tensorflow_source_dir}/tensorflow/core/kernels/fact_op.cc"
       "${tensorflow_source_dir}/tensorflow/core/kernels/immutable_constant_op.cc"
       "${tensorflow_source_dir}/tensorflow/core/kernels/immutable_constant_op.h"
+      "${tensorflow_source_dir}/tensorflow/core/kernels/meta_support.*"
       "${tensorflow_source_dir}/tensorflow/core/kernels/sparse_matmul_op.cc"
       "${tensorflow_source_dir}/tensorflow/core/kernels/sparse_matmul_op.h"
       "${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.h"
diff --git a/tensorflow/contrib/cmake/tf_tools.cmake b/tensorflow/contrib/cmake/tf_tools.cmake
index 91776fd5c82..4b3b93f890f 100644
--- a/tensorflow/contrib/cmake/tf_tools.cmake
+++ b/tensorflow/contrib/cmake/tf_tools.cmake
@@ -13,7 +13,10 @@ add_executable(${proto_text}
     $<TARGET_OBJECTS:tf_core_lib>
 )
 
-target_link_libraries(${proto_text} PUBLIC ${tensorflow_EXTERNAL_LIBRARIES})
+target_link_libraries(${proto_text} PUBLIC
+  ${tensorflow_EXTERNAL_LIBRARIES}
+  tf_protos_cc
+)
 
 add_dependencies(${proto_text}
     tf_core_lib
diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index 6df6dd5c248..850cbf8d26a 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -36,7 +36,7 @@ cuda_py_tests(
 
 cuda_py_tests(
     name = "operator_pd_cholesky_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/kernel_tests/operator_pd_cholesky_test.py"],
     additional_deps = [
         ":distributions_py",
@@ -60,7 +60,7 @@ cuda_py_tests(
 
 cuda_py_tests(
     name = "operator_pd_full_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/kernel_tests/operator_pd_full_test.py"],
     additional_deps = [
         ":distributions_py",
@@ -72,7 +72,7 @@ cuda_py_tests(
 
 cuda_py_tests(
     name = "operator_pd_identity_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/kernel_tests/operator_pd_identity_test.py"],
     additional_deps = [
         ":distributions_py",
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijector_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijector_test.py
index f42406e90bc..7356511a127 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijector_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijector_test.py
@@ -614,6 +614,67 @@ class SigmoidCenteredBijectorTest(tf.test.TestCase):
                           atol=0., rtol=1e-7)
 
 
+class CholeskyOuterProductBijectorTest(tf.test.TestCase):
+  """Tests the correctness of the Y = X * X^T transformation."""
+
+  def testBijectorMatrix(self):
+    with self.test_session():
+      bijector = bijectors.CholeskyOuterProduct(event_ndims=2,
+                                                validate_args=True)
+      self.assertEqual("cholesky_outer_product", bijector.name)
+      x = [[[1., 0],
+            [2, 1]],
+           [[math.sqrt(2.), 0],
+            [math.sqrt(8.), 1]]]
+      y = np.matmul(x, np.transpose(x, axes=(0, 2, 1)))
+      # Fairly easy to compute differentials since we have 2x2.
+      dx_dy = [[[2.*1, 0, 0],
+                [2, 1, 0],
+                [0, 2*2, 2*1]],
+               [[2*math.sqrt(2.), 0, 0],
+                [math.sqrt(8.), math.sqrt(2.), 0],
+                [0, 2*math.sqrt(8.), 2*1]]]
+      ildj = -np.sum(
+          np.log(np.asarray(dx_dy).diagonal(offset=0, axis1=1, axis2=2)),
+          axis=1)
+      self.assertAllEqual((2, 2, 2), bijector.forward(x).get_shape())
+      self.assertAllEqual((2, 2, 2), bijector.inverse(y).get_shape())
+      self.assertAllClose(y, bijector.forward(x).eval())
+      self.assertAllClose(x, bijector.inverse(y).eval())
+      self.assertAllClose(ildj,
+                          bijector.inverse_log_det_jacobian(y).eval(),
+                          atol=0., rtol=1e-7)
+      self.assertAllClose(-bijector.inverse_log_det_jacobian(y).eval(),
+                          bijector.forward_log_det_jacobian(x).eval(),
+                          atol=0., rtol=1e-7)
+
+  def testBijectorScalar(self):
+    with self.test_session():
+      bijector = bijectors.CholeskyOuterProduct(event_ndims=0,
+                                                validate_args=True)
+      self.assertEqual("cholesky_outer_product", bijector.name)
+      x = [[[1., 5],
+            [2, 1]],
+           [[math.sqrt(2.), 3],
+            [math.sqrt(8.), 1]]]
+      y = np.square(x)
+      ildj = -math.log(2.) - np.log(x)
+      self.assertAllClose(y, bijector.forward(x).eval())
+      self.assertAllClose(x, bijector.inverse(y).eval())
+      self.assertAllClose(ildj,
+                          bijector.inverse_log_det_jacobian(y).eval(),
+                          atol=0., rtol=1e-7)
+      self.assertAllClose(-bijector.inverse_log_det_jacobian(y).eval(),
+                          bijector.forward_log_det_jacobian(x).eval(),
+                          atol=0., rtol=1e-7)
+
+  def testScalarCongruency(self):
+    with self.test_session():
+      bijector = bijectors.CholeskyOuterProduct(event_ndims=0,
+                                                validate_args=True)
+      assert_scalar_congruency(bijector, lower_x=1e-3, upper_x=1.5, rtol=0.05)
+
+
 class ChainBijectorTest(tf.test.TestCase):
   """Tests the correctness of the Y = Chain(bij1, bij2, bij3) transformation."""
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py b/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py
index e02b6439186..f4da88e5350 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py
@@ -41,11 +41,34 @@ class DistributionTest(tf.test.TestCase):
       for cls in classes:
         for sample_shape in sample_shapes:
           param_shapes = cls.param_shapes(sample_shape)
-          print(param_shapes)
           params = dict([(name, tf.random_normal(shape))
                          for name, shape in param_shapes.items()])
           dist = cls(**params)
           self.assertAllEqual(sample_shape, tf.shape(dist.sample()).eval())
+          dist_copy = dist.copy()
+          self.assertAllEqual(sample_shape,
+                              tf.shape(dist_copy.sample()).eval())
+          self.assertEqual(dist.parameters, dist_copy.parameters)
+
+  def testCopyExtraArgs(self):
+    with self.test_session():
+      # Note: we cannot easily test all distributions since each requires
+      # different initialization arguments. We therefore spot test a few.
+      normal = dists.Normal(mu=1., sigma=2., validate_args=True)
+      self.assertEqual(normal.parameters, normal.copy().parameters)
+      wishart = dists.WishartFull(df=2, scale=[[1., 2], [2, 5]],
+                                  validate_args=True)
+      self.assertEqual(wishart.parameters, wishart.copy().parameters)
+
+  def testCopyOverride(self):
+    with self.test_session():
+      normal = dists.Normal(mu=1., sigma=2., validate_args=True)
+      normal_copy = normal.copy(validate_args=False)
+      base_params = normal.parameters.copy()
+      copy_params = normal.copy(validate_args=False).parameters.copy()
+      self.assertNotEqual(base_params.pop("validate_args"),
+                          copy_params.pop("validate_args"))
+      self.assertEqual(base_params, copy_params)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/distributions/python/ops/bijector.py b/tensorflow/contrib/distributions/python/ops/bijector.py
index 054facb9a24..2472c12d3f3 100644
--- a/tensorflow/contrib/distributions/python/ops/bijector.py
+++ b/tensorflow/contrib/distributions/python/ops/bijector.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 r"""Bijector Ops.
 
-An API for reversible (bijective) transformations of random variables.
+An API for invertible, differentiable transformations of random variables.
 
 ## Background
 
@@ -31,6 +31,7 @@ To apply a `Bijector`, use `distributions.TransformedDistribution`.
 
 @@Bijector
 @@Chain
+@@CholeskyOuterProduct
 @@Exp
 @@Identity
 @@Inline
@@ -46,7 +47,9 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
+import collections
 import contextlib
+import math
 import re
 import numpy as np
 import six
@@ -58,18 +61,112 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 
+__all__ = [
+    "Bijector",
+    "Chain",
+    "CholeskyOuterProduct",
+    "Exp",
+    "Identity",
+    "Inline",
+    "Invert",
+    "ScaleAndShift",
+    "SigmoidCentered",
+    "SoftmaxCentered",
+    "Softplus",
+]
+
+
+class _Mapping(collections.namedtuple("_Mapping",
+                                      ["x", "y", "ildj", "condition_kwargs"])):
+  """Helper class to make it easier to manage caching in `Bijector`."""
+
+  def __new__(cls, x=None, y=None, ildj=None, condition_kwargs=None):
+    """Custom __new__ so namedtuple items have defaults.
+
+    Args:
+      x: `Tensor`. Forward.
+      y: `Tensor`. Inverse.
+      ildj: `Tensor`. Inverse log det Jacobian.
+      condition_kwargs: Python dictionary. Extra args supplied to
+        forward/inverse/etc functions.
+
+    Returns:
+      mapping: New instance of _Mapping.
+    """
+    return super(_Mapping, cls).__new__(cls, x, y, ildj, condition_kwargs)
+
+  @property
+  def x_key(self):
+    """Returns key used for caching Y=g(X)."""
+    return (self.x,) + self._deep_tuple(tuple(sorted(
+        self.condition_kwargs.items())))
+
+  @property
+  def y_key(self):
+    """Returns key used for caching X=g^{-1}(Y)."""
+    return (self.y,) + self._deep_tuple(tuple(sorted(
+        self.condition_kwargs.items())))
+
+  def merge(self, x=None, y=None, ildj=None,
+            condition_kwargs=None, mapping=None):
+    """Returns new _Mapping with args merged with self.
+
+    Args:
+      x: `Tensor`. Forward.
+      y: `Tensor`. Inverse.
+      ildj: `Tensor`. Inverse log det Jacobian.
+      condition_kwargs: Python dictionary. Extra args supplied to
+        forward/inverse/etc functions.
+      mapping: Instance of _Mapping to merge. Can only be specified if no other
+        arg is specified.
+
+    Returns:
+      mapping: New instance of `_Mapping` which has inputs merged with self.
+
+    Raises:
+      ValueError: if mapping and any other arg is not `None`.
+    """
+    if mapping is None:
+      mapping = _Mapping(x=x, y=y, ildj=ildj,
+                         condition_kwargs=condition_kwargs)
+    elif not all([arg is None for arg in [x, y, ildj, condition_kwargs]]):
+      raise ValueError("Cannot specify mapping and individual args.")
+    return _Mapping(
+        x=self._merge(self.x, mapping.x),
+        y=self._merge(self.y, mapping.y),
+        ildj=self._merge(self.ildj, mapping.ildj),
+        condition_kwargs=self._merge(self.condition_kwargs,
+                                     mapping.condition_kwargs))
+
+  def _merge(self, old, new):
+    """Helper to merge which handles merging one value."""
+    if old is None:
+      return new
+    elif new is not None and old != new:
+      raise ValueError("Incompatible values: %s != %s" % (old, new))
+    return old
+
+  def _deep_tuple(self, x):
+    """Converts lists of lists to tuples of tuples."""
+    return (tuple(map(self._deep_tuple, x))
+            if isinstance(x, (list, tuple)) else x)
+
 
 @six.add_metaclass(abc.ABCMeta)
 class Bijector(object):
-  """Interface for transforming a `Distribution` via `TransformedDistribution`.
+  """Interface for transforming a `Distribution` sample.
 
-  A `Bijector` implements a bijective, differentiable function by transforming
-  an input `Tensor`. The output `Tensor` shape is constrained by the input
-  `sample`, `batch`, and `event` shape.  A `Bijector` is characterized by three
+  A `Bijector` implements a
+  [diffeomorphism](https://en.wikipedia.org/wiki/Diffeomorphism), i.e., a
+  bijective, differentiable function. A `Bijector` is used by
+  `TransformedDistribution` but can be generally used for transforming a
+  `Distribution` generated `Tensor`.  A `Bijector` is characterized by three
   operations:
 
   1. Forward Evaluation
@@ -210,7 +307,8 @@ class Bijector(object):
   - The inverse `log o det o Jacobian` can be implemented as the negative of the
     forward `log o det o Jacobian`.  This is useful if the `inverse` is
     implemented as a cache or the inverse Jacobian is computationally more
-    expensive. The following demonstrates the suggested implementation.
+    expensive (e.g., `CholeskyOuterProduct` `Bijector`). The following
+    demonstrates the suggested implementation.
 
     ```python
     def _inverse_and_log_det_jacobian(self, y):
@@ -300,6 +398,11 @@ class Bijector(object):
     self._is_constant_jacobian = is_constant_jacobian
     self._validate_args = validate_args
     self._dtype = dtype
+    self._from_y = {}
+    self._from_x = {}
+    # Using abbreviation ildj for "inverse log det Jacobian."
+    # This variable is not `None` iff is_constant_jacobian is `True`.
+    self._constant_ildj = None
     if name:
       self._name = name
     else:
@@ -368,7 +471,12 @@ class Bijector(object):
     with self._name_scope(name, [x]):
       x = ops.convert_to_tensor(x, name="x")
       self._maybe_assert_dtype(x)
-      return self._forward(x, **condition_kwargs)
+      mapping = self._lookup(x=x, condition_kwargs=condition_kwargs)
+      if mapping.y is not None:
+        return mapping.y
+      mapping = mapping.merge(y=self._forward(x, **condition_kwargs))
+      self._cache(mapping)
+      return mapping.y
 
   def _inverse(self, y):
     raise NotImplementedError("inverse is not implemented")
@@ -393,16 +501,28 @@ class Bijector(object):
     with self._name_scope(name, [y]):
       y = ops.convert_to_tensor(y, name="y")
       self._maybe_assert_dtype(y)
+      mapping = self._lookup(y=y, condition_kwargs=condition_kwargs)
+      if mapping.x is not None:
+        return mapping.x
+      ildj = None
       try:
-        return self._inverse(y, **condition_kwargs)
+        x = self._inverse(y, **condition_kwargs)
       except NotImplementedError as original_error:
         # Since _inverse was not implemented, try to see if it's implemented
         # by the _inverse_and_inverse_log_det_jacobian member.
         try:
-          return self._inverse_and_inverse_log_det_jacobian(
-              y, **condition_kwargs)[0]
+          x, ildj = self._inverse_and_inverse_log_det_jacobian(
+              y, **condition_kwargs)
+          if self._constant_ildj is not None:
+            ildj = self._constant_ildj  # Use the "global" result.
+          elif self.is_constant_jacobian:
+            self._constant_ildj = ildj
         except NotImplementedError:
           raise original_error
+      x = x if mapping.x is None else mapping.x
+      mapping = mapping.merge(x=x, ildj=ildj)
+      self._cache(mapping)
+      return mapping.x
 
   def _inverse_log_det_jacobian(self, y):
     raise NotImplementedError("inverse_log_det_jacobian is not implemented.")
@@ -430,18 +550,32 @@ class Bijector(object):
         `_inverse_and_inverse_log_det_jacobian` are implemented.
     """
     with self._name_scope(name, [y]):
+      if self._constant_ildj is not None:
+        return self._constant_ildj
       y = ops.convert_to_tensor(y, name="y")
       self._maybe_assert_dtype(y)
+      mapping = self._lookup(y=y, condition_kwargs=condition_kwargs)
+      if mapping.ildj is not None:
+        return mapping.ildj
       try:
-        return self._inverse_log_det_jacobian(y, **condition_kwargs)
+        x = mapping.x
+        ildj = self._inverse_log_det_jacobian(y, **condition_kwargs)
       except NotImplementedError as original_error:
         # Since _inverse_log_det_jacobian was not implemented, try to see if
         # it's implemented by the _inverse_and_inverse_log_det_jacobian member.
         try:
-          return self._inverse_and_inverse_log_det_jacobian(
-              y, **condition_kwargs)[1]
+          x, ildj = self._inverse_and_inverse_log_det_jacobian(
+              y, **condition_kwargs)
+          if mapping.x is not None:
+            x = mapping.x
         except NotImplementedError:
           raise original_error
+      if self.is_constant_jacobian:
+        self._constant_ildj = ildj
+      x = x if mapping.x is None else mapping.x
+      mapping = mapping.merge(x=x, ildj=ildj)
+      self._cache(mapping)
+      return mapping.ildj
 
   def _inverse_and_inverse_log_det_jacobian(self, y):
     raise NotImplementedError(
@@ -473,18 +607,30 @@ class Bijector(object):
     with self._name_scope(name, [y]):
       y = ops.convert_to_tensor(y, name="y")
       self._maybe_assert_dtype(y)
+      mapping = self._lookup(y=y, condition_kwargs=condition_kwargs)
+      if mapping.x is not None and mapping.ildj is not None:
+        return mapping.x, mapping.ildj
       try:
-        return self._inverse_and_inverse_log_det_jacobian(
+        x, ildj = self._inverse_and_inverse_log_det_jacobian(
             y, **condition_kwargs)
       except NotImplementedError as original_error:
         # Since _inverse_and_inverse_log_det_jacobian was not implemented, try
         # to see if we can separately use _inverse and
         # _inverse_log_det_jacobian members.
         try:
-          return (self._inverse(y, **condition_kwargs),
-                  self._inverse_log_det_jacobian(y, **condition_kwargs))
+          x = self._inverse(y, **condition_kwargs)
+          if self._constant_ildj is None:
+            ildj = self._inverse_log_det_jacobian(y, **condition_kwargs)
         except NotImplementedError:
           raise original_error
+      if self._constant_ildj is not None:
+        ildj = self._constant_ildj  # Ignore any ildj we may/not have.
+      elif self.is_constant_jacobian:
+        self._constant_ildj = ildj
+      x = x if mapping.x is None else mapping.x
+      mapping = mapping.merge(x=x, ildj=ildj)
+      self._cache(mapping)
+      return mapping.x, mapping.ildj
 
   def _forward_log_det_jacobian(self, x):
     raise NotImplementedError(
@@ -509,16 +655,29 @@ class Bijector(object):
         nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
     """
     with self._name_scope(name, [x]):
+      if self._constant_ildj is not None:
+        # Need "-1. *" to avoid invalid-unary-operand-type linter warning.
+        return -1. * self._constant_ildj
       x = ops.convert_to_tensor(x, name="x")
       self._maybe_assert_dtype(x)
+      mapping = self._lookup(x=x, condition_kwargs=condition_kwargs)
+      if mapping.ildj is not None:
+        return -mapping.ildj
+      y = None
       try:
-        return self._forward_log_det_jacobian(x, **condition_kwargs)
+        ildj = -self._forward_log_det_jacobian(x, **condition_kwargs)
       except NotImplementedError as original_error:
         try:
-          y = self.inverse(x, **condition_kwargs)
-          return -self.inverse_log_det_jacobian(y, **condition_kwargs)
+          y = self.inverse(x, **condition_kwargs) if y is None else y
+          ildj = self.inverse_log_det_jacobian(y, **condition_kwargs)
         except NotImplementedError:
           raise original_error
+      if self.is_constant_jacobian:
+        self._constant_ildj = ildj
+      y = y if mapping.y is None else mapping.y
+      mapping = mapping.merge(y=y, ildj=ildj)
+      self._cache(mapping)
+      return -mapping.ildj
 
   @contextlib.contextmanager
   def _name_scope(self, name=None, values=None):
@@ -534,6 +693,31 @@ class Bijector(object):
       raise TypeError("Input had dtype %s but expected %s." %
                       (self.dtype, x.dtype))
 
+  def _cache(self, mapping):
+    """Helper which stores mapping info in forward/inverse dicts."""
+    if self._constant_ildj is not None:
+      # Fold in ildj if known constant Jacobian.
+      mapping = mapping.merge(ildj=self._constant_ildj)
+    # Merging from lookup is an added check that we're not overwriting anything
+    # which is not None.
+    mapping = mapping.merge(mapping=self._lookup(
+        mapping.x, mapping.y, mapping.condition_kwargs))
+    if mapping.x is None or mapping.y is None:
+      ValueError("Caching expects both (x,y) to be known, i.e., not None.")
+    self._from_x[mapping.x_key] = mapping
+    self._from_y[mapping.y_key] = mapping
+
+  def _lookup(self, x=None, y=None, condition_kwargs=None):
+    """Helper which retrieves mapping info from forward/inverse dicts."""
+    mapping = _Mapping(x=x, y=y, condition_kwargs=condition_kwargs)
+    # Since _cache requires both x,y to be set, we only need to do one cache
+    # lookup since the mapping is always in both or neither.
+    if mapping.x is not None:
+      return self._from_x.get(mapping.x_key, mapping)
+    if mapping.y is not None:
+      return self._from_y.get(mapping.y_key, mapping)
+    return mapping
+
 
 class Inline(Bijector):
   # pylint: disable=line-too-long
@@ -547,7 +731,7 @@ class Inline(Bijector):
     inverse_fn=tf.log,
     inverse_log_det_jacobian_fn=(
       lambda y: -tf.reduce_sum(tf.log(y), reduction_indices=-1)),
-    name="Exp")
+    name="exp")
   ```
 
   The above example is equivalent to the `Bijector` `Exp(event_ndims=1)`.
@@ -573,8 +757,8 @@ class Inline(Bijector):
         log o det o jacobian of the forward transformation.
       is_constant_jacobian: `Boolean` indicating that the Jacobian is constant
         for all input arguments.
-      validate_args: `Boolean` indicated whether arguments should be checked for
-        correctness.
+      validate_args: `Boolean` indicating whether arguments should be checked
+        for correctness.
       name: `String`, name given to ops managed by this object.
     """
     super(Inline, self).__init__(
@@ -643,8 +827,8 @@ class Invert(Bijector):
 
     Args:
       bijector: Bijector instance.
-      validate_args: `Boolean` indicated whether arguments should be checked for
-        correctness.
+      validate_args: `Boolean` indicating whether arguments should be checked
+        for correctness.
       name: `String`, name given to ops managed by this object.
     """
 
@@ -713,8 +897,8 @@ class Chain(Bijector):
     Args:
       bijectors: Python list of bijector instances. An empty list makes this
         bijector equivalent to the `Identity` bijector.
-      validate_args: `Boolean` indicated whether arguments should be checked for
-        correctness.
+      validate_args: `Boolean` indicating whether arguments should be checked
+        for correctness.
       name: `String`, name given to ops managed by this object. Default: E.g.,
         `Chain([Exp(), Softplus()]).name == "chain_of_exp_of_softplus"`.
 
@@ -794,12 +978,9 @@ class Identity(Bijector):
 
   def __init__(self, validate_args=False, name="identity"):
     super(Identity, self).__init__(
-        batch_ndims=0,
-        event_ndims=0,
         is_constant_jacobian=True,
         validate_args=validate_args,
         name=name)
-    self._is_constant_jacobian = True
 
   def _forward(self, x):
     return x
@@ -841,8 +1022,8 @@ class Exp(Bijector):
     Args:
       event_ndims: Scalar `int32` `Tensor` indicating the number of dimensions
         associated with a particular draw from the distribution.
-      validate_args: `Boolean` indicated whether arguments should be checked for
-        correctness.
+      validate_args: `Boolean` indicating whether arguments should be checked
+        for correctness.
       name: `String` name given to ops managed by this object.
     """
 
@@ -923,8 +1104,8 @@ class ScaleAndShift(Bijector):
       scale: `Tensor` used to scale input, i.e., `Y = g(X) = scale * X + shift`.
       event_ndims: Scalar `int32` `Tensor` indicating the number of dimensions
         associated with a particular draw from the distribution.
-      validate_args: `Boolean` indicated whether arguments should be checked for
-        correctness.
+      validate_args: `Boolean` indicating whether arguments should be checked
+        for correctness.
       name: `String` name given to ops managed by this object.
     """
 
@@ -1271,3 +1452,150 @@ class SigmoidCentered(SoftmaxCentered):
   def __init__(self, validate_args=False, name="sigmoid_centered"):
     super(SigmoidCentered, self).__init__(
         validate_args=validate_args, name=name)
+
+
+class CholeskyOuterProduct(Bijector):
+  # pylint: disable=line-too-long
+  """Bijector which computes Y = g(X) = X X^T where X is a lower-triangular, positive-diagonal matrix.
+
+  `event_ndims` must be 0 or 2, i.e., scalar or matrix.
+
+  Note: the upper-triangular part of X is ignored (whether or not its zero).
+
+  Examples:
+
+  ```python
+  bijector.CholeskyOuterProduct(event_ndims=2).forward(x=[[1., 0], [2, 1]])
+  # Result: [[1, 1], [1, 5]], i.e., x x^T
+
+  bijector.SoftmaxCentered(event_ndims=2).inverse(y=[[1., 1], [1, 5]])
+  # Result: [[1, 0], [2, 1]], i.e., chol(y).
+  ```
+
+  """
+  # pylint: enable=line-too-long
+
+  def __init__(self, event_ndims=2, validate_args=False,
+               name="cholesky_outer_product"):
+    """Instantiates the `CholeskyOuterProduct` bijector.
+
+    Args:
+      event_ndims: `constant` `int32` scalar `Tensor` indicating the number of
+        dimensions associated with a particular draw from the distribution. Must
+        be 0 or 2.
+      validate_args: `Boolean` indicating whether arguments should be checked
+        for correctness.
+      name: `String` name given to ops managed by this object.
+
+    Raises:
+      ValueError: if event_ndims is neither 0 or 2.
+    """
+    self._parameters = {}
+    self._name = name
+    with self._name_scope("init", values=[event_ndims]):
+      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
+      event_ndims = tensor_util.constant_value(event_ndims)
+    if event_ndims is None or event_ndims not in [0, 2]:
+      raise ValueError("`event_ndims` must be a TF constant which is 0 or 2")
+    self._static_event_ndims = event_ndims
+    super(CholeskyOuterProduct, self).__init__(
+        validate_args=validate_args,
+        name=name)
+
+  def _forward(self, x):
+    if self._static_event_ndims == 0:
+      return math_ops.square(x)
+    if self.validate_args:
+      is_matrix = check_ops.assert_rank_at_least(x, 2)
+      shape = array_ops.shape(x)
+      is_square = check_ops.assert_equal(shape[-2], shape[-1])
+      x = control_flow_ops.with_dependencies([is_matrix, is_square], x)
+    # For safety, explicitly zero-out the upper triangular part.
+    x = array_ops.matrix_band_part(x, -1, 0)
+    return math_ops.batch_matmul(x, x, adj_y=True)
+
+  def _inverse_and_inverse_log_det_jacobian(self, y):
+    x = (math_ops.sqrt(y) if self._static_event_ndims == 0
+         else linalg_ops.cholesky(y))
+    return x, -self._forward_log_det_jacobian(x)
+
+  def _forward_log_det_jacobian(self, x):
+    # Let Y be a symmetric, positive definite matrix and write:
+    #   Y = X X^T
+    # where X is lower-triangular.
+    #
+    # Observe that,
+    #   dY[i,j]/dX[a,b]
+    #   = d/dX[a,b] { X[i,:] X[j,:] }
+    #   = sum_{d=1}^p { I[i=a] I[d=b] X[j,d] + I[j=a] I[d=b] X[i,d] }
+    #
+    # To compute the Jacobian dX/dY we must represent X,Y as vectors. Since Y is
+    # symmetric and X is lower-triangular, we need vectors of dimension:
+    #   d = p (p + 1) / 2
+    # where X, Y are p x p matrices, p > 0. We use a row-major mapping, i.e.,
+    #   k = { i (i + 1) / 2 + j   i>=j
+    #       { undef               i<j
+    # and assume zero-based indexes. When k is undef, the element is dropped.
+    # Example:
+    #           j      k
+    #        0 1 2 3  /
+    #    0 [ 0 . . . ]
+    # i  1 [ 1 2 . . ]
+    #    2 [ 3 4 5 . ]
+    #    3 [ 6 7 8 9 ]
+    # Write vec[.] to indicate transforming a matrix to vector via k(i,j). (With
+    # slight abuse: k(i,j)=undef means the element is dropped.)
+    #
+    # We now show d vec[Y] / d vec[X] is lower triangular. Assuming both are
+    # defined, observe that k(i,j) < k(a,b) iff (1) i<a or (2) i=a and j<b.
+    # In both cases dvec[Y]/dvec[X]@[k(i,j),k(a,b)] = 0 since:
+    # (1) j<=i<a thus i,j!=a.
+    # (2) i=a>j  thus i,j!=a.
+    #
+    # Since the Jacobian is lower-triangular, we need only compute the product
+    # of diagonal elements:
+    #   d vec[Y] / d vec[X] @[k(i,j), k(i,j)]
+    #   = X[j,j] + I[i=j] X[i,j]
+    #   = 2 X[j,j].
+    # Since there is a 2 X[j,j] term for every lower-triangular element of X we
+    # conclude:
+    #   |Jac(d vec[Y]/d vec[X])| = 2^p prod_{j=0}^{p-1} X[j,j]^{p-j}.
+    if self._static_event_ndims == 0:
+      if self.validate_args:
+        is_positive = check_ops.assert_positive(
+            x, message="All elements must be positive.")
+        x = control_flow_ops.with_dependencies([is_positive], x)
+      return math.log(2.) + math_ops.log(x)
+
+    diag = array_ops.matrix_diag_part(x)
+    if self.validate_args:
+      is_matrix = check_ops.assert_rank_at_least(
+          x, 2, message="Input must be a (batch of) matrix.")
+      shape = array_ops.shape(x)
+      is_square = check_ops.assert_equal(
+          shape[-2], shape[-1],
+          message="Input must be a (batch of) square matrix.")
+      # Assuming lower-triangular means we only need check diag>0.
+      is_positive_definite = check_ops.assert_positive(
+          diag, message="Input must be positive definite.")
+      x = control_flow_ops.with_dependencies(
+          [is_matrix, is_square, is_positive_definite], x)
+
+    # Create a column vector equal to: [p, p-1, ..., 2, 1]^T.
+    if x.get_shape().ndims is None or x.get_shape()[-1].value is None:
+      p = array_ops.shape(x)[-1]
+    else:
+      p = x.get_shape()[-1].value
+    exponents = array_ops.expand_dims(
+        math_ops.linspace(math_ops.cast(p, dtype=x.dtype), 1., p),
+        dim=1)
+
+    sum_weighted_log_diag = array_ops.squeeze(
+        math_ops.batch_matmul(math_ops.log(diag), exponents),
+        squeeze_dims=-1)
+    fldj = p * math.log(2.) + sum_weighted_log_diag
+
+    if x.get_shape().ndims is not None:
+      fldj.set_shape(x.get_shape()[:-2])
+
+    return fldj
diff --git a/tensorflow/contrib/distributions/python/ops/distribution.py b/tensorflow/contrib/distributions/python/ops/distribution.py
index 2bfd272e71d..5a3583c22a3 100644
--- a/tensorflow/contrib/distributions/python/ops/distribution.py
+++ b/tensorflow/contrib/distributions/python/ops/distribution.py
@@ -327,12 +327,13 @@ class Distribution(_BaseDistribution):
     for i, t in enumerate(graph_parents):
       if t is None or not contrib_framework.is_tensor(t):
         raise ValueError("Graph parent item %d is not a Tensor; %s." % (i, t))
+    parameters = parameters or {}
     self._dtype = dtype
     self._is_continuous = is_continuous
     self._is_reparameterized = is_reparameterized
     self._allow_nan_stats = allow_nan_stats
     self._validate_args = validate_args
-    self._parameters = parameters or {}
+    self._parameters = parameters
     self._graph_parents = graph_parents
     self._name = name or type(self).__name__
 
@@ -434,6 +435,27 @@ class Distribution(_BaseDistribution):
     """Python boolean indicated possibly expensive checks are enabled."""
     return self._validate_args
 
+  def copy(self, **override_parameters_kwargs):
+    """Creates a deep copy of the distribution.
+
+    Note: the copy distribution may continue to depend on the original
+    intialization arguments.
+
+    Args:
+      **override_parameters_kwargs: String/value dictionary of initialization
+        arguments to override with new values.
+
+    Returns:
+      distribution: A new instance of `type(self)` intitialized from the union
+        of self.parameters and override_parameters_kwargs, i.e.,
+        `dict(self.parameters, **override_parameters_kwargs)`.
+    """
+    parameters = dict(self.parameters, **override_parameters_kwargs)
+    # Python3 leaks "__class__" into `locals()` so we remove if present.
+    # TODO(b/32376812): Remove this pop.
+    parameters.pop("__class__", None)
+    return type(self)(**parameters)
+
   def _batch_shape(self):
     raise NotImplementedError("batch_shape is not implemented")
 
diff --git a/tensorflow/contrib/distributions/python/ops/transformed_distribution.py b/tensorflow/contrib/distributions/python/ops/transformed_distribution.py
index 9a4af741a4d..47f9f36aec5 100644
--- a/tensorflow/contrib/distributions/python/ops/transformed_distribution.py
+++ b/tensorflow/contrib/distributions/python/ops/transformed_distribution.py
@@ -19,7 +19,6 @@ from __future__ import print_function
 
 from tensorflow.contrib.distributions.python.ops import distribution as distributions
 from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 
 
@@ -160,7 +159,6 @@ class TransformedDistribution(distributions.Distribution):
     name = name or bijector.name + distribution.name
     self._distribution = distribution
     self._bijector = bijector
-    self._inverse_cache = {}
     super(TransformedDistribution, self).__init__(
         dtype=self._distribution.dtype,
         is_continuous=self._distribution.is_continuous,
@@ -202,9 +200,7 @@ class TransformedDistribution(distributions.Distribution):
                                  **distribution_kwargs)
     # Recall that a bijector is named for its forward transform, i.e.,
     # `Y = g(X)`,
-    y = self.bijector.forward(x, **bijector_kwargs)
-    self._inverse_cache[y] = x
-    return y
+    return self.bijector.forward(x, **bijector_kwargs)
 
   @distribution_util.AppendDocstring(
       """Implements `(log o p o g^{-1})(y) + (log o det o J o g^{-1})(y)`,
@@ -216,11 +212,9 @@ class TransformedDistribution(distributions.Distribution):
   def _log_prob(self, y, bijector_kwargs=None, distribution_kwargs=None):
     bijector_kwargs = bijector_kwargs or {}
     distribution_kwargs = distribution_kwargs or {}
-    x = self._inverse_possibly_from_cache(y, bijector_kwargs)
-    inverse_log_det_jacobian = self.bijector.inverse_log_det_jacobian(
+    x, ildj = self.bijector.inverse_and_inverse_log_det_jacobian(
         y, **bijector_kwargs)
-    return (self.distribution.log_prob(x, **distribution_kwargs) +
-            inverse_log_det_jacobian)
+    return ildj + self.distribution.log_prob(x, **distribution_kwargs)
 
   @distribution_util.AppendDocstring(
       """Implements `p(g^{-1}(y)) det|J(g^{-1}(y))|`, where `g^{-1}` is the
@@ -232,18 +226,16 @@ class TransformedDistribution(distributions.Distribution):
   def _prob(self, y, bijector_kwargs=None, distribution_kwargs=None):
     bijector_kwargs = bijector_kwargs or {}
     distribution_kwargs = distribution_kwargs or {}
-    x = self._inverse_possibly_from_cache(y, bijector_kwargs)
-    inverse_det_jacobian = math_ops.exp(self.bijector.inverse_log_det_jacobian(
-        y, **bijector_kwargs))
-    return (self.distribution.prob(x, **distribution_kwargs) *
-            inverse_det_jacobian)
+    x, ildj = self.bijector.inverse_and_inverse_log_det_jacobian(
+        y, **bijector_kwargs)
+    return math_ops.exp(ildj) * self.distribution.prob(x, **distribution_kwargs)
 
   @distribution_util.AppendDocstring(
       condition_kwargs_dict=_condition_kwargs_dict)
   def _log_cdf(self, y, bijector_kwargs=None, distribution_kwargs=None):
     bijector_kwargs = bijector_kwargs or {}
     distribution_kwargs = distribution_kwargs or {}
-    x = self._inverse_possibly_from_cache(y, bijector_kwargs)
+    x = self.bijector.inverse(y, **bijector_kwargs)
     return self.distribution.log_cdf(x, distribution_kwargs)
 
   @distribution_util.AppendDocstring(
@@ -251,7 +243,7 @@ class TransformedDistribution(distributions.Distribution):
   def _cdf(self, y, bijector_kwargs=None, distribution_kwargs=None):
     bijector_kwargs = bijector_kwargs or {}
     distribution_kwargs = distribution_kwargs or {}
-    x = self._inverse_possibly_from_cache(y, bijector_kwargs)
+    x = self.bijector.inverse(y, **bijector_kwargs)
     return self.distribution.cdf(x, **distribution_kwargs)
 
   @distribution_util.AppendDocstring(
@@ -260,7 +252,7 @@ class TransformedDistribution(distributions.Distribution):
                              bijector_kwargs=None, distribution_kwargs=None):
     bijector_kwargs = bijector_kwargs or {}
     distribution_kwargs = distribution_kwargs or {}
-    x = self._inverse_possibly_from_cache(y, bijector_kwargs)
+    x = self.bijector.inverse(y, **bijector_kwargs)
     return self.distribution.log_survival_function(x, **distribution_kwargs)
 
   @distribution_util.AppendDocstring(
@@ -269,13 +261,5 @@ class TransformedDistribution(distributions.Distribution):
                          bijector_kwargs=None, distribution_kwargs=None):
     bijector_kwargs = bijector_kwargs or {}
     distribution_kwargs = distribution_kwargs or {}
-    x = self._inverse_possibly_from_cache(y, bijector_kwargs)
+    x = self.bijector.inverse(y, **bijector_kwargs)
     return self.distribution.survival_function(x, **distribution_kwargs)
-
-  def _inverse_possibly_from_cache(self, y, bijector_kwargs):
-    """Return `self._inverse(y)`, possibly using cached value."""
-    y = ops.convert_to_tensor(y, name="y")
-    if y in self._inverse_cache:
-      return self._inverse_cache[y]
-    else:
-      return self.bijector.inverse(y, **bijector_kwargs)
diff --git a/tensorflow/contrib/factorization/examples/mnist.py b/tensorflow/contrib/factorization/examples/mnist.py
index b238e2e174d..b0451f8fbca 100644
--- a/tensorflow/contrib/factorization/examples/mnist.py
+++ b/tensorflow/contrib/factorization/examples/mnist.py
@@ -327,6 +327,6 @@ if __name__ == '__main__':
       default=True,
       help='Use fake input data.'
   )
-  FLAGS = parser.parse_args()
+  FLAGS, unparsed = parser.parse_known_args()
 
   tf.test.main()
diff --git a/tensorflow/contrib/factorization/python/ops/kmeans.py b/tensorflow/contrib/factorization/python/ops/kmeans.py
index 88cf5f084d8..3228c1f3dfe 100644
--- a/tensorflow/contrib/factorization/python/ops/kmeans.py
+++ b/tensorflow/contrib/factorization/python/ops/kmeans.py
@@ -243,6 +243,7 @@ class KMeansClustering(estimator.Estimator,
      ).training_graph()
     incr_step = tf.assign_add(tf.contrib.framework.get_global_step(), 1)
     self._loss = tf.reduce_sum(losses)
+    tf.scalar_summary('loss/raw', self._loss)
     training_op = with_dependencies([training_op, incr_step], self._loss)
     return training_op, self._loss
 
diff --git a/tensorflow/contrib/layers/python/layers/optimizers.py b/tensorflow/contrib/layers/python/layers/optimizers.py
index ca914c79265..a31882fecb4 100644
--- a/tensorflow/contrib/layers/python/layers/optimizers.py
+++ b/tensorflow/contrib/layers/python/layers/optimizers.py
@@ -24,16 +24,20 @@ from tensorflow.contrib import framework as contrib_framework
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as vars_
+from tensorflow.python.training import moving_averages
 from tensorflow.python.training import optimizer as optimizer_
 from tensorflow.python.training import training as train
 
+
 OPTIMIZER_CLS_NAMES = {
     "Adagrad": train.AdagradOptimizer,
     "Adam": train.AdamOptimizer,
@@ -104,7 +108,11 @@ def optimize_loss(loss,
     gradient_multipliers: dict of variables or variable names to floats.
                           If present, gradients for specified
                           variables will be multiplied by given constant.
-    clip_gradients: float or `None`, clips gradients by this value.
+    clip_gradients: float, callable or `None`. If float, is provided, a global
+      clipping is applied to prevent the norm of the gradient to exceed this
+      value. Alternatively, a callable can be provided e.g.: adaptive_clipping.
+      This callable takes a `list` of `(gradients, variables)` `tuple`s and
+      returns the same thing with the gradients modified.
     learning_rate_decay_fn: function, takes `learning_rate` and `global_step`
                             `Tensor`s, returns `Tensor`.
                             Can be used to implement any learning rate decay
@@ -132,6 +140,7 @@ def optimize_loss(loss,
         * `global_step` is an invalid type or shape.
         * `learning_rate` is an invalid type or value.
         * `optimizer` is wrong type.
+        * `clip_gradients' is not float or callable.
         * `learning_rate` and `learning_rate_decay_fn` are supplied, but no
           `global_step` is available.
   """
@@ -224,9 +233,18 @@ def optimize_loss(loss,
     if gradient_multipliers is not None:
       gradients = _multiply_gradients(gradients, gradient_multipliers)
 
+    if "gradient_norm" in summaries:
+      logging_ops.scalar_summary("global_norm/gradient_norm",
+                                 clip_ops.global_norm(zip(*gradients)[0]))
+
     # Optionally clip gradients by global norm.
-    if clip_gradients is not None:
+    if isinstance(clip_gradients, float):
       gradients = _clip_gradients_by_norm(gradients, clip_gradients)
+    elif callable(clip_gradients):
+      gradients = clip_gradients(gradients)
+    elif clip_gradients is not None:
+      raise ValueError(
+          "Unknown type %s for clip_gradients" % type(clip_gradients))
 
     # Add scalar summary for loss.
     if "loss" in summaries:
@@ -241,11 +259,15 @@ def optimize_loss(loss,
 
       if grad_values is not None:
         if "gradients" in summaries:
-          logging_ops.histogram_summary(variable.name + "/gradients",
+          logging_ops.histogram_summary("gradients/" + variable.name,
                                         grad_values)
         if "gradient_norm" in summaries:
-          logging_ops.histogram_summary(variable.name + "/gradient_norm",
-                                        clip_ops.global_norm([grad_values]))
+          logging_ops.scalar_summary("gradient_norm/" + variable.name,
+                                     clip_ops.global_norm([grad_values]))
+
+    if clip_gradients is not None and "gradient_norm" in summaries:
+      logging_ops.scalar_summary("global_norm/clipped_gradient_norm",
+                                 clip_ops.global_norm(zip(*gradients)[0]))
 
     # Create gradient updates.
     grad_updates = opt.apply_gradients(gradients,
@@ -266,6 +288,101 @@ def _clip_gradients_by_norm(grads_and_vars, clip_gradients):
   return list(zip(clipped_gradients, variables))
 
 
+def _adaptive_max_norm(norm, std_factor, decay, global_step, epsilon, name):
+  """Find max_norm given norm and previous average."""
+  with vs.variable_scope(name, "AdaptiveMaxNorm", [norm]):
+    log_norm = math_ops.log(norm + epsilon)
+
+    def moving_average(name, value, decay):
+      moving_average_variable = vs.get_variable(
+          name, shape=value.get_shape(), dtype=value.dtype,
+          initializer=init_ops.zeros_initializer, trainable=False)
+      return moving_averages.assign_moving_average(
+          moving_average_variable, value, decay)
+
+    # quicker adaptation at the beginning
+    if global_step is not None:
+      n = math_ops.to_float(global_step)
+      decay = math_ops.minimum(decay, n / (n + 1.))
+
+    # update averages
+    mean = moving_average("mean", log_norm, decay)
+    sq_mean = moving_average("sq_mean", math_ops.square(log_norm), decay)
+
+    variance = sq_mean - math_ops.square(mean)
+    std = math_ops.sqrt(math_ops.maximum(epsilon, variance))
+    max_norms = math_ops.exp(mean + std_factor*std)
+    return max_norms, mean
+
+
+def adaptive_clipping_fn(std_factor=2.,
+                         decay=0.95,
+                         static_max_norm=None,
+                         global_step=None,
+                         report_summary=False,
+                         epsilon=1e-8,
+                         name=None):
+  """Adapt the clipping value using statistics on the norms.
+
+  Implement adaptive gradient as presented in section 3.2.1 of
+  https://arxiv.org/abs/1412.1602.
+
+  Keeps a moving average of the mean and std of the log(norm) of the gradient.
+  if the norm exceeds `exp(mean + std_factor*std)`, all gradients are rescaled
+  such that the global norm becomes `exp(mean)`.
+
+  Args:
+    std_factor: Python scaler (or tensor).
+      `max_norm = exp(mean + std_factor*std)`
+    decay: The smoothing factor of the moving averages.
+    static_max_norm: If provided, will threshold the norm to this value as an
+      extra safety.
+    global_step: Optional global_step. If provided, `decay = decay*n/(n+1)`.
+      This provides a quicker adaptation of the mean for the first steps.
+    report_summary: If `True`, will add histogram summaries of the `max_norm`.
+    epsilon: Small value chosen to avoid zero variance.
+    name: The name for this operation is used to scope operations and summaries.
+
+  Returns:
+    A function for applying gradient clipping.
+  """
+  def gradient_clipping(grads_and_vars):
+    """Internal function for adaptive clipping."""
+    grads, variables = zip(*grads_and_vars)
+
+    norm = clip_ops.global_norm(grads)
+
+    max_norm, log_mean = _adaptive_max_norm(
+        norm, std_factor, decay, global_step, epsilon, name)
+
+    # reports the max gradient norm for debugging
+    if report_summary:
+      logging_ops.scalar_summary(
+          "global_norm/adaptive_max_gradient_norm", max_norm)
+
+    # factor will be 1. if norm is smaller than max_norm
+    factor = math_ops.select(norm < max_norm,
+                             array_ops.ones_like(norm),
+                             math_ops.exp(log_mean) / norm)
+
+    if static_max_norm is not None:
+      factor = math_ops.minimum(static_max_norm / norm, factor)
+
+    # apply factor
+    clipped_grads = []
+    for grad in grads:
+      if grad is None:
+        clipped_grads.append(None)
+      elif isinstance(grad, ops.IndexedSlices):
+        clipped_grads.append(ops.IndexedSlices(
+            grad.values * factor, grad.indices, grad.dense_shape))
+      else:
+        clipped_grads.append(grad * factor)
+
+    return list(zip(clipped_grads, variables))
+  return gradient_clipping
+
+
 def _add_scaled_noise_to_gradients(grads_and_vars, gradient_noise_scale):
   """Adds scaled noise from a 0-mean normal distribution to gradients."""
   gradients, variables = zip(*grads_and_vars)
diff --git a/tensorflow/contrib/layers/python/layers/optimizers_test.py b/tensorflow/contrib/layers/python/layers/optimizers_test.py
index fb76fd20b4a..a7de611a664 100644
--- a/tensorflow/contrib/layers/python/layers/optimizers_test.py
+++ b/tensorflow/contrib/layers/python/layers/optimizers_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
 import tensorflow as tf
 
 
@@ -179,6 +180,26 @@ class OptimizersTest(tf.test.TestCase):
       self.assertAlmostEqual(var_value, 9.98999, 4)
       self.assertEqual(global_step_value, 1)
 
+  def testAdaptiveGradientClip(self):
+    with self.test_session() as session:
+      x, var, loss, global_step = _setup_model()
+      clip_gradients = tf.contrib.layers.adaptive_clipping_fn()
+      train = tf.contrib.layers.optimize_loss(loss,
+                                              global_step,
+                                              learning_rate=0.1,
+                                              optimizer="SGD",
+                                              clip_gradients=clip_gradients)
+      tf.initialize_all_variables().run()
+      session.run(train, feed_dict={x: 5})
+      var_value, global_step_value = session.run([var, global_step])
+      self.assertAlmostEqual(var_value, 9.8916, 4)
+      self.assertEqual(global_step_value, 1)
+      var_count = 0
+      for var in tf.all_variables():
+        if var.name.startswith("OptimizeLoss/AdaptiveMaxNorm"):
+          var_count += 1
+      self.assertEqual(2, var_count)
+
   def testGradientMultiply(self):
     with self.test_session() as session:
       x, var, loss, global_step = _setup_model()
@@ -332,5 +353,70 @@ class OptimizersTest(tf.test.TestCase):
         self.assertEqual(update_var_value, 20)
         self.assertEqual(global_step_value, 1)
 
+
+class AdaptiveClipping(tf.test.TestCase):
+
+  def testAverages(self):
+    with self.test_session() as session:
+      scale = 2.
+      grad = tf.ones([3, 4]) * scale
+      log_norm = np.log(np.sqrt(scale**2 * grad.get_shape().num_elements()))
+      grads_and_vars = [(grad, grad)]
+      grads_and_vars = tf.contrib.layers.adaptive_clipping_fn(
+          decay=0.5)(grads_and_vars)
+
+      var_dict = {}
+      for var in tf.all_variables():
+        if var.name.startswith("AdaptiveMaxNorm"):
+          var_dict[var.name.split(":")[0]] = var
+      self.assertEqual(2, len(var_dict))
+      moving_mean = var_dict["AdaptiveMaxNorm/mean"]
+      moving_sq_mean = var_dict["AdaptiveMaxNorm/sq_mean"]
+      tf.initialize_all_variables().run()
+      mean, sq_mean = session.run([moving_mean, moving_sq_mean])
+      self.assertEqual([0], mean)
+      self.assertEqual([0], sq_mean)
+      for i in range(20):
+        mean, sq_mean, _ = session.run(
+            [moving_mean, moving_sq_mean, grads_and_vars[0][0]])
+        if i == 0:
+          self.assertLess(mean, 0.9 * log_norm)
+          self.assertLess(sq_mean, 0.9 * log_norm**2)
+
+      self.assertAlmostEqual(float(mean), log_norm, places=4)
+      self.assertAlmostEqual(float(sq_mean), log_norm**2, places=4)
+
+  def testClip(self):
+    with self.test_session() as session:
+      spike = 1000.
+      multiplier = tf.placeholder(tf.float32, [], "multiplier")
+      step = tf.placeholder(tf.int32, [], "step")
+
+      grad = tf.ones([3, 4]) * multiplier
+      grads_and_vars = [(grad, grad)]
+      grads_and_vars = tf.contrib.layers.adaptive_clipping_fn(
+          decay=0.9, global_step=step)(grads_and_vars)
+
+      tf.initialize_all_variables().run()
+      def run(scale, i):
+        return session.run(grads_and_vars[0][0],
+                           feed_dict={multiplier: scale, step: i})
+
+      for i in range(20):
+        scale = [1., -2.][i % 2]
+        clipped_grad = run(scale, i)
+        if i > 3:
+          self.assertAllClose(np.ones(clipped_grad.shape)*scale, clipped_grad)
+
+      # assert that the spike will have low influence.
+      clipped_grad = run(spike, 20)
+      self.assertTrue((clipped_grad < 25.).all())
+
+      # assert that a repeated spike will converge to this new value.
+      for i in range(10):
+        clipped_grad = run(spike, i + 21)
+
+      self.assertAllClose(np.ones(clipped_grad.shape)*spike, clipped_grad)
+
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensorflow/contrib/learn/python/learn/estimators/__init__.py b/tensorflow/contrib/learn/python/learn/estimators/__init__.py
index 07dd12ebc38..b5b1dbb6355 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/__init__.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/__init__.py
@@ -35,6 +35,6 @@ from tensorflow.contrib.learn.python.learn.estimators.linear import LinearClassi
 from tensorflow.contrib.learn.python.learn.estimators.linear import LinearRegressor
 from tensorflow.contrib.learn.python.learn.estimators.logistic_regressor import LogisticRegressor
 from tensorflow.contrib.learn.python.learn.estimators.random_forest import TensorForestEstimator
-from tensorflow.contrib.learn.python.learn.estimators.random_forest import TensorForestLossMonitor
+from tensorflow.contrib.learn.python.learn.estimators.random_forest import TensorForestLossHook
 from tensorflow.contrib.learn.python.learn.estimators.run_config import RunConfig
 from tensorflow.contrib.learn.python.learn.estimators.svm import SVM
diff --git a/tensorflow/contrib/learn/python/learn/estimators/classifier.py b/tensorflow/contrib/learn/python/learn/estimators/classifier.py
index 978ab9339b9..cf9ea7e82ae 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/classifier.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/classifier.py
@@ -20,6 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib import metrics as metrics_lib
+from tensorflow.contrib.framework import deprecated
 from tensorflow.contrib.framework import deprecated_arg_values
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.session_bundle import exporter
@@ -27,6 +28,8 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 
 
+@deprecated('2016-11-30', 'Please write an appropriate function for use with'
+            ' your estimator.')
 def classification_signature_fn(examples, unused_features, predictions):
   """Creates classification signature from given examples and predictions.
 
@@ -61,6 +64,7 @@ class Classifier(estimator.Estimator):
   CLASS_OUTPUT = 'classes'
   PROBABILITY_OUTPUT = 'probabilities'
 
+  @deprecated('2016-11-30', 'Please use Estimator directly.')
   def __init__(self, model_fn, n_classes, model_dir=None, config=None,
                params=None, feature_engineering_fn=None):
     """Constructor for Classifier.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator.py b/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator.py
index 241b2b41e5c..ae4c97eae7c 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator.py
@@ -309,7 +309,7 @@ class _DynamicRNNEstimator(estimator.BaseEstimator):
           inputs=rnn_outputs,
           num_outputs=self._target_column.num_label_columns,
           activation_fn=None,
-          trainable=False)
+          trainable=True)
       return activations, final_state
 
   @abc.abstractmethod
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py
index f14e65fff55..d5ca3fbeed5 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py
@@ -429,7 +429,7 @@ class SingleValueRNNEstimatorTest(tf.test.TestCase):
     cell_type = 'basic_rnn'
     cell_size = 8
     optimizer_type = 'Momentum'
-    learning_rate = 0.5
+    learning_rate = 0.1
     momentum = 0.9
     loss_threshold = 0.1
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
index 0ebd8088664..1882e1578d8 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@@ -36,6 +36,7 @@ from tensorflow.contrib import layers
 from tensorflow.contrib import metrics as metrics_lib
 from tensorflow.contrib.framework import deprecated
 from tensorflow.contrib.framework import deprecated_arg_values
+from tensorflow.contrib.framework import get_graph_from_inputs
 from tensorflow.contrib.framework import list_variables
 from tensorflow.contrib.framework import load_variable
 from tensorflow.contrib.learn.python.learn import evaluable
@@ -88,8 +89,11 @@ class ModelFnOps(
     collections.namedtuple('ModelFnOps', ['predictions', 'loss', 'training_op',
                                           'default_metrics', 'signature_fn'])):
 
-  def __new__(cls, predictions, loss, training_op, default_metrics,
-              signature_fn, mode):
+  def __new__(cls, mode, predictions=None, loss=None, training_op=None,
+              default_metrics=None, signature_fn=None):
+    # Assert all ops are from the same graph.
+    get_graph_from_inputs((predictions, loss, training_op))
+
     # Validate training_op.
     if training_op is None:
       if mode == ModeKeys.TRAIN:
@@ -1042,13 +1046,16 @@ class Estimator(BaseEstimator):
 
     if isinstance(model_fn_results, ModelFnOps):
       return model_fn_results
-    else:
-      # Here model_fn_ops should be a tuple with 3 elements.
-      if len(model_fn_results) != 3:
-        raise ValueError('Unrecognized value returned by model_fn, '
-                         'please return ModelFnOps.')
-      return ModelFnOps(model_fn_results[0], model_fn_results[1],
-                        model_fn_results[2], None, None, mode)
+
+    # Here model_fn_ops should be a tuple with 3 elements.
+    if len(model_fn_results) != 3:
+      raise ValueError('Unrecognized value returned by model_fn, '
+                       'please return ModelFnOps.')
+    return ModelFnOps(
+        mode=mode,
+        predictions=model_fn_results[0],
+        loss=model_fn_results[1],
+        training_op=model_fn_results[2])
 
   def _get_train_ops(self, features, targets):
     """Method that builds model graph and returns trainer ops.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head.py b/tensorflow/contrib/learn/python/learn/estimators/head.py
index 04d2484e8e0..bdb3fe3589e 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head.py
@@ -229,20 +229,30 @@ class _Head(object):
         else:
           train_op = control_flow_ops.group(*additional_train_op)
 
-      return estimator.ModelFnOps(None, loss, train_op,
-                                  self._default_metric(),
-                                  self._create_signature_fn(), mode)
+      return estimator.ModelFnOps(
+          mode=estimator.ModeKeys.TRAIN,
+          loss=loss,
+          training_op=train_op,
+          default_metrics=self._default_metric(),
+          signature_fn=self._create_signature_fn())
+
     if mode == estimator.ModeKeys.INFER:
-      predictions = self._infer_op(logits, logits_input)
-      return estimator.ModelFnOps(predictions, None, None,
-                                  self._default_metric(),
-                                  self._create_signature_fn(), mode)
+      return estimator.ModelFnOps(
+          mode=estimator.ModeKeys.INFER,
+          predictions=self._infer_op(logits, logits_input),
+          default_metrics=self._default_metric(),
+          signature_fn=self._create_signature_fn())
+
     if mode == estimator.ModeKeys.EVAL:
       predictions, loss = self._eval_op(features, target, logits, logits_input)
-      return estimator.ModelFnOps(predictions, loss, None,
-                                  self._default_metric(),
-                                  self._create_signature_fn(), mode)
-    raise ValueError("mode=%s unrecognized" % str(mode))
+      return estimator.ModelFnOps(
+          mode=estimator.ModeKeys.EVAL,
+          predictions=predictions,
+          loss=loss,
+          default_metrics=self._default_metric(),
+          signature_fn=self._create_signature_fn())
+
+    raise ValueError("mode=%s unrecognized." % str(mode))
 
   @abc.abstractmethod
   def _training_loss(self, features, target, logits=None, logits_input=None,
diff --git a/tensorflow/contrib/learn/python/learn/estimators/random_forest.py b/tensorflow/contrib/learn/python/learn/estimators/random_forest.py
index 58b4389a000..86f8c5dd028 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/random_forest.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/random_forest.py
@@ -17,25 +17,28 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-import six
-
 from tensorflow.contrib import framework as contrib_framework
 from tensorflow.contrib.framework import deprecated_arg_values
-from tensorflow.contrib.learn.python.learn import monitors as mon
+from tensorflow.contrib.learn.python.learn import evaluable
+from tensorflow.contrib.learn.python.learn import trainable
 
 from tensorflow.contrib.learn.python.learn.estimators import estimator
+from tensorflow.contrib.learn.python.learn.utils import export
 
 from tensorflow.contrib.tensor_forest.client import eval_metrics
 from tensorflow.contrib.tensor_forest.data import data_ops
 from tensorflow.contrib.tensor_forest.python import tensor_forest
 
 from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import session_run_hook
+
+
+KEYS_NAME = 'keys'
+LOSS_NAME = 'rf_training_loss'
 
 
 def _assert_float32(tensors):
@@ -56,58 +59,124 @@ def _assert_float32(tensors):
       raise TypeError('Expected dtype=float32, %s.' % tensor)
 
 
-class TensorForestLossMonitor(mon.EveryN):
-  """Terminates training when training loss stops decreasing."""
+class TensorForestLossHook(session_run_hook.SessionRunHook):
+  """Monitor to request stop when loss stops decreasing."""
 
-  def __init__(self,
-               early_stopping_rounds,
-               every_n_steps):
-    super(TensorForestLossMonitor, self).__init__(every_n_steps=every_n_steps)
+  def __init__(self, early_stopping_rounds):
     self.early_stopping_rounds = early_stopping_rounds
     self.min_loss = None
-    self.min_loss_step = 0
+    self.last_step = -1
+    # self.steps records the number of steps for which the loss has been
+    # non-decreasing
+    self.steps = 0
 
-  def step_begin(self, step):
-    super(TensorForestLossMonitor, self).step_begin(step)
-    return [self._loss_op_name]
+  def before_run(self, run_context):
+    return session_run_hook.SessionRunArgs(
+        {'global_step': contrib_framework.get_global_step(),
+         'current_loss': run_context.session.graph.get_operation_by_name(
+             LOSS_NAME).outputs[0]})
 
-  def set_estimator(self, est):
-    """This function gets called in the same graph as _get_train_ops."""
-    super(TensorForestLossMonitor, self).set_estimator(est)
-    self._loss_op_name = est.training_loss.name
+  def after_run(self, run_context, run_values):
+    current_loss = run_values.results['current_loss']
+    current_step = run_values.results['global_step']
+    self.steps += 1
+    # Gaurd against the global step going backwards, which might happen
+    # if we recover from something.
+    if self.last_step == -1 or self.last_step > current_step:
+      logging.info('TensorForestLossHook resetting last_step.')
+      self.last_step = current_step
+      self.steps = 0
+      return
 
-  def every_n_step_end(self, step, outputs):
-    super(TensorForestLossMonitor, self).every_n_step_end(step, outputs)
-    current_loss = outputs[self._loss_op_name]
     if self.min_loss is None or current_loss < self.min_loss:
       self.min_loss = current_loss
-      self.min_loss_step = step
-    return step - self.min_loss_step >= self.early_stopping_rounds
+      self.steps = 0
+    if self.steps > self.early_stopping_rounds:
+      logging.info('TensorForestLossHook requesting stop.')
+      run_context.request_stop()
 
 
-class TensorForestEstimator(estimator.BaseEstimator):
+def get_model_fn(params, graph_builder_class, device_assigner,
+                 weights_name=None, keys_name=None):
+  """Return a model function given a way to construct a graph builder."""
+  def _model_fn(features, targets):
+    """Function that returns predictions, training loss, and training op."""
+    weights = None
+    keys = None
+    if weights_name and weights_name in features:
+      weights = features.pop(weights_name)
+    if keys_name and keys_name in features:
+      keys = features.pop(keys_name)
+    processed_features, spec = data_ops.ParseDataTensorOrDict(features)
+    _assert_float32(processed_features)
+    if targets is not None:
+      targets = data_ops.ParseLabelTensorOrDict(targets)
+      _assert_float32(targets)
+
+    graph_builder = graph_builder_class(params, device_assigner=device_assigner)
+    inference = {eval_metrics.INFERENCE_PROB_NAME:
+                 graph_builder.inference_graph(processed_features,
+                                               data_spec=spec)}
+    if not params.regression:
+      inference[eval_metrics.INFERENCE_PRED_NAME] = math_ops.argmax(
+          inference[eval_metrics.INFERENCE_PROB_NAME], 1)
+    if keys:
+      inference[KEYS_NAME] = keys
+
+    # targets might be None if we're doing prediction (which brings up the
+    # question of why we force everything to adhere to a single model_fn).
+    training_loss = None
+    training_graph = None
+    if targets is not None:
+      training_loss = graph_builder.training_loss(processed_features, targets,
+                                                  data_spec=spec,
+                                                  name=LOSS_NAME)
+      training_graph = control_flow_ops.group(
+          graph_builder.training_graph(
+              processed_features, targets, data_spec=spec,
+              input_weights=weights),
+          state_ops.assign_add(contrib_framework.get_global_step(), 1))
+    # Put weights back in
+    if weights is not None:
+      features[weights_name] = weights
+    return (inference, training_loss, training_graph)
+  return _model_fn
+
+
+class TensorForestEstimator(evaluable.Evaluable, trainable.Trainable):
   """An estimator that can train and evaluate a random forest."""
 
   def __init__(self, params, device_assigner=None, model_dir=None,
                graph_builder_class=tensor_forest.RandomForestGraphs,
-               master='', accuracy_metric=None,
-               tf_random_seed=None, config=None,
-               feature_engineering_fn=None):
+               config=None, weights_name=None, keys_name=None,
+               feature_engineering_fn=None, early_stopping_rounds=100):
     self.params = params.fill()
-    self.accuracy_metric = (accuracy_metric or
-                            ('r2' if self.params.regression else 'accuracy'))
-    self.data_feeder = None
-    self.device_assigner = (
-        device_assigner or tensor_forest.RandomForestDeviceAssigner())
     self.graph_builder_class = graph_builder_class
-    self.training_args = {}
-    self.construction_args = {}
-    self._feature_engineering_fn = (
-        feature_engineering_fn or
-        (lambda features, targets: (features, targets)))
+    self.early_stopping_rounds = early_stopping_rounds
+    self._estimator = estimator.Estimator(
+        model_fn=get_model_fn(params, graph_builder_class, device_assigner,
+                              weights_name=weights_name, keys_name=keys_name),
+        model_dir=model_dir,
+        config=config,
+        feature_engineering_fn=feature_engineering_fn)
 
-    super(TensorForestEstimator, self).__init__(model_dir=model_dir,
-                                                config=config)
+  def evaluate(
+      self, x=None, y=None, input_fn=None, feed_fn=None, batch_size=None,
+      steps=None, metrics=None, name=None):
+    """See evaluable.Evaluable."""
+    return self._estimator.evaluate(
+        input_fn=input_fn, x=x, y=y, feed_fn=feed_fn,
+        batch_size=batch_size, steps=steps,
+        metrics=metrics, name=name)
+
+  def fit(self, x=None, y=None, input_fn=None, steps=None, batch_size=None,
+          monitors=None, max_steps=None):
+    """See trainable.Trainable."""
+    if not monitors:
+      monitors = [TensorForestLossHook(self.early_stopping_rounds)]
+    self._estimator.fit(input_fn=input_fn, x=x, y=y,
+                        batch_size=batch_size, steps=steps, monitors=monitors,
+                        max_steps=max_steps)
 
   @deprecated_arg_values(
       estimator.AS_ITERABLE_DATE, estimator.AS_ITERABLE_INSTRUCTIONS,
@@ -135,13 +204,14 @@ class TensorForestEstimator(estimator.BaseEstimator):
     Raises:
       ValueError: If both or neither of x and input_fn were given.
     """
-    results = super(TensorForestEstimator, self).predict(
+    results = self._estimator.predict(
         x=x, input_fn=input_fn, batch_size=batch_size, outputs=outputs,
         as_iterable=as_iterable)
+
     if as_iterable:
-      return (r['probabilities'] for r in results)
+      return (x[eval_metrics.INFERENCE_PROB_NAME] for x in results)
     else:
-      return results['probabilities']
+      return results[eval_metrics.INFERENCE_PROB_NAME]
 
   @deprecated_arg_values(
       estimator.AS_ITERABLE_DATE, estimator.AS_ITERABLE_INSTRUCTIONS,
@@ -168,16 +238,16 @@ class TensorForestEstimator(estimator.BaseEstimator):
       Numpy array of predicted classes or regression values (or an iterable of
       predictions if as_iterable is True).
     """
-    probabilities = self.predict_proba(
+    results = self._estimator.predict(
         x=x, input_fn=input_fn, batch_size=batch_size, outputs=outputs,
         as_iterable=as_iterable)
-    if self.params.regression:
-      return probabilities
+
+    predict_name = (eval_metrics.INFERENCE_PROB_NAME if self.params.regression
+                    else eval_metrics.INFERENCE_PRED_NAME)
+    if as_iterable:
+      return (x[predict_name] for x in results)
     else:
-      if as_iterable:
-        return (np.argmax(p, axis=0) for p in probabilities)
-      else:
-        return np.argmax(probabilities, axis=1)
+      return results[predict_name]
 
   @deprecated_arg_values(
       estimator.AS_ITERABLE_DATE, estimator.AS_ITERABLE_INSTRUCTIONS,
@@ -186,100 +256,40 @@ class TensorForestEstimator(estimator.BaseEstimator):
       self, x=None, input_fn=None, axis=None, batch_size=None, outputs=None,
       as_iterable=True):
     """Same as predict but also returns the example keys."""
-    results = super(TensorForestEstimator, self).predict(
+    results = self._estimator.predict(
         x=x, input_fn=input_fn, batch_size=batch_size, outputs=outputs,
         as_iterable=as_iterable)
-    if self.params.regression:
-      if as_iterable:
-        return ((r['probabilities'], r.get('keys', None)) for r in results)
-      else:
-        return results['probabilities'], results.get('keys', None)
+
+    predict_name = (eval_metrics.INFERENCE_PROB_NAME if self.params.regression
+                    else eval_metrics.INFERENCE_PRED_NAME)
+    if as_iterable:
+      return ((x[predict_name], x.get(KEYS_NAME, None)) for x in results)
     else:
-      if as_iterable:
-        return ((np.argmax(r['probabilities'], axis=0),
-                 r.get('keys', None)) for r in results)
-
-      else:
-        return np.argmax(results['probabilities'], axis=1), results.get('keys',
-                                                                        None)
-
-  def _get_train_ops(self, features, targets):
-    """Method that builds model graph and returns trainer ops.
-
-    Args:
-      features: `Tensor` or `dict` of `Tensor` objects.
-      targets: `Tensor` or `dict` of `Tensor` objects.
-
-    Returns:
-      Tuple of train `Operation` and loss `Tensor`.
-    """
-    features, _, weights, spec = data_ops.ParseDataTensorOrDict(features)
-    labels = data_ops.ParseLabelTensorOrDict(targets)
-    features, labels = self._feature_engineering_fn(features, labels)
-    _assert_float32(features)
-    _assert_float32(labels)
-
-    if weights is not None:
-      if 'input_weights' in self.training_args:
-        logging.warning('Replacing input_weights in training_args.')
-      self.training_args['input_weights'] = weights
-
-    graph_builder = self.graph_builder_class(
-        self.params, device_assigner=self.device_assigner,
-        **self.construction_args)
-
-    epoch = None
-    if self.data_feeder:
-      epoch = self.data_feeder.make_epoch_variable()
-
-    train = control_flow_ops.group(
-        graph_builder.training_graph(
-            features, labels, data_spec=spec, epoch=epoch,
-            **self.training_args),
-        state_ops.assign_add(contrib_framework.get_global_step(), 1))
-
-    self.training_loss = graph_builder.training_loss(features, targets)
-
-    return train, self.training_loss
-
-  def _get_predict_ops(self, features):
-    graph_builder = self.graph_builder_class(
-        self.params, device_assigner=self.device_assigner, training=False,
-        **self.construction_args)
-    features, keys, _, spec = data_ops.ParseDataTensorOrDict(features)
-    features, _ = self._feature_engineering_fn(features, None)
-    _assert_float32(features)
-    output_dict = {
-        'probabilities': graph_builder.inference_graph(features,
-                                                       data_spec=spec)}
-    if keys is not None:
-      output_dict['keys'] = keys
-    return output_dict
-
-  def _get_eval_ops(self, features, targets, metrics):
-    features, _, _, spec = data_ops.ParseDataTensorOrDict(features)
-    labels = data_ops.ParseLabelTensorOrDict(targets)
-    features, labels = self._feature_engineering_fn(features, labels)
-    _assert_float32(features)
-    _assert_float32(labels)
-
-    graph_builder = self.graph_builder_class(
-        self.params, device_assigner=self.device_assigner, training=False,
-        **self.construction_args)
-
-    probabilities = graph_builder.inference_graph(features, data_spec=spec)
-
-    # One-hot the labels.
-    if not self.params.regression:
-      labels = math_ops.to_int64(array_ops.one_hot(math_ops.to_int64(
-          array_ops.squeeze(labels)), self.params.num_classes, 1, 0))
-
-    if metrics is None:
-      metrics = {self.accuracy_metric:
-                 eval_metrics.get_metric(self.accuracy_metric)}
-
-    result = {}
-    for name, metric in six.iteritems(metrics):
-      result[name] = metric(probabilities, labels)
+      return results[predict_name], results.get(KEYS_NAME, None)
 
+  def export(self,
+             export_dir,
+             input_fn,
+             signature_fn=None,
+             default_batch_size=1):
+    """See BaseEstimator.export."""
+    # Reset model function with basic device assigner.
+    # Servo doesn't support distributed inference
+    # but it will try to respect device assignments if they're there.
+    # pylint: disable=protected-access
+    orig_model_fn = self._estimator._model_fn
+    self._estimator._model_fn = get_model_fn(
+        self.params, self.graph_builder_class,
+        tensor_forest.RandomForestDeviceAssigner())
+    result = self._estimator.export(
+        export_dir=export_dir,
+        use_deprecated_input_fn=True,
+        signature_fn=(signature_fn or
+                      (export.regression_signature_fn
+                       if self.params.regression else
+                       export.classification_signature_fn_with_prob)),
+        default_batch_size=default_batch_size,
+        prediction_key=eval_metrics.INFERENCE_PROB_NAME)
+    self._estimator._model_fn = orig_model_fn
+    # pylint: enable=protected-access
     return result
diff --git a/tensorflow/contrib/learn/python/learn/estimators/random_forest_test.py b/tensorflow/contrib/learn/python/learn/estimators/random_forest_test.py
index a1216be1fe9..9242aa98969 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/random_forest_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/random_forest_test.py
@@ -28,14 +28,30 @@ class TensorForestTrainerTests(tf.test.TestCase):
   def testClassification(self):
     """Tests multi-class classification using matrix data as input."""
     hparams = tf.contrib.tensor_forest.python.tensor_forest.ForestHParams(
-        num_trees=3, max_nodes=1000, num_classes=3, num_features=4)
-    classifier = tf.contrib.learn.TensorForestEstimator(hparams)
+        num_trees=3, max_nodes=1000, num_classes=3, num_features=4,
+        split_after_samples=20)
+    classifier = tf.contrib.learn.TensorForestEstimator(hparams.fill())
 
     iris = tf.contrib.learn.datasets.load_iris()
     data = iris.data.astype(np.float32)
     target = iris.target.astype(np.float32)
 
-    monitors = [tf.contrib.learn.TensorForestLossMonitor(10, 10)]
+    classifier.fit(x=data, y=target, steps=100, batch_size=50)
+    classifier.evaluate(x=data, y=target, steps=10)
+
+  def testClassificationTrainingLoss(self):
+    """Tests multi-class classification using matrix data as input."""
+    hparams = tf.contrib.tensor_forest.python.tensor_forest.ForestHParams(
+        num_trees=3, max_nodes=1000, num_classes=3, num_features=4)
+    classifier = tf.contrib.learn.TensorForestEstimator(
+        hparams, graph_builder_class=(
+            tf.contrib.tensor_forest.python.tensor_forest.TrainingLossForest))
+
+    iris = tf.contrib.learn.datasets.load_iris()
+    data = iris.data.astype(np.float32)
+    target = iris.target.astype(np.float32)
+
+    monitors = [tf.contrib.learn.TensorForestLossHook(10)]
     classifier.fit(x=data, y=target, steps=100, monitors=monitors)
     classifier.evaluate(x=data, y=target, steps=10)
 
@@ -44,16 +60,15 @@ class TensorForestTrainerTests(tf.test.TestCase):
 
     hparams = tf.contrib.tensor_forest.python.tensor_forest.ForestHParams(
         num_trees=3, max_nodes=1000, num_classes=1, num_features=13,
-        regression=True)
+        regression=True, split_after_samples=20)
 
-    regressor = tf.contrib.learn.TensorForestEstimator(hparams)
+    regressor = tf.contrib.learn.TensorForestEstimator(hparams.fill())
 
     boston = tf.contrib.learn.datasets.load_boston()
     data = boston.data.astype(np.float32)
     target = boston.target.astype(np.float32)
 
-    monitors = [tf.contrib.learn.TensorForestLossMonitor(10, 10)]
-    regressor.fit(x=data, y=target, steps=100, monitors=monitors)
+    regressor.fit(x=data, y=target, steps=100, batch_size=50)
     regressor.evaluate(x=data, y=target, steps=10)
 
 
diff --git a/tensorflow/contrib/learn/python/learn/graph_actions.py b/tensorflow/contrib/learn/python/learn/graph_actions.py
index c7ce09de28c..0c5152b553f 100644
--- a/tensorflow/contrib/learn/python/learn/graph_actions.py
+++ b/tensorflow/contrib/learn/python/learn/graph_actions.py
@@ -627,7 +627,7 @@ def _eval_results_to_str(eval_results):
 
 def _write_summary_results(output_dir, eval_results, current_global_step):
   """Writes eval results into summary file in given dir."""
-  logging.info('Saving evaluation summary for %d step: %s', current_global_step,
+  logging.info('Saving evaluation summary for step %d: %s', current_global_step,
                _eval_results_to_str(eval_results))
   summary_writer = get_summary_writer(output_dir)
   summary = summary_pb2.Summary()
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py b/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
index 21ce65b7eb4..933c7456f5d 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
@@ -253,6 +253,18 @@ def _get_shared_file_name_queue(file_names, shuffle, num_epochs, name):
 
 
 def _get_file_names(file_pattern, randomize_input):
+  """Parse list of file names from pattern, optionally shuffled.
+
+  Args:
+    file_pattern: File glob pattern, or list of strings.
+    randomize_input: Whether to shuffle the order of file names.
+
+  Returns:
+    List of file names matching `file_pattern`.
+
+  Raises:
+    ValueError: If `file_pattern` is empty, or pattern matches no files.
+  """
   if isinstance(file_pattern, list):
     file_names = file_pattern
     if not file_names:
@@ -304,6 +316,36 @@ def _read_keyed_batch_examples_helper(file_pattern,
                                       parse_fn=None,
                                       setup_shared_queue=False,
                                       name=None):
+  """Adds operations to read, queue, batch `Example` protos.
+
+  Args:
+    file_pattern: List of files or pattern of file paths containing
+        `Example` records. See `tf.gfile.Glob` for pattern rules.
+    batch_size: An int or scalar `Tensor` specifying the batch size to use.
+    reader: A function or class that returns an object with
+      `read` method, (filename tensor) -> (example tensor).
+    randomize_input: Whether the input should be randomized.
+    num_epochs: Integer specifying the number of times to read through the
+      dataset. If `None`, cycles through the dataset forever.
+      NOTE - If specified, creates a variable that must be initialized, so call
+      `tf.initialize_all_variables()` as shown in the tests.
+    queue_capacity: Capacity for input queue.
+    num_threads: The number of threads enqueuing examples.
+    read_batch_size: An int or scalar `Tensor` specifying the number of
+      records to read at once
+    parse_fn: Parsing function, takes `Example` Tensor returns parsed
+      representation. If `None`, no parsing is done.
+    setup_shared_queue: Whether to set up a shared queue for file names.
+    name: Name of resulting op.
+
+  Returns:
+    Returns tuple of:
+    - `Tensor` of string keys.
+    - String `Tensor` of batched `Example` proto.
+
+  Raises:
+    ValueError: for invalid inputs.
+  """
   # Retrieve files to read.
   file_names = _get_file_names(file_pattern, randomize_input)
 
@@ -348,10 +390,10 @@ def _read_keyed_batch_examples_helper(file_pattern,
 
     enqueue_many = read_batch_size > 1
 
-    if num_epochs is not None:
-      allow_smaller_final_batch = True
-    else:
+    if num_epochs is None:
       allow_smaller_final_batch = False
+    else:
+      allow_smaller_final_batch = True
 
     # Setup batching queue given list of read example tensors.
     if randomize_input:
@@ -505,7 +547,6 @@ def _read_keyed_batch_features_shared_queue(file_pattern,
       Adding multiple queue runners for the parsed example queue helps maintain
       a full queue when the subsequent computations overall are cheaper than
       parsing.
-    parser_num_threads: (Deprecated) The number of threads to parse examples.
     parse_fn: Parsing function, takes `Example` Tensor returns parsed
       representation. If `None`, no parsing is done.
     name: Name of resulting op.
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py b/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
index f9f42bbfad2..8491bb707bf 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
@@ -121,7 +121,8 @@ class GraphIOTest(tf.test.TestCase):
     batch_size = 17
     queue_capacity = 1234
     name = "my_batch"
-    features = {"feature": tf.FixedLenFeature(shape=[0], dtype=tf.float32)}
+    shape = (0,)
+    features = {"feature": tf.FixedLenFeature(shape=shape, dtype=tf.float32)}
 
     with tf.Graph().as_default() as g, self.test_session(graph=g) as sess:
       features = tf.contrib.learn.io.read_batch_record_features(
@@ -132,8 +133,11 @@ class GraphIOTest(tf.test.TestCase):
           queue_capacity=queue_capacity,
           reader_num_threads=2,
           name=name)
-      self.assertEqual("%s/fifo_queue_1_Dequeue:0" % name,
-                       features["feature"].name)
+      self.assertTrue(
+          "feature" in features, "'feature' missing from %s." % features.keys())
+      feature = features["feature"]
+      self.assertEqual("%s/fifo_queue_1_Dequeue:0" % name, feature.name)
+      self.assertAllEqual((batch_size,) + shape, feature.get_shape().as_list())
       file_name_queue_name = "%s/file_name_queue" % name
       file_names_name = "%s/input" % file_name_queue_name
       example_queue_name = "%s/fifo_queue" % name
@@ -161,6 +165,7 @@ class GraphIOTest(tf.test.TestCase):
           reader=tf.TFRecordReader, randomize_input=True,
           num_epochs=1,
           queue_capacity=queue_capacity, name=name)
+      self.assertAllEqual((None,), inputs.get_shape().as_list())
       self.assertEqual("%s:1" % name, inputs.name)
       file_name_queue_name = "%s/file_name_queue" % name
       file_name_queue_limit_name = (
@@ -190,6 +195,7 @@ class GraphIOTest(tf.test.TestCase):
           _VALID_FILE_PATTERN, batch_size,
           reader=tf.TFRecordReader, randomize_input=True,
           queue_capacity=queue_capacity, name=name)
+      self.assertAllEqual((batch_size,), inputs.get_shape().as_list())
       self.assertEqual("%s:1" % name, inputs.name)
       file_name_queue_name = "%s/file_name_queue" % name
       file_names_name = "%s/input" % file_name_queue_name
@@ -234,6 +240,7 @@ class GraphIOTest(tf.test.TestCase):
           filename, batch_size, reader=tf.TextLineReader,
           randomize_input=False, num_epochs=1, queue_capacity=queue_capacity,
           name=name)
+      self.assertAllEqual((None,), inputs.get_shape().as_list())
       session.run(tf.initialize_local_variables())
 
       coord = tf.train.Coordinator()
@@ -280,10 +287,13 @@ class GraphIOTest(tf.test.TestCase):
     features = {"sequence": tf.FixedLenFeature([], tf.string)}
 
     with tf.Graph().as_default() as g, self.test_session(graph=g) as session:
-      _, result = tf.contrib.learn.read_keyed_batch_features(
+      keys, result = tf.contrib.learn.read_keyed_batch_features(
           filename, batch_size, features, tf.TextLineReader,
           randomize_input=False, num_epochs=1, queue_capacity=queue_capacity,
           num_enqueue_threads=2, parse_fn=tf.decode_json_example, name=name)
+      self.assertAllEqual((None,), keys.get_shape().as_list())
+      self.assertEqual(1, len(result))
+      self.assertAllEqual((None,), result["sequence"].get_shape().as_list())
       session.run(tf.initialize_local_variables())
       coord = tf.train.Coordinator()
       threads = tf.train.start_queue_runners(session, coord=coord)
@@ -319,6 +329,7 @@ class GraphIOTest(tf.test.TestCase):
           filenames, batch_size, reader=tf.TextLineReader,
           randomize_input=False, num_epochs=1, queue_capacity=queue_capacity,
           name=name)
+      self.assertAllEqual((None,), inputs.get_shape().as_list())
       session.run(tf.initialize_local_variables())
 
       coord = tf.train.Coordinator()
@@ -354,7 +365,7 @@ class GraphIOTest(tf.test.TestCase):
     name = "my_batch"
 
     with tf.Graph().as_default() as g, self.test_session(graph=g) as session:
-      _, inputs = _read_keyed_batch_examples_shared_queue(
+      keys, inputs = _read_keyed_batch_examples_shared_queue(
           filenames,
           batch_size,
           reader=tf.TextLineReader,
@@ -362,6 +373,8 @@ class GraphIOTest(tf.test.TestCase):
           num_epochs=1,
           queue_capacity=queue_capacity,
           name=name)
+      self.assertAllEqual((None,), keys.get_shape().as_list())
+      self.assertAllEqual((None,), inputs.get_shape().as_list())
       session.run(tf.initialize_local_variables())
 
       coord = tf.train.Coordinator()
@@ -418,7 +431,7 @@ class GraphIOTest(tf.test.TestCase):
 
     with tf.Graph().as_default() as g1, tf.Session(
         server.target, graph=g1) as session:
-      _, inputs = _read_keyed_batch_examples_shared_queue(
+      keys, inputs = _read_keyed_batch_examples_shared_queue(
           filenames,
           batch_size,
           reader=tf.TextLineReader,
@@ -426,6 +439,8 @@ class GraphIOTest(tf.test.TestCase):
           num_epochs=1,
           queue_capacity=queue_capacity,
           name=name)
+      self.assertAllEqual((None,), keys.get_shape().as_list())
+      self.assertAllEqual((None,), inputs.get_shape().as_list())
       session.run(tf.initialize_local_variables())
 
       # Run the three queues once manually.
@@ -443,7 +458,7 @@ class GraphIOTest(tf.test.TestCase):
 
     with tf.Graph().as_default() as g2, tf.Session(
         server.target, graph=g2) as session:
-      _, inputs = _read_keyed_batch_examples_shared_queue(
+      keys, inputs = _read_keyed_batch_examples_shared_queue(
           filenames,
           batch_size,
           reader=tf.TextLineReader,
@@ -451,6 +466,8 @@ class GraphIOTest(tf.test.TestCase):
           num_epochs=1,
           queue_capacity=queue_capacity,
           name=name)
+      self.assertAllEqual((None,), keys.get_shape().as_list())
+      self.assertAllEqual((None,), inputs.get_shape().as_list())
 
       # Run the worker and the example queue.
       self._run_queue(worker_file_name_queue_name, session)
@@ -473,6 +490,7 @@ class GraphIOTest(tf.test.TestCase):
           [filename], batch_size, reader=tf.TextLineReader,
           randomize_input=False, num_epochs=1, queue_capacity=queue_capacity,
           read_batch_size=10, name=name)
+      self.assertAllEqual((None,), inputs.get_shape().as_list())
       session.run(tf.initialize_local_variables())
 
       coord = tf.train.Coordinator()
@@ -499,6 +517,8 @@ class GraphIOTest(tf.test.TestCase):
           filename, batch_size,
           reader=tf.TextLineReader, randomize_input=False,
           num_epochs=1, queue_capacity=queue_capacity, name=name)
+      self.assertAllEqual((None,), keys.get_shape().as_list())
+      self.assertAllEqual((None,), inputs.get_shape().as_list())
       session.run(tf.initialize_local_variables())
 
       coord = tf.train.Coordinator()
@@ -537,6 +557,9 @@ class GraphIOTest(tf.test.TestCase):
           reader=tf.TextLineReader, randomize_input=False,
           num_epochs=1, queue_capacity=queue_capacity,
           parse_fn=parse_fn, name=name)
+      self.assertAllEqual((None,), keys.get_shape().as_list())
+      self.assertEqual(1, len(inputs))
+      self.assertAllEqual((None, 1), inputs["age"].get_shape().as_list())
       session.run(tf.initialize_local_variables())
 
       coord = tf.train.Coordinator()
diff --git a/tensorflow/contrib/learn/python/learn/utils/export.py b/tensorflow/contrib/learn/python/learn/utils/export.py
index 5313dd3a4ea..4dbd23b5f6a 100644
--- a/tensorflow/contrib/learn/python/learn/utils/export.py
+++ b/tensorflow/contrib/learn/python/learn/utils/export.py
@@ -24,6 +24,7 @@ from tensorflow.contrib.framework import deprecated_arg_values
 from tensorflow.contrib.framework.python.ops import variables as contrib_variables
 from tensorflow.contrib.session_bundle import exporter
 from tensorflow.contrib.session_bundle import gc
+from tensorflow.core.protobuf import saver_pb2
 from tensorflow.python.client import session as tf_session
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -53,7 +54,7 @@ def _get_saver():
     else:
       saver = None
   if saver is None and variables.all_variables():
-    saver = tf_saver.Saver()
+    saver = tf_saver.Saver(write_version=saver_pb2.SaverDef.V1)
     ops.add_to_collection(ops.GraphKeys.SAVERS, saver)
   return saver
 
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index d7a4d8873c9..ed5d6539b3b 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -21,6 +21,7 @@ tensorflow/core/kernels/strided_slice_op_inst_4.cc
 tensorflow/core/kernels/strided_slice_op_inst_3.cc
 tensorflow/core/kernels/strided_slice_op_inst_2.cc
 tensorflow/core/kernels/strided_slice_op_inst_1.cc
+tensorflow/core/kernels/strided_slice_op_inst_0.cc
 tensorflow/core/kernels/strided_slice_op.cc
 tensorflow/core/kernels/stack_ops.cc
 tensorflow/core/kernels/split_op.cc
@@ -142,6 +143,7 @@ tensorflow/core/kernels/avgpooling_op.cc
 tensorflow/core/kernels/argmax_op.cc
 tensorflow/core/kernels/aggregate_ops.cc
 tensorflow/core/kernels/dequantize_op.cc
+tensorflow/core/kernels/meta_support.cc
 tensorflow/core/kernels/quantization_utils.cc
 tensorflow/core/kernels/quantize_down_and_shrink_range.cc
 tensorflow/core/kernels/quantize_op.cc
@@ -153,6 +155,7 @@ tensorflow/core/kernels/quantized_conv_ops.cc
 tensorflow/core/kernels/quantized_matmul_op.cc
 tensorflow/core/kernels/quantized_pooling_ops.cc
 tensorflow/core/kernels/quantized_reshape_op.cc
+tensorflow/core/kernels/requantization_range_op.cc
 tensorflow/core/kernels/requantize.cc
 tensorflow/core/ops/training_ops.cc
 tensorflow/core/ops/string_ops.cc
diff --git a/tensorflow/contrib/metrics/__init__.py b/tensorflow/contrib/metrics/__init__.py
index a0b7b1ccfff..fc98a8d3df4 100644
--- a/tensorflow/contrib/metrics/__init__.py
+++ b/tensorflow/contrib/metrics/__init__.py
@@ -95,11 +95,6 @@ Certain metrics, such as streaming_mean or streaming_accuracy, can be weighted
 via a `weights` argument. The `weights` tensor must be the same size as the
 labels and predictions tensors and results in a weighted average of the metric.
 
-Other metrics, such as streaming_recall, streaming_precision, and streaming_auc,
-are not well defined with regard to weighted samples. However, a binary
-`ignore_mask` argument can be used to ignore certain values at graph executation
-time.
-
 ## Metric `Ops`
 
 @@streaming_accuracy
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index a15783149f4..c7d20613713 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -23,7 +23,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.framework import deprecated
-from tensorflow.contrib.framework import deprecated_args
 from tensorflow.contrib.framework import tensor_util
 from tensorflow.contrib.framework.python.ops import variables as contrib_variables
 from tensorflow.contrib.metrics.python.ops import confusion_matrix_ops
@@ -41,40 +40,6 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 
 
-IGNORE_MASK_DATE = '2016-10-19'
-IGNORE_MASK_INSTRUCTIONS = (
-    '`ignore_mask` is being deprecated. Instead use `weights` with values 0.0 '
-    'and 1.0 to mask values. For example, `weights=tf.logical_not(mask)`.')
-
-
-def _mask_weights(mask=None, weights=None):
-  """Mask a given set of weights.
-
-  Elements are included when the corresponding `mask` element is `False`, and
-  excluded otherwise.
-
-  Args:
-    mask: An optional, `bool` `Tensor`.
-    weights: An optional `Tensor` whose shape matches `mask` if `mask` is not
-      `None`.
-
-  Returns:
-    Masked weights if `mask` and `weights` are not `None`, weights equivalent to
-    `mask` if `weights` is `None`, and otherwise `weights`.
-
-  Raises:
-    ValueError: If `weights` and `mask` are not `None` and have mismatched
-      shapes.
-  """
-  if mask is not None:
-    check_ops.assert_type(mask, dtypes.bool)
-    if weights is None:
-      weights = array_ops.ones_like(mask, dtype=dtypes.float32)
-    weights = math_ops.cast(math_ops.logical_not(mask), weights.dtype) * weights
-
-  return weights
-
-
 def _safe_div(numerator, denominator, name):
   """Divides two values, returning 0 if the denominator is <= 0.
 
@@ -516,8 +481,7 @@ def streaming_accuracy(predictions, labels, weights=None,
                         updates_collections, name or 'accuracy')
 
 
-@deprecated_args(IGNORE_MASK_DATE, IGNORE_MASK_INSTRUCTIONS, 'ignore_mask')
-def streaming_precision(predictions, labels, ignore_mask=None, weights=None,
+def streaming_precision(predictions, labels, weights=None,
                         metrics_collections=None, updates_collections=None,
                         name=None):
   """Computes the precision of the predictions with respect to the labels.
@@ -534,14 +498,11 @@ def streaming_precision(predictions, labels, ignore_mask=None, weights=None,
   `weights`.
 
   If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-  Alternatively, if `ignore_mask` is not `None`, then mask values where
-  `ignore_mask` is `True`.
 
   Args:
     predictions: The predicted values, a `bool` `Tensor` of arbitrary shape.
     labels: The ground truth values, a `bool` `Tensor` whose dimensions must
       match `predictions`.
-    ignore_mask: An optional, `bool` `Tensor` whose shape matches `predictions`.
     weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
     metrics_collections: An optional list of collections that `precision` should
       be added to.
@@ -558,9 +519,8 @@ def streaming_precision(predictions, labels, ignore_mask=None, weights=None,
 
   Raises:
     ValueError: If `predictions` and `labels` have mismatched shapes, or if
-      `ignore_mask` is not `None` and its shape doesn't match `predictions`, or
-      if `weights` is not `None` and its shape doesn't match `predictions`, or
-      if either `metrics_collections` or `updates_collections` are not a list or
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
       tuple.
   """
   with variable_scope.variable_scope(
@@ -570,7 +530,6 @@ def streaming_precision(predictions, labels, ignore_mask=None, weights=None,
         predictions, labels, weights)
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
 
-    weights = _mask_weights(ignore_mask, weights)
     true_positives, true_positives_update_op = _streaming_true_positives(
         predictions, labels, weights, metrics_collections=None,
         updates_collections=None, name=None)
@@ -599,8 +558,7 @@ def streaming_precision(predictions, labels, ignore_mask=None, weights=None,
     return precision, update_op
 
 
-@deprecated_args(IGNORE_MASK_DATE, IGNORE_MASK_INSTRUCTIONS, 'ignore_mask')
-def streaming_recall(predictions, labels, ignore_mask=None, weights=None,
+def streaming_recall(predictions, labels, weights=None,
                      metrics_collections=None, updates_collections=None,
                      name=None):
   """Computes the recall of the predictions with respect to the labels.
@@ -615,14 +573,11 @@ def streaming_recall(predictions, labels, ignore_mask=None, weights=None,
   weights each prediction by the corresponding value in `weights`.
 
   If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-  Alternatively, if `ignore_mask` is not `None`, then mask values where
-  `ignore_mask` is `True`.
 
   Args:
     predictions: The predicted values, a `bool` `Tensor` of arbitrary shape.
     labels: The ground truth values, a `bool` `Tensor` whose dimensions must
       match `predictions`.
-    ignore_mask: An optional, `bool` `Tensor` whose shape matches `predictions`.
     weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
     metrics_collections: An optional list of collections that `recall` should
       be added to.
@@ -639,9 +594,8 @@ def streaming_recall(predictions, labels, ignore_mask=None, weights=None,
 
   Raises:
     ValueError: If `predictions` and `labels` have mismatched shapes, or if
-      `ignore_mask` is not `None` and its shape doesn't match `predictions`, or
-      if `weights` is not `None` and its shape doesn't match `predictions`, or
-      if either `metrics_collections` or `updates_collections` are not a list or
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
       tuple.
   """
   with variable_scope.variable_scope(name, 'recall', [predictions, labels]):
@@ -649,7 +603,6 @@ def streaming_recall(predictions, labels, ignore_mask=None, weights=None,
         predictions, labels, weights)
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
 
-    weights = _mask_weights(ignore_mask, weights)
     true_positives, true_positives_update_op = _streaming_true_positives(
         predictions, labels, weights, metrics_collections=None,
         updates_collections=None, name=None)
@@ -1235,10 +1188,9 @@ def _at_k_name(name, k=None, class_id=None):
 
 @deprecated('2016-11-08', 'Please use `streaming_sparse_recall_at_k`, '
             'and reshape labels from [batch_size] to [batch_size, 1].')
-@deprecated_args(IGNORE_MASK_DATE, IGNORE_MASK_INSTRUCTIONS, 'ignore_mask')
-def streaming_recall_at_k(predictions, labels, k, ignore_mask=None,
-                          weights=None, metrics_collections=None,
-                          updates_collections=None, name=None):
+def streaming_recall_at_k(predictions, labels, k, weights=None,
+                          metrics_collections=None, updates_collections=None,
+                          name=None):
   """Computes the recall@k of the predictions with respect to dense labels.
 
   The `streaming_recall_at_k` function creates two local variables, `total` and
@@ -1255,15 +1207,12 @@ def streaming_recall_at_k(predictions, labels, k, ignore_mask=None,
   increments `count` with the reduced sum of `weights`.
 
   If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-  Alternatively, if `ignore_mask` is not `None`, then mask values where
-  `ignore_mask` is `True`.
 
   Args:
     predictions: A floating point tensor of dimension [batch_size, num_classes]
     labels: A tensor of dimension [batch_size] whose type is in `int32`,
       `int64`.
     k: The number of top elements to look at for computing recall.
-    ignore_mask: An optional, `bool` `Tensor` whose shape matches `predictions`.
     weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
     metrics_collections: An optional list of collections that `recall_at_k`
       should be added to.
@@ -1279,26 +1228,23 @@ def streaming_recall_at_k(predictions, labels, k, ignore_mask=None,
 
   Raises:
     ValueError: If `predictions` and `labels` have mismatched shapes, or if
-      `ignore_mask` is not `None` and its shape doesn't match `predictions`, or
-      if `weights` is not `None` and its shape doesn't match `predictions`, or
-      if either `metrics_collections` or `updates_collections` are not a list or
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
       tuple.
   """
   in_top_k = math_ops.to_float(nn.in_top_k(predictions, labels, k))
   return streaming_mean(in_top_k,
-                        _mask_weights(ignore_mask, weights),
+                        weights,
                         metrics_collections,
                         updates_collections,
                         name or _at_k_name('recall', k))
 
 
 # TODO(ptucker): Validate range of values in labels?
-@deprecated_args(IGNORE_MASK_DATE, IGNORE_MASK_INSTRUCTIONS, 'ignore_mask')
 def streaming_sparse_recall_at_k(predictions,
                                  labels,
                                  k,
                                  class_id=None,
-                                 ignore_mask=None,
                                  weights=None,
                                  metrics_collections=None,
                                  updates_collections=None,
@@ -1328,8 +1274,6 @@ def streaming_sparse_recall_at_k(predictions,
   `false_negative_at_<k>` using these values.
 
   If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-  Alternatively, if `ignore_mask` is not `None`, then mask values where
-  `ignore_mask` is `True`.
 
   Args:
     predictions: Float `Tensor` with shape [D1, ... DN, num_classes] where
@@ -1347,8 +1291,6 @@ def streaming_sparse_recall_at_k(predictions,
     class_id: Integer class ID for which we want binary metrics. This should be
       in range [0, num_classes), where num_classes is the last dimension of
       `predictions`. If class_id is outside this range, the method returns NAN.
-    ignore_mask: An optional, `bool` `Tensor` whose shape is broadcastable to
-      the the first [D1, ... DN] dimensions of `predictions` and `labels`.
     weights: An optional `Tensor` whose shape is broadcastable to the the first
       [D1, ... DN] dimensions of `predictions` and `labels`.
     metrics_collections: An optional list of collections that values should
@@ -1365,16 +1307,14 @@ def streaming_sparse_recall_at_k(predictions,
       `recall`.
 
   Raises:
-    ValueError: If `ignore_mask` is not `None` and its shape doesn't match
-      `predictions`, or if `weights` is not `None` and its shape doesn't match
-      `predictions`, or if either `metrics_collections` or `updates_collections`
-      are not a list or tuple.
+    ValueError: If `weights` is not `None` and its shape doesn't match
+    `predictions`, or if either `metrics_collections` or `updates_collections`
+    are not a list or tuple.
   """
   default_name = _at_k_name('recall', k, class_id=class_id)
   with ops.name_scope(name, default_name, (predictions, labels)) as scope:
     _, top_k_idx = nn.top_k(predictions, k)
     top_k_idx = math_ops.to_int64(top_k_idx)
-    weights = _mask_weights(ignore_mask, weights)
     tp, tp_update = _streaming_sparse_true_positive_at_k(
         predictions_idx=top_k_idx, labels=labels, k=k, class_id=class_id,
         weights=weights)
@@ -1396,7 +1336,6 @@ def _streaming_sparse_precision_at_k(top_k_idx,
                                      labels,
                                      k=None,
                                      class_id=None,
-                                     ignore_mask=None,
                                      weights=None,
                                      metrics_collections=None,
                                      updates_collections=None,
@@ -1423,8 +1362,6 @@ def _streaming_sparse_precision_at_k(top_k_idx,
       in range [0, num_classes), where num_classes is the last dimension of
       `predictions`. If `class_id` is outside this range, the method returns
       NAN.
-    ignore_mask: An optional, `bool` `Tensor` whose shape is broadcastable to
-      the the first [D1, ... DN] dimensions of `predictions` and `labels`.
     weights: An optional `Tensor` whose shape is broadcastable to the the first
       [D1, ... DN] dimensions of `predictions` and `labels`.
     metrics_collections: An optional list of collections that values should
@@ -1441,13 +1378,11 @@ def _streaming_sparse_precision_at_k(top_k_idx,
       `precision`.
 
   Raises:
-    ValueError: If `ignore_mask` is not `None` and its shape doesn't match
-      `predictions`, or if `weights` is not `None` and its shape doesn't match
+    ValueError: If `weights` is not `None` and its shape doesn't match
       `predictions`, or if either `metrics_collections` or `updates_collections`
       are not a list or tuple.
   """
   top_k_idx = math_ops.to_int64(top_k_idx)
-  weights = _mask_weights(ignore_mask, weights)
   tp, tp_update = _streaming_sparse_true_positive_at_k(
       predictions_idx=top_k_idx, labels=labels, k=k, class_id=class_id,
       weights=weights)
@@ -1466,12 +1401,10 @@ def _streaming_sparse_precision_at_k(top_k_idx,
 
 
 # TODO(ptucker): Validate range of values in labels?
-@deprecated_args(IGNORE_MASK_DATE, IGNORE_MASK_INSTRUCTIONS, 'ignore_mask')
 def streaming_sparse_precision_at_k(predictions,
                                     labels,
                                     k,
                                     class_id=None,
-                                    ignore_mask=None,
                                     weights=None,
                                     metrics_collections=None,
                                     updates_collections=None,
@@ -1502,8 +1435,6 @@ def streaming_sparse_precision_at_k(predictions,
   `false_positive_at_<k>` using these values.
 
   If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-  Alternatively, if `ignore_mask` is not `None`, then mask values where
-  `ignore_mask` is `True`.
 
   Args:
     predictions: Float `Tensor` with shape [D1, ... DN, num_classes] where
@@ -1522,8 +1453,6 @@ def streaming_sparse_precision_at_k(predictions,
       in range [0, num_classes], where num_classes is the last dimension of
       `predictions`. If `class_id` is outside this range, the method returns
       NAN.
-    ignore_mask: An optional, `bool` `Tensor` whose shape is broadcastable to
-      the the first [D1, ... DN] dimensions of `predictions` and `labels`.
     weights: An optional `Tensor` whose shape is broadcastable to the the first
       [D1, ... DN] dimensions of `predictions` and `labels`.
     metrics_collections: An optional list of collections that values should
@@ -1540,21 +1469,19 @@ def streaming_sparse_precision_at_k(predictions,
       `precision`.
 
   Raises:
-    ValueError: If `ignore_mask` is not `None` and its shape doesn't match
-      `predictions`, or if `weights` is not `None` and its shape doesn't match
+    ValueError: If `weights` is not `None` and its shape doesn't match
       `predictions`, or if either `metrics_collections` or `updates_collections`
       are not a list or tuple.
   """
   default_name = _at_k_name('precision', k, class_id=class_id)
   with ops.name_scope(name, default_name,
-                      (predictions, labels, ignore_mask, weights)) as scope:
+                      (predictions, labels, weights)) as scope:
     _, top_k_idx = nn.top_k(predictions, k)
     return _streaming_sparse_precision_at_k(
         top_k_idx=top_k_idx,
         labels=labels,
         k=k,
         class_id=class_id,
-        ignore_mask=ignore_mask,
         weights=weights,
         metrics_collections=metrics_collections,
         updates_collections=updates_collections,
@@ -1562,11 +1489,9 @@ def streaming_sparse_precision_at_k(predictions,
 
 
 # TODO(ptucker): Validate range of values in labels?
-@deprecated_args(IGNORE_MASK_DATE, IGNORE_MASK_INSTRUCTIONS, 'ignore_mask')
 def streaming_sparse_precision_at_top_k(top_k_predictions,
                                         labels,
                                         class_id=None,
-                                        ignore_mask=None,
                                         weights=None,
                                         metrics_collections=None,
                                         updates_collections=None,
@@ -1595,8 +1520,6 @@ def streaming_sparse_precision_at_top_k(top_k_predictions,
   `false_positive_at_k` using these values.
 
   If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-  Alternatively, if `ignore_mask` is not `None`, then mask values where
-  `ignore_mask` is `True`.
 
   Args:
     top_k_predictions: Integer `Tensor` with shape [D1, ... DN, k] where
@@ -1614,8 +1537,6 @@ def streaming_sparse_precision_at_top_k(top_k_predictions,
       in range [0, num_classes), where num_classes is the last dimension of
       `predictions`. If `class_id` is outside this range, the method returns
       NAN.
-    ignore_mask: An optional, `bool` `Tensor` whose shape is broadcastable to
-      the the first [D1, ... DN] dimensions of `predictions` and `labels`.
     weights: An optional `Tensor` whose shape is broadcastable to the the first
       [D1, ... DN] dimensions of `predictions` and `labels`.
     metrics_collections: An optional list of collections that values should
@@ -1632,8 +1553,7 @@ def streaming_sparse_precision_at_top_k(top_k_predictions,
       `precision`.
 
   Raises:
-    ValueError: If `ignore_mask` is not `None` and its shape doesn't match
-      `predictions`, or if `weights` is not `None` and its shape doesn't match
+    ValueError: If `weights` is not `None` and its shape doesn't match
       `predictions`, or if either `metrics_collections` or `updates_collections`
       are not a list or tuple.
     ValueError: If `top_k_predictions` has rank < 2.
@@ -1641,7 +1561,7 @@ def streaming_sparse_precision_at_top_k(top_k_predictions,
   default_name = _at_k_name('precision', class_id=class_id)
   with ops.name_scope(
       name, default_name,
-      (top_k_predictions, labels, ignore_mask, weights)) as scope:
+      (top_k_predictions, labels, weights)) as scope:
     rank = array_ops.rank(top_k_predictions)
     check_rank_op = control_flow_ops.Assert(
         math_ops.greater_equal(rank, 2),
@@ -1651,7 +1571,6 @@ def streaming_sparse_precision_at_top_k(top_k_predictions,
           top_k_idx=top_k_predictions,
           labels=labels,
           class_id=class_id,
-          ignore_mask=ignore_mask,
           weights=weights,
           metrics_collections=metrics_collections,
           updates_collections=updates_collections,
@@ -2760,8 +2679,7 @@ def streaming_mean_cosine_distance(predictions, labels, dim, weights=None,
   return mean_distance, update_op
 
 
-@deprecated_args(IGNORE_MASK_DATE, IGNORE_MASK_INSTRUCTIONS, 'ignore_mask')
-def streaming_percentage_less(values, threshold, ignore_mask=None, weights=None,
+def streaming_percentage_less(values, threshold, weights=None,
                               metrics_collections=None,
                               updates_collections=None,
                               name=None):
@@ -2778,13 +2696,10 @@ def streaming_percentage_less(values, threshold, ignore_mask=None, weights=None,
   `percentage`.
 
   If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-  Alternatively, if `ignore_mask` is not `None`, then mask values where
-  `ignore_mask` is `True`.
 
   Args:
     values: A numeric `Tensor` of arbitrary size.
     threshold: A scalar threshold.
-    ignore_mask: An optional, `bool` `Tensor` whose shape matches `values`.
     weights: An optional `Tensor` whose shape is broadcastable to `values`.
     metrics_collections: An optional list of collections that the metric
       value variable should be added to.
@@ -2799,23 +2714,21 @@ def streaming_percentage_less(values, threshold, ignore_mask=None, weights=None,
       appropriately.
 
   Raises:
-    ValueError: If `ignore_mask` is not `None` and its shape doesn't match
-      `values`, or if `weights` is not `None` and its shape doesn't match
-      `values`, or if either `metrics_collections` or `updates_collections` are
-      not a list or tuple.
+    ValueError: If `weights` is not `None` and its shape doesn't match `values`,
+      or if either `metrics_collections` or `updates_collections` are not a list
+      or tuple.
   """
   is_below_threshold = math_ops.to_float(math_ops.less(values, threshold))
-  return streaming_mean(is_below_threshold, _mask_weights(ignore_mask, weights),
+  return streaming_mean(is_below_threshold,
+                        weights,
                         metrics_collections,
                         updates_collections,
                         name or 'percentage_below_threshold')
 
 
-@deprecated_args(IGNORE_MASK_DATE, IGNORE_MASK_INSTRUCTIONS, 'ignore_mask')
 def streaming_mean_iou(predictions,
                        labels,
                        num_classes,
-                       ignore_mask=None,
                        weights=None,
                        metrics_collections=None,
                        updates_collections=None,
@@ -2834,8 +2747,6 @@ def streaming_mean_iou(predictions,
   `update_op` operation that updates these variables and returns the `mean_iou`.
 
   If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-  Alternatively, if `ignore_mask` is not `None`, then mask values where
-  `ignore_mask` is `True`.
 
   Args:
     predictions: A tensor of prediction results for semantic labels, whose
@@ -2846,7 +2757,6 @@ def streaming_mean_iou(predictions,
     num_classes: The possible number of labels the prediction task can
       have. This value must be provided, since a confusion matrix of
       dimension = [num_classes, num_classes] will be allocated.
-    ignore_mask: An optional, `bool` `Tensor` whose shape matches `predictions`.
     weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
     metrics_collections: An optional list of collections that `mean_iou`
       should be added to.
@@ -2860,9 +2770,8 @@ def streaming_mean_iou(predictions,
 
   Raises:
     ValueError: If `predictions` and `labels` have mismatched shapes, or if
-      `ignore_mask` is not `None` and its shape doesn't match `predictions`, or
-      if `weights` is not `None` and its shape doesn't match `predictions`, or
-      if either `metrics_collections` or `updates_collections` are not a list or
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
       tuple.
   """
   with variable_scope.variable_scope(name, 'mean_iou', [predictions, labels]):
@@ -2888,7 +2797,6 @@ def streaming_mean_iou(predictions,
     if labels_rank > 1:
       labels = array_ops.reshape(labels, [-1])
 
-    weights = _mask_weights(ignore_mask, weights)
     if weights is not None:
       weights_rank = weights.get_shape().ndims
       if weights_rank > 1:
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
index c64ce86f2fe..9e56453d227 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
@@ -671,18 +671,6 @@ class StreamingPrecisionTest(tf.test.TestCase):
       self.assertAlmostEqual(0.5, update_op.eval())
       self.assertAlmostEqual(0.5, precision.eval())
 
-  def testMasked(self):
-    predictions = tf.constant([1, 0, 1, 0, 1], shape=(1, 5))
-    labels = tf.constant([0, 1, 1, 0, 1], shape=(1, 5))
-    mask = tf.constant([False, False, False, False, True], shape=(1, 5))
-    precision, update_op = metrics.streaming_precision(
-        predictions, labels, ignore_mask=mask)
-
-    with self.test_session() as sess:
-      sess.run(tf.initialize_local_variables())
-      self.assertAlmostEqual(0.5, update_op.eval())
-      self.assertAlmostEqual(0.5, precision.eval())
-
   def testWeighted1d(self):
     predictions = tf.constant([[1, 0, 1, 0], [1, 0, 1, 0]])
     labels = tf.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
@@ -838,18 +826,6 @@ class StreamingRecallTest(tf.test.TestCase):
       self.assertAlmostEqual(0.5, update_op.eval())
       self.assertAlmostEqual(0.5, recall.eval())
 
-  def testMasked(self):
-    predictions = tf.constant([1, 0, 1, 0, 1], shape=(1, 5))
-    labels = tf.constant([0, 1, 1, 0, 1], shape=(1, 5))
-    mask = tf.constant([False, False, False, False, True], shape=(1, 5))
-    recall, update_op = metrics.streaming_recall(
-        predictions, labels, ignore_mask=mask)
-
-    with self.test_session() as sess:
-      sess.run(tf.initialize_local_variables())
-      self.assertAlmostEqual(0.5, update_op.eval())
-      self.assertAlmostEqual(0.5, recall.eval())
-
   def testWeighted1d(self):
     predictions = tf.constant([[1, 0, 1, 0], [0, 1, 0, 1]])
     labels = tf.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
@@ -1737,15 +1713,13 @@ class StreamingRecallAtKTest(tf.test.TestCase):
                               dtype=tf.float32)
     labels = tf.constant(
         self._np_labels, shape=(self._batch_size,), dtype=tf.int64)
-    weights = tf.constant([0, 1, 1, 1], shape=(self._batch_size,),
+    weights = tf.constant([0, 1, 0, 1], shape=(self._batch_size,),
                           dtype=tf.float32)
-    mask = tf.constant([False, False, True, False], shape=(self._batch_size,),
-                       dtype=tf.bool)
     recall, update_op = metrics.streaming_recall_at_k(
-        predictions, labels, k=2, ignore_mask=mask, weights=weights)
+        predictions, labels, k=2, weights=weights)
     sp_recall, sp_update_op = metrics.streaming_sparse_recall_at_k(
         predictions, tf.reshape(labels, (self._batch_size, 1)), k=2,
-        ignore_mask=mask, weights=weights)
+        weights=weights)
 
     with self.test_session() as sess:
       sess.run(tf.initialize_local_variables())
@@ -1763,16 +1737,13 @@ class StreamingSparsePrecisionTest(tf.test.TestCase):
                                             k,
                                             expected,
                                             class_id=None,
-                                            ignore_mask=None,
                                             weights=None):
     with tf.Graph().as_default() as g, self.test_session(g):
-      if ignore_mask is not None:
-        ignore_mask = tf.constant(ignore_mask, tf.bool)
       if weights is not None:
         weights = tf.constant(weights, tf.float32)
       metric, update = metrics.streaming_sparse_precision_at_k(
           predictions=tf.constant(predictions, tf.float32), labels=labels,
-          k=k, class_id=class_id, ignore_mask=ignore_mask, weights=weights)
+          k=k, class_id=class_id, weights=weights)
 
       # Fails without initialized vars.
       self.assertRaises(tf.OpError, metric.eval)
@@ -1792,17 +1763,13 @@ class StreamingSparsePrecisionTest(tf.test.TestCase):
                                                 labels,
                                                 expected,
                                                 class_id=None,
-                                                ignore_mask=None,
                                                 weights=None):
     with tf.Graph().as_default() as g, self.test_session(g):
-      if ignore_mask is not None:
-        ignore_mask = tf.constant(ignore_mask, tf.bool)
       if weights is not None:
         weights = tf.constant(weights, tf.float32)
       metric, update = metrics.streaming_sparse_precision_at_top_k(
           top_k_predictions=tf.constant(top_k_predictions, tf.int32),
-          labels=labels, class_id=class_id, ignore_mask=ignore_mask,
-          weights=weights)
+          labels=labels, class_id=class_id, weights=weights)
 
       # Fails without initialized vars.
       self.assertRaises(tf.OpError, metric.eval)
@@ -1821,11 +1788,8 @@ class StreamingSparsePrecisionTest(tf.test.TestCase):
                                           predictions,
                                           labels,
                                           k,
-                                          expected,
-                                          ignore_mask=None):
+                                          expected):
     with tf.Graph().as_default() as g, self.test_session(g):
-      if ignore_mask is not None:
-        ignore_mask = tf.constant(ignore_mask, tf.bool)
       predictions = tf.constant(predictions, tf.float32)
       metric = metric_ops.sparse_average_precision_at_k(
           predictions, labels, k)
@@ -2305,11 +2269,9 @@ class StreamingSparsePrecisionTest(tf.test.TestCase):
           top_k_predictions, labels, expected=NAN, class_id=class_id,
           weights=[[0, 0], [0, 0]])
     self._test_streaming_sparse_precision_at_k(
-        predictions, labels, k=5, expected=NAN, ignore_mask=[[False], [True]],
-        weights=[[0], [1]])
+        predictions, labels, k=5, expected=NAN, weights=[[0], [0]])
     self._test_streaming_sparse_precision_at_top_k(
-        top_k_predictions, labels, expected=NAN,
-        ignore_mask=[[False], [True]], weights=[[0], [1]])
+        top_k_predictions, labels, expected=NAN, weights=[[0], [0]])
     self._test_streaming_sparse_precision_at_k(
         predictions, labels, k=5, expected=NAN, weights=[[0, 0], [0, 0]])
     self._test_streaming_sparse_precision_at_top_k(
@@ -2342,34 +2304,34 @@ class StreamingSparsePrecisionTest(tf.test.TestCase):
     # Class 2: 2 predictions, both correct.
     self._test_streaming_sparse_precision_at_k(
         predictions, labels, k=5, expected=2.0 / 2.0, class_id=2,
-        ignore_mask=[[False], [False]], weights=[[1], [0]])
+        weights=[[1], [0]])
     self._test_streaming_sparse_precision_at_top_k(
         top_k_predictions, labels, expected=2.0 / 2.0, class_id=2,
-        ignore_mask=[[False], [False]], weights=[[1], [0]])
+        weights=[[1], [0]])
 
     # Class 2: 2 predictions, both correct.
     self._test_streaming_sparse_precision_at_k(
         predictions, labels, k=5, expected=2.0 / 2.0, class_id=2,
-        ignore_mask=[[False], [False]], weights=[[0], [1]])
+        weights=[[0], [1]])
     self._test_streaming_sparse_precision_at_top_k(
         top_k_predictions, labels, expected=2.0 / 2.0, class_id=2,
-        ignore_mask=[[False], [False]], weights=[[0], [1]])
+        weights=[[0], [1]])
 
     # Class 7: 1 incorrect prediction.
     self._test_streaming_sparse_precision_at_k(
         predictions, labels, k=5, expected=0.0 / 1.0, class_id=7,
-        ignore_mask=[[False], [True]], weights=[[1], [1]])
+        weights=[[1], [0]])
     self._test_streaming_sparse_precision_at_top_k(
         top_k_predictions, labels, expected=0.0 / 1.0, class_id=7,
-        ignore_mask=[[False], [True]], weights=[[1], [1]])
+        weights=[[1], [0]])
 
     # Class 7: 1 correct prediction.
     self._test_streaming_sparse_precision_at_k(
         predictions, labels, k=5, expected=1.0 / 1.0, class_id=7,
-        ignore_mask=[[True], [False]], weights=[[1], [1]])
+        weights=[[0], [1]])
     self._test_streaming_sparse_precision_at_top_k(
         top_k_predictions, labels, expected=1.0 / 1.0, class_id=7,
-        ignore_mask=[[True], [False]], weights=[[1], [1]])
+        weights=[[0], [1]])
 
     # Class 7: no predictions.
     self._test_streaming_sparse_precision_at_k(
@@ -2409,17 +2371,13 @@ class StreamingSparseRecallTest(tf.test.TestCase):
                                          k,
                                          expected,
                                          class_id=None,
-                                         ignore_mask=None,
                                          weights=None):
     with tf.Graph().as_default() as g, self.test_session(g):
-      if ignore_mask is not None:
-        ignore_mask = tf.constant(ignore_mask, tf.bool)
       if weights is not None:
         weights = tf.constant(weights, tf.float32)
       metric, update = metrics.streaming_sparse_recall_at_k(
           predictions=tf.constant(predictions, tf.float32),
-          labels=labels, k=k, class_id=class_id, ignore_mask=ignore_mask,
-          weights=weights)
+          labels=labels, k=k, class_id=class_id, weights=weights)
 
       # Fails without initialized vars.
       self.assertRaises(tf.OpError, metric.eval)
@@ -2740,8 +2698,7 @@ class StreamingSparseRecallTest(tf.test.TestCase):
           predictions, labels, k=5, expected=NAN, class_id=class_id,
           weights=[[0, 0], [0, 0]])
     self._test_streaming_sparse_recall_at_k(
-        predictions, labels, k=5, expected=NAN, ignore_mask=[[False], [True]],
-        weights=[[0], [1]])
+        predictions, labels, k=5, expected=NAN, weights=[[0], [0]])
     self._test_streaming_sparse_recall_at_k(
         predictions, labels, k=5, expected=NAN, weights=[[0, 0], [0, 0]])
 
@@ -2764,22 +2721,22 @@ class StreamingSparseRecallTest(tf.test.TestCase):
     # Class 2: 2 labels, both correct.
     self._test_streaming_sparse_recall_at_k(
         predictions, labels, k=5, expected=2.0 / 2.0, class_id=2,
-        ignore_mask=[[False], [False]], weights=[[1], [0]])
+        weights=[[1], [0]])
 
     # Class 2: 2 labels, both correct.
     self._test_streaming_sparse_recall_at_k(
         predictions, labels, k=5, expected=2.0 / 2.0, class_id=2,
-        ignore_mask=[[False], [False]], weights=[[0], [1]])
+        weights=[[0], [1]])
 
     # Class 7: 1 label, correct.
     self._test_streaming_sparse_recall_at_k(
         predictions, labels, k=5, expected=1.0 / 1.0, class_id=7,
-        ignore_mask=[[True], [False]], weights=[[1], [1]])
+        weights=[[0], [1]])
 
     # Class 7: 1 label, incorrect.
     self._test_streaming_sparse_recall_at_k(
         predictions, labels, k=5, expected=0.0 / 1.0, class_id=7,
-        ignore_mask=[[False], [True]], weights=[[1], [1]])
+        weights=[[1], [0]])
 
     # Class 7: 2 labels, 1 correct.
     self._test_streaming_sparse_recall_at_k(
@@ -3660,16 +3617,14 @@ class PcntBelowThreshTest(tf.test.TestCase):
   def testSomePresentOneUpdate(self):
     with self.test_session() as sess:
       values = tf.constant([2, 4, 6, 8], shape=(1, 4), dtype=tf.float32)
-      mask = tf.constant([False, True, False, False], shape=(1, 4),
-                         dtype=tf.bool)
-      weights = tf.constant([1, 1, 0, 1], shape=(1, 4), dtype=tf.float32)
+      weights = tf.constant([1, 0, 0, 1], shape=(1, 4), dtype=tf.float32)
 
       pcnt0, update_op0 = metrics.streaming_percentage_less(
-          values, 100, ignore_mask=mask, weights=weights, name='high')
+          values, 100, weights=weights, name='high')
       pcnt1, update_op1 = metrics.streaming_percentage_less(
-          values, 7, ignore_mask=mask, weights=weights, name='medium')
+          values, 7, weights=weights, name='medium')
       pcnt2, update_op2 = metrics.streaming_percentage_less(
-          values, 1, ignore_mask=mask, weights=weights, name='low')
+          values, 1, weights=weights, name='low')
 
       sess.run(tf.initialize_local_variables())
       self.assertListEqual([1.0, 0.5, 0.0],
@@ -3712,22 +3667,6 @@ class StreamingMeanIOUTest(tf.test.TestCase):
       metrics.streaming_mean_iou(
           predictions, labels, num_classes=2)
 
-  def testLabelsAndIgnoreMaskOfDifferentSizeRaisesValueError(self):
-    predictions = tf.ones([10])
-    labels = tf.ones([10])
-    ignore_mask = tf.cast(tf.ones([9]), tf.bool)
-    with self.assertRaises(ValueError):
-      metrics.streaming_mean_iou(
-          predictions, labels, num_classes=2, ignore_mask=ignore_mask)
-
-  def testIgnoreMaskIsNotBooleanRaisesTypeError(self):
-    predictions = tf.ones([10])
-    labels = tf.ones([10])
-    ignore_mask = tf.ones([10])
-    with self.assertRaises(TypeError):
-      metrics.streaming_mean_iou(
-          predictions, labels, num_classes=2, ignore_mask=ignore_mask)
-
   def testLabelsAndWeightsOfDifferentSizeRaisesValueError(self):
     predictions = tf.ones([10])
     labels = tf.ones([10])
@@ -3810,29 +3749,18 @@ class StreamingMeanIOUTest(tf.test.TestCase):
       _enqueue_vector(sess, labels_queue, [1])
       labels = labels_queue.dequeue()
 
-      # Create the queue that populates the ignore_masks.
-      ignore_masks_queue = tf.FIFOQueue(6, dtypes=tf.bool, shapes=(1, 1))
-      _enqueue_vector(sess, ignore_masks_queue, [False])
-      _enqueue_vector(sess, ignore_masks_queue, [False])
-      _enqueue_vector(sess, ignore_masks_queue, [False])
-      _enqueue_vector(sess, ignore_masks_queue, [True])
-      _enqueue_vector(sess, ignore_masks_queue, [False])
-      _enqueue_vector(sess, ignore_masks_queue, [False])
-      ignore_mask = ignore_masks_queue.dequeue()
-
       # Create the queue that populates the weights.
       weights_queue = tf.FIFOQueue(6, dtypes=tf.float32, shapes=(1, 1))
       _enqueue_vector(sess, weights_queue, [1.0])
       _enqueue_vector(sess, weights_queue, [1.0])
       _enqueue_vector(sess, weights_queue, [1.0])
-      _enqueue_vector(sess, weights_queue, [1.0])
+      _enqueue_vector(sess, weights_queue, [0.0])
       _enqueue_vector(sess, weights_queue, [1.0])
       _enqueue_vector(sess, weights_queue, [0.0])
       weights = weights_queue.dequeue()
 
       miou, update_op = metrics.streaming_mean_iou(
-          predictions, labels, num_classes, ignore_mask=ignore_mask,
-          weights=weights)
+          predictions, labels, num_classes, weights=weights)
 
       sess.run(tf.initialize_local_variables())
       for _ in range(6):
@@ -3920,13 +3848,12 @@ class StreamingMeanIOUTest(tf.test.TestCase):
     labels = tf.concat(0, [tf.constant(0, shape=[3]),
                            tf.constant(1, shape=[7])])
     num_classes = 2
-    mask = tf.concat(0, [tf.constant(False, shape=[9]),
-                         tf.constant(True, shape=[1])])
     weights = tf.concat(0, [tf.constant(0, shape=[1]),
-                            tf.constant(1, shape=[9])])
+                            tf.constant(1, shape=[8]),
+                            tf.constant(0, shape=[1])])
     with self.test_session() as sess:
       miou, update_op = metrics.streaming_mean_iou(
-          predictions, labels, num_classes, ignore_mask=mask, weights=weights)
+          predictions, labels, num_classes, weights=weights)
       sess.run(tf.initialize_local_variables())
       self.assertAllEqual([[2, 2], [0, 4]], update_op.eval())
       desired_miou = np.mean([2./4., 4./6.])
diff --git a/tensorflow/contrib/opt/python/training/external_optimizer.py b/tensorflow/contrib/opt/python/training/external_optimizer.py
index 7629662b079..de539a46e26 100644
--- a/tensorflow/contrib/opt/python/training/external_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/external_optimizer.py
@@ -100,7 +100,7 @@ class ExternalOptimizerInterface(object):
                                                 accumulated_dims[1:])]
 
   def minimize(self, session=None, feed_dict=None, fetches=None,
-               step_callback=None, loss_callback=None, grad_callback=None):
+               step_callback=None, loss_callback=None):
     """Minimize a scalar `Tensor`.
 
     Variables subject to optimization are updated in-place at the end of
@@ -113,14 +113,13 @@ class ExternalOptimizerInterface(object):
     Args:
       session: A `Session` instance.
       feed_dict: A feed dict to be passed to calls to `session.run`.
-      fetches: A list of `Tensor`s to fetch and supply to `loss_callback` and
-        `grad_callback` as positional arguments.
+      fetches: A list of `Tensor`s to fetch and supply to `loss_callback`
+        as positional arguments.
       step_callback: A function to be called at each optimization step;
         arguments are the current values of all optimization variables
         flattened into a single vector.
       loss_callback: A function to be called every time the loss and gradients
         are computed, with evaluated fetches supplied as positional arguments.
-      grad_callback: Deprecated.
     """
     session = session or ops.get_default_session()
     feed_dict = feed_dict or {}
@@ -128,9 +127,6 @@ class ExternalOptimizerInterface(object):
 
     loss_callback = loss_callback or (lambda *fetches: None)
     step_callback = step_callback or (lambda xk: None)
-    # TODO(chapelle): Remove grad_callback (b/30590858)
-    if grad_callback:
-      logging.warn('grad_callback is deprecated. Please use loss_callback.')
 
     # Construct loss function and associated gradient.
     loss_grad_func = self._make_eval_func(
diff --git a/tensorflow/contrib/opt/python/training/moving_average_optimizer.py b/tensorflow/contrib/opt/python/training/moving_average_optimizer.py
index 86a828394ea..d6df49d8525 100644
--- a/tensorflow/contrib/opt/python/training/moving_average_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/moving_average_optimizer.py
@@ -62,7 +62,8 @@ from tensorflow.python.training import saver
 class MovingAverageOptimizer(optimizer.Optimizer):
   """Optimizer wrapper that maintains a moving average of parameters."""
 
-  def __init__(self, opt, average_decay=0.9999, sequential_update=True):
+  def __init__(self, opt, average_decay=0.9999, num_updates=None,
+               sequential_update=True):
     """Construct a new MovingAverageOptimizer.
 
     Args:
@@ -70,6 +71,8 @@ class MovingAverageOptimizer(optimizer.Optimizer):
       average_decay: Float.  Decay to use to maintain the moving averages
                      of trained variables.
                      See tf.train.ExponentialMovingAverage for details.
+      num_updates: Optional count of number of updates applied to variables.
+                   See tf.train.ExponentialMovingAverage for details.
       sequential_update: Bool. If False, will compute the moving average at the
                          same time as the model is updated, potentially doing
                          benign data races.
@@ -77,7 +80,8 @@ class MovingAverageOptimizer(optimizer.Optimizer):
                          updates.
     """
     self._optimizer = opt
-    self._ema = moving_averages.ExponentialMovingAverage(average_decay)
+    self._ema = moving_averages.ExponentialMovingAverage(
+        average_decay, num_updates=num_updates)
     self._variable_map = None
     self._sequential_update = sequential_update
 
diff --git a/tensorflow/contrib/rnn/BUILD b/tensorflow/contrib/rnn/BUILD
index 00123379f6f..fdac3e9e497 100644
--- a/tensorflow/contrib/rnn/BUILD
+++ b/tensorflow/contrib/rnn/BUILD
@@ -181,6 +181,24 @@ tf_gen_op_libs(
     op_lib_names = ["lstm_ops"],
 )
 
+tf_kernel_library(
+    name = "gru_ops_kernels",
+    srcs = [
+        "kernels/blas_gemm.cc",
+        "kernels/blas_gemm.h",
+    ],
+    gpu_srcs = [
+        "kernels/blas_gemm.h",
+    ],
+    prefix = "kernels/gru_ops",
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels:eigen_helpers",
+        "//third_party/eigen3",
+    ],
+)
+
 tf_kernel_library(
     name = "lstm_ops_kernels",
     srcs = [
diff --git a/tensorflow/contrib/rnn/kernels/blas_gemm.cc b/tensorflow/contrib/rnn/kernels/blas_gemm.cc
index 637b872dadc..e62501e9b10 100644
--- a/tensorflow/contrib/rnn/kernels/blas_gemm.cc
+++ b/tensorflow/contrib/rnn/kernels/blas_gemm.cc
@@ -37,7 +37,6 @@ perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
 namespace functor {
 template <typename T>
 void TensorCuBlasGemm<T>::operator()(OpKernelContext* ctx,
-                                     perftools::gputools::Stream* stream,
                                      bool transa, bool transb, uint64 m,
                                      uint64 n, uint64 k, T alpha, const T* a,
                                      int lda, const T* b, int ldb, T beta, T* c,
@@ -52,7 +51,8 @@ void TensorCuBlasGemm<T>::operator()(OpKernelContext* ctx,
   auto c_ptr = AsDeviceMemory(c);
 
   bool blas_launch_status =
-      stream
+      ctx->op_device_context()
+          ->stream()
           ->ThenBlasGemm(trans[transa], trans[transb], m, n, k, alpha, a_ptr,
                          lda, b_ptr, ldb, beta, &c_ptr, ldc)
           .ok();
diff --git a/tensorflow/contrib/rnn/kernels/blas_gemm.h b/tensorflow/contrib/rnn/kernels/blas_gemm.h
index 9c34b8ae715..e33eceadff1 100644
--- a/tensorflow/contrib/rnn/kernels/blas_gemm.h
+++ b/tensorflow/contrib/rnn/kernels/blas_gemm.h
@@ -21,22 +21,15 @@ limitations under the License.
 #include "tensorflow/core/kernels/eigen_activations.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace perftools {
-namespace gputools {
-class Stream;
-}  // end namespace gputools
-}  // end namespace perftools
-
 namespace tensorflow {
 class OpKernelContext;
 namespace functor {
 
 template <typename T>
 struct TensorCuBlasGemm {
-  void operator()(OpKernelContext* ctx, perftools::gputools::Stream* stream,
-                  bool transa, bool transb, uint64 m, uint64 n, uint64 k,
-                  T alpha, const T* a, int lda, const T* b, int ldb, T beta,
-                  T* c, int ldc);
+  void operator()(OpKernelContext* ctx, bool transa, bool transb, uint64 m,
+                  uint64 n, uint64 k, T alpha, const T* a, int lda, const T* b,
+                  int ldb, T beta, T* c, int ldc);
 };
 
 template <typename Device, typename T, bool USE_CUBLAS>
@@ -44,16 +37,15 @@ struct TensorBlasGemm;
 
 template <typename Device, typename T>
 struct TensorBlasGemm<Device, T, true /* USE_CUBLAS */> {
-  static void compute(OpKernelContext* ctx, perftools::gputools::Stream* stream,
-                      const Device& d, bool transa, bool transb, T alpha,
-                      typename TTypes<T>::ConstMatrix a,
+  static void compute(OpKernelContext* ctx, const Device& d, bool transa,
+                      bool transb, T alpha, typename TTypes<T>::ConstMatrix a,
                       typename TTypes<T>::ConstMatrix b, T beta,
                       typename TTypes<T>::Matrix c) {
     int64 m = c.dimensions()[0];
     int64 n = c.dimensions()[1];
     int64 k = transa ? a.dimensions()[0] : a.dimensions()[1];
 
-    TensorCuBlasGemm<T>()(ctx, stream, transb, transa, n, m, k, alpha, b.data(),
+    TensorCuBlasGemm<T>()(ctx, transb, transa, n, m, k, alpha, b.data(),
                           transb ? k : n, a.data(), transa ? m : k, beta,
                           c.data(), n);
   }
@@ -61,9 +53,8 @@ struct TensorBlasGemm<Device, T, true /* USE_CUBLAS */> {
 
 template <typename Device, typename T>
 struct TensorBlasGemm<Device, T, false /* USE_CUBLAS */> {
-  static void compute(OpKernelContext* ctx, perftools::gputools::Stream* stream,
-                      const Device& d, bool transa, bool transb, T alpha,
-                      typename TTypes<T>::ConstMatrix a,
+  static void compute(OpKernelContext* ctx, const Device& d, bool transa,
+                      bool transb, T alpha, typename TTypes<T>::ConstMatrix a,
                       typename TTypes<T>::ConstMatrix b, T beta,
                       typename TTypes<T>::Matrix c) {
     Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_pairs;
diff --git a/tensorflow/contrib/rnn/kernels/gru_ops.cc b/tensorflow/contrib/rnn/kernels/gru_ops.cc
index ae25322a40c..6173591d3db 100644
--- a/tensorflow/contrib/rnn/kernels/gru_ops.cc
+++ b/tensorflow/contrib/rnn/kernels/gru_ops.cc
@@ -15,10 +15,6 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#if GOOGLE_CUDA
-#include "tensorflow/core/platform/stream_executor.h"
-#endif  // GOOGLE_CUDA
-
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/contrib/rnn/kernels/gru_ops.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -151,14 +147,9 @@ class GRUCellBlockOp : public OpKernel {
 
     const Device& device = ctx->eigen_device<Device>();
 
-    perftools::gputools::Stream* stream =
-        std::is_same<Device, GPUDevice>::value
-            ? ctx->op_device_context()->stream()
-            : nullptr;
-
     functor::GRUBlockCellFprop<Device, T, USE_CUBLAS>(batch_size, input_size,
                                                       cell_size)(
-        ctx, stream, device, x_tensor->matrix<T>(), h_prev_tensor->matrix<T>(),
+        ctx, device, x_tensor->matrix<T>(), h_prev_tensor->matrix<T>(),
         w_ru_tensor->matrix<T>(), w_c_tensor->matrix<T>(),
         b_ru_tensor->vec<T>(), b_c_tensor->vec<T>(), r_u_bar_tensor.matrix<T>(),
         r_tensor->matrix<T>(), u_tensor->matrix<T>(), c_tensor->matrix<T>(),
@@ -362,14 +353,10 @@ class GRUBlockCellGradOp : public OpKernel {
                             &d_x_component_2_h_prevr));
 
     const Device& device = ctx->eigen_device<Device>();
-    perftools::gputools::Stream* stream =
-        std::is_same<Device, GPUDevice>::value
-            ? ctx->op_device_context()->stream()
-            : nullptr;
 
     functor::GRUBlockCellBprop<Device, T, USE_CUBLAS>(batch_size, input_size,
                                                       cell_size)(
-        ctx, stream, device, x_tensor->matrix<T>(), h_prev_tensor->matrix<T>(),
+        ctx, device, x_tensor->matrix<T>(), h_prev_tensor->matrix<T>(),
         w_ru_tensor->matrix<T>(), w_c_tensor->matrix<T>(),
         b_ru_tensor->vec<T>(), b_c_tensor->vec<T>(), r_tensor->matrix<T>(),
         u_tensor->matrix<T>(), c_tensor->matrix<T>(), d_h_tensor->matrix<T>(),
@@ -400,8 +387,8 @@ namespace functor {
 #define DECLARE_GPU_SPEC(T)                                                   \
   template <>                                                                 \
   void GRUBlockCellFprop<GPUDevice, T, true>::operator()(                     \
-      OpKernelContext* ctx, perftools::gputools::Stream* stream,              \
-      const GPUDevice& d, typename TTypes<T>::ConstMatrix x,                  \
+      OpKernelContext* ctx, const GPUDevice& d,                               \
+      typename TTypes<T>::ConstMatrix x,                                      \
       typename TTypes<T>::ConstMatrix h_prev,                                 \
       typename TTypes<T>::ConstMatrix w_ru,                                   \
       typename TTypes<T>::ConstMatrix w_c, typename TTypes<T>::ConstVec b_ru, \
@@ -430,9 +417,9 @@ namespace functor {
 #define DECLARE_GPU_SPEC(T)                                                    \
   template <>                                                                  \
   void GRUBlockCellBprop<GPUDevice, T, true>::operator()(                      \
-      OpKernelContext* ctx, perftools::gputools::Stream* stream,               \
-      const GPUDevice& d, typename TTypes<T>::ConstMatrix x,                   \
-      typename TTypes<T>::ConstMatrix h, typename TTypes<T>::ConstMatrix w_ru, \
+      OpKernelContext* ctx, const GPUDevice& d,                                \
+      typename TTypes<T>::ConstMatrix x, typename TTypes<T>::ConstMatrix h,    \
+      typename TTypes<T>::ConstMatrix w_ru,                                    \
       typename TTypes<T>::ConstMatrix w_c, typename TTypes<T>::ConstVec b_ru,  \
       typename TTypes<T>::ConstVec b_c, typename TTypes<T>::ConstMatrix r,     \
       typename TTypes<T>::ConstMatrix u, typename TTypes<T>::ConstMatrix c,    \
diff --git a/tensorflow/contrib/rnn/kernels/gru_ops.h b/tensorflow/contrib/rnn/kernels/gru_ops.h
index e6c4ad9a032..06a56650629 100644
--- a/tensorflow/contrib/rnn/kernels/gru_ops.h
+++ b/tensorflow/contrib/rnn/kernels/gru_ops.h
@@ -21,12 +21,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace perftools {
-namespace gputools {
-class Stream;
-}  // end namespace gputools
-}  // end namespace perftools
-
 namespace tensorflow {
 
 class OpKernelContext;
@@ -77,18 +71,15 @@ struct GRUBlockCellFprop : public GRUCell {
                     const int cell_size)
       : GRUCell(batch_size, input_size, cell_size) {}
 
-  void operator()(OpKernelContext* ctx, perftools::gputools::Stream* stream,
-                  const Device& d, typename TTypes<T>::ConstMatrix x,
-                  typename TTypes<T>::ConstMatrix h_prev,
-                  typename TTypes<T>::ConstMatrix w_ru,
-                  typename TTypes<T>::ConstMatrix w_c,
-                  typename TTypes<T>::ConstVec b_ru,
-                  typename TTypes<T>::ConstVec b_c,
-                  typename TTypes<T>::Matrix r_u_bar,
-                  typename TTypes<T>::Matrix r, typename TTypes<T>::Matrix u,
-                  typename TTypes<T>::Matrix c, typename TTypes<T>::Matrix h,
-                  typename TTypes<T>::Matrix x_h_prev,
-                  typename TTypes<T>::Matrix x_h_prevr) {
+  void operator()(
+      OpKernelContext* ctx, const Device& d, typename TTypes<T>::ConstMatrix x,
+      typename TTypes<T>::ConstMatrix h_prev,
+      typename TTypes<T>::ConstMatrix w_ru, typename TTypes<T>::ConstMatrix w_c,
+      typename TTypes<T>::ConstVec b_ru, typename TTypes<T>::ConstVec b_c,
+      typename TTypes<T>::Matrix r_u_bar, typename TTypes<T>::Matrix r,
+      typename TTypes<T>::Matrix u, typename TTypes<T>::Matrix c,
+      typename TTypes<T>::Matrix h, typename TTypes<T>::Matrix x_h_prev,
+      typename TTypes<T>::Matrix x_h_prevr) {
     // Concat x_h_prev = [x, h_prev].
     x_h_prev.slice(x_offsets(), x_extends()).device(d) = x;
     x_h_prev.slice(h_offsets(), h_extends()).device(d) = h_prev;
@@ -96,9 +87,8 @@ struct GRUBlockCellFprop : public GRUCell {
     // r_u_bar = x_h_prev * w_ru + b_ru
     typename TTypes<T>::ConstMatrix const_x_h_prev(x_h_prev.data(),
                                                    x_h_prev.dimensions());
-    TensorBlasGemm<Device, T, USE_CUBLAS>::compute(ctx, stream, d, false, false,
-                                                   T(1), const_x_h_prev, w_ru,
-                                                   T(0), r_u_bar);
+    TensorBlasGemm<Device, T, USE_CUBLAS>::compute(
+        ctx, d, false, false, T(1), const_x_h_prev, w_ru, T(0), r_u_bar);
 
     // Creating a bias matrix for adding by broadcasting 'b_ru'
     Eigen::array<Eigen::DenseIndex, 2> broadcast_shape({batch_size_, 1});
@@ -117,7 +107,7 @@ struct GRUBlockCellFprop : public GRUCell {
     typename TTypes<T>::ConstMatrix const_x_h_prevr(x_h_prevr.data(),
                                                     x_h_prevr.dimensions());
     TensorBlasGemm<Device, T, USE_CUBLAS>::compute(
-        ctx, stream, d, false, false, T(1), const_x_h_prevr, w_c, T(0), c);
+        ctx, d, false, false, T(1), const_x_h_prevr, w_c, T(0), c);
 
     Eigen::array<Eigen::DenseIndex, 2> b_c_shape({1, b_c.dimensions()[0]});
     c.device(d) += (b_c.reshape(b_c_shape).broadcast(broadcast_shape));
@@ -135,8 +125,7 @@ struct GRUBlockCellBprop : public GRUCell {
       : GRUCell(batch_size, input_size, cell_size) {}
 
   void operator()(
-      OpKernelContext* ctx, perftools::gputools::Stream* stream,
-      const Device& d, typename TTypes<T>::ConstMatrix x,
+      OpKernelContext* ctx, const Device& d, typename TTypes<T>::ConstMatrix x,
       typename TTypes<T>::ConstMatrix h_prev,
       typename TTypes<T>::ConstMatrix w_ru, typename TTypes<T>::ConstMatrix w_c,
       typename TTypes<T>::ConstVec b_ru, typename TTypes<T>::ConstVec b_c,
@@ -159,9 +148,9 @@ struct GRUBlockCellBprop : public GRUCell {
     // [2nd_component_of_d_x d_h_prevr] = d_c_bar X w_c^T
     typename TTypes<T>::ConstMatrix const_d_c_bar(d_c_bar.data(),
                                                   d_c_bar.dimensions());
-    TensorBlasGemm<Device, T, USE_CUBLAS>::compute(ctx, stream, d, false, true,
-                                                   T(1), const_d_c_bar, w_c,
-                                                   T(0), d_x_comp2_and_h_prevr);
+    TensorBlasGemm<Device, T, USE_CUBLAS>::compute(ctx, d, false, true, T(1),
+                                                   const_d_c_bar, w_c, T(0),
+                                                   d_x_comp2_and_h_prevr);
 
     d_hr.device(d) = d_x_comp2_and_h_prevr.slice(h_offsets(), h_extends());
     d_r_bar.device(d) = (d_hr * h_prev * r) * (r.constant(T(1)) - r);
@@ -175,7 +164,7 @@ struct GRUBlockCellBprop : public GRUCell {
     typename TTypes<T>::ConstMatrix const_d_r_bar_u_bar(
         d_r_bar_u_bar.data(), d_r_bar_u_bar.dimensions());
     TensorBlasGemm<Device, T, USE_CUBLAS>::compute(
-        ctx, stream, d, false, true, T(1), const_d_r_bar_u_bar, w_ru, T(0),
+        ctx, d, false, true, T(1), const_d_r_bar_u_bar, w_ru, T(0),
         d_x_comp1_and_h_prev_comp1);
 
     // d_x = d_x_comp1 + d_x_comp2
diff --git a/tensorflow/contrib/rnn/kernels/lstm_ops.cc b/tensorflow/contrib/rnn/kernels/lstm_ops.cc
index 7fec457a4ac..2cebcd8fb31 100644
--- a/tensorflow/contrib/rnn/kernels/lstm_ops.cc
+++ b/tensorflow/contrib/rnn/kernels/lstm_ops.cc
@@ -34,10 +34,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 
-#if GOOGLE_CUDA
-#include "tensorflow/core/platform/stream_executor.h"
-#endif  // GOOGLE_CUDA
-
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -164,14 +160,10 @@ class LSTMBlockCellOp : public OpKernel {
                                       &icfo_tensor));
 
     const Device& device = ctx->eigen_device<Device>();
-    perftools::gputools::Stream* stream =
-        std::is_same<Device, GPUDevice>::value
-            ? ctx->op_device_context()->stream()
-            : nullptr;
 
     functor::LSTMBlockCellFprop<Device, T, USE_CUBLAS>(batch_size, input_size,
                                                        cell_size)(
-        ctx, stream, device, forget_bias_, cell_clip_, use_peephole_,
+        ctx, device, forget_bias_, cell_clip_, use_peephole_,
         x_tensor->matrix<T>(), cs_prev_tensor->matrix<T>(),
         h_prev_tensor->matrix<T>(), w_tensor->matrix<T>(), wci_tensor->vec<T>(),
         wcf_tensor->vec<T>(), wco_tensor->vec<T>(), b_tensor->vec<T>(),
@@ -196,22 +188,21 @@ REGISTER_KERNEL(float);
 
 #if GOOGLE_CUDA
 namespace functor {
-#define DECLARE_GPU_SPEC(T)                                                \
-  template <>                                                              \
-  void LSTMBlockCellFprop<GPUDevice, T, true>::operator()(                 \
-      OpKernelContext* ctx, perftools::gputools::Stream* stream,           \
-      const GPUDevice& d, const T forget_bias, const T cell_clip,          \
-      bool use_peephole, typename TTypes<T>::ConstMatrix x,                \
-      typename TTypes<T>::ConstMatrix cs_prev,                             \
-      typename TTypes<T>::ConstMatrix h_prev,                              \
-      typename TTypes<T>::ConstMatrix w, typename TTypes<T>::ConstVec wci, \
-      typename TTypes<T>::ConstVec wcf, typename TTypes<T>::ConstVec wco,  \
-      typename TTypes<T>::ConstVec b, typename TTypes<T>::Matrix xh,       \
-      typename TTypes<T>::Matrix i, typename TTypes<T>::Matrix cs,         \
-      typename TTypes<T>::Matrix f, typename TTypes<T>::Matrix o,          \
-      typename TTypes<T>::Matrix ci, typename TTypes<T>::Matrix co,        \
-      typename TTypes<T>::Matrix icfo, typename TTypes<T>::Matrix h);      \
-                                                                           \
+#define DECLARE_GPU_SPEC(T)                                                    \
+  template <>                                                                  \
+  void LSTMBlockCellFprop<GPUDevice, T, true>::operator()(                     \
+      OpKernelContext* ctx, const GPUDevice& d, const T forget_bias,           \
+      const T cell_clip, bool use_peephole, typename TTypes<T>::ConstMatrix x, \
+      typename TTypes<T>::ConstMatrix cs_prev,                                 \
+      typename TTypes<T>::ConstMatrix h_prev,                                  \
+      typename TTypes<T>::ConstMatrix w, typename TTypes<T>::ConstVec wci,     \
+      typename TTypes<T>::ConstVec wcf, typename TTypes<T>::ConstVec wco,      \
+      typename TTypes<T>::ConstVec b, typename TTypes<T>::Matrix xh,           \
+      typename TTypes<T>::Matrix i, typename TTypes<T>::Matrix cs,             \
+      typename TTypes<T>::Matrix f, typename TTypes<T>::Matrix o,              \
+      typename TTypes<T>::Matrix ci, typename TTypes<T>::Matrix co,            \
+      typename TTypes<T>::Matrix icfo, typename TTypes<T>::Matrix h);          \
+                                                                               \
   extern template struct LSTMBlockCellFprop<GPUDevice, T, true>;
 
 DECLARE_GPU_SPEC(float);
@@ -445,10 +436,6 @@ class LSTMBlockCellGradOp : public OpKernel {
                                            &di_tensor));
 
     const Device& device = ctx->eigen_device<Device>();
-    perftools::gputools::Stream* stream =
-        std::is_same<Device, GPUDevice>::value
-            ? ctx->op_device_context()->stream()
-            : nullptr;
 
     functor::TensorZero<Device, T>()(device, wci_grad_tensor->flat<float>());
     functor::TensorZero<Device, T>()(device, wcf_grad_tensor->flat<float>());
@@ -456,7 +443,7 @@ class LSTMBlockCellGradOp : public OpKernel {
 
     functor::LSTMBlockCellBprop<Device, T, USE_CUBLAS>(batch_size, input_size,
                                                        cell_size)(
-        ctx, stream, device, use_peephole_, x_tensor->matrix<T>(),
+        ctx, device, use_peephole_, x_tensor->matrix<T>(),
         cs_prev_tensor->matrix<T>(), h_prev_tensor->matrix<T>(),
         w_tensor->matrix<T>(), wci_tensor->vec<T>(), wcf_tensor->vec<T>(),
         wco_tensor->vec<T>(), b_tensor->vec<T>(), i_tensor->matrix<T>(),
@@ -486,8 +473,7 @@ namespace functor {
 #define DECLARE_GPU_SPEC(T)                                                   \
   template <>                                                                 \
   void LSTMBlockCellBprop<GPUDevice, T, true>::operator()(                    \
-      OpKernelContext* ctx, perftools::gputools::Stream* stream,              \
-      const GPUDevice& d, bool use_peephole,                                  \
+      OpKernelContext* ctx, const GPUDevice& d, bool use_peephole,            \
       typename TTypes<T>::ConstMatrix x,                                      \
       typename TTypes<T>::ConstMatrix cs_prev,                                \
       typename TTypes<T>::ConstMatrix h_prev,                                 \
@@ -769,10 +755,6 @@ class BlockLSTMOp : public OpKernel {
                                       &icfo_tensor));
 
     const Device& device = ctx->eigen_device<Device>();
-    perftools::gputools::Stream* stream =
-        std::is_same<Device, GPUDevice>::value
-            ? ctx->op_device_context()->stream()
-            : nullptr;
 
     const int64 seq_len_max = seq_len_max_tensor->scalar<int64>()();
     SliceHelper<Device, T> slicer(ctx);
@@ -794,7 +776,7 @@ class BlockLSTMOp : public OpKernel {
 
       functor::LSTMBlockCellFprop<Device, T, USE_CUBLAS>(batch_size, input_size,
                                                          cell_size)(
-          ctx, stream, device, forget_bias_, cell_clip_, use_peephole_,
+          ctx, device, forget_bias_, cell_clip_, use_peephole_,
           x_tensor.matrix<T>(), cs_prev_tensor2.matrix<T>(),
           h_prev_tensor2.matrix<T>(), w_tensor->matrix<T>(),
           wci_tensor->vec<T>(), wcf_tensor->vec<T>(), wco_tensor->vec<T>(),
@@ -1020,10 +1002,6 @@ class BlockLSTMGradOp : public OpKernel {
 
 
     const Device& device = ctx->eigen_device<Device>();
-    perftools::gputools::Stream* stream =
-        std::is_same<Device, GPUDevice>::value
-            ? ctx->op_device_context()->stream()
-            : nullptr;
 
     functor::TensorZero<Device, T>()(device, cs_grad_tensor.flat<float>());
     functor::TensorZero<Device, T>()(device,
@@ -1073,7 +1051,7 @@ class BlockLSTMGradOp : public OpKernel {
       Tensor x_grad_tensor = slicer.OutputSlice(x_grad, t, "x_grad");
       functor::BlockLSTMBprop<Device, T, USE_CUBLAS>(batch_size, input_size,
                                                      cell_size)(
-          ctx, stream, device, use_peephole_, x_tensor.matrix<T>(),
+          ctx, device, use_peephole_, x_tensor.matrix<T>(),
           cs_prev_tensor2.matrix<T>(), h_prev_tensor2.matrix<T>(),
           w_tensor->matrix<T>(), wci_tensor->vec<T>(), wcf_tensor->vec<T>(),
           wco_tensor->vec<T>(), b_tensor->vec<T>(), xh_tensor.matrix<T>(),
@@ -1134,8 +1112,7 @@ namespace functor {
                                                                                \
   template <>                                                                  \
   void BlockLSTMBprop<GPUDevice, T, true>::operator()(                         \
-      OpKernelContext* ctx, perftools::gputools::Stream* stream,               \
-      const GPUDevice& d, bool use_peephole,                                   \
+      OpKernelContext* ctx, const GPUDevice& d, bool use_peephole,             \
       typename TTypes<T>::ConstMatrix x,                                       \
       typename TTypes<T>::ConstMatrix cs_prev,                                 \
       typename TTypes<T>::ConstMatrix h_prev,                                  \
diff --git a/tensorflow/contrib/rnn/kernels/lstm_ops.h b/tensorflow/contrib/rnn/kernels/lstm_ops.h
index 1332b880026..d9ed9e3ab71 100644
--- a/tensorflow/contrib/rnn/kernels/lstm_ops.h
+++ b/tensorflow/contrib/rnn/kernels/lstm_ops.h
@@ -22,12 +22,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/eigen_activations.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace perftools {
-namespace gputools {
-class Stream;
-}  // end namespace gputools
-}  // end namespace perftools
-
 namespace tensorflow {
 class OpKernelContext;
 
@@ -153,29 +147,26 @@ struct LSTMBlockCellFprop : public LSTMBlockCell {
                      const int cell_size)
       : LSTMBlockCell(batch_size, input_size, cell_size) {}
 
-  void operator()(OpKernelContext* ctx, perftools::gputools::Stream* stream,
-                  const Device& d, const T forget_bias, const T cell_clip,
-                  bool use_peephole, typename TTypes<T>::ConstMatrix x,
-                  typename TTypes<T>::ConstMatrix cs_prev,
-                  typename TTypes<T>::ConstMatrix h_prev,
-                  typename TTypes<T>::ConstMatrix w,
-                  typename TTypes<T>::ConstVec wci,
-                  typename TTypes<T>::ConstVec wcf,
-                  typename TTypes<T>::ConstVec wco,
-                  typename TTypes<T>::ConstVec b, typename TTypes<T>::Matrix xh,
-                  typename TTypes<T>::Matrix i, typename TTypes<T>::Matrix cs,
-                  typename TTypes<T>::Matrix f, typename TTypes<T>::Matrix o,
-                  typename TTypes<T>::Matrix ci, typename TTypes<T>::Matrix co,
-                  typename TTypes<T>::Matrix icfo,
-                  typename TTypes<T>::Matrix h) {
+  void operator()(
+      OpKernelContext* ctx, const Device& d, const T forget_bias,
+      const T cell_clip, bool use_peephole, typename TTypes<T>::ConstMatrix x,
+      typename TTypes<T>::ConstMatrix cs_prev,
+      typename TTypes<T>::ConstMatrix h_prev, typename TTypes<T>::ConstMatrix w,
+      typename TTypes<T>::ConstVec wci, typename TTypes<T>::ConstVec wcf,
+      typename TTypes<T>::ConstVec wco, typename TTypes<T>::ConstVec b,
+      typename TTypes<T>::Matrix xh, typename TTypes<T>::Matrix i,
+      typename TTypes<T>::Matrix cs, typename TTypes<T>::Matrix f,
+      typename TTypes<T>::Matrix o, typename TTypes<T>::Matrix ci,
+      typename TTypes<T>::Matrix co, typename TTypes<T>::Matrix icfo,
+      typename TTypes<T>::Matrix h) {
     // Concat xh = [x, h].
     xh.slice(xh_x_offsets(), xh_x_extents()).device(d) = x;
     xh.slice(xh_h_offsets(), xh_h_extents()).device(d) = h_prev;
 
     // states1 = xh * w + b
     typename TTypes<T>::ConstMatrix const_xh(xh.data(), xh.dimensions());
-    TensorBlasGemm<Device, T, USE_CUBLAS>::compute(
-        ctx, stream, d, false, false, T(1), const_xh, w, T(0), icfo);
+    TensorBlasGemm<Device, T, USE_CUBLAS>::compute(ctx, d, false, false, T(1),
+                                                   const_xh, w, T(0), icfo);
     Eigen::array<Eigen::DenseIndex, 2> b_shape({1, b.dimensions()[0]});
     Eigen::array<Eigen::DenseIndex, 2> broadcast_shape({batch_size_, 1});
     icfo.device(d) += b.reshape(b_shape).broadcast(broadcast_shape);
@@ -239,8 +230,8 @@ struct LSTMBlockCellBprop : public LSTMBlockCell {
       : LSTMBlockCell(batch_size, input_size, cell_size) {}
 
   void operator()(
-      OpKernelContext* ctx, perftools::gputools::Stream* stream,
-      const Device& d, bool use_peephole, typename TTypes<T>::ConstMatrix x,
+      OpKernelContext* ctx, const Device& d, bool use_peephole,
+      typename TTypes<T>::ConstMatrix x,
       typename TTypes<T>::ConstMatrix cs_prev,
       typename TTypes<T>::ConstMatrix h_prev, typename TTypes<T>::ConstMatrix w,
       typename TTypes<T>::ConstVec wci, typename TTypes<T>::ConstVec wcf,
@@ -305,8 +296,8 @@ struct BlockLSTMBprop : public LSTMBlockCell {
       : LSTMBlockCell(batch_size, input_size, cell_size) {}
 
   void operator()(
-      OpKernelContext* ctx, perftools::gputools::Stream* stream,
-      const Device& d, bool use_peephole, typename TTypes<T>::ConstMatrix x,
+      OpKernelContext* ctx, const Device& d, bool use_peephole,
+      typename TTypes<T>::ConstMatrix x,
       typename TTypes<T>::ConstMatrix cs_prev,
       typename TTypes<T>::ConstMatrix h_prev, typename TTypes<T>::ConstMatrix w,
       typename TTypes<T>::ConstVec wci, typename TTypes<T>::ConstVec wcf,
@@ -364,7 +355,7 @@ struct BlockLSTMBprop : public LSTMBlockCell {
     typename TTypes<T>::ConstMatrix const_dicfo(dicfo.data(),
                                                 dicfo.dimensions());
     TensorBlasGemm<Device, T, USE_CUBLAS>::compute(
-        ctx, stream, d, false, true, T(1), const_dicfo, w, T(0), xh_grad);
+        ctx, d, false, true, T(1), const_dicfo, w, T(0), xh_grad);
 
     // xh.
     xh.slice(xh_x_offsets(), xh_x_extents()).device(d) = x;
@@ -377,7 +368,7 @@ struct BlockLSTMBprop : public LSTMBlockCell {
 
     // w_grad.
     TensorBlasGemm<Device, T, USE_CUBLAS>::compute(
-        ctx, stream, d, true, false, T(1), const_xh, const_dicfo, T(1), w_grad);
+        ctx, d, true, false, T(1), const_xh, const_dicfo, T(1), w_grad);
 
     // b_grad.
     b_grad.device(d) += dicfo.sum(Eigen::array<int, 1>({0}));
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index 850b9547168..c1c25ba0942 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -1005,7 +1005,7 @@ _linear = rnn_cell._linear
 class AttentionCellWrapper(rnn_cell.RNNCell):
   """Basic attention cell wrapper.
 
-  Implementation based on https://arxiv.org/pdf/1601.06733.pdf.
+  Implementation based on https://arxiv.org/abs/1409.0473.
   """
 
   def __init__(self, cell, attn_length, attn_size=None, attn_vec_size=None,
diff --git a/tensorflow/contrib/slim/python/slim/data/dataset_data_provider.py b/tensorflow/contrib/slim/python/slim/data/dataset_data_provider.py
index ada4e0611ec..7f1b53ae356 100644
--- a/tensorflow/contrib/slim/python/slim/data/dataset_data_provider.py
+++ b/tensorflow/contrib/slim/python/slim/data/dataset_data_provider.py
@@ -51,7 +51,7 @@ from tensorflow.contrib.slim.python.slim.data import parallel_reader
 class DatasetDataProvider(data_provider.DataProvider):
 
   def __init__(self, dataset, num_readers=1, shuffle=True, num_epochs=None,
-               common_queue_capacity=256, common_queue_min=128):
+               common_queue_capacity=256, common_queue_min=128, seed=None):
     """Creates a DatasetDataProvider.
 
     Args:
@@ -64,6 +64,7 @@ class DatasetDataProvider(data_provider.DataProvider):
       common_queue_capacity: The capacity of the common queue.
       common_queue_min: The minimum number of elements in the common queue after
         a dequeue.
+      seed: The seed to use if shuffling.
     """
     _, data = parallel_reader.parallel_read(
         dataset.data_sources,
@@ -72,7 +73,8 @@ class DatasetDataProvider(data_provider.DataProvider):
         num_readers=num_readers,
         shuffle=shuffle,
         capacity=common_queue_capacity,
-        min_after_dequeue=common_queue_min)
+        min_after_dequeue=common_queue_min,
+        seed=seed)
 
     items = dataset.decoder.list_items()
     tensors = dataset.decoder.decode(data, items)
diff --git a/tensorflow/contrib/slim/python/slim/data/parallel_reader.py b/tensorflow/contrib/slim/python/slim/data/parallel_reader.py
index e8f6de31496..f1cbf563e3b 100644
--- a/tensorflow/contrib/slim/python/slim/data/parallel_reader.py
+++ b/tensorflow/contrib/slim/python/slim/data/parallel_reader.py
@@ -170,7 +170,8 @@ def parallel_read(data_sources,
                   shuffle=True,
                   dtypes=None,
                   capacity=256,
-                  min_after_dequeue=128):
+                  min_after_dequeue=128,
+                  seed=None):
   """Reads multiple records in parallel from data_sources using n readers.
 
   It uses a ParallelReader to read from multiple files in  parallel using
@@ -199,6 +200,7 @@ def parallel_read(data_sources,
     capacity: integer, capacity of the common_queue.
     min_after_dequeue: integer, minimum number of records in the common_queue
       after dequeue. Needed for a good shuffle.
+    seed: A seed for RandomShuffleQueue.
 
   Returns:
     key, value: a tuple of keys and values from the data_source.
@@ -212,7 +214,8 @@ def parallel_read(data_sources,
       common_queue = data_flow_ops.RandomShuffleQueue(
           capacity=capacity,
           min_after_dequeue=min_after_dequeue,
-          dtypes=dtypes)
+          dtypes=dtypes,
+          seed=seed)
     else:
       common_queue = data_flow_ops.FIFOQueue(capacity=capacity, dtypes=dtypes)
 
diff --git a/tensorflow/contrib/slim/python/slim/learning.py b/tensorflow/contrib/slim/python/slim/learning.py
index ed3e927560e..5595e53da1f 100644
--- a/tensorflow/contrib/slim/python/slim/learning.py
+++ b/tensorflow/contrib/slim/python/slim/learning.py
@@ -471,7 +471,14 @@ def create_train_op(
                                           'LossTensor is inf or nan')
 
     # Ensure the train_tensor computes grad_updates.
-    return control_flow_ops.with_dependencies([grad_updates], total_loss)
+    train_op = control_flow_ops.with_dependencies([grad_updates], total_loss)
+
+  # Add the operation used for training to the 'train_op' collection
+  train_ops = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+  if train_op not in train_ops:
+    train_ops.append(train_op)
+
+  return train_op
 
 
 def _wait_for_step(sess, global_step, step):
diff --git a/tensorflow/contrib/slim/python/slim/learning_test.py b/tensorflow/contrib/slim/python/slim/learning_test.py
index 69cd4a9583b..8a9f5f825c7 100644
--- a/tensorflow/contrib/slim/python/slim/learning_test.py
+++ b/tensorflow/contrib/slim/python/slim/learning_test.py
@@ -301,6 +301,22 @@ class CreateTrainOpTest(tf.test.TestCase):
         self.assertAllClose(mean, [0] * 4)
         self.assertAllClose(variance, [1] * 4)
 
+  def testRecordTrainOpInCollection(self):
+    with tf.Graph().as_default():
+      tf.set_random_seed(0)
+      tf_inputs = tf.constant(self._inputs, dtype=tf.float32)
+      tf_labels = tf.constant(self._labels, dtype=tf.float32)
+
+      tf_predictions = LogisticClassifier(tf_inputs)
+      slim.losses.log_loss(tf_predictions, tf_labels)
+      total_loss = slim.losses.get_total_loss()
+
+      optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0)
+      train_op = slim.learning.create_train_op(total_loss, optimizer)
+
+      # Make sure the training op was recorded in the proper collection
+      self.assertTrue(train_op in tf.get_collection(tf.GraphKeys.TRAIN_OP))
+
 
 class TrainTest(tf.test.TestCase):
 
diff --git a/tensorflow/contrib/tensor_forest/client/eval_metrics.py b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
index 6971e1861d1..be89b6f9593 100644
--- a/tensorflow/contrib/tensor_forest/client/eval_metrics.py
+++ b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
@@ -23,43 +23,54 @@ from tensorflow.contrib.metrics.python.ops import metric_ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 
-
-def _accuracy(probabilities, targets):
-  predictions = math_ops.argmax(probabilities, 1)
-  # undo one-hot
-  labels = math_ops.argmax(targets, 1)
-  return metric_ops.streaming_accuracy(predictions, labels)
+INFERENCE_PROB_NAME = 'inference'
+INFERENCE_PRED_NAME = 'predictions'
 
 
-def _r2(probabilities, targets):
+def _accuracy(predictions, targets, weights=None):
+  return metric_ops.streaming_accuracy(predictions, targets, weights=weights)
+
+
+def _r2(probabilities, targets, weights=None):
   if targets.get_shape().ndims == 1:
     targets = array_ops.expand_dims(targets, -1)
+  targets = math_ops.to_float(targets)
   y_mean = math_ops.reduce_mean(targets, 0)
   squares_total = math_ops.reduce_sum(math_ops.square(targets - y_mean), 0)
   squares_residuals = math_ops.reduce_sum(math_ops.square(
       targets - probabilities), 0)
   score = 1 - math_ops.reduce_sum(squares_residuals / squares_total)
-  return metric_ops.streaming_mean(score)
+  return metric_ops.streaming_mean(score, weights=weights)
 
 
-def _sigmoid_entropy(probabilities, targets):
+def _squeeze_and_onehot(targets, depth):
+  targets = array_ops.squeeze(targets, squeeze_dims=[1])
+  return array_ops.one_hot(math_ops.to_int32(targets), depth)
+
+
+def _sigmoid_entropy(probabilities, targets, weights=None):
   return metric_ops.streaming_mean(losses.sigmoid_cross_entropy(
-      probabilities, targets))
+      probabilities, _squeeze_and_onehot(targets,
+                                         array_ops.shape(probabilities)[1])),
+                                   weights=weights)
 
 
-def _softmax_entropy(probabilities, targets):
-  return metric_ops.streaming_mean(losses.softmax_cross_entropy(
-      probabilities, targets))
+def _softmax_entropy(probabilities, targets, weights=None):
+  return metric_ops.streaming_mean(losses.sparse_softmax_cross_entropy(
+      probabilities, math_ops.to_int32(targets)),
+                                   weights=weights)
 
 
-def _predictions(probabilities, unused_targets):
-  return math_ops.argmax(probabilities, 1)
+def _predictions(predictions, unused_targets, **unused_kwargs):
+  return predictions
 
 
-def _log_loss(probabilities, targets):
-  # targets doesn't have a shape coming in, log_loss isn't too happy about it.
-  targets = array_ops.reshape(targets, array_ops.shape(probabilities))
-  return metric_ops.streaming_mean(losses.log_loss(probabilities, targets))
+def _class_log_loss(probabilities, targets, weights=None):
+  return metric_ops.streaming_mean(
+      losses.log_loss(probabilities,
+                      _squeeze_and_onehot(targets,
+                                          array_ops.shape(probabilities)[1])),
+      weights=weights)
 
 
 _EVAL_METRICS = {'sigmoid_entropy': _sigmoid_entropy,
@@ -67,9 +78,21 @@ _EVAL_METRICS = {'sigmoid_entropy': _sigmoid_entropy,
                  'accuracy': _accuracy,
                  'r2': _r2,
                  'predictions': _predictions,
-                 'log_loss': _log_loss}
+                 'classification_log_loss': _class_log_loss}
+
+
+_PREDICTION_KEYS = {'sigmoid_entropy': INFERENCE_PROB_NAME,
+                    'softmax_entropy': INFERENCE_PROB_NAME,
+                    'accuracy': INFERENCE_PRED_NAME,
+                    'r2': INFERENCE_PROB_NAME,
+                    'predictions': INFERENCE_PRED_NAME,
+                    'classification_log_loss': INFERENCE_PROB_NAME}
 
 
 def get_metric(metric_name):
   """Given a metric name, return the corresponding metric function."""
   return _EVAL_METRICS[metric_name]
+
+
+def get_prediction_key(metric_name):
+  return _PREDICTION_KEYS[metric_name]
diff --git a/tensorflow/contrib/tensor_forest/data/data_ops.py b/tensorflow/contrib/tensor_forest/data/data_ops.py
index c408b93d710..1dfcaf5c7a4 100644
--- a/tensorflow/contrib/tensor_forest/data/data_ops.py
+++ b/tensorflow/contrib/tensor_forest/data/data_ops.py
@@ -17,10 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import math
 import threading
 
-from tensorflow.contrib.learn.python.learn.learn_io import graph_io
 from tensorflow.contrib.tensor_forest.python import constants
 
 from tensorflow.python.framework import common_shapes
@@ -35,8 +33,6 @@ from tensorflow.python.platform import tf_logging as logging
 
 DATA_OPS_FILE = '_data_ops.so'
 
-EXAMPLE_WEIGHT_NAME = '__weight__'
-
 _data_ops = None
 _ops_lock = threading.Lock()
 
@@ -69,68 +65,28 @@ def Load():
 def _ParseSparse(data):
   """Concat sparse tensors together.
 
-  A common use of sparse tensors is to treat strings as a sparse bit vector
-  with a large number of features representing the presence of all possible
-  values.  Here we convert these strings to integer indices in a sparse bit
-  tensor.  In order to pack each incoming feature into a single sparse tensor,
-  we add an offset to the converted indices to indicate that they came from
-  different features in the source data.
-
   Args:
     data: A dict of name -> Tensor.
 
   Returns:
-    A single sparse tensor with float values and a 1-D input spec Tensor.
+    A single sparse tensor and a 1-D input spec Tensor.
 
   Raises:
-    NotImplementedError:  Combining dense and sparse tensors is not yet
+    NotImplementedError:  Combining dense and sparse tensors is not
       supported.
     ValueError: If data contains non-string Tensors.
   """
-  convert_ops = Load()
-
-  # Sparse tensor indices have 63 bits to use for information. We use the
-  # minimum number of these (MSBs) for the offset, and pack the rest with the
-  # actual data.
-  num_features = len(data)
-  offset_bits = int(math.ceil(math.log(num_features, 2)))
-
-  # We condense data to 26 bits, see sparse_values_to_indices.cc
-  offset_increment = int(math.pow(2, 26 - offset_bits))
-  offset = 0
-
-  sparse_tensors = []
-  keys = None
-  weights = None
   for k in sorted(data.keys()):
-    if k == graph_io.KEY_FEATURE_NAME:
-      keys = data[k]
-    elif k == EXAMPLE_WEIGHT_NAME:
-      weights = data[k]
-    elif isinstance(data[k], ops.SparseTensor):
-      # TODO(gilberth): Support mixed string/float sparse tensors.
-      # We currently only support string (categorical) data if we're using
-      # sparse tensors.
-      if data[k].dtype != dtypes.string:
-        raise ValueError('Only sparse tensors of type string are supported.')
-      sparse_indices = data[k].indices
-      sparse_values = data[k].values
-      new_shape = array_ops.concat(
-          0, [array_ops.slice(data[k].shape, [0], [1]), [offset_increment]])
+    if not isinstance(data[k], ops.SparseTensor):
+      raise NotImplementedError(
+          'Features should be either all sparse or all dense.  Use a '
+          'feature engineering function to convert some of them.')
 
-      new_indices, new_values = convert_ops.sparse_values_to_indices(
-          sparse_indices,
-          sparse_values,
-          offset, offset_bits=offset_bits)
-      sparse_tensors.append(ops.SparseTensor(indices=new_indices,
-                                             values=new_values,
-                                             shape=new_shape))
-    else:
-      # Convert dense to sparse.
-      raise NotImplementedError('Dense to sparse conversion not implemented.')
-
-  return (sparse_ops.sparse_concat(1, sparse_tensors), keys, weights,
-          [constants.DATA_CATEGORICAL])
+  data_spec = [
+      constants.DATA_CATEGORICAL if data[data.keys()[0]].dtype == dtypes.string
+      else constants.DATA_FLOAT
+  ]
+  return sparse_ops.sparse_concat(1, data.values()), data_spec
 
 
 def _ParseDense(data):
@@ -143,22 +99,20 @@ def _ParseDense(data):
     A tuple of (single dense float Tensor, keys tensor (if exists), data spec).
   """
   convert_ops = Load()
-  data_spec = [constants.DATA_CATEGORICAL if data[k].dtype == dtypes.string else
-               constants.DATA_FLOAT for k in sorted(data.keys())]
+  data_spec = [constants.DATA_CATEGORICAL if (data[k].dtype == dtypes.string or
+                                              data[k].dtype == dtypes.int32 or
+                                              data[k].dtype == dtypes.int64)
+               else constants.DATA_FLOAT for k in sorted(data.keys())]
   data_spec = [constants.DATA_FLOAT] + data_spec
-  keys = None
-  weights = None
   features = []
   for k in sorted(data.keys()):
-    if k == graph_io.KEY_FEATURE_NAME:
-      keys = data[k]
-    elif k == EXAMPLE_WEIGHT_NAME:
-      weights = data[k]
+    if data[k].dtype == dtypes.string:
+      features.append(convert_ops.string_to_float(data[k]))
+    elif data[k].dtype == dtypes.int64 or data[k].dtype == dtypes.int32:
+      features.append(math_ops.to_float(data[k]))
     else:
-      features.append(
-          convert_ops.string_to_float(data[k]) if data[k].dtype == dtypes.string
-          else data[k])
-  return array_ops.concat(1, features), keys, weights, data_spec
+      features.append(data[k])
+  return array_ops.concat(1, features), data_spec
 
 
 def ParseDataTensorOrDict(data):
@@ -187,8 +141,7 @@ def ParseDataTensorOrDict(data):
     else:
       return _ParseDense(data)
   else:
-    return (data, None, None,
-            [constants.DATA_FLOAT] * data.get_shape().as_list()[1])
+    return (data, [constants.DATA_FLOAT] * data.get_shape().as_list()[1])
 
 
 def ParseLabelTensorOrDict(labels):
diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest.py b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
index ee31c0eba41..17d469739f9 100644
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
@@ -19,7 +19,9 @@ from __future__ import print_function
 
 import math
 import random
+import sys
 
+from tensorflow.contrib.losses.python.losses import loss_ops
 from tensorflow.contrib.tensor_forest.python import constants
 from tensorflow.contrib.tensor_forest.python.ops import inference_ops
 from tensorflow.contrib.tensor_forest.python.ops import training_ops
@@ -429,8 +431,9 @@ class RandomForestGraphs(object):
     return math_ops.reduce_mean(math_ops.to_float(array_ops.pack(sizes)))
 
   # pylint: disable=unused-argument
-  def training_loss(self, features, labels):
-    return math_ops.neg(self.average_size())
+  def training_loss(self, features, labels, data_spec=None,
+                    name='training_loss'):
+    return math_ops.neg(self.average_size(), name=name)
 
   # pylint: disable=unused-argument
   def validation_loss(self, features, labels):
@@ -456,6 +459,63 @@ class RandomForestGraphs(object):
     return ForestStats(tree_stats, self.params)
 
 
+def one_hot_wrapper(num_classes, loss_fn):
+  """Some loss functions take one-hot labels."""
+  def _loss(probs, targets):
+    one_hot_labels = array_ops.one_hot(
+        math_ops.to_int32(targets), num_classes,
+        on_value=1., off_value=0., dtype=dtypes.float32)
+    return loss_fn(probs, one_hot_labels)
+  return _loss
+
+
+class TrainingLossForest(RandomForestGraphs):
+  """Random Forest that uses training loss as the termination criteria."""
+
+  def __init__(self, params, loss_fn=None, **kwargs):
+    """Initialize.
+
+    Args:
+      params: Like RandomForestGraphs, a ForestHParams object.
+      loss_fn: A function that takes probabilities and targets and returns
+        a loss for each example.
+      **kwargs: Keyword args to pass to superclass (RandomForestGraphs).
+    """
+    self.loss_fn = loss_fn or one_hot_wrapper(params.num_classes,
+                                              loss_ops.log_loss)
+    self._loss = None
+    super(TrainingLossForest, self).__init__(params, **kwargs)
+
+  def _get_loss(self, features, labels, data_spec=None):
+    """Constructs, caches, and returns the inference-based loss."""
+    if self._loss is not None:
+      return self._loss
+
+    def _average_loss():
+      probs = self.inference_graph(features, data_spec=data_spec)
+      return math_ops.reduce_sum(self.loss_fn(
+          probs, labels)) / math_ops.to_float(
+              array_ops.shape(features)[0])
+
+    self._loss = control_flow_ops.cond(
+        self.average_size() > 0, _average_loss,
+        lambda: constant_op.constant(sys.maxsize, dtype=dtypes.float32))
+
+    return self._loss
+
+  def training_graph(self, input_data, input_labels, data_spec=None,
+                     **kwargs):
+    loss = self._get_loss(input_data, input_labels, data_spec=data_spec)
+    with ops.control_dependencies([loss.op]):
+      return super(TrainingLossForest, self).training_graph(
+          input_data, input_labels, **kwargs)
+
+  def training_loss(self, features, labels, data_spec=None,
+                    name='training_loss'):
+    return array_ops.identity(
+        self._get_loss(features, labels, data_spec=data_spec), name=name)
+
+
 class RandomTreeGraphs(object):
   """Builds TF graphs for random tree training and inference."""
 
diff --git a/tensorflow/contrib/tfprof/BUILD b/tensorflow/contrib/tfprof/BUILD
index d55bda1bd05..e817cb86dfd 100644
--- a/tensorflow/contrib/tfprof/BUILD
+++ b/tensorflow/contrib/tfprof/BUILD
@@ -12,6 +12,7 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
+        "//tensorflow/contrib/tfprof/python/tools/tfprof:model_analyzer",
         "//tensorflow/contrib/tfprof/python/tools/tfprof:tfprof_logger",
     ],
 )
diff --git a/tensorflow/contrib/tfprof/README.md b/tensorflow/contrib/tfprof/README.md
index 013be486767..e103cb21216 100644
--- a/tensorflow/contrib/tfprof/README.md
+++ b/tensorflow/contrib/tfprof/README.md
@@ -20,434 +20,9 @@ and measures system performance.
 4.  Explore model based on name scope or graph structure.
 5.  Selectively grouping/filtering/accounting/ordering ops.
 
-### Interfaces
+tfprof can be used as CommandLine Interface (CLI) and Python API.
+CLI locates in tensorflow/tools/tfprof.
+Python API locates in tensorflow/contrib/tfprof.
+Tutorial locates in tensorflow/tools/tfprof/README.md
 
-[CLI Tutorials](#cli-tutorials):
-It supports interactive mode for exploration and single-shot mode for
-scripts. Outputs can be dumped to files or printed in terminal.
-
-Python API Tutorials: Python API is not released yet.
-
-## CLI Tutorials
-
-Tutorials are based on a 32 layers ResNet.
-TODO(xpan): Provide graph.pbtxt, model.ckpt, tfprof_log and run_meta download.
-
-### Examples
-
-1) Start `tfprof` command line tool
-
-```shell
-# Build the tool.
-bazel build -c opt tensorflow/contrib/tfprof/...
-
-# Help information, including detail 'option' instructions.
-bazel-bin/tensorflow/contrib/tfprof/tools/tfprof/tfprof help
-#
-# The following commands will start tfprof interactive mode.
-#
-# Profile model shapes and parameters only.
-bazel-bin/tensorflow/contrib/tfprof/tools/tfprof/tfprof \
-    --graph_path=/graph.pbtxt
-#
-# Additionally profile checkpoint statistics and values.
-# Use '-account_type_regexes _checkpoint_variables' to select
-# checkpoint tensors.
-bazel-bin/tensorflow/contrib/tfprof/tools/tfprof/tfprof \
-    --graph_path=graph.pbtxt \
-    --checkpoint_path=model.ckpt
-#
-# Additionally profile ops requested memory and timing.
-# See CLI Input Files section on generating run_meta file.
-bazel-bin/tensorflow/contrib/tfprof/tools/tfprof/tfprof \
-    --graph_path=graph.pbtxt \
-    --run_meta_path=run_meta \
-    --checkpoint_path=model.ckpt
-#
-# tfprof_log is used to define customized op types and float ops.
-# Use tfprof_logger.write_op_log() to create tfprof_log.
-# See 11) in Examples section on generating tfprof_log file.
-bazel-bin/tensorflow/contrib/tfprof/tools/tfprof/tfprof \
-    --graph_path=graph.pbtxt \
-    --run_meta_path=run_meta \
-    --op_log_path=tfprof_log \
-    --checkpoint_path=model.ckpt
-```
-Note that `graph.pbtxt` is an ASCII text format.
-
-2) Press enter to show the default options
-
-```shell
-tfprof>
-tfprof>
--max_depth                  4
--min_bytes                  0
--min_micros                 0
--min_params                 0
--min_float_ops              0
--device_regexes             .*
--order_by                   name
--account_type_regexes       Variable
--start_name_regexes         .*
--trim_name_regexes
--show_name_regexes          .*
--hide_name_regexes          IsVariableInitialized_[0-9]+,save\/.*,^zeros[0-9_]*
--account_displayed_op_only  false
-# supported select fileds. Availability depends on --[run_meta|checkpoint|op_log]_path.
-# [bytes|micros|params|float_ops|num_hidden_ops|tensor_value|device|op_types]
--select                     params
--viz                        false
--dump_to_file
-```
-
-3) I want to see the `BatchNorm`'s gamma value in checkpoint.
-
-```shell
-# Requires --graph_path, --checkpoint_path.
-tfprof> scope -show_name_regexes unit_1_0.*gamma -select tensor_value -max_depth 5
-_TFProfRoot ()
-  unit_1_0/shared_activation/init_bn/gamma ()
-[1.80 2.10 2.06 1.91 2.26 1.86 1.81 1.37 1.78 1.85 1.96 1.54 2.04 2.34 2.22 1.99 ],
-  unit_1_0/sub2/bn2/gamma ()
-[1.57 1.83 1.30 1.25 1.59 1.14 1.26 0.82 1.19 1.10 1.48 1.01 0.82 1.23 1.21 1.14 ],
-```
-
-4) I want to see my checkpoint tensors shape and number of parameters.
-
-```shell
-# Requires --graph_path, --checkpoint_path.
-# Increase -max_depth to see all tensors.
-tfprof> scope -account_type_regexes _checkpoint_variables -select params -max_depth 4
-_TFProfRoot (--/930.58k params)
-  global_step (0/0 params)
-  init/init_conv/DW (3x3x3x16, 432/864 params)
-  pool_logit/DW (64x10, 640/1.28k params)
-    pool_logit/DW/Momentum (64x10, 640/640 params)
-  pool_logit/biases (10, 10/20 params)
-    pool_logit/biases/Momentum (10, 10/10 params)
-  unit_last/final_bn/beta (64, 64/128 params)
-  unit_last/final_bn/gamma (64, 64/128 params)
-  unit_last/final_bn/moving_mean (64, 64/64 params)
-  unit_last/final_bn/moving_variance (64, 64/64 params)
-```
-
-5) I defined an op named ‘cost’ to calculate the loss. I want to know what ops
-it depends on take a long time to run. Hint: Use the ‘graph’ command to explore
-graph dependencies.
-
-```shell
-# Requires --graph_path, --run_meta_path.
-tfprof> graph -start_name_regexes cost.* -max_depth 100 -min_micros 10000 -select micros -account_type_regexes .*
-_TFProfRoot (0us/3.61sec)
-  init/init_conv/Conv2D (11.75ms/3.10sec)
-    random_shuffle_queue_DequeueMany (3.09sec/3.09sec)
-  unit_1_0/sub2/conv2/Conv2D (74.14ms/3.19sec)
-  unit_1_3/sub2/conv2/Conv2D (60.75ms/3.34sec)
-  unit_2_4/sub2/conv2/Conv2D (73.58ms/3.54sec)
-  unit_3_3/sub2/conv2/Conv2D (10.26ms/3.60sec)
-```
-
-6) I want to know the expensive operations during the back propagation.
-Hint: tensorflow prepend ‘gradient’ to your defined name scopes. Use the ‘scope’
-command to explore based on name scope hierarchies.
-
-```shell
-# Requires --graph_path, --run_meta_path.
-tfprof> scope -start_name_regexes gradient.* -max_depth 100 -min_micros 20000 -select micros -account_type_regexes .*
-_TFProfRoot (0us/2.29sec)
-  gradients/unit_1_0/sub1/conv1/Conv2D_grad/Conv2DBackpropFilter (54.96ms/54.96ms)
-  gradients/unit_1_0/sub2/conv2/Conv2D_grad/Conv2DBackpropFilter (83.63ms/83.63ms)
-  gradients/unit_1_1/sub1/conv1/Conv2D_grad/Conv2DBackpropFilter (99.25ms/99.25ms)
-  gradients/unit_1_2/sub1/conv1/Conv2D_grad/Conv2DBackpropFilter (95.40ms/95.40ms)
-  gradients/unit_1_2/sub2/conv2/Conv2D_grad/Conv2DBackpropFilter (99.83ms/99.83ms)
-  gradients/unit_1_3/sub1/conv1/Conv2D_grad/Conv2DBackpropFilter (95.39ms/95.39ms)
-  ...
-```
-
-7) Show the number of float operations in the model.
-Note: float operations calculation depends on
-1) op.RegisterStatistics. If an op doesn’t
-have RegisterStatistics defined, its float operations cannot be counted.
-2) fully defined shape is also necessary in order to calculate flops.
-float operations number is provided by tensorflow::tfprof::OpLog logged from
-Python API.
-
-```shell
-# Requires --graph_path, --op_log_path.
-tfprof> scope -min_float_ops 1 -max_depth 10 -select float_ops -account_type_regexes .*
-_TFProfRoot (0/17.63b flops)
-  gradients/pool_logit/xw_plus_b/MatMul_grad/MatMul (163.84k/163.84k flops)
-  gradients/pool_logit/xw_plus_b/MatMul_grad/MatMul_1 (163.84k/163.84k flops)
-  init/init_conv/Conv2D (113.25m/113.25m flops)
-  pool_logit/xw_plus_b (1.28k/165.12k flops)
-    pool_logit/xw_plus_b/MatMul (163.84k/163.84k flops)
-  unit_1_0/sub1/conv1/Conv2D (603.98m/603.98m flops)
-  unit_1_0/sub2/conv2/Conv2D (603.98m/603.98m flops)
-  unit_1_1/sub1/conv1/Conv2D (603.98m/603.98m flops)
-  unit_1_1/sub2/conv2/Conv2D (603.98m/603.98m flops)
-  ...
-```
-
-8) Show the number of parameters of all `tf.trainable_variables()` in the model.
-
-```shell
-# Requires --graph_path --op_log_path.
-# store option for future commands.
-tfprof> set -account_type_regexes _trainable_variables
-tfprof> scope -max_depth 4 -select params
-_TFProfRoot (--/464.15k params)
-  init/init_conv/DW (3x3x3x16, 432/432 params)
-  pool_logit/DW (64x10, 640/640 params)
-  pool_logit/biases (10, 10/10 params)
-  unit_last/final_bn/beta (64, 64/64 params)
-  unit_last/final_bn/gamma (64, 64/64 params)
-```
-
-Where does “_trainable_variables” come from? It is from the OpLog file
-generated by write_op_log() Python API. write_op_log() help users create some
-common op types implicitly. Users can define their own op types and log it
-through the write_op_log() API.
-
-9) What if I’m lazy and don’t want to define op type? I have given my ops
-well-defined names in my model’s code. And want to use names to select a group
-of ops. Let’s try it!
-
-```shell
-tfprof> set -account_type_regexes .*
-tfprof> scope -show_name_regexes unit_2_1.*DW -max_depth 100 -account_displayed_op_only
-_TFProfRoot (0/18.43k params)
-  unit_2_1/sub1/conv1/DW (3x3x32x32, 9.22k/9.22k params)
-  unit_2_1/sub2/conv2/DW (3x3x32x32, 9.22k/9.22k params)
-```
-
-The above command allows you to filter ops that match specific names.
-`-account_displayed_op_only` asks tfprof to only account ops displayed
-in terminal. Otherwise, tfprof accounts all ops matched by
-`-account_type_regexes` recursively even if they are hidden due to some
-options such as -max_depth.
-
-10) TensorFlow has built-in op types. For example, built-in op type `Variable`
-seems to include `Variable's` created by your model. However, be careful when
-depending on it because TensorFlow creates extra `Variable` ops implicitly and
-the implicitly created ops can have the same prefix as the `Variable's` you
-defined.
-
-In the following example, extra `Variables` are created and “/Momentum” is
-appended to their names. This might cause you “model capacity” calculation
-to get wrong.
-
-```shell
-tfprof> scope -account_type_regexes Variable -max_depth 4 -select params
-_TFProfRoot (--/930.58k params)
-  global_step (1/1 params)
-  init/init_conv/DW (3x3x3x16, 432/864 params)
-  pool_logit/DW (64x10, 640/1.28k params)
-    pool_logit/DW/Momentum (64x10, 640/640 params)
-  pool_logit/biases (10, 10/20 params)
-    pool_logit/biases/Momentum (10, 10/10 params)
-  unit_last/final_bn/beta (64, 64/128 params)
-  unit_last/final_bn/gamma (64, 64/128 params)
-  unit_last/final_bn/moving_mean (64, 64/64 params)
-  unit_last/final_bn/moving_variance (64, 64/64 params)
-```
-
-
-11) A example of defining extra op type for ops using `OpLog`
-
-First, in Python code, create an `OpLog` proto and add op type
-information to it:
-
-```python
-
-op_log = tfprof_log_pb2.OpLog()
-entry = op_log.log_entries.add()
-entry.name = 'pool_logit/DW'
-entry.types.append('pool_logit')
-entry = op_log.log_entries.add()
-entry.name = 'pool_logit/biases'
-# Alternatively:
-# var = tf.get_variable(xxx)
-# entry.name = var.op.name
-entry.types.append('pool_logit')
-```
-
-Second, call write_op_log to write the OpLog proto.
-
-```python
-tf.tfprof.tfprof_logger.write_op_log(sess.graph, /tmp/my_op_log_dir, op_log)
-```
-
-Third, when starting the tfprof tool, specify
-"--op_log_path /tmp/my_op_log_dir/op_log"
-
-```shell
-tfprof> scope -account_type_regexes pool_logit -max_depth 4 -select params
-_TFProfRoot (--/650 params)
-  pool_logit/DW (64x10, 640/640 params)
-  pool_logit/biases (10, 10/10 params)
-```
-
-Note that when you call
-`tf.tfprof.tfprof_logger.write_op_log(...)`, the tool adds all `Variables`
-inside `tf.trainable_variables()` to `_trainable_variables`.
-
-12) Run tfprof in one-shot mode and dump result to file.
-
-```shell
-# Printed to stdout if --dump_to_file is not set.
-tfprof scope --graph_path /cns/ij-d/home/xpan/tfprof/graph.pbtxt  \
-             --max_depth 3 \
-             --dump_to_file "/tmp/dump"
-Reading Files...
-Parsing GraphDef...
-Preparing Views...
-
-cat /tmp/dump
-_TFProfRoot (--/930.58k params)
-  global_step (0/0 params)
-  pool_logit/DW (64x10, 640/1.28k params)
-  pool_logit/biases (10, 10/20 params)
-```
-
-13) Analyze how balanced Variable are on parameter servers.
-
-In this tutorial, I'm going to use a seq2seq model, which are split
-on several gpus at workers and several parameter servers.
-
-In tfprof, 'device' is an op_type. For example, if op1 and op2 are placed on
-gpu0. They share an op_type called 'gpu0'.
-
-```shell
-bazel-bin/tensorflow/contrib/tfprof/tools/tfprof/tfprof \
-  --graph_path ~/tfprof/textsum/graph.pbtxt  \
-  --run_meta_path ~/tfprof/textsum/run_meta
-
-# Looks like ps task 1 is holding twice more parameters than task 0.
-tfprof> scope -select device,params -account_type_regexes .*ps.*task:0.* -max_depth 1
-_TFProfRoot (--/25.81m params)
-tfprof> scope -select device,params -account_type_regexes .*ps.*task:1.* -max_depth 1
-_TFProfRoot (--/58.84m params)
-```
-
-### CLI Input Files
-
-tfprof command line inference (CLI) loads dumped files from a tensorflow model.
-Convert them into in-memory data structures. To use it, users need to specify
-the locations of the dumped files. The following are the dumped files loaded
-by tfprof:
-
-<b>--graph_path:</b> GraphDef text file (required). Used to build in-memory
-representation of the model. For example, graph.pbtxt written by tf.Supervisor
-is a candidate. If you are not using tf.Supervisor, you can easily get GraphDef
-using tf.Graph.as_graph_def() or other API.
-
-<b>--run_meta_path:</b> tensorflow::RunMetadata.
-Used to get the memory and time consumption of
-each op of the model. Users need to enable it. For example, the following code
-snippet writes a RunMetadata file:
-
-```python
-run_options = config_pb2.RunOptions(trace_level=config_pb2.RunOptions.FULL_TRACE)
-run_metadata = config_pb2.RunMetadata()
-# Once a while, call it the get the RunMeta.
-_ = self._sess.run(..., options=run_options, run_metadata=run_metadata)
-with gfile.Open(os.path.join(output_dir, "run_meta"), "w") as f:
-  f.write(run_metadata.SerializeToString())
-```
-
-<b>--op_log_path:</b>
-tensorflow::tfprof::OpLog. A proto used to provide extra op information
-for ops. By giving a group of ops a type name, users can easily aggregate the
-statistics for those ops without accidently missing or including extra ops.
-tfprof exposes the following Python API to add op information and logging.
-
-```python
-tf.contrib.tfprof.tfprof_logger.write_op_log(graph, log_dir, op_log=None)
-```
-
-<b>--checkpoint_path:</b>
-TensorFlow checkpoint. It defines _checkpoint_variable op type. It also
-provides checkpointed tensors' values.
-
-
-## Design
-
-
-### In-memory representation
-
-<b>Scope:</b> This representation organizes ops based on name scope hierarchy,
-similar to filesystem hierarchy. Hence, it is essentially a tree data structure.
-For example op1 with name “name1/name2” is a child of op2 with name “name1”.
-
-<b>Graph:</b> The representation organizes ops based on op inputs. Hence it is
-a graph structure. The graph is a “directed acyclic graph” (hopefully), with
-direction from “output to input”. The direction is design this way so that users
-can trace from “result” to its “sources”.
-
-### Command line options
-
-tfprof’s major goals are to measure system performance and quicly analyze
-model architectures. Hence, its commands and options should allow users to achieve
-these 2 goals easily.
-
-<b>graph:</b> It is expected that users will mostly use graph representation to
-debug system performance. Hence, tfprof supports graph command, which pulls the
-graph in-memory representation described above.
-
-<b>scope:</b> It is expected that some users might want to explore their model
-statistics using the name scope information they defined in the Python codes.
-Hence, tfprof supports “scope” command, which pulls the tree in-memory
-representation.
-
-<b>set:</b> It is used to store the options so that user doesn’t need to
-re-type the same option again and again in the follow up command line. Note that
-tfprof has traditional terminal’s history and auto-complete support.
-
-<b>help:</b> print help information.
-
-<b>Options:</b> Run “tfprof help” to get detailed explanations.
-
-```python
-"-max_depth",
-"-min_bytes",
-"-min_micros",
-"-min_params",
-"-min_float_ops",
-"-order_by",
-"-account_type_regexes",
-"-start_name_regexes",
-"-trim_name_regexes",
-"-show_name_regexes",
-"-hide_name_regexes",
-"-account_displayed_op_only",
-"-select",
-"-viz",  # Only supported for graph command.
-"-dump_to_file",
-```
-
-A key design is that stats are aggregated from descendants up to ancestors.
-`-account_type_regexes` is used to decide which ops stat is accounted. It makes
-decision based on op type. Usually set it to `.*` if no extra type information
-is added to the ops using OpLog. Intuitively, only accounted ops are displayed.
-`-min/max` and `-show/hide/trim/start` options are only used the optionally
-displayed or hide ops based on ops’ name and stats. However, they don’t prevent
-tfprof from accounting stats of hidden ops. Hence, the stat of a op can be
-aggregated by its parent even if it is hidden. `-account_displayed_op_only` is
-an option to break this rule. When it is set, only displayed ops are accounted.
-
-Regexes are all comma-separated, for example `-show_name_regexes`
-`regex1.*,regex2.*`. It is designed this way because it is convenient and comma
-is not expected to show up in op names.
-
-`-order_by` is used to order displayed ops. Displayed ops at the same hierarchy
-(notice the indent printed) are sorted according to order_by.
-
-## Future Work
-
-* Load SummaryWriter event logs so that it can show the latest summary value.
-
-* Better sorting and aggregation of outputs. Easier comprehension.
-
-* Currently, shape information is based on `graph.pbtxt`. When the shape
-information is incomplete, tfprof ignores it. See if it can use `RunMetadata`
-and `Checkpoint` to complete shape information.
+Enjoy!
\ No newline at end of file
diff --git a/tensorflow/contrib/tfprof/__init__.py b/tensorflow/contrib/tfprof/__init__.py
index ce777979b96..129dad2726c 100644
--- a/tensorflow/contrib/tfprof/__init__.py
+++ b/tensorflow/contrib/tfprof/__init__.py
@@ -17,5 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.tfprof.python.tools.tfprof import model_analyzer
 from tensorflow.contrib.tfprof.python.tools.tfprof import tfprof_logger
 from tensorflow.python.util.all_util import make_all
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/BUILD b/tensorflow/contrib/tfprof/python/tools/tfprof/BUILD
index 87a8311486f..07677c6ed73 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/BUILD
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/BUILD
@@ -3,14 +3,36 @@ licenses(["notice"])  # Apache 2.0
 package(default_visibility = ["//visibility:public"])
 
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
+
+py_library(
+    name = "model_analyzer",
+    srcs = ["model_analyzer.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/tfprof/python/tools/tfprof:pywrap_tensorflow_print_model_analysis_lib",
+        "//tensorflow/contrib/tfprof/python/tools/tfprof:tfprof_logger",
+        "//tensorflow/tools/tfprof:protos_all_py",
+    ],
+)
+
+py_test(
+    name = "model_analyzer_test",
+    srcs = ["model_analyzer_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":model_analyzer",
+        "//tensorflow:tensorflow_py",
+    ],
+)
 
 py_library(
     name = "tfprof_logger",
     srcs = ["tfprof_logger.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/tfprof/tools/tfprof:protos_all_py",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/tools/tfprof:protos_all_py",
     ],
 )
 
@@ -20,7 +42,34 @@ tf_py_test(
     additional_deps = [
         ":tfprof_logger",
         "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/tfprof/tools/tfprof:protos_all_py",
+        "//tensorflow/tools/tfprof:protos_all_py",
+    ],
+)
+
+tf_py_wrap_cc(
+    name = "pywrap_tensorflow_print_model_analysis_lib",
+    srcs = ["pywrap_tensorflow_print_model_analysis.i"],
+    swig_includes = [
+        "//tensorflow/python:lib/core/strings.i",
+        "//tensorflow/python:platform/base.i",
+    ],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/tools/tfprof/internal:print_model_analysis_hdr",
+        "//util/python:python_headers",
+    ],
+)
+
+py_test(
+    name = "print_model_analysis_test",
+    srcs = ["print_model_analysis_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":pywrap_tensorflow_print_model_analysis_lib",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/tools/tfprof:protos_all_py",
     ],
 )
 
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer.py b/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer.py
new file mode 100644
index 00000000000..cc94fd65b53
--- /dev/null
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer.py
@@ -0,0 +1,188 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model Analyzer.
+
+Analyze model, including shape, params, time, memory, structure, etc.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.tfprof.python.tools.tfprof import pywrap_tensorflow_print_model_analysis_lib as print_mdl
+from tensorflow.contrib.tfprof.python.tools.tfprof import tfprof_logger
+from tensorflow.tools.tfprof import tfprof_options_pb2
+from tensorflow.tools.tfprof import tfprof_output_pb2
+
+# pylint: disable=bad-whitespace
+# pylint: disable=bad-continuation
+# 2 example tfprof_options for print_model_analysis API.
+#
+# Show the parameter statistics of trainable variables.
+TRAINABLE_VARS_PARAMS_STAT_OPTIONS = {
+    'max_depth': 10000,
+    'min_bytes': 0,
+    'min_micros': 0,
+    'min_params': 0,
+    'min_float_ops': 0,
+    'device_regexes': ['.*'],
+    'order_by': 'name',
+    'account_type_regexes': [tfprof_logger.TRAINABLE_VARIABLES],
+    'start_name_regexes': ['.*'],
+    'trim_name_regexes': [],
+    'show_name_regexes': ['.*'],
+    'hide_name_regexes': [],
+    'account_displayed_op_only': True,
+    'select': ['params'],
+    'viz': False,
+    'dump_to_file': ''
+}
+
+# Show the number float operations.
+FLOAT_OPS_OPTIONS = {
+    'max_depth': 10000,
+    'min_bytes': 0,
+    'min_micros': 0,
+    'min_params': 0,
+    'min_float_ops': 1,
+    'device_regexes': ['.*'],
+    'order_by': 'float_ops',
+    'account_type_regexes': ['.*'],
+    'start_name_regexes': ['.*'],
+    'trim_name_regexes': [],
+    'show_name_regexes': ['.*'],
+    'hide_name_regexes': [],
+    'account_displayed_op_only': True,
+    'select': ['float_ops'],
+    'viz': False,
+    'dump_to_file': ''
+}
+
+# Show number of parameters on parameter server 0.
+# It is recommended to provide`run_meta` argument
+# to have complete device placement info.
+PRINT_PARAMS_ON_DEVICE = {
+    'max_depth': 1,
+    'min_bytes': 0,
+    'min_micros': 0,
+    'min_params': 0,
+    'min_float_ops': 0,
+    'device_regexes': ['.*'],
+    'order_by': 'name',
+    'account_type_regexes': ['.*ps.*task:0.*'],
+    'start_name_regexes': ['.*'],
+    'trim_name_regexes': [],
+    'show_name_regexes': ['.*'],
+    'hide_name_regexes': [],
+    'account_displayed_op_only': False,
+    'select': ['device', 'params'],
+    'viz': False,
+    'dump_to_file': ''
+}
+
+# Show the timing stats and memory demands.
+PRINT_ALL_TIMING_MEMORY = {
+    'max_depth': 10000,
+    'min_bytes': 1,  # Only >=1
+    'min_micros': 1,  # Only >=1
+    'min_params': 0,
+    'min_float_ops': 0,
+    'device_regexes': ['.*'],
+    'order_by': 'name',
+    'account_type_regexes': ['.*'],
+    'start_name_regexes': ['.*'],
+    'trim_name_regexes': [],
+    'show_name_regexes': ['.*'],
+    'hide_name_regexes': [],
+    'account_displayed_op_only': True,
+    'select': ['micros', 'bytes'],
+    'viz': False,
+    'dump_to_file': ''
+}
+
+# pylint: enable=bad-whitespace
+# pylint: enable=bad-continuation
+
+
+def print_model_analysis(graph,
+                         run_meta=None,
+                         op_log=None,
+                         tfprof_cmd='scope',
+                         tfprof_options=TRAINABLE_VARS_PARAMS_STAT_OPTIONS):
+  """Print model statistics.
+
+    Prints the model statistics to stdout. Also returns the results
+    in a TFProfNode proto. See go/tfprof or run tfprof tool:
+    'bazel run third_party/tensorflow/tools/tfprof help'
+
+    Examples:
+      Show the parameter/shape statistics of tf.trainable_variables().
+        print_model_analysis(sess.graph).
+
+      Show number of float ops. Only ops with RegisterStatistics defined
+      are counted.
+        show_float_op_opts = model_analyzer.FLOAT_OPS_OPTIONS
+        print_model_analysis(sess.graph, tfprof_options=show_float_op_opts)
+
+  Args:
+    graph: tf.Graph.
+    run_meta: tensorflow::RunMetadata proto. When provided, also shows valid
+              timing and memory information when 'select' option contains
+              'micros' and 'bytes'.
+    op_log: tensorflow::tfprof::OpLog proto. users can use this proto to
+            group together ops and use a op_type to select the group.
+    tfprof_cmd: string. Either 'scope' or 'graph'. 'scope' view organize
+                ops using their name scopes. 'graph' view organize ops using
+                their graph inputs.
+    tfprof_options: See 'tfprof help' for details.
+  Returns:
+    TFProfNode proto. Side effect: a formatted output to stdout.
+  """
+  # pylint: disable=protected-access
+  op_log = tfprof_logger._merge_default_with_oplog(graph, op_log, run_meta)
+  # pylint: enable=protected-access
+  opts = tfprof_options_pb2.OptionsProto()
+  opts.max_depth = tfprof_options['max_depth']
+  opts.min_bytes = tfprof_options['min_bytes']
+  opts.min_micros = tfprof_options['min_micros']
+  opts.min_params = tfprof_options['min_params']
+  opts.min_float_ops = tfprof_options['min_float_ops']
+  for p in tfprof_options['device_regexes']:
+    opts.device_regexes.append(p)
+  opts.order_by = tfprof_options['order_by']
+  for p in tfprof_options['account_type_regexes']:
+    opts.account_type_regexes.append(p)
+  for p in tfprof_options['start_name_regexes']:
+    opts.start_name_regexes.append(p)
+  for p in tfprof_options['trim_name_regexes']:
+    opts.trim_name_regexes.append(p)
+  for p in tfprof_options['show_name_regexes']:
+    opts.show_name_regexes.append(p)
+  for p in tfprof_options['hide_name_regexes']:
+    opts.hide_name_regexes.append(p)
+  opts.account_displayed_op_only = tfprof_options['account_displayed_op_only']
+  for p in tfprof_options['select']:
+    opts.select.append(p)
+  opts.viz = tfprof_options['viz']
+  opts.dump_to_file = tfprof_options['dump_to_file']
+
+  run_meta_str = run_meta.SerializeToString() if run_meta else b''
+  op_log_str = op_log.SerializeToString() if op_log else b''
+
+  tfprof_node = tfprof_output_pb2.TFProfNode()
+  tfprof_node.ParseFromString(
+      print_mdl.PrintModelAnalysis(
+          graph.as_graph_def().SerializeToString(), run_meta_str, op_log_str,
+          tfprof_cmd.encode('utf-8'), opts.SerializeToString()))
+  return tfprof_node
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_test.py b/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_test.py
new file mode 100644
index 00000000000..9988392acd9
--- /dev/null
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_test.py
@@ -0,0 +1,84 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import tensorflow as tf
+
+
+class PrintModelAnalysisTest(tf.test.TestCase):
+
+  def _BuildSmallModel(self):
+    image = tf.zeros([2, 6, 6, 3])
+    kernel = tf.get_variable(
+        'DW', [3, 3, 3, 6],
+        tf.float32,
+        initializer=tf.random_normal_initializer(stddev=0.001))
+    x = tf.nn.conv2d(image, kernel, [1, 2, 2, 1], padding='SAME')
+    kernel = tf.get_variable(
+        'DW2', [2, 2, 6, 12],
+        tf.float32,
+        initializer=tf.random_normal_initializer(stddev=0.001))
+    x = tf.nn.conv2d(x, kernel, [1, 2, 2, 1], padding='SAME')
+    return x
+
+  def testDumpToFile(self):
+    opts = tf.contrib.tfprof.model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS
+    opts['dump_to_file'] = os.path.join(tf.test.get_temp_dir(), 'dump')
+
+    with tf.Session() as sess, tf.device('/cpu:0'):
+      _ = self._BuildSmallModel()
+      tf.contrib.tfprof.model_analyzer.print_model_analysis(
+          sess.graph, tfprof_options=opts)
+
+      with tf.gfile.Open(opts['dump_to_file'], 'r') as f:
+        self.assertEqual(u'_TFProfRoot (--/450 params)\n'
+                         '  DW (3x3x3x6, 162/162 params)\n'
+                         '  DW2 (2x2x6x12, 288/288 params)\n',
+                         f.read().decode('utf-8'))
+
+  def testSelectEverything(self):
+    opts = tf.contrib.tfprof.model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS
+    opts['dump_to_file'] = os.path.join(tf.test.get_temp_dir(), 'dump')
+    opts['account_type_regexes'] = ['.*']
+    opts['select'] = [
+        'bytes', 'params', 'float_ops', 'num_hidden_ops', 'device', 'op_types'
+    ]
+
+    with tf.Session() as sess, tf.device('/cpu:0'):
+      x = self._BuildSmallModel()
+
+      sess.run(tf.initialize_all_variables())
+      run_meta = tf.RunMetadata()
+      _ = sess.run(x,
+                   options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),
+                   run_metadata=run_meta)
+
+      tf.contrib.tfprof.model_analyzer.print_model_analysis(
+          sess.graph, run_meta, tfprof_options=opts)
+
+      with tf.gfile.Open(opts['dump_to_file'], 'r') as f:
+        # pylint: disable=line-too-long
+        self.assertEqual(
+            '_TFProfRoot (0/450 params, 0/10.44k flops, 0B/5.28KB, _kTFScopeParent)\n  Conv2D (0/0 params, 5.83k/5.83k flops, 432B/432B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Conv2D)\n  Conv2D_1 (0/0 params, 4.61k/4.61k flops, 384B/384B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Conv2D)\n  DW (3x3x3x6, 162/162 params, 0/0 flops, 648B/1.30KB, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Variable|_trainable_variables)\n    DW/Assign (0/0 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|Assign)\n    DW/Initializer (0/0 params, 0/0 flops, 0B/0B, _kTFScopeParent)\n      DW/Initializer/random_normal (0/0 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|Add)\n        DW/Initializer/random_normal/RandomStandardNormal (0/0 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|RandomStandardNormal)\n        DW/Initializer/random_normal/mean (0/0 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|Const)\n        DW/Initializer/random_normal/mul (0/0 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|Mul)\n        DW/Initializer/random_normal/shape (0/0 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|Const)\n        DW/Initializer/random_normal/stddev (0/0 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|Const)\n    DW/read (0/0 params, 0/0 flops, 648B/648B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Identity)\n  DW2 (2x2x6x12, 288/288 params, 0/0 flops, 1.15KB/2.30KB, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Variable|_trainable_variables)\n    DW2/Assign (0/0 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|Assign)\n    DW2/Initializer (0/0 params, 0/0 flops, 0B/0B, _kTFScopeParent)\n      DW2/Initializer/random_normal (0/0 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|Add)\n        DW2/Initializer/random_normal/RandomStandardNormal (0/0 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|RandomStandardNormal)\n        DW2/Initializer/random_normal/mean (0/0 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|Const)\n        DW2/Initializer/random_normal/mul (0/0 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|Mul)\n        DW2/Initializer/random_normal/shape (0/0 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|Const)\n        DW2/Initializer/random_normal/stddev (0/0 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|Const)\n    DW2/read (0/0 params, 0/0 flops, 1.15KB/1.15KB, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Identity)\n  init (0/0 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|NoOp)\n  zeros (0/0 params, 0/0 flops, 864B/864B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Const)\n',
+            f.read().decode('utf-8'))
+        # pylint: enable=line-too-long
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/print_model_analysis_test.py b/tensorflow/contrib/tfprof/python/tools/tfprof/print_model_analysis_test.py
new file mode 100644
index 00000000000..0354d0f631d
--- /dev/null
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/print_model_analysis_test.py
@@ -0,0 +1,238 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""print_model_analysis test."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from google.protobuf import text_format
+from tensorflow.contrib.tfprof.python.tools.tfprof import pywrap_tensorflow_print_model_analysis_lib as print_mdl
+from tensorflow.tools.tfprof import tfprof_options_pb2
+from tensorflow.tools.tfprof import tfprof_output_pb2
+
+# pylint: disable=bad-whitespace
+# pylint: disable=bad-continuation
+TEST_OPTIONS = {
+    'max_depth': 10000,
+    'min_bytes': 0,
+    'min_micros': 0,
+    'min_params': 0,
+    'min_float_ops': 0,
+    'device_regexes': ['.*'],
+    'order_by': 'name',
+    'account_type_regexes': ['.*'],
+    'start_name_regexes': ['.*'],
+    'trim_name_regexes': [],
+    'show_name_regexes': ['.*'],
+    'hide_name_regexes': [],
+    'account_displayed_op_only': True,
+    'select': ['params'],
+    'viz': False
+}
+
+# pylint: enable=bad-whitespace
+# pylint: enable=bad-continuation
+
+
+class PrintModelAnalysisTest(tf.test.TestCase):
+
+  def _BuildSmallModel(self):
+    image = tf.zeros([2, 6, 6, 3])
+    kernel = tf.get_variable(
+        'DW', [6, 6, 3, 6],
+        tf.float32,
+        initializer=tf.random_normal_initializer(stddev=0.001))
+    x = tf.nn.conv2d(image, kernel, [1, 2, 2, 1], padding='SAME')
+    return x
+
+  def testPrintModelAnalysis(self):
+    opts = tfprof_options_pb2.OptionsProto()
+    opts.max_depth = TEST_OPTIONS['max_depth']
+    opts.min_bytes = TEST_OPTIONS['min_bytes']
+    opts.min_micros = TEST_OPTIONS['min_micros']
+    opts.min_params = TEST_OPTIONS['min_params']
+    opts.min_float_ops = TEST_OPTIONS['min_float_ops']
+    for p in TEST_OPTIONS['device_regexes']:
+      opts.device_regexes.append(p)
+    opts.order_by = TEST_OPTIONS['order_by']
+    for p in TEST_OPTIONS['account_type_regexes']:
+      opts.account_type_regexes.append(p)
+    for p in TEST_OPTIONS['start_name_regexes']:
+      opts.start_name_regexes.append(p)
+    for p in TEST_OPTIONS['trim_name_regexes']:
+      opts.trim_name_regexes.append(p)
+    for p in TEST_OPTIONS['show_name_regexes']:
+      opts.show_name_regexes.append(p)
+    for p in TEST_OPTIONS['hide_name_regexes']:
+      opts.hide_name_regexes.append(p)
+    opts.account_displayed_op_only = TEST_OPTIONS['account_displayed_op_only']
+    for p in TEST_OPTIONS['select']:
+      opts.select.append(p)
+    opts.viz = TEST_OPTIONS['viz']
+
+    with tf.Session() as sess, tf.device('/cpu:0'):
+      _ = self._BuildSmallModel()
+      tfprof_pb = tfprof_output_pb2.TFProfNode()
+      tfprof_pb.ParseFromString(
+          print_mdl.PrintModelAnalysis(sess.graph.as_graph_def(
+          ).SerializeToString(), b'', b'', b'scope', opts.SerializeToString()))
+
+      expected_pb = tfprof_output_pb2.TFProfNode()
+      text_format.Merge(r"""name: "_TFProfRoot"
+          exec_micros: 0
+          requested_bytes: 0
+          total_exec_micros: 0
+          total_requested_bytes: 0
+          total_parameters: 648
+          children {
+            name: "Conv2D"
+            exec_micros: 0
+            requested_bytes: 0
+            total_exec_micros: 0
+            total_requested_bytes: 0
+            total_parameters: 0
+            device: "/device:CPU:0"
+            float_ops: 0
+            total_float_ops: 0
+          }
+          children {
+            name: "DW"
+            exec_micros: 0
+            requested_bytes: 0
+            parameters: 648
+            total_exec_micros: 0
+            total_requested_bytes: 0
+            total_parameters: 648
+            device: "/device:CPU:0"
+            children {
+              name: "DW/Assign"
+              exec_micros: 0
+              requested_bytes: 0
+              total_exec_micros: 0
+              total_requested_bytes: 0
+              total_parameters: 0
+              device: "/device:CPU:0"
+              float_ops: 0
+              total_float_ops: 0
+            }
+            children {
+              name: "DW/Initializer"
+              exec_micros: 0
+              requested_bytes: 0
+              total_exec_micros: 0
+              total_requested_bytes: 0
+              total_parameters: 0
+              children {
+                name: "DW/Initializer/random_normal"
+                exec_micros: 0
+                requested_bytes: 0
+                total_exec_micros: 0
+                total_requested_bytes: 0
+                total_parameters: 0
+                device: "/device:CPU:0"
+                children {
+                  name: "DW/Initializer/random_normal/RandomStandardNormal"
+                  exec_micros: 0
+                  requested_bytes: 0
+                  total_exec_micros: 0
+                  total_requested_bytes: 0
+                  total_parameters: 0
+                  device: "/device:CPU:0"
+                  float_ops: 0
+                  total_float_ops: 0
+                }
+                children {
+                  name: "DW/Initializer/random_normal/mean"
+                  exec_micros: 0
+                  requested_bytes: 0
+                  total_exec_micros: 0
+                  total_requested_bytes: 0
+                  total_parameters: 0
+                  device: "/device:CPU:0"
+                  float_ops: 0
+                  total_float_ops: 0
+                }
+                children {
+                  name: "DW/Initializer/random_normal/mul"
+                  exec_micros: 0
+                  requested_bytes: 0
+                  total_exec_micros: 0
+                  total_requested_bytes: 0
+                  total_parameters: 0
+                  device: "/device:CPU:0"
+                  float_ops: 0
+                  total_float_ops: 0
+                }
+                children {
+                  name: "DW/Initializer/random_normal/shape"
+                  exec_micros: 0
+                  requested_bytes: 0
+                  total_exec_micros: 0
+                  total_requested_bytes: 0
+                  total_parameters: 0
+                  device: "/device:CPU:0"
+                  float_ops: 0
+                  total_float_ops: 0
+                }
+                children {
+                  name: "DW/Initializer/random_normal/stddev"
+                  exec_micros: 0
+                  requested_bytes: 0
+                  total_exec_micros: 0
+                  total_requested_bytes: 0
+                  total_parameters: 0
+                  device: "/device:CPU:0"
+                  float_ops: 0
+                  total_float_ops: 0
+                }
+                float_ops: 0
+                total_float_ops: 0
+              }
+              float_ops: 0
+              total_float_ops: 0
+            }
+            children {
+              name: "DW/read"
+              exec_micros: 0
+              requested_bytes: 0
+              total_exec_micros: 0
+              total_requested_bytes: 0
+              total_parameters: 0
+              device: "/device:CPU:0"
+              float_ops: 0
+              total_float_ops: 0
+            }
+            float_ops: 0
+            total_float_ops: 0
+          }
+          children {
+            name: "zeros"
+            exec_micros: 0
+            requested_bytes: 0
+            total_exec_micros: 0
+            total_requested_bytes: 0
+            total_parameters: 0
+            device: "/device:CPU:0"
+            float_ops: 0
+            total_float_ops: 0
+          }
+          float_ops: 0
+          total_float_ops: 0""", expected_pb)
+      self.assertEqual(expected_pb, tfprof_pb)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/pywrap_tensorflow_print_model_analysis.i b/tensorflow/contrib/tfprof/python/tools/tfprof/pywrap_tensorflow_print_model_analysis.i
new file mode 100644
index 00000000000..05b734a699f
--- /dev/null
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/pywrap_tensorflow_print_model_analysis.i
@@ -0,0 +1,43 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+%include "tensorflow/python/lib/core/strings.i"
+%include "tensorflow/python/platform/base.i"
+
+%{
+#include "tensorflow/tools/tfprof/internal/print_model_analysis.h"
+#include "tensorflow/core/framework/types.h"
+%}
+
+%typemap(typecheck) const string & = char *;
+%typemap(in) const string& (string temp) {
+  if (!_PyObjAs<string>($input, &temp)) return NULL;
+  $1 = &temp;
+}
+%typemap(out) const string& {
+  $result = PyString_FromStringAndSize($1->data(), $1->size());
+}
+%apply const string & {string &};
+%apply const string & {string *};
+
+%ignoreall
+
+%unignore tensorflow;
+%unignore tensorflow::tfprof;
+%unignore tensorflow::tfprof::PrintModelAnalysis;
+
+%include "tensorflow/tools/tfprof/internal/print_model_analysis.h"
+
+%unignoreall
\ No newline at end of file
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/tfprof_logger.py b/tensorflow/contrib/tfprof/python/tools/tfprof/tfprof_logger.py
index 53dd2632b69..1f710bc970c 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/tfprof_logger.py
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/tfprof_logger.py
@@ -24,8 +24,8 @@ import os
 import sys
 
 import tensorflow as tf
-from tensorflow.contrib.tfprof.tools.tfprof import tfprof_log_pb2
 from tensorflow.python.framework import ops
+from tensorflow.tools.tfprof import tfprof_log_pb2
 
 TRAINABLE_VARIABLES = '_trainable_variables'
 REGISTERED_FLOP_STATS = 'flops'
@@ -85,7 +85,7 @@ def _get_logged_ops(graph, run_meta=None):
     if node.name not in logged_ops:
       entry = tfprof_log_pb2.OpLogEntry()
       entry.name = node.name
-      entry.float_ops = stats.value
+      entry.float_ops = int(stats.value)
       logged_ops[entry.name] = entry
 
   for v in graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
diff --git a/tensorflow/contrib/training/__init__.py b/tensorflow/contrib/training/__init__.py
index c9564fc316c..d2a6368d785 100644
--- a/tensorflow/contrib/training/__init__.py
+++ b/tensorflow/contrib/training/__init__.py
@@ -32,8 +32,9 @@ like to store state in the forward direction across segments of an example.
 To resample data with replacement on a per-example basis, use
 ['rejection_sample'](#rejection_sample) or
 ['resample_at_rate'](#resample_at_rate). For `rejection_sample`, provide
-a boolean Tensor describing whether to accept or reject. For `resample_at_rate`,
-providing the desired rate for each example. If you wish to specify relative
+a boolean Tensor describing whether to accept or reject. Resulting batch sizes
+are always the same. For `resample_at_rate`, provide the desired rate for each
+example. Resulting batch sizes may vary. If you wish to specify relative
 rates, rather than absolute ones, use ['weighted_resample'](#weighted_resample)
 (which also returns the actual resampling rate used for each output example).
 
diff --git a/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.cc b/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.cc
index 68cb20d0b57..1f079027efb 100644
--- a/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.cc
+++ b/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.cc
@@ -16,8 +16,10 @@ limitations under the License.
 
 #include <unordered_set>
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/kernels/immutable_constant_op.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
@@ -45,13 +47,27 @@ class NodeConverter {
     const DataType tensor_data_type = tensor_proto.dtype();
     const TensorShapeProto tensor_shape = tensor_proto.tensor_shape();
 
+    // Check that the tensor type is POD, only these types are supported for
+    // memmapping.
+    // DataType enum is explicitly converted to int to avoid errors with passing
+    // enum type are a parameter type to std::unordered_set.
+    static std::unordered_set<int> supported_types{
+#define TYPE_FOR_SET(type) static_cast<int>(DataTypeToEnum<type>::value),
+        TF_CALL_POD_TYPES(TYPE_FOR_SET)
+#undef ADD_TYPE
+    };
+
+    if (supported_types.count(static_cast<int>(tensor_data_type)) == 0) {
+      return Status::OK();
+    }
+
     // Create Tensor from value and write it in memmapped format.
     Tensor parsed(tensor_proto.dtype());
     if (!parsed.FromProto(cpu_allocator(), tensor_proto)) {
       return errors::InvalidArgument("Cannot parse tensor from proto: ",
                                      tensor_proto.DebugString());
     }
-    if (parsed.TotalBytes() < min_conversion_size_bytes) {
+    if (parsed.TotalBytes() < static_cast<size_t>(min_conversion_size_bytes)) {
       return Status::OK();
     }
 
diff --git a/tensorflow/contrib/util/convert_graphdef_memmapped_format_test.cc b/tensorflow/contrib/util/convert_graphdef_memmapped_format_test.cc
index d64dca7b634..cb1e7577cf2 100644
--- a/tensorflow/contrib/util/convert_graphdef_memmapped_format_test.cc
+++ b/tensorflow/contrib/util/convert_graphdef_memmapped_format_test.cc
@@ -26,6 +26,15 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+bool GraphHasImmutableConstNodes(const GraphDef& graph_def) {
+  for (const auto& node : graph_def.node()) {
+    if (node.op() == "ImmutableConst") {
+      return true;
+    }
+  }
+  return false;
+}
+
 TEST(ConvertGraphdefMemmappedFormatTest, ConvertModel) {
   const string dir = testing::TmpDir();
   const string filename_pb = io::JoinPath(dir, "graphdef.pb");
@@ -69,6 +78,7 @@ TEST(ConvertGraphdefMemmappedFormatTest, ConvertModel) {
   TF_ASSERT_OK(ReadBinaryProto(
       &memmapped_env, MemmappedFileSystem::kMemmappedPackageDefaultGraphDef,
       &loaded_graph_def));
+  ASSERT_TRUE(GraphHasImmutableConstNodes(loaded_graph_def));
 
   TF_ASSERT_OK(session->Create(loaded_graph_def)) << "Can't create test graph";
   std::vector<Tensor> outputs;
@@ -79,5 +89,48 @@ TEST(ConvertGraphdefMemmappedFormatTest, ConvertModel) {
   EXPECT_EQ(outputs.front().flat<float>()(2), 2.0f * 3.0f * kTensorHeight);
 }
 
+TEST(ConvertGraphdefMemmappedFormatTest, NotSupportedTypesConvert) {
+  // Create a graph with strings.
+  const string dir = testing::TmpDir();
+  const string filename_pb = io::JoinPath(dir, "string_graphdef.pb");
+
+  constexpr int kTensorWidth = 4000;
+  constexpr int kTensorHeight = 100;
+  const TensorShape kTestTensorShape({kTensorWidth, kTensorHeight});
+  Tensor test_tensor1(DT_STRING, kTestTensorShape);
+  test::FillFn<string>(&test_tensor1, [](int) -> string { return "ABC"; });
+
+  Tensor test_tensor2(DT_STRING, kTestTensorShape);
+  test::FillFn<string>(&test_tensor2, [](int) -> string { return "XYZ"; });
+  auto root = Scope::NewRootScope().ExitOnError();
+  ops::Output m = ops::Add(root, test_tensor1, test_tensor2);
+  const string result_name = m.node()->name();
+
+  GraphDef graph_def;
+  TF_ASSERT_OK(root.ToGraphDef(&graph_def));
+  string graph_def_serialized;
+  graph_def.SerializeToString(&graph_def_serialized);
+  TF_ASSERT_OK(
+      WriteStringToFile(Env::Default(), filename_pb, graph_def_serialized));
+
+  const string filename_mmap = io::JoinPath(dir, "string_graphdef.mmap");
+  TF_ASSERT_OK(ConvertConstantsToImmutable(filename_pb, filename_mmap, 1000));
+
+  // Create and initialize MemmappedEnv from the converted file.
+  MemmappedEnv memmapped_env(Env::Default());
+  TF_ASSERT_OK(memmapped_env.InitializeFromFile(filename_mmap));
+
+  // Load the graph and run calculations.
+  SessionOptions session_options;
+  session_options.env = &memmapped_env;
+  std::unique_ptr<Session> session(NewSession(session_options));
+  ASSERT_TRUE(session != nullptr) << "Failed to create session";
+  GraphDef loaded_graph_def;
+  TF_ASSERT_OK(ReadBinaryProto(
+      &memmapped_env, MemmappedFileSystem::kMemmappedPackageDefaultGraphDef,
+      &loaded_graph_def));
+  ASSERT_FALSE(GraphHasImmutableConstNodes(loaded_graph_def));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index a2a998cf4dc..1c37921afc3 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -164,6 +164,8 @@ cc_library(
         "lib/core/threadpool.h",
         "lib/gtl/array_slice.h",
         "lib/gtl/cleanup.h",
+        "lib/gtl/flatmap.h",
+        "lib/gtl/flatset.h",
         "lib/gtl/inlined_vector.h",
         "lib/gtl/priority_queue_util.h",
         "lib/hash/crc32c.h",
@@ -178,7 +180,6 @@ cc_library(
         "lib/io/table.h",
         "lib/io/table_builder.h",
         "lib/io/table_options.h",
-        "lib/jpeg/jpeg_mem.h",
         "lib/math/math_util.h",
         "lib/monitoring/collected_metrics.h",
         "lib/monitoring/collection_registry.h",
@@ -220,6 +221,13 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "jpeg",
+    hdrs = ["lib/jpeg/jpeg_mem.h"],
+    visibility = ["//visibility:public"],
+    deps = [":jpeg_internal"],
+)
+
 # Test support library needed for all tests
 # This is currently public, but may be made internal in the
 # future.  Try to avoid depending on it.
@@ -521,6 +529,7 @@ cc_library(
         "//tensorflow/core/kernels:control_flow_ops",
         "//tensorflow/core/kernels:ctc_ops",
         "//tensorflow/core/kernels:data_flow",
+        "//tensorflow/core/kernels:fake_quant_ops",
         "//tensorflow/core/kernels:function_ops",
         "//tensorflow/core/kernels:image",
         "//tensorflow/core/kernels:io",
@@ -970,6 +979,7 @@ cc_library(
             ],
             exclude = [
                 "**/*test*",
+                "lib/jpeg/**/*",
                 "platform/**/cuda.h",
                 "platform/**/stream_executor.h",
                 "platform/load_library.cc",
@@ -986,6 +996,7 @@ cc_library(
             ],
             exclude = [
                 "**/*test*",
+                "lib/jpeg/**/*",
                 "platform/**/cuda.h",
                 "platform/**/stream_executor.h",
             ],
@@ -1019,7 +1030,6 @@ cc_library(
         "lib/io/zlib_compression_options.h",
         "lib/io/zlib_inputstream.h",
         "lib/io/zlib_outputbuffer.h",
-        "lib/jpeg/jpeg_handle.h",
         "lib/png/png_io.h",
         "lib/random/random.h",
         "lib/random/random_distributions.h",
@@ -1048,6 +1058,26 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "jpeg_internal",
+    srcs = glob(
+        [
+            "lib/jpeg/*h",
+            "lib/jpeg/*.cc",
+        ],
+        exclude = [
+            "**/*test*",
+        ],
+    ),
+    hdrs = ["lib/jpeg/jpeg_handle.h"],
+    copts = tf_copts(),
+    linkopts = ["-ldl"],
+    deps = [
+        ":lib",
+        "//tensorflow/core/platform/default/build_config:jpeg",
+    ],
+)
+
 proto_text_hdrs_and_srcs = tf_generate_proto_text_sources(
     name = "proto_text_srcs_all",
     srcs = tf_proto_text_protos_relative(),
@@ -1149,83 +1179,6 @@ cc_header_only_library(
     ],
 )
 
-filegroup(
-    name = "framework_headers",
-    srcs = [
-        "framework/allocator.h",
-        "framework/attr_value_util.h",
-        "framework/bfloat16.h",
-        "framework/cancellation.h",
-        "framework/control_flow.h",
-        "framework/device_base.h",
-        "framework/function.h",
-        "framework/kernel_def_builder.h",
-        "framework/node_def_util.h",
-        "framework/numeric_types.h",
-        "framework/op.h",
-        "framework/op_def_builder.h",
-        "framework/op_def_util.h",
-        "framework/op_kernel.h",
-        "framework/partial_tensor_shape.h",
-        "framework/register_types.h",
-        "framework/rendezvous.h",
-        "framework/selective_registration.h",
-        "framework/session_state.h",
-        "framework/shape_inference.h",
-        "framework/tensor.h",
-        "framework/tensor_reference.h",
-        "framework/tensor_shape.h",
-        "framework/tensor_types.h",
-        "framework/tracking_allocator.h",
-        "framework/type_traits.h",
-        "framework/types.h",
-        "framework/unique_tensor_references.h",
-        "lib/core/errors.h",
-        "lib/core/notification.h",
-        "lib/core/refcount.h",
-        "lib/core/status.h",
-        "lib/core/stringpiece.h",
-        "lib/core/threadpool.h",
-        "lib/gtl/array_slice.h",
-        "lib/gtl/array_slice_internal.h",
-        "lib/gtl/inlined_vector.h",
-        "lib/gtl/manual_constructor.h",
-        "lib/hash/hash.h",
-        "lib/strings/numbers.h",
-        "lib/strings/str_util.h",
-        "lib/strings/strcat.h",
-        "platform/cpu_info.h",
-        "platform/default/dynamic_annotations.h",
-        "platform/default/integral_types.h",
-        "platform/default/logging.h",
-        "platform/default/mutex.h",
-        "platform/default/notification.h",
-        "platform/default/protobuf.h",
-        "platform/default/thread_annotations.h",
-        "platform/dynamic_annotations.h",
-        "platform/env.h",
-        "platform/file_statistics.h",
-        "platform/file_system.h",
-        "platform/fingerprint.h",
-        "platform/logging.h",
-        "platform/macros.h",
-        "platform/mem.h",
-        "platform/mutex.h",
-        "platform/net.h",
-        "platform/notification.h",
-        "platform/platform.h",
-        "platform/prefetch.h",
-        "platform/protobuf.h",
-        "platform/strong_hash.h",
-        "platform/thread_annotations.h",
-        "platform/types.h",
-        "public/session.h",
-        "public/session_options.h",
-        "public/version.h",
-        "util/device_name_utils.h",
-    ],
-)
-
 tf_cuda_library(
     name = "stream_executor",
     srcs = tf_additional_stream_executor_srcs(),
@@ -1316,7 +1269,7 @@ cc_library(
         "platform/regexp.h",
     ],
     visibility = [
-        "//tensorflow/contrib/tfprof:__subpackages__",
+        "//tensorflow/tools/tfprof:__subpackages__",
     ],
     deps = [":lib_internal"],
 )
@@ -1326,11 +1279,13 @@ tf_cuda_library(
     srcs = ["common_runtime/direct_session.cc"],
     hdrs = ["common_runtime/direct_session.h"],
     copts = tf_copts(),
+    cuda_deps = [
+        ":gpu_tracer",
+    ],
     linkstatic = 1,
     deps = [
         ":core_cpu_internal",
         ":framework",
-        ":gpu_tracer",
         ":lib",
         ":lib_internal",
         ":proto_text",
@@ -1496,6 +1451,8 @@ tf_cc_tests(
         "lib/gtl/array_slice_test.cc",
         "lib/gtl/cleanup_test.cc",
         "lib/gtl/edit_distance_test.cc",
+        "lib/gtl/flatmap_test.cc",
+        "lib/gtl/flatset_test.cc",
         "lib/gtl/inlined_vector_test.cc",
         "lib/gtl/int_type_test.cc",
         "lib/gtl/iterator_range_test.cc",
@@ -1582,6 +1539,8 @@ cc_test(
     srcs = ["lib/jpeg/jpeg_mem_unittest.cc"],
     data = glob(["lib/jpeg/testdata/*.jpg"]),
     deps = [
+        ":jpeg",
+        ":jpeg_internal",
         ":lib",
         ":lib_internal",
         ":test",
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 59fa09bd8db..35332dfc8cf 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/executor.h"
 #include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_tracer.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/common_runtime/memory_types.h"
 #include "tensorflow/core/common_runtime/simple_placer.h"
@@ -57,6 +56,10 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
+#if GOOGLE_CUDA
+#include "tensorflow/core/common_runtime/gpu/gpu_tracer.h"
+#endif  // GOOGLE_CUDA
+
 namespace tensorflow {
 
 namespace {
@@ -453,12 +456,14 @@ Status DirectSession::Run(const RunOptions& run_options,
     args.stats_collector = run_state.collector.get();
   }
 
+#if GOOGLE_CUDA
   std::unique_ptr<GPUTracer> tracer;
   if (run_options.trace_level() >= RunOptions::HARDWARE_TRACE) {
     tracer.reset(CreateGPUTracer());
     // tracer will be NULL on non-GPU platforms.
     if (tracer) tracer->Start();
   }
+#endif  // GOOGLE_CUDA
 
   for (const auto& item : executors_and_keys->items) {
     item.executor->RunAsync(args, barrier->Get());
@@ -468,10 +473,12 @@ Status DirectSession::Run(const RunOptions& run_options,
                                       ? run_options.timeout_in_ms()
                                       : operation_timeout_in_ms_);
 
+#if GOOGLE_CUDA
   if (tracer) {
     tracer->Stop();
     tracer->Collect(args.stats_collector);
   }
+#endif  // GOOGLE_CUDA
 
   {
     mutex_lock l(run_state.mu_);
@@ -840,10 +847,11 @@ Status DirectSession::GetOrCreateExecutors(
   std::vector<string> tn_sorted(target_nodes.begin(), target_nodes.end());
   std::sort(tn_sorted.begin(), tn_sorted.end());
 
-  const string key = strings::StrCat(str_util::Join(inputs_sorted, ","), "->",
-                                     str_util::Join(outputs_sorted, ","), "/",
-                                     str_util::Join(tn_sorted, ","), "/",
-                                     run_state_args->is_partial_run);
+  const string key = strings::StrCat(
+      str_util::Join(inputs_sorted, ","), "->",
+      str_util::Join(outputs_sorted, ","), "/", str_util::Join(tn_sorted, ","),
+      "/", run_state_args->is_partial_run, "/",
+      SummarizeDebugTensorWatches(run_state_args->debug_tensor_watches));
 
   // Set the handle.
   run_state_args->handle =
@@ -938,7 +946,7 @@ Status DirectSession::GetOrCreateExecutors(
     partition_graph = iter->second.release();
     optimizer.Optimize(lib, options_.env, device, &partition_graph);
 
-    // EXPERIMENTAL: tfdb inserts debug nodes (i.e., probes) to the graph
+    // EXPERIMENTAL: tfdbg inserts debug nodes (i.e., probes) to the graph
     if (!run_state_args->debug_tensor_watches.empty()) {
       TF_RETURN_IF_ERROR(
           DebugNodeInserter::InsertNodes(run_state_args->debug_tensor_watches,
diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
index a4289112534..0e7203a4d86 100644
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@@ -291,7 +291,7 @@ class DirectSession : public Session {
 
   TF_DISALLOW_COPY_AND_ASSIGN(DirectSession);
 
-  // EXPERIMENTAL: debugger (tfdb) related
+  // EXPERIMENTAL: debugger (tfdbg) related
   friend class DebugGateway;
 };
 
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index c3cc11abb1b..390809b68a0 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -222,7 +222,7 @@ typedef gtl::InlinedVector<AllocatorAttributes, 4> AllocatorAttributeVec;
 class ExecutorImpl : public Executor {
  public:
   ExecutorImpl(const LocalExecutorParams& p, const Graph* g)
-      : params_(p), graph_(g), initial_pending_counts_(graph_->num_node_ids()) {
+      : params_(p), graph_(g) {
     CHECK(p.create_kernel != nullptr);
     CHECK(p.delete_kernel != nullptr);
   }
@@ -231,6 +231,7 @@ class ExecutorImpl : public Executor {
     for (int i = 0; i < graph_->num_node_ids(); i++) {
       params_.delete_kernel(nodes_[i].kernel);
     }
+    delete[] frame_local_ids_;
     delete[] nodes_;
     delete graph_;
   }
@@ -256,13 +257,39 @@ class ExecutorImpl : public Executor {
  private:
   friend class ExecutorState;
 
-  static void InitializePending(const Graph* graph, PendingCounts* counts);
+  struct ControlFlowInfo {
+    std::unordered_map<string, int> frame_name_to_size;
+    std::vector<string> frame_names;
+  };
+
+  struct FrameInfo {
+    // The total number of inputs to a frame.
+    int input_count;
+
+    // The total number of input tensors of a frame.
+    // == sum(nodes[*].num_inputs()) where nodes are the nodes in the frame.
+    int total_inputs;
+
+    // Each frame has its own PendingCounts only for the nodes in the frame.
+    PendingCounts* pending_counts;  // Owned
+
+    // The nodes in a frame. Used only for debugging.
+    std::vector<const Node*>* nodes;  // Owned
+
+    ~FrameInfo() {
+      delete pending_counts;
+      delete nodes;
+    }
+  };
+
+  static Status BuildControlFlowInfo(const Graph* graph,
+                                     ControlFlowInfo* cf_info);
+  void InitializePending(const Graph* graph, const ControlFlowInfo& cf_info);
 
   // Owned.
   LocalExecutorParams params_;
   const Graph* graph_;
   NodeItem* nodes_ = nullptr;     // array of size "graph_.num_node_ids()"
-  int total_input_tensors_ = 0;   // == sum(nodes_[*].num_inputs())
   int total_output_tensors_ = 0;  // == sum(nodes_[*].num_outputs())
 
   // A cached value of params_
@@ -271,14 +298,17 @@ class ExecutorImpl : public Executor {
   // Root nodes (with no in edges) that should form the initial ready queue
   std::vector<const Node*> root_nodes_;
 
-  PendingCounts initial_pending_counts_;
-
-  // The number of inputs for each frame in this graph. This is static
-  // information of the graph.
-  std::unordered_map<string, int> frame_input_count_;
-
   std::vector<AllocatorAttributes> output_attrs_;
 
+  // Mapping from frame name to static information about the frame.
+  // TODO(yuanbyu): We could cache it along with the graph so to avoid
+  // the overhead of constructing it for each executor instance.
+  std::unordered_map<string, FrameInfo> frame_info_;
+
+  // Mapping from a node's id to its index in the PendingCounts of the
+  // frame the node belongs to.
+  int* frame_local_ids_ = nullptr;  // Owned
+
   TF_DISALLOW_COPY_AND_ASSIGN(ExecutorImpl);
 };
 
@@ -287,23 +317,31 @@ Status ExecutorImpl::Initialize() {
   delete[] nodes_;
   nodes_ = new NodeItem[num_nodes];
 
-  Status s;
-  total_input_tensors_ = 0;
   total_output_tensors_ = 0;
 
-  InitializePending(graph_, &initial_pending_counts_);
+  // Build the information about frames in this subgraph.
+  ControlFlowInfo cf_info;
+  BuildControlFlowInfo(graph_, &cf_info);
 
   // Cache this value so we make this virtual function call once, rather
   // that O(# steps * # nodes per step) times.
   device_record_tensor_accesses_ =
       params_.device->RequiresRecordingAccessedTensors();
 
+  for (auto& it : cf_info.frame_name_to_size) {
+    frame_info_[it.first].nodes = new std::vector<const Node*>;
+  }
+  frame_local_ids_ = new int[num_nodes];
+  std::unordered_map<string, int> frame_count;
+
   // Preprocess every node in the graph to create an instance of op
-  // kernel for each node;
+  // kernel for each node.
   for (const Node* n : graph_->nodes()) {
     const int id = n->id();
+    const string& frame_name = cf_info.frame_names[id];
+    FrameInfo& frame_info = frame_info_[frame_name];
 
-    // See if this node is a root node, and if so, add to root_nodes_
+    // See if this node is a root node, and if so, add to root_nodes_.
     const int num_in_edges = n->in_edges().size();
     if (num_in_edges == 0) {
       root_nodes_.push_back(n);
@@ -321,18 +359,18 @@ Status ExecutorImpl::Initialize() {
       item->inlined_output_type[i] = n->output_type(i);
     }
 
-    item->input_start = total_input_tensors_;
-    total_input_tensors_ += n->num_inputs();
+    item->input_start = frame_info.total_inputs;
+    frame_info.total_inputs += n->num_inputs();
 
     item->output_attr_start = total_output_tensors_;
     total_output_tensors_ += n->num_outputs();
 
-    s = params_.create_kernel(n->def(), &item->kernel);
+    Status s = params_.create_kernel(n->def(), &item->kernel);
     if (!s.ok()) {
       item->kernel = nullptr;
       s = AttachDef(s, n->def());
       LOG(ERROR) << "Executor failed to create kernel. " << s;
-      break;
+      return s;
     }
     CHECK(item->kernel);
     item->kernel_is_expensive = item->kernel->IsExpensive();
@@ -340,14 +378,18 @@ Status ExecutorImpl::Initialize() {
     item->is_merge = IsMerge(n);
 
     // Initialize static information about the frames in the graph.
+    frame_local_ids_[id] = frame_count[frame_name]++;
+    frame_info.nodes->push_back(n);
     if (IsEnter(n)) {
-      string frame_name;
-      s = GetNodeAttr(n->def(), "frame_name", &frame_name);
-      if (!s.ok()) return s;
-      ++frame_input_count_[frame_name];
+      string enter_name;
+      TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "frame_name", &enter_name));
+      ++frame_info_[enter_name].input_count;
     }
   }
-  if (!s.ok()) return s;
+
+  // Initialize PendingCounts only after frame_local_ids_ is initialized.
+  InitializePending(graph_, cf_info);
+
   return SetAllocAttrs();
 }
 
@@ -533,12 +575,13 @@ class ExecutorState {
   typedef gtl::InlinedVector<Entry, 4> EntryVector;
 
   struct IterationState {
-    explicit IterationState(const ExecutorImpl* impl)
-        : input_tensors(new Entry[impl->total_input_tensors_]),
+    explicit IterationState(const PendingCounts* pending_counts,
+                            int total_input_tensors)
+        : input_tensors(new Entry[total_input_tensors]),
           outstanding_ops(0),
           outstanding_frame_count(0),
-          counts_(impl->graph_->num_node_ids()) {
-      counts_.InitializeFrom(impl->initial_pending_counts_);
+          counts_(pending_counts->num_nodes()) {
+      counts_.InitializeFrom(*pending_counts);
     }
 
     // The state of an iteration.
@@ -668,9 +711,23 @@ class ExecutorState {
     // will only "execute" the dead exits of the final iteration.
     std::vector<const Node*> dead_exits GUARDED_BY(mu);
 
+    // Static information specific to this frame.
+    PendingCounts* pending_counts = nullptr;
+    int total_input_tensors = 0;
+    std::vector<const Node*>* nodes = nullptr;
+
     // Lock ordering: ExecutorState.mu_ < mu.
     mutex mu;
 
+    void InitializeFrameInfo(const string& enter_name) {
+      auto it_frame_info = executor->frame_info_.find(enter_name);
+      DCHECK(it_frame_info != executor->frame_info_.end());
+      pending_counts = it_frame_info->second.pending_counts;
+      total_input_tensors = it_frame_info->second.total_inputs;
+      num_pending_inputs = it_frame_info->second.input_count;
+      nodes = it_frame_info->second.nodes;
+    }
+
     inline IterationState* GetIteration(int64 iter)
         EXCLUSIVE_LOCKS_REQUIRED(mu) {
       int index = iter % iterations.size();
@@ -889,13 +946,12 @@ class ExecutorState {
   inline void MaybeMarkCompleted(FrameState* frame, int64 iter, int64 id);
 
   // Provide debugging output about an outstanding node in the executor.
-  void DumpCompletedNodeState(const int node_id, const Entry* input_vector);
   void DumpPendingNodeState(const int node_id, const Entry* input_vector,
                             bool show_nodes_with_no_ready_inputs);
   void DumpActiveNodeState(const int node_id, const Entry* input_vector);
 
   // Provide debugging output about an outstanding iteration in the executor.
-  void DumpIterationState(IterationState* iteration);
+  void DumpIterationState(const FrameState* frame, IterationState* iteration);
 
   // Provide debugging output of the state of the executor.
   void DumpState();
@@ -932,16 +988,16 @@ ExecutorState::ExecutorState(const Executor::Args& args, ExecutorImpl* impl)
       num_outstanding_ops_(0) {
   // We start the entire execution in iteration 0 of the root frame
   // so let us create the root frame and the state for iteration 0.
-  // Initialize the frame.
+  // We assume root_frame_->frame_name.empty().
   root_frame_ = new FrameState(impl_, 1);
-  root_frame_->frame_name = "_root";  // assume to be unique
   root_frame_->frame_id = 0;          // must be 0
-  // Initialize the first iteration.
-  root_frame_->iterations.resize(root_frame_->max_parallel_iterations);
-  IterationState* iter_state = new IterationState(impl);
-  root_frame_->iterations[0] = iter_state;
+  root_frame_->InitializeFrameInfo(root_frame_->frame_name);
+
+  // Initialize iteration 0.
+  root_frame_->iterations.resize(root_frame_->max_parallel_iterations);
+  root_frame_->iterations[0] = new IterationState(
+      root_frame_->pending_counts, root_frame_->total_input_tensors);
 
-  if (vlog_) VLOG(2) << "Create frame: " << root_frame_->frame_name;
   outstanding_frames_.insert({root_frame_->frame_name, root_frame_});
 }
 
@@ -949,21 +1005,88 @@ ExecutorState::~ExecutorState() {
   for (auto name_frame : outstanding_frames_) {
     delete name_frame.second;
   }
-
   for (auto it : device_context_map_) {
     it->Unref();
   }
-
   delete slice_reader_cache_;
 }
 
+Status ExecutorImpl::BuildControlFlowInfo(const Graph* g,
+                                          ControlFlowInfo* cf_info) {
+  const int num_nodes = g->num_node_ids();
+  cf_info->frame_names.resize(num_nodes);
+  std::vector<Node*> parent_nodes;
+  parent_nodes.resize(num_nodes);
+  std::vector<bool> visited;
+  visited.resize(num_nodes);
+
+  string frame_name;
+  std::deque<Node*> ready;
+
+  // Initialize with the root nodes.
+  for (Node* n : g->nodes()) {
+    if (n->in_edges().empty()) {
+      visited[n->id()] = true;
+      ++cf_info->frame_name_to_size[frame_name];
+      ready.push_back(n);
+    }
+  }
+
+  while (!ready.empty()) {
+    Node* curr_node = ready.front();
+    int curr_id = curr_node->id();
+    ready.pop_front();
+
+    Node* parent = nullptr;
+    if (IsEnter(curr_node)) {
+      // Enter a child frame.
+      TF_RETURN_IF_ERROR(
+          GetNodeAttr(curr_node->def(), "frame_name", &frame_name));
+      parent = curr_node;
+    } else if (IsExit(curr_node)) {
+      // Exit to the parent frame.
+      parent = parent_nodes[curr_id];
+      frame_name = cf_info->frame_names[parent->id()];
+      parent = parent_nodes[parent->id()];
+    } else {
+      parent = parent_nodes[curr_id];
+      frame_name = cf_info->frame_names[curr_id];
+    }
+
+    for (const Edge* out_edge : curr_node->out_edges()) {
+      Node* out = out_edge->dst();
+      int out_id = out->id();
+
+      // Add to ready queue if not visited.
+      bool is_visited = visited[out_id];
+      if (!is_visited) {
+        ready.push_back(out);
+        visited[out_id] = true;
+
+        // Process the node 'out'.
+        cf_info->frame_names[out_id] = frame_name;
+        parent_nodes[out_id] = parent;
+        ++cf_info->frame_name_to_size[frame_name];
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
 void ExecutorImpl::InitializePending(const Graph* graph,
-                                     PendingCounts* counts) {
-  for (int id = 0; id < graph->num_node_ids(); id++) {
-    counts->set_initial_count(id, 0, 0);  // Make sure everything is initialized
+                                     const ControlFlowInfo& cf_info) {
+  for (auto& it : cf_info.frame_name_to_size) {
+    PendingCounts* counts = new PendingCounts(it.second);
+    frame_info_[it.first].pending_counts = counts;
+    // Make sure everything is initialized
+    for (int id = 0; id < it.second; id++) {
+      counts->set_initial_count(id, 0, 0);
+    }
   }
   for (const Node* n : graph->nodes()) {
     const int id = n->id();
+    const int pending_id = frame_local_ids_[id];
     const int num_in_edges = n->in_edges().size();
     int initial_count;
     if (IsMerge(n)) {
@@ -980,7 +1103,9 @@ void ExecutorImpl::InitializePending(const Graph* graph,
     } else {
       initial_count = num_in_edges;
     }
-    counts->set_initial_count(id, initial_count, num_in_edges);
+    const string& name = cf_info.frame_names[id];
+    PendingCounts* counts = frame_info_[name].pending_counts;
+    counts->set_initial_count(pending_id, initial_count, num_in_edges);
   }
 }
 
@@ -1104,8 +1229,9 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) {
     // TODO(misard) Replace with a finer-grain enabling flag once we
     // add better optional debugging support.
     if (vlog_ && VLOG_IS_ON(1)) {
+      int pending_id = impl_->frame_local_ids_[id];
       mutex_lock l(input_frame->mu);
-      input_frame->GetIteration(input_iter)->mark_started(id);
+      input_frame->GetIteration(input_iter)->mark_started(pending_id);
     }
 
     // Set the device_context for this node id, if it exists.
@@ -1637,12 +1763,13 @@ void ExecutorState::ScheduleReady(const TaggedNodeSeq& ready,
 }
 
 inline void ExecutorState::MaybeMarkCompleted(FrameState* frame, int64 iter,
-                                              int64 id) {
+                                              int64 node_id) {
   // TODO(misard) Replace with a finer-grain enabling flag once we
   // add better optional debugging support.
   if (vlog_ && VLOG_IS_ON(1)) {
+    int pending_id = impl_->frame_local_ids_[node_id];
     mutex_lock l(frame->mu);
-    frame->GetIteration(iter)->mark_completed(id);
+    frame->GetIteration(iter)->mark_completed(pending_id);
   }
 }
 
@@ -1656,18 +1783,6 @@ const Tensor* ExecutorState::GetTensorValueForDump(const Entry& input) {
   }
 }
 
-void ExecutorState::DumpCompletedNodeState(const int node_id,
-                                           const Entry* input_vector) {
-  const NodeItem& node_item = impl_->nodes_[node_id];
-  const Node& node = *node_item.node;
-  LOG(WARNING) << "    Completed Node: " << node.DebugString();
-  const int input_base = node_item.input_start;
-  for (int i = 0; i < node.num_inputs(); ++i) {
-    const Entry& input = input_vector[input_base + i];
-    CHECK(!GetTensorValueForDump(input)->IsInitialized());
-  }
-}
-
 void ExecutorState::DumpPendingNodeState(
     const int node_id, const Entry* input_vector,
     const bool show_nodes_with_no_ready_inputs) {
@@ -1723,23 +1838,30 @@ void ExecutorState::DumpActiveNodeState(const int node_id,
   }
 }
 
-void ExecutorState::DumpIterationState(IterationState* iteration) {
+void ExecutorState::DumpIterationState(const FrameState* frame,
+                                       IterationState* iteration) {
+  const std::vector<const Node*>* nodes = frame->nodes;
   // Dump any waiting nodes that are holding on to tensors.
-  for (int i = 0; i < impl_->graph_->num_node_ids(); ++i) {
-    if (iteration->node_state(i) == PendingCounts::PENDING_NOTREADY ||
-        iteration->node_state(i) == PendingCounts::PENDING_READY) {
-      DumpPendingNodeState(i, iteration->input_tensors, false);
+  for (const Node* node : *nodes) {
+    int node_id = node->id();
+    int pending_id = impl_->frame_local_ids_[node_id];
+    if (iteration->node_state(pending_id) == PendingCounts::PENDING_NOTREADY ||
+        iteration->node_state(pending_id) == PendingCounts::PENDING_READY) {
+      DumpPendingNodeState(node_id, iteration->input_tensors, false);
     }
   }
   // Then the active nodes.
-  for (int i = 0; i < impl_->graph_->num_node_ids(); ++i) {
-    if (iteration->node_state(i) == PendingCounts::STARTED) {
-      DumpActiveNodeState(i, iteration->input_tensors);
+  for (const Node* node : *nodes) {
+    int node_id = node->id();
+    int pending_id = impl_->frame_local_ids_[node_id];
+    if (iteration->node_state(pending_id) == PendingCounts::STARTED) {
+      DumpActiveNodeState(pending_id, iteration->input_tensors);
     }
   }
   // Show all input tensors in use.
+  int total_input_tensors = frame->total_input_tensors;
   size_t total_bytes = 0;
-  for (int i = 0; i < impl_->total_input_tensors_; ++i) {
+  for (int i = 0; i < total_input_tensors; ++i) {
     const Entry& input = iteration->input_tensors[i];
     const Tensor* tensor = GetTensorValueForDump(input);
     if (tensor->IsInitialized()) {
@@ -1764,7 +1886,7 @@ void ExecutorState::DumpState() {
       mutex_lock frame_lock(frame_state->mu);
       for (IterationState* iteration : frame_state->iterations) {
         LOG(WARNING) << "  Iteration:";
-        DumpIterationState(iteration);
+        DumpIterationState(frame_state, iteration);
       }
     }
     dumped_on_error_ = true;
@@ -1819,16 +1941,13 @@ void ExecutorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
   temp->frame_id = Hash64(child_name);
   temp->parent_frame = frame;
   temp->parent_iter = iter;
+  temp->InitializeFrameInfo(enter_name);
 
   // 'iterations' is a fixed-length circular buffer.
   temp->iterations.resize(temp->max_parallel_iterations + 1);
-  // Initialize the first iteration.
-  IterationState* iter_state = new IterationState(impl_);
-  temp->iterations[0] = iter_state;
-
-  auto frame_pending = impl_->frame_input_count_.find(enter_name);
-  DCHECK(frame_pending != impl_->frame_input_count_.end());
-  temp->num_pending_inputs = frame_pending->second;
+  // Initialize iteration 0.
+  temp->iterations[0] =
+      new IterationState(temp->pending_counts, temp->total_input_tensors);
 
   {
     mutex_lock executor_lock(mu_);
@@ -1851,33 +1970,40 @@ void ExecutorState::DeleteFrame(FrameState* frame, TaggedNodeSeq* ready) {
   FrameState* parent_frame = frame->parent_frame;
   int64 parent_iter = frame->parent_iter;
   if (parent_frame != nullptr) {
+    const int* pending_ids = impl_->frame_local_ids_;
     mutex_lock paranet_frame_lock(parent_frame->mu);
     // Propagate all the dead exits to the parent frame.
     for (const Node* node : frame->dead_exits) {
       auto parent_iter_state = parent_frame->GetIteration(parent_iter);
       for (const Edge* e : node->out_edges()) {
         const Node* dst_node = e->dst();
-        const int dst_id = dst_node->id();
+        const int dst_pending_id = pending_ids[dst_node->id()];
+
+        // TODO(yuanbyu): We don't need this if we require the subgraph
+        // given to an executor not to contain a sink node.
+        if (dst_node->IsSink()) continue;
 
         bool dst_dead = true;
         bool dst_ready = false;
         // We know this is a dead input to dst.
         if (IsMerge(dst_node)) {
           if (e->IsControlEdge()) {
-            parent_iter_state->decrement_pending(dst_id, 2);
-            int count = parent_iter_state->pending(dst_id);
-            dst_dead = (parent_iter_state->dead_count(dst_id) ==
-                        dst_node->num_inputs());
+            parent_iter_state->decrement_pending(dst_pending_id, 2);
+            int count = parent_iter_state->pending(dst_pending_id);
+            int dead_cnt = parent_iter_state->dead_count(dst_pending_id);
+            dst_dead = (dead_cnt == dst_node->num_inputs());
             dst_ready = (count == 0) || ((count == 1) && dst_dead);
           } else {
-            parent_iter_state->increment_dead_count(dst_id);
-            const int dead_cnt = parent_iter_state->dead_count(dst_id);
+            parent_iter_state->increment_dead_count(dst_pending_id);
+            const int dead_cnt = parent_iter_state->dead_count(dst_pending_id);
             dst_dead = (dead_cnt == dst_node->num_inputs());
-            dst_ready = (parent_iter_state->pending(dst_id) == 1) && dst_dead;
+            dst_ready =
+                (parent_iter_state->pending(dst_pending_id) == 1) && dst_dead;
           }
         } else {
-          parent_iter_state->increment_dead_count(dst_id);
-          dst_ready = (parent_iter_state->decrement_pending(dst_id, 1) == 0);
+          parent_iter_state->increment_dead_count(dst_pending_id);
+          dst_ready =
+              (parent_iter_state->decrement_pending(dst_pending_id, 1) == 0);
         }
         if (dst_ready) {
           ready->push_back(
@@ -1923,12 +2049,18 @@ void ExecutorState::FrameState::ActivateNodes(const Node* node,
                                               const EntryVector& outputs,
                                               TaggedNodeSeq* ready) {
   const NodeItem* nodes = executor->nodes_;
+  const int* pending_ids = executor->frame_local_ids_;
   IterationState* iter_state = GetIteration(iter);
   for (const Edge* e : node->out_edges()) {
     const Node* dst_node = e->dst();
     const int dst_id = dst_node->id();
+    const int dst_pending_id = pending_ids[dst_id];
     const int src_slot = e->src_output();
 
+    // TODO(yuanbyu): We don't need this if we require the subgraph
+    // given to an executor not to contain a sink node.
+    if (dst_node->IsSink()) continue;
+
     bool dst_dead = false;
     bool dst_ready = false;
     // True iff this input for dst is needed. We only set this input for
@@ -1940,15 +2072,16 @@ void ExecutorState::FrameState::ActivateNodes(const Node* node,
       // a) a live data input becomes available or b) all data inputs are dead.
       // For Merge, pending's LSB is set iff a live data input has arrived.
       if (e->IsControlEdge()) {
-        iter_state->decrement_pending(dst_id, 2);
-        int count = iter_state->pending(dst_id);
-        dst_dead = (iter_state->dead_count(dst_id) == dst_node->num_inputs());
+        iter_state->decrement_pending(dst_pending_id, 2);
+        int count = iter_state->pending(dst_pending_id);
+        int dead_cnt = iter_state->dead_count(dst_pending_id);
+        dst_dead = (dead_cnt == dst_node->num_inputs());
         dst_ready = (count == 0) || ((count == 1) && dst_dead);
       } else {
         if (outputs[src_slot].has_value) {
           // This is a live data input.
-          int count = iter_state->pending(dst_id);
-          iter_state->mark_live(dst_id);
+          int count = iter_state->pending(dst_pending_id);
+          iter_state->mark_live(dst_pending_id);
           // Only the first live edge sets the input and (potentially)
           // triggers execution. The low bit of count is set if and
           // only if no live input has been used yet (mark_live clears
@@ -1962,10 +2095,10 @@ void ExecutorState::FrameState::ActivateNodes(const Node* node,
           // a dead enter. We need this to handle properly a while loop on
           // the untaken branch of a conditional.
           // TODO(yuanbyu): This is a bit hacky, but a good solution for now.
-          iter_state->increment_dead_count(dst_id);
-          const int dead_cnt = iter_state->dead_count(dst_id);
+          iter_state->increment_dead_count(dst_pending_id);
+          const int dead_cnt = iter_state->dead_count(dst_pending_id);
           dst_dead = (dead_cnt == dst_node->num_inputs()) || IsEnter(node);
-          dst_ready = (iter_state->pending(dst_id) == 1) && dst_dead;
+          dst_ready = (iter_state->pending(dst_pending_id) == 1) && dst_dead;
           dst_need_input = false;
         }
       }
@@ -1974,10 +2107,10 @@ void ExecutorState::FrameState::ActivateNodes(const Node* node,
       // for all inputs to come in even if we know the node is dead. This
       // ensures that all input tensors get cleaned up.
       if (is_dead || (!e->IsControlEdge() && !outputs[src_slot].has_value)) {
-        iter_state->increment_dead_count(dst_id);
+        iter_state->increment_dead_count(dst_pending_id);
       }
-      dst_dead = iter_state->dead_count(dst_id) > 0;
-      dst_ready = (iter_state->decrement_pending(dst_id, 1) == 0);
+      dst_dead = iter_state->dead_count(dst_pending_id) > 0;
+      dst_ready = (iter_state->decrement_pending(dst_pending_id, 1) == 0);
     }
 
     if (dst_need_input) {
@@ -2052,7 +2185,8 @@ void ExecutorState::FrameState::IncrementIteration(TaggedNodeSeq* ready) {
   int64 next_iter = iteration_count;
 
   // Initialize the next iteration.
-  IterationState* iter_state = new IterationState(executor);
+  IterationState* iter_state =
+      new IterationState(pending_counts, total_input_tensors);
   SetIteration(next_iter, iter_state);
   num_outstanding_iterations++;
   dead_exits.clear();
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 89c870253f2..c868083efda 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -44,11 +44,7 @@ static const char* const kRetOp = "_Retval";
 static const char* const kGradientOp = "SymbolicGradient";
 static const char* const kNodeLabel = "Func";
 static const char* const kFuncAttr = "f";
-// kNoinlineAttr must start with an "_" to avoid collisions with
-// user-specified attrs.
-static const char* const kNoinlineAttr = "_noinline";
-// Old graphs use no "_".
-static const char* const kOldNoinlineAttr = "noinline";
+static const char* const kNoInlineAttr = "_noinline";
 
 // Represents the index-th output of a node.
 struct Endpoint {
@@ -168,6 +164,7 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
 
   Device* device() override { return device_; }
   Env* env() override { return env_; }
+  int graph_def_version() override { return graph_def_version_; }
 
   string DebugString(Handle h) override;
 
@@ -290,6 +287,34 @@ const FunctionBody* FunctionLibraryRuntimeImpl::GetFunctionBody(Handle h) {
   return func_graphs_[h];
 }
 
+namespace {
+
+struct CustomCreatorSingleton {
+  mutex mu;
+  CustomKernelCreator custom_creator = nullptr;
+
+  void Set(CustomKernelCreator cb) {
+    mutex_lock l(mu);
+    custom_creator = cb;
+  }
+
+  CustomKernelCreator Get() {
+    mutex_lock l(mu);
+    return custom_creator;
+  }
+};
+
+CustomCreatorSingleton* GetCustomCreatorSingleton() {
+  static CustomCreatorSingleton* ccs = new CustomCreatorSingleton;
+  return ccs;
+}
+
+}  // end namespace
+
+void RegisterCustomKernelCreator(CustomKernelCreator cb) {
+  GetCustomCreatorSingleton()->Set(cb);
+}
+
 Status FunctionLibraryRuntimeImpl::CreateKernel(const NodeDef& ndef,
                                                 OpKernel** kernel) {
   if (lib_def_->Find(ndef.op()) == nullptr) {
@@ -318,8 +343,23 @@ Status FunctionLibraryRuntimeImpl::CreateKernel(const NodeDef& ndef,
     output_memory_types.push_back(t == DT_INT32 ? HOST_MEMORY : DEVICE_MEMORY);
   }
 
-  // Constructs a CallOp kernel for running the instantiated function.
+  // If a custom kernel creator is given, try that.
+  CustomKernelCreator custom_creator = GetCustomCreatorSingleton()->Get();
   Status s;
+  if (custom_creator) {
+    std::unique_ptr<OpKernel> ret;
+    s = custom_creator(this, ndef, &ret);
+    if (s.ok()) {
+      *kernel = ret.release();
+      return s;
+    } else {
+      VLOG(2) << "Custom creator error: " << s;
+      // Falls through.
+      s = Status::OK();
+    }
+  }
+
+  // Constructs a CallOp kernel for running the instantiated function.
   auto device_type = DeviceType(device_->attributes().device_type());
   OpKernelConstruction construction(
       device_type, device_, device_->GetAllocator(AllocatorAttributes()), &ndef,
@@ -327,7 +367,7 @@ Status FunctionLibraryRuntimeImpl::CreateKernel(const NodeDef& ndef,
       fbody->ret_types, output_memory_types, graph_def_version_, &s);
   *kernel = new CallOp(handle, &construction);
   if (!s.ok()) {
-    delete kernel;
+    delete *kernel;
   }
   return s;
 }
@@ -887,15 +927,11 @@ static void InlineFunctionBody(Graph* g, Node* caller,
 }
 
 // Given a node's NodeDef, returns false iff the node explicitly
-// specified _noinline. This gives ExpandInlineFunctions a heuristic to
-// decide whether to inline the function.
-// `old` is true for GraphDef versions older than 12, when the
-// `noinline` attr was renamed to `_noinline` to avoid conflicts with
-// user-specified attrs.
-bool ShouldInline(const NodeDef& ndef, bool old) {
+// specified _noinline. This gives ExpandInlineFunctions a heuristic
+// to decide whether to inline the function.
+bool ShouldInline(const NodeDef& ndef) {
   bool noinline = false;
-  const char* const attr = old ? kOldNoinlineAttr : kNoinlineAttr;
-  if (GetNodeAttr(ndef, attr, &noinline).ok()) {
+  if (GetNodeAttr(ndef, kNoInlineAttr, &noinline).ok()) {
     // If the node specifies attribute '_noinline', returns accordingly.
     return !noinline;
   }
@@ -914,7 +950,8 @@ bool ShouldInline(const NodeDef& ndef, bool old) {
     // continue and the runtime will error out.
     return false;
   }
-  s = GetNodeAttr(AttrSlice(&forward_func_attrs->attr()), attr, &noinline);
+  s = GetNodeAttr(AttrSlice(&forward_func_attrs->attr()), kNoInlineAttr,
+                  &noinline);
   if (!s.ok()) {
     // The forward function doesn't specify '_noinline' attr, we should
     // be free to decide.
@@ -926,11 +963,9 @@ bool ShouldInline(const NodeDef& ndef, bool old) {
 
 bool ExpandInlineFunctions(FunctionLibraryRuntime* lib, Graph* graph) {
   std::vector<std::pair<Node*, const FunctionBody*>> candidates;
-  // Identify old graphs before the 'noinline' attr was renamed '_noinline'.
-  const bool old_inline_attr = graph->versions().producer() < 12;
   for (Node* node : graph->nodes()) {
     VLOG(3) << "Expanding " << node->DebugString();
-    if (!ShouldInline(node->def(), old_inline_attr)) {
+    if (!ShouldInline(node->def())) {
       VLOG(3) << "noinline: " << node->DebugString();
       continue;
     }
diff --git a/tensorflow/core/common_runtime/function.h b/tensorflow/core/common_runtime/function.h
index 196226214ba..73e99442388 100644
--- a/tensorflow/core/common_runtime/function.h
+++ b/tensorflow/core/common_runtime/function.h
@@ -123,6 +123,18 @@ void ToGraphDef(const Graph* g, GraphDef* gdef, bool pretty = false);
 // TODO(zhifengc): Asks math expert to say the comment again.
 FunctionBody* SymbolicGradient(const FunctionBody& f);
 
+// Registers a customizable kernel creator for a function call.
+//
+// If 'cb()' returns a non-OK, we still fall back to an executor-based
+// interpreter op kernel to execute a function. If 'cb()' returns OK,
+// takes ownership of the returned OpKernel.
+//
+// TODO(zhifengc/phawkins): b/32379046
+typedef std::function<Status(FunctionLibraryRuntime*, const NodeDef&,
+                             std::unique_ptr<OpKernel>*)>
+    CustomKernelCreator;
+void RegisterCustomKernelCreator(CustomKernelCreator cb);
+
 }  // end namespace tensorflow
 
 #endif  // TENSORFLOW_COMMON_RUNTIME_FUNCTION_H_
diff --git a/tensorflow/core/common_runtime/pending_counts.h b/tensorflow/core/common_runtime/pending_counts.h
index be2dc2418ed..cfc40324710 100644
--- a/tensorflow/core/common_runtime/pending_counts.h
+++ b/tensorflow/core/common_runtime/pending_counts.h
@@ -71,6 +71,7 @@ class PendingCounts {
     }
   }
 
+  inline int num_nodes() const { return num_nodes_; }
   NodeState node_state(int id) {
     if (IsLarge(id)) {
       return NodeStateLarge(id);
@@ -185,12 +186,7 @@ class PendingCounts {
   // use one byte to hold both the pending and dead count for a node
   // where these together can fit in one byte, and we use a hash table
   // to handle the rare node ids that need larger counts than this.
-
-  // TODO(yuanbyu): We current use O(# of nodes in partition) space
-  // even for nested iterations where only a small fraction of the
-  // nodes are involved.  This is not efficient if the subgraph for
-  // the frame is only a small subset of the partition. We should make
-  // the vector size to be only the size of the frame subgraph.
+  // Each frame in this subgraph has its own PendingCounts.
 
   // We use 3 bits each for dead_count and pending.
   static const int kMaxCountForPackedCounts = 7;
diff --git a/tensorflow/core/common_runtime/shape_refiner.cc b/tensorflow/core/common_runtime/shape_refiner.cc
index 4752be41ff1..1ddd4830761 100644
--- a/tensorflow/core/common_runtime/shape_refiner.cc
+++ b/tensorflow/core/common_runtime/shape_refiner.cc
@@ -27,6 +27,10 @@ limitations under the License.
 
 namespace tensorflow {
 
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
 ShapeRefiner::ShapeRefiner(const OpRegistryInterface* ops)
     : ops_registry_(ops) {}
 
@@ -37,7 +41,7 @@ Status ShapeRefiner::AddNode(const Node* node) {
   // from 'input's InferenceContext, and store into a vector
   // indexed by 'node's input.
   std::vector<Node*> input_nodes(node->num_inputs());
-  std::vector<shape_inference::ShapeHandle> input_shapes(node->num_inputs());
+  std::vector<ShapeHandle> input_shapes(node->num_inputs());
   for (const Edge* e : node->in_edges()) {
     if (e->IsControlEdge()) continue;
 
@@ -49,7 +53,7 @@ Status ShapeRefiner::AddNode(const Node* node) {
           node->name(), "' was not previously added to ShapeRefiner.");
     }
 
-    shape_inference::InferenceContext* c = it->second;
+    InferenceContext* c = it->second;
     DCHECK_GE(e->dst_input(), 0);
     input_nodes[e->dst_input()] = input;
     input_shapes[e->dst_input()] = c->output(e->src_output());
@@ -68,11 +72,13 @@ Status ShapeRefiner::AddNode(const Node* node) {
   std::vector<const Tensor*> input_tensors(node->num_inputs());
   std::vector<Tensor> real_tensors(node->num_inputs());
   std::vector<bool> attempted_materialization(node->num_inputs());
+  std::vector<bool> attempted_tensor_as_shape_conversion(node->num_inputs());
+  std::vector<ShapeHandle> input_tensors_as_shapes;
 
   // Create the inference context for this node with the existing input shapes.
-  std::unique_ptr<shape_inference::InferenceContext> c(
-      new shape_inference::InferenceContext(&node->def(), node->op_def(),
-                                            input_shapes, input_tensors));
+  std::unique_ptr<InferenceContext> c(
+      new InferenceContext(&node->def(), node->op_def(), input_shapes,
+                           input_tensors, input_tensors_as_shapes));
   if (!c->construction_status().ok()) {
     return c->construction_status();
   }
@@ -101,63 +107,44 @@ Status ShapeRefiner::AddNode(const Node* node) {
     // subgraph once.
 
     for (int i = 0; i < c->num_inputs(); ++i) {
+      if (!c->requested_input_tensor(i)) {
+        continue;
+      }
       // Check if we have not already filled in the requested input,
       // and if not, try to materialize the tensors.
-      if (c->requested_input_tensor(i) && !attempted_materialization[i]) {
+      if (!attempted_materialization[i]) {
         attempted_materialization[i] = true;
 
-        const Edge* input_edge;
-        TF_RETURN_IF_ERROR(node->input_edge(i, &input_edge));
-
-        bool is_constant_graph = false;
-        Graph subgraph(ops_registry_);
-
-        // We identify the possibly constant subgraph to evaluate by
-        // recursively iterating backwards through the inputs to 'node'
-        // until we either 1) find an already existing input to our subgraph
-        // (filled in `const_inputs`), 2) Discover our graph is not constant,
-        // or 3) Hit a root node.
-        std::vector<std::pair<string, Tensor>> const_inputs;
-        TF_RETURN_IF_ERROR(ExtractConstantSubgraph(
-            input_nodes[i], &subgraph, &is_constant_graph, &const_inputs));
-        if (is_constant_graph) {
-          const string output_tensor_name = strings::StrCat(
-              input_nodes[i]->name(), ":", input_edge->src_output());
-          std::vector<Tensor> outputs;
-          // NOTE; we should pass in a function library runtime if we want
-          // to support constant-expression evaluation on functions.
-          Status s = GraphRunner::Run(&subgraph, nullptr /* function_library */,
-                                      Env::Default(), const_inputs,
-                                      {output_tensor_name}, &outputs);
-
-          // If all kernels in the constant graph are not registered
-          // in the process, GraphRunner::Run may fail, in which case
-          // we cannot propagate constants, so this is best-effort.
-          if (s.ok()) {
-            real_tensors[i] = outputs[0];
-            input_tensors[i] = &real_tensors[i];
-
-            // We have more concrete information about a shape,
-            // so re-run shape inference.
-            rerun_shape_fn = true;
-
-            // We memoize (small) constants evaluated so far, so
-            // ExtractConstantSubgraph can avoid extracting the full
-            // subgraph.  As we build up large graphs, this avoids
-            // repeated computation of the early parts of a constant
-            // graph.
-            if (outputs[0].TotalBytes() <= kMaxTensorSize) {
-              const_tensor_map_[output_tensor_name] = outputs[0];
-            }
-          }
+        Tensor result;
+        bool evaluated = false;
+        TF_RETURN_IF_ERROR(
+            EvaluateConstantTensorForEdge(node, i, &evaluated, &result));
+        if (evaluated) {
+          real_tensors[i] = result;
+          input_tensors[i] = &real_tensors[i];
+          // We have more concrete information about a shape,
+          // so re-run shape inference.
+          rerun_shape_fn = true;
         }
       }
+      if (c->requested_input_tensor_as_partial_shape(i) &&
+          !attempted_tensor_as_shape_conversion[i]) {
+        attempted_tensor_as_shape_conversion[i] = true;
+        if (i >= input_tensors_as_shapes.size()) {
+          input_tensors_as_shapes.resize(i + 1);
+        }
+        ShapeHandle s;
+        TF_RETURN_IF_ERROR(ConstantPartialShape(c.get(), node, i, &s));
+        input_tensors_as_shapes[i] = s;
+        rerun_shape_fn = true;
+      }
     }
 
     if (rerun_shape_fn) {
       // We have more information about the shapes on this pass,
       // so re-run shape inference.
       c->set_input_tensors(input_tensors);
+      c->set_input_tensors_as_shapes(input_tensors_as_shapes);
       TF_RETURN_IF_ERROR(op_reg_data->shape_inference_fn(c.get()));
     }
   } while (rerun_shape_fn);
@@ -169,7 +156,7 @@ Status ShapeRefiner::AddNode(const Node* node) {
 }
 
 Status ShapeRefiner::SetShape(const Node* node, int output_port,
-                              shape_inference::ShapeHandle shape) {
+                              ShapeHandle shape) {
   auto c = GetContext(node);
   if (c == nullptr) {
     return errors::Internal("Could not find context for ", node->name());
@@ -182,7 +169,7 @@ Status ShapeRefiner::SetShape(const Node* node, int output_port,
   }
 
   // Check compatibility, and merge the shapes.
-  shape_inference::ShapeHandle existing_shape = c->output(output_port);
+  ShapeHandle existing_shape = c->output(output_port);
   TF_RETURN_IF_ERROR(c->Merge(existing_shape, shape, &shape));
   c->set_output(output_port, shape);
 
@@ -196,6 +183,55 @@ Status ShapeRefiner::SetShape(const Node* node, int output_port,
   return Status::OK();
 }
 
+Status ShapeRefiner::EvaluateConstantTensorForEdge(const Node* node,
+                                                   int dst_idx, bool* evaluated,
+                                                   Tensor* result) {
+  *evaluated = false;
+  const Edge* input_edge;
+  TF_RETURN_IF_ERROR(node->input_edge(dst_idx, &input_edge));
+
+  bool is_constant_graph = false;
+  Graph subgraph(ops_registry_);
+
+  // We identify the possibly constant subgraph to evaluate by
+  // recursively iterating backwards through the inputs to 'node'
+  // until we either 1) find an already existing input to our subgraph
+  // (filled in `const_inputs`), 2) Discover our graph is not constant,
+  // or 3) Hit a root node.
+  std::vector<std::pair<string, Tensor>> const_inputs;
+  TF_RETURN_IF_ERROR(ExtractConstantSubgraph(
+      input_edge->src(), &subgraph, &is_constant_graph, &const_inputs));
+  if (!is_constant_graph) {
+    return Status::OK();
+  }
+  const string output_tensor_name =
+      strings::StrCat(input_edge->src()->name(), ":", input_edge->src_output());
+  std::vector<Tensor> outputs;
+  // NOTE; we should pass in a function library runtime if we want
+  // to support constant-expression evaluation on functions.
+  Status s = GraphRunner::Run(&subgraph, nullptr /* function_library */,
+                              Env::Default(), const_inputs,
+                              {output_tensor_name}, &outputs);
+
+  // If all kernels in the constant graph are not registered
+  // in the process, GraphRunner::Run may fail, in which case
+  // we cannot propagate constants, so this is best-effort.
+  if (s.ok()) {
+    *result = outputs[0];
+    *evaluated = true;
+
+    // We memoize (small) constants evaluated so far, so
+    // ExtractConstantSubgraph can avoid extracting the full
+    // subgraph.  As we build up large graphs, this avoids
+    // repeated computation of the early parts of a constant
+    // graph.
+    if (outputs[0].TotalBytes() <= kMaxTensorSize) {
+      const_tensor_map_[output_tensor_name] = outputs[0];
+    }
+  }
+  return Status::OK();
+}
+
 Status ShapeRefiner::ExtractConstantSubgraph(
     Node* target_node, Graph* out_graph, bool* is_constant_graph,
     std::vector<std::pair<string, Tensor>>* const_inputs) {
@@ -308,4 +344,75 @@ Status ShapeRefiner::ExtractConstantSubgraph(
   return Status::OK();
 }
 
+Status ShapeRefiner::ConstantPartialShape(InferenceContext* target_context,
+                                          const Node* node, int dst_idx,
+                                          ShapeHandle* result) {
+  const Edge* input_edge;
+  TF_RETURN_IF_ERROR(node->input_edge(dst_idx, &input_edge));
+
+  InferenceContext* src_context = GetContext(input_edge->src());
+  if (src_context == nullptr) return errors::Internal("Missing src context");
+  ShapeHandle src_shape = src_context->output(input_edge->src_output());
+  TF_RETURN_IF_ERROR(src_context->WithRank(src_shape, 1, &src_shape));
+
+  const string& src_op = input_edge->src()->type_string();
+  if (src_context->Value(src_context->Dim(src_shape, 0)) == 0) {
+    // Source tensor is a vector of length 0, so the shape it
+    // represents is as scalar.
+    *result = target_context->Scalar();
+  } else if (src_op == "Shape") {
+    *result = src_context->input(0);
+  } else if (src_op == "Pack") {
+    std::vector<DimensionHandle> dims;
+    // Pack is concatenating its input scalars to form the shape tensor vector.
+    for (int i = 0; i < src_context->num_inputs(); ++i) {
+      Tensor scalar;
+      bool evaluated = false;
+      TF_RETURN_IF_ERROR(EvaluateConstantTensorForEdge(input_edge->src(), i,
+                                                       &evaluated, &scalar));
+      if (evaluated) {
+        int64 size;
+        if (scalar.dtype() == DT_INT32) {
+          size = scalar.scalar<int32>()();
+        } else if (scalar.dtype() == DT_INT64) {
+          size = scalar.scalar<int64>()();
+        } else {
+          return errors::InvalidArgument("Pack input must be int32 or int64");
+        }
+        dims.push_back(size < 0 ? target_context->UnknownDim()
+                                : target_context->MakeDim(size));
+      } else {
+        dims.push_back(target_context->UnknownDim());
+      }
+    }
+    *result = target_context->MakeShape(dims);
+  } else if (src_op == "Concat") {
+    *result = target_context->Scalar();
+    // Concat is concatenating its input shape vectors.
+    // input 0 is ignored as it is the concat dim and will always be 0.
+    for (int i = 1; i < src_context->num_inputs(); ++i) {
+      ShapeHandle sub_result;
+      TF_RETURN_IF_ERROR(ConstantPartialShape(target_context, input_edge->src(),
+                                              i, &sub_result));
+      if (!target_context->RankKnown(sub_result)) {
+        // Failed to evaluate. Treat the output as completely unknown.
+        // TODO(cwhipkey): we could rely on all inputs being the same size, so
+        // figure that size out and append the right number of unknown dims.
+        *result = target_context->UnknownShape();
+        return Status::OK();
+      }
+      TF_RETURN_IF_ERROR(
+          target_context->Concatenate(*result, sub_result, result));
+    }
+  } else {
+    Tensor t;
+    bool evaluated = false;
+    TF_RETURN_IF_ERROR(
+        EvaluateConstantTensorForEdge(node, dst_idx, &evaluated, &t));
+    TF_RETURN_IF_ERROR(target_context->MakeShapeFromTensor(
+        evaluated ? &t : nullptr, src_shape, result));
+  }
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/shape_refiner.h b/tensorflow/core/common_runtime/shape_refiner.h
index b72001ddd21..6ce5ddb3661 100644
--- a/tensorflow/core/common_runtime/shape_refiner.h
+++ b/tensorflow/core/common_runtime/shape_refiner.h
@@ -71,6 +71,34 @@ class ShapeRefiner {
       Node* node, Graph* out_graph, bool* is_constant_graph,
       std::vector<std::pair<string, Tensor>>* const_inputs) TF_MUST_USE_RESULT;
 
+  Status EvaluateConstantTensorForEdge(const Node* node, int dst_idx,
+                                       bool* evaluated, Tensor* result);
+
+  // This function tries to materialize as much information about the 'node''s
+  // dst_idx input as a statically computable shape, and the result may be
+  // partially known, depending on what is statically inferable.
+  //
+  // This is called when node.input[dst_idx] is a tensor that is used to define
+  // the shape of some other tensor (e.g., the second argument to Reshape is a
+  // <shape> tensor, where each element of the shape tensor is a dimension of
+  // the target tensor).  It returns in <result> a shape for that input.
+  //
+  // Unlike simply resolving node.input[dst_idx] to a constant and then
+  // converting that to a shape, this function can return a partial shape. This
+  // is useful for cases where the shape tensor is only partially defined, such
+  // as with calls for: reshape(x, shape(y)) where shape(y) is partially
+  // defined.
+  //
+  // The implementation has op implementations for ops commonly called on shape
+  // tensors, and the implementations are specialized to shape tensors (namely,
+  // the output is a vector).
+  //
+  // <target_context> is used when creating new DimensionHandle and ShapeHandle
+  // objects.
+  Status ConstantPartialShape(shape_inference::InferenceContext* target_context,
+                              const Node* node, int dst_idx,
+                              shape_inference::ShapeHandle* result);
+
   const OpRegistryInterface* ops_registry_ = nullptr;
 
   // Stores a map from a node to its InferenceContext.
diff --git a/tensorflow/core/common_runtime/shape_refiner_test.cc b/tensorflow/core/common_runtime/shape_refiner_test.cc
index 164fa6afb0b..420594d98a5 100644
--- a/tensorflow/core/common_runtime/shape_refiner_test.cc
+++ b/tensorflow/core/common_runtime/shape_refiner_test.cc
@@ -398,5 +398,347 @@ TEST(ShapeRefinerTest, ConstantValueVisitNodeTwice) {
   EXPECT_EQ("[1,4,7]", ctx->DebugString(ctx->output(0)));
 }
 
+namespace {
+
+Status TensorAsShapeShapeFn(shape_inference::InferenceContext* c) {
+  shape_inference::ShapeHandle out;
+  TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0 /* input_idx */, &out));
+  c->set_output(0, out);
+  return Status::OK();
+}
+
+// Register ops used by the ConstantValueAsShape* tests.
+
+REGISTER_OP("TensorAsShapeInt32")
+    .Input("a: int32")
+    .Output("o: int32")
+    .SetShapeFn(TensorAsShapeShapeFn);
+
+REGISTER_OP("TensorAsShapeInt64")
+    .Input("a: int64")
+    .Output("o: int64")
+    .SetShapeFn(TensorAsShapeShapeFn);
+
+REGISTER_OP("NonConstScalarInt32")
+    .Output("o: int32")
+    .SetIsStateful()  // prevents constant folding
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("NonConstScalarInt64")
+    .Output("o: int64")
+    .SetIsStateful()  // prevents constant folding
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("WithEmptyVectorShape")
+    .Output("o: int32")
+    .SetIsStateful()  // prevents constant folding
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Vector(0));
+      return Status::OK();
+    });
+
+REGISTER_OP("WithPartialShape")
+    .Output("o: int32")
+    .SetIsStateful()  // prevents constant folding
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(
+          0, c->MakeShape({1, shape_inference::InferenceContext::kUnknownDim, 3,
+                           shape_inference::InferenceContext::kUnknownDim, 5}));
+      return Status::OK();
+    });
+
+REGISTER_OP("WithPartialShape2")
+    .Output("o: int32")
+    .SetIsStateful()  // prevents constant folding
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(
+          0,
+          c->MakeShape({6, shape_inference::InferenceContext::kUnknownDim, 8}));
+      return Status::OK();
+    });
+
+REGISTER_OP("WithUnknownShape")
+    .Output("o: int32")
+    .SetIsStateful()  // prevents constant folding
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->UnknownShape());
+      return Status::OK();
+    });
+
+}  // namespace
+
+TEST(ShapeRefinerTest, ConstantValueAsShape_EmptyVector) {
+  Scope root = Scope::NewRootScope();
+  Node* input;
+  TF_ASSERT_OK(
+      NodeBuilder("in", "WithEmptyVectorShape").Finalize(root.graph(), &input));
+  Node* result;
+  TF_ASSERT_OK(NodeBuilder("test", "TensorAsShapeInt32")
+                   .Input(input)
+                   .Finalize(root.graph(), &result));
+
+  ShapeRefiner m(OpRegistry::Global());
+  TF_ASSERT_OK(m.AddNode(input));
+  TF_ASSERT_OK(m.AddNode(result));
+
+  shape_inference::InferenceContext* ctx = m.GetContext(result);
+  EXPECT_EQ("[]", ctx->DebugString(ctx->output(0)));
+}
+
+TEST(ShapeRefinerTest, ConstantValueAsShape_Shape) {
+  for (int pass = 0; pass < 2; ++pass) {
+    Scope root = Scope::NewRootScope();
+    Node* input;
+    TF_ASSERT_OK(
+        NodeBuilder("in", pass == 0 ? "WithPartialShape" : "WithUnknownShape")
+            .Finalize(root.graph(), &input));
+    auto shape = ops::Shape(root, ops::Output(input));
+    Node* result;
+    TF_ASSERT_OK(NodeBuilder("test", "TensorAsShapeInt32")
+                     .Input(shape.node())
+                     .Finalize(root.graph(), &result));
+
+    ShapeRefiner m(OpRegistry::Global());
+    TF_ASSERT_OK(m.AddNode(input));
+    TF_ASSERT_OK(m.AddNode(shape.node()));
+    TF_ASSERT_OK(m.AddNode(result));
+
+    shape_inference::InferenceContext* ctx = m.GetContext(result);
+    if (pass == 0) {
+      EXPECT_EQ("[1,?,3,?,5]", ctx->DebugString(ctx->output(0)));
+    } else {
+      EXPECT_EQ("?", ctx->DebugString(ctx->output(0)));
+    }
+  }
+}
+
+TEST(ShapeRefinerTest, ConstantValueAsShape_PackInt32) {
+  Scope root = Scope::NewRootScope();
+  Node* scalar_non_const;
+  TF_ASSERT_OK(NodeBuilder("in", "NonConstScalarInt32")
+                   .Finalize(root.graph(), &scalar_non_const));
+
+  ops::InputList inputs{
+      ops::Input(ops::Const<int32>(root, 10)),
+      ops::Input(ops::Const<int32>(root, 20)),
+      ops::Input(ops::Output(scalar_non_const)),
+      ops::Input(ops::Const<int32>(root, 40)),
+  };
+  auto pack = ops::Pack(root, inputs);
+  TF_ASSERT_OK(root.status());
+
+  Node* result;
+  TF_ASSERT_OK(NodeBuilder("test", "TensorAsShapeInt32")
+                   .Input(pack.node())
+                   .Finalize(root.graph(), &result));
+
+  ShapeRefiner m(OpRegistry::Global());
+  for (auto input : inputs) {
+    TF_ASSERT_OK(m.AddNode(input.node()));
+  }
+  TF_ASSERT_OK(m.AddNode(pack.node()));
+  TF_ASSERT_OK(m.AddNode(result));
+
+  shape_inference::InferenceContext* ctx = m.GetContext(result);
+  EXPECT_EQ("[10,20,?,40]", ctx->DebugString(ctx->output(0)));
+}
+
+TEST(ShapeRefinerTest, ConstantValueAsShape_PackInt64) {
+  Scope root = Scope::NewRootScope();
+  Node* scalar_non_const;
+  TF_ASSERT_OK(NodeBuilder("in", "NonConstScalarInt64")
+                   .Finalize(root.graph(), &scalar_non_const));
+
+  ops::InputList inputs{
+      ops::Input(ops::Const<int64>(root, 10LL)),
+      ops::Input(ops::Const<int64>(root, 20LL)),
+      ops::Input(ops::Output(scalar_non_const)),
+      ops::Input(ops::Const<int64>(root, 1LL << 40)),
+  };
+  auto pack = ops::Pack(root, inputs);
+  TF_ASSERT_OK(root.status());
+
+  Node* result;
+  TF_ASSERT_OK(NodeBuilder("test", "TensorAsShapeInt64")
+                   .Input(pack.node())
+                   .Finalize(root.graph(), &result));
+
+  ShapeRefiner m(OpRegistry::Global());
+  for (const auto& input : inputs) {
+    TF_ASSERT_OK(m.AddNode(input.node()));
+  }
+  TF_ASSERT_OK(m.AddNode(pack.node()));
+  TF_ASSERT_OK(m.AddNode(result));
+
+  shape_inference::InferenceContext* ctx = m.GetContext(result);
+  EXPECT_EQ("[10,20,?,1099511627776]", ctx->DebugString(ctx->output(0)));
+}
+
+TEST(ShapeRefinerTest, ConstantValueAsShape_PackUnknownDim) {
+  Scope root = Scope::NewRootScope();
+
+  ops::InputList inputs{
+      ops::Input(ops::Const<int64>(root, 10LL)),
+      ops::Input(ops::Const<int64>(root, -1LL)),
+  };
+  auto pack = ops::Pack(root, inputs);
+  TF_ASSERT_OK(root.status());
+
+  Node* result;
+  TF_ASSERT_OK(NodeBuilder("test", "TensorAsShapeInt64")
+                   .Input(pack.node())
+                   .Finalize(root.graph(), &result));
+
+  ShapeRefiner m(OpRegistry::Global());
+  for (const auto& input : inputs) {
+    TF_ASSERT_OK(m.AddNode(input.node()));
+  }
+  TF_ASSERT_OK(m.AddNode(pack.node()));
+  TF_ASSERT_OK(m.AddNode(result));
+
+  shape_inference::InferenceContext* ctx = m.GetContext(result);
+  EXPECT_EQ("[10,?]", ctx->DebugString(ctx->output(0)));
+}
+
+TEST(ShapeRefinerTest, ConstantValueAsShape_PackInvalidInput) {
+  Scope root = Scope::NewRootScope();
+
+  // Inputs are length 2 vectors instead of scalars.
+  ops::InputList inputs{
+      ops::Input(ops::Const<int64>(root, {10LL, 20LL})),
+      ops::Input(ops::Const<int64>(root, {10LL, 21LL})),
+  };
+  auto pack = ops::Pack(root, inputs);
+  TF_ASSERT_OK(root.status());
+
+  Node* result;
+  TF_ASSERT_OK(NodeBuilder("test", "TensorAsShapeInt64")
+                   .Input(pack.node())
+                   .Finalize(root.graph(), &result));
+
+  ShapeRefiner m(OpRegistry::Global());
+  for (const auto& input : inputs) {
+    TF_ASSERT_OK(m.AddNode(input.node()));
+  }
+  TF_ASSERT_OK(m.AddNode(pack.node()));
+  EXPECT_TRUE(
+      StringPiece(m.AddNode(result).error_message()).contains("but is rank 2"));
+}
+
+TEST(ShapeRefinerTest, ConstantValueAsShape_Concat) {
+  Scope root = Scope::NewRootScope();
+  Graph* g = root.graph();
+  Node* partial_1;
+  Node* partial_2;
+  TF_ASSERT_OK(NodeBuilder("in", "WithPartialShape").Finalize(g, &partial_1));
+  TF_ASSERT_OK(NodeBuilder("in", "WithPartialShape2").Finalize(g, &partial_2));
+  auto const_input = ops::Const(root, {9, 10, 11});
+  ops::OutputList concat_inputs{
+      ops::Shape(root, ops::Output(partial_1)),
+      ops::Shape(root, ops::Output(partial_2)), const_input,
+  };
+  auto concat_dim = ops::Const(root, 0);
+  auto concat = ops::Concat(root, concat_dim, concat_inputs);
+  TF_ASSERT_OK(root.status());
+
+  Node* result;
+  TF_ASSERT_OK(NodeBuilder("test", "TensorAsShapeInt32")
+                   .Input(concat.node())
+                   .Finalize(g, &result));
+
+  ShapeRefiner m(OpRegistry::Global());
+  TF_ASSERT_OK(m.AddNode(partial_1));
+  TF_ASSERT_OK(m.AddNode(partial_2));
+  for (const auto& o : concat_inputs) {
+    TF_ASSERT_OK(m.AddNode(o.node()));
+  }
+  TF_ASSERT_OK(m.AddNode(concat_dim.node()));
+  TF_ASSERT_OK(m.AddNode(concat.node()));
+  TF_ASSERT_OK(m.AddNode(result));
+
+  shape_inference::InferenceContext* ctx = m.GetContext(result);
+  EXPECT_EQ("[1,?,3,?,5,6,?,8,9,10,11]", ctx->DebugString(ctx->output(0)));
+}
+
+TEST(ShapeRefinerTest, ConstantValueAsShape_ConcatWithUnknown) {
+  Scope root = Scope::NewRootScope();
+  Graph* g = root.graph();
+  Node* scalar_non_const;
+  TF_ASSERT_OK(NodeBuilder("in", "NonConstScalarInt32")
+                   .Finalize(root.graph(), &scalar_non_const));
+
+  Node* partial_1;
+  Node* partial_2;
+  Node* unknown;
+  TF_ASSERT_OK(NodeBuilder("in", "WithPartialShape").Finalize(g, &partial_1));
+  TF_ASSERT_OK(NodeBuilder("in", "WithPartialShape2").Finalize(g, &partial_2));
+  TF_ASSERT_OK(NodeBuilder("in", "WithUnknownShape").Finalize(g, &unknown));
+  ops::OutputList concat_inputs{
+      ops::Shape(root, ops::Output(partial_1)),
+      ops::Shape(root, ops::Output(partial_2)),
+      ops::Shape(root, ops::Output(unknown)),
+  };
+  auto concat_dim = ops::Const(root, 0);
+  auto concat = ops::Concat(root, concat_dim, concat_inputs);
+  TF_ASSERT_OK(root.status());
+
+  Node* result;
+  TF_ASSERT_OK(NodeBuilder("test", "TensorAsShapeInt32")
+                   .Input(concat.node())
+                   .Finalize(g, &result));
+
+  ShapeRefiner m(OpRegistry::Global());
+  TF_ASSERT_OK(m.AddNode(partial_1));
+  TF_ASSERT_OK(m.AddNode(partial_2));
+  TF_ASSERT_OK(m.AddNode(unknown));
+  for (const auto& o : concat_inputs) {
+    TF_ASSERT_OK(m.AddNode(o.node()));
+  }
+  TF_ASSERT_OK(m.AddNode(concat_dim.node()));
+  TF_ASSERT_OK(m.AddNode(concat.node()));
+  TF_ASSERT_OK(m.AddNode(result));
+
+  shape_inference::InferenceContext* ctx = m.GetContext(result);
+  EXPECT_EQ("?", ctx->DebugString(ctx->output(0)));
+}
+
+TEST(ShapeRefinerTest, ConstantValueAsShape_ConcatInvalidDimValue) {
+  Scope root = Scope::NewRootScope();
+  Graph* g = root.graph();
+  Node* scalar_non_const;
+  TF_ASSERT_OK(NodeBuilder("in", "NonConstScalarInt32")
+                   .Finalize(root.graph(), &scalar_non_const));
+
+  Node* partial_1;
+  Node* partial_2;
+  TF_ASSERT_OK(NodeBuilder("in", "WithPartialShape").Finalize(g, &partial_1));
+  TF_ASSERT_OK(NodeBuilder("in", "WithPartialShape2").Finalize(g, &partial_2));
+  auto const_input = ops::Const(root, {9, -2, 11});
+  ops::OutputList concat_inputs{
+      ops::Shape(root, ops::Output(partial_1)),
+      ops::Shape(root, ops::Output(partial_2)),  //
+      const_input,
+  };
+  auto concat_dim = ops::Const(root, 0);
+  auto concat = ops::Concat(root, concat_dim, concat_inputs);
+  TF_ASSERT_OK(root.status());
+
+  Node* result;
+  TF_ASSERT_OK(NodeBuilder("test", "TensorAsShapeInt32")
+                   .Input(concat.node())
+                   .Finalize(g, &result));
+
+  ShapeRefiner m(OpRegistry::Global());
+  TF_ASSERT_OK(m.AddNode(partial_1));
+  TF_ASSERT_OK(m.AddNode(partial_2));
+  for (const auto& o : concat_inputs) {
+    TF_ASSERT_OK(m.AddNode(o.node()));
+  }
+  TF_ASSERT_OK(m.AddNode(concat_dim.node()));
+  TF_ASSERT_OK(m.AddNode(concat.node()));
+  EXPECT_EQ("Invalid value in tensor used for shape: -2",
+            m.AddNode(result).error_message());
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/simple_graph_execution_state.cc b/tensorflow/core/common_runtime/simple_graph_execution_state.cc
index ff00ad5cfda..82d36b51b5a 100644
--- a/tensorflow/core/common_runtime/simple_graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/simple_graph_execution_state.cc
@@ -274,16 +274,6 @@ Status SimpleGraphExecutionState::InitBaseGraph(
   return Status::OK();
 }
 
-void SimpleGraphExecutionState::UpdateCostsFromStats(const StepStats& ss) {
-  mutex_lock l(mu_);
-  costs_.MergeFromStats(node_name_to_cost_id_map_, ss);
-}
-
-void SimpleGraphExecutionState::MergeCostsFromGlobal(CostModel* costs) {
-  mutex_lock l(mu_);
-  costs->MergeFromGlobal(costs_);
-}
-
 Status SimpleGraphExecutionState::BuildGraph(
     const BuildGraphOptions& options, std::unique_ptr<SimpleClientGraph>* out) {
   VLOG(1) << "BuildGraph";
diff --git a/tensorflow/core/common_runtime/simple_graph_execution_state.h b/tensorflow/core/common_runtime/simple_graph_execution_state.h
index 2a33d9e298c..3b6ce23c754 100644
--- a/tensorflow/core/common_runtime/simple_graph_execution_state.h
+++ b/tensorflow/core/common_runtime/simple_graph_execution_state.h
@@ -133,22 +133,6 @@ class SimpleGraphExecutionState {
   Status BuildGraph(const BuildGraphOptions& options,
                     std::unique_ptr<SimpleClientGraph>* out);
 
-  // Sums execution statistics in "ss" into the CostModel.
-  void UpdateCostsFromStats(const StepStats& ss);
-
-  Microseconds TimeEstimate(const Node* n) {
-    mutex_lock l(mu_);  // could use reader lock
-    return costs_.TimeEstimate(n);
-  }
-
-  Bytes SizeEstimate(const Node* n, int output_slot) {
-    mutex_lock l(mu_);  // could use reader lock
-    return costs_.SizeEstimate(n, output_slot);
-  }
-
-  // Merge the cost model maintained by this graph_execution_state to 'costs'.
-  void MergeCostsFromGlobal(CostModel* costs);
-
   // The graph returned by BuildGraph may contain only the pruned
   // graph, whereas some clients may want access to the full graph.
   const Graph* full_graph() {
diff --git a/tensorflow/core/debug/debug_gateway_test.cc b/tensorflow/core/debug/debug_gateway_test.cc
index bba8299e6d5..1fab9a56a35 100644
--- a/tensorflow/core/debug/debug_gateway_test.cc
+++ b/tensorflow/core/debug/debug_gateway_test.cc
@@ -335,7 +335,9 @@ TEST_F(SessionDebugMinusAXTest, RunSimpleNetworkWithTwoDebugNodesInserted) {
 }
 
 TEST_F(SessionDebugMinusAXTest,
-       RunSimpleNetworkConcurrentlyWithDebugNodesInserted) {
+       RunSimpleNetworkConcurrentlyWithDifferentDebugTensorWatches) {
+  // Test concurrent Run() calls on a graph with different debug watches.
+
   Initialize({3, 2, -1, 0});
   std::unique_ptr<DirectSession> session(CreateSession());
   ASSERT_TRUE(session != nullptr);
@@ -351,33 +353,39 @@ TEST_F(SessionDebugMinusAXTest,
 
   mutex mu;
   DebugGateway debug_gateway(session.get());
-  std::vector<Tensor> debug_identity_tensor_vals;
+  std::unordered_map<string, Tensor> debug_identity_tensor_vals;
 
   const string debug_identity = "DebugIdentity";
-  const string debug_identity_node_name = DebugNodeInserter::GetDebugNodeName(
+
+  const string a_debug_identity_node_name = DebugNodeInserter::GetDebugNodeName(
+      strings::StrCat(a_, ":", 0), 0, debug_identity);
+  const string x_debug_identity_node_name = DebugNodeInserter::GetDebugNodeName(
+      strings::StrCat(x_, ":", 0), 0, debug_identity);
+  const string y_debug_identity_node_name = DebugNodeInserter::GetDebugNodeName(
       strings::StrCat(y_, ":", 0), 0, debug_identity);
 
   Notification callbacks_done;
-  int comp_callback_count = 0;
-  int val_callback_count = 0;
-  debug_gateway.SetNodeCompletionCallback(
-      [&mu, &callbacks_done, &comp_callback_count, &debug_identity_node_name](
-          const string& node_name, const bool any_output) {
-        mutex_lock l(mu);
-        if (node_name == debug_identity_node_name) {
-          comp_callback_count++;
-        }
-      });
+  volatile int val_callback_count = 0;
 
   debug_gateway.SetNodeValueCallback(
-      [this, &mu, &val_callback_count, &debug_identity_node_name,
+      [this, &mu, &val_callback_count, &a_debug_identity_node_name,
+       &x_debug_identity_node_name, &y_debug_identity_node_name,
        &debug_identity_tensor_vals,
        &callbacks_done](const string& node_name, const int output_slot,
                         const Tensor& tensor_value, const bool is_ref) {
         mutex_lock l(mu);
-        if (node_name == debug_identity_node_name && output_slot == 0) {
+
+        if (node_name == a_debug_identity_node_name && output_slot == 0) {
+          debug_identity_tensor_vals["a"] = tensor_value;
+          val_callback_count++;
+        } else if (node_name == x_debug_identity_node_name &&
+                   output_slot == 0) {
           // output_slot == 0 carries the debug signal.
-          debug_identity_tensor_vals.push_back(tensor_value);
+          debug_identity_tensor_vals["x"] = tensor_value;
+          val_callback_count++;
+        } else if (node_name == y_debug_identity_node_name &&
+                   output_slot == 0) {
+          debug_identity_tensor_vals["y"] = tensor_value;
           val_callback_count++;
         }
 
@@ -389,19 +397,41 @@ TEST_F(SessionDebugMinusAXTest,
         }
       });
 
+  int run_counter = 0;
+  mutex run_lock;
+
   // Function to be executed concurrently.
-  auto fn = [this, &session, output_names, target_nodes, &debug_identity]() {
-    // Create unique debug tensor watch options for each of the two concurrent
+  auto fn = [this, &run_lock, &run_counter, &session, output_names,
+             target_nodes, &debug_identity]() {
+    // Create unique debug tensor watch options for each of the concurrent
     // run calls.
     RunOptions run_opts;
     run_opts.set_output_partition_graphs(true);
+
     DebugTensorWatch* tensor_watch_opts =
         run_opts.add_debug_tensor_watch_opts();
-
-    tensor_watch_opts->set_node_name(y_);
     tensor_watch_opts->set_output_slot(0);
     tensor_watch_opts->add_debug_ops(debug_identity);
 
+    {
+      // Let the concurrent runs watch different tensors.
+
+      mutex_lock l(run_lock);
+
+      if (run_counter == 0) {
+        // Let the 1st concurrent run watch a.
+        tensor_watch_opts->set_node_name(a_);
+      } else if (run_counter == 1) {
+        // Let the 2nd concurrent watch x.
+        tensor_watch_opts->set_node_name(x_);
+      } else if (run_counter == 2) {
+        // Let the 3rd concurrent watch y.
+        tensor_watch_opts->set_node_name(y_);
+      }
+
+      run_counter++;
+    }
+
     // Run the graph.
     RunMetadata run_metadata;
     std::vector<std::pair<string, Tensor>> inputs;
@@ -436,15 +466,26 @@ TEST_F(SessionDebugMinusAXTest,
 
   {
     mutex_lock l(mu);
-    ASSERT_EQ(kConcurrentRuns, comp_callback_count);
+
     ASSERT_EQ(kConcurrentRuns, val_callback_count);
     ASSERT_EQ(kConcurrentRuns, debug_identity_tensor_vals.size());
-    for (int i = 0; i < kConcurrentRuns; ++i) {
-      ASSERT_EQ(TensorShape({2, 1}), debug_identity_tensor_vals[i].shape());
-      auto mat_identity = debug_identity_tensor_vals[i].matrix<float>();
-      ASSERT_EQ(5.0, mat_identity(0, 0));
-      ASSERT_EQ(-1.0, mat_identity(1, 0));
-    }
+
+    ASSERT_EQ(TensorShape({2, 2}), debug_identity_tensor_vals["a"].shape());
+    auto a_mat_identity = debug_identity_tensor_vals["a"].matrix<float>();
+    ASSERT_EQ(3.0, a_mat_identity(0, 0));
+    ASSERT_EQ(2.0, a_mat_identity(0, 1));
+    ASSERT_EQ(-1.0, a_mat_identity(1, 0));
+    ASSERT_EQ(0.0, a_mat_identity(1, 1));
+
+    ASSERT_EQ(TensorShape({2, 1}), debug_identity_tensor_vals["x"].shape());
+    auto x_mat_identity = debug_identity_tensor_vals["x"].matrix<float>();
+    ASSERT_EQ(1.0, x_mat_identity(0, 0));
+    ASSERT_EQ(1.0, x_mat_identity(1, 0));
+
+    ASSERT_EQ(TensorShape({2, 1}), debug_identity_tensor_vals["y"].shape());
+    auto y_mat_identity = debug_identity_tensor_vals["y"].matrix<float>();
+    ASSERT_EQ(5.0, y_mat_identity(0, 0));
+    ASSERT_EQ(-1.0, y_mat_identity(1, 0));
   }
 }
 
@@ -499,25 +540,22 @@ TEST_F(SessionDebugOutputSlotWithoutOngoingEdgeTest,
 
   Notification callbacks_done;
 
-  debug_gateway.SetNodeCompletionCallback(
-      [&mu, &callbacks_done](const string& node_name, const bool any_output) {
-        mutex_lock l(mu);
-        if (node_name == "_SINK" && !callbacks_done.HasBeenNotified()) {
-          callbacks_done.Notify();
-        }
-      });
-
   std::vector<Tensor> debug_identity_tensor_vals;
-  debug_gateway.SetNodeValueCallback(
-      [this, &mu, &debug_identity_node_name, &debug_identity_tensor_vals](
-          const string& node_name, const int output_slot,
-          const Tensor& tensor_value, const bool is_ref) {
-        mutex_lock l(mu);
+  debug_gateway.SetNodeValueCallback([this, &mu, &callbacks_done,
+                                      &debug_identity_node_name,
+                                      &debug_identity_tensor_vals](
+      const string& node_name, const int output_slot,
+      const Tensor& tensor_value, const bool is_ref) {
+    mutex_lock l(mu);
 
-        if (node_name == debug_identity_node_name && output_slot == 0) {
-          debug_identity_tensor_vals.push_back(tensor_value);
-        }
-      });
+    if (node_name == debug_identity_node_name && output_slot == 0) {
+      debug_identity_tensor_vals.push_back(tensor_value);
+
+      if (!callbacks_done.HasBeenNotified()) {
+        callbacks_done.Notify();
+      }
+    }
+  });
 
   // Add DebugIdentity watch on c:0, which does not have an outgoing edge.
   RunOptions run_opts;
diff --git a/tensorflow/core/debug/debug_graph_utils.cc b/tensorflow/core/debug/debug_graph_utils.cc
index b4b0ca810b4..bd0625fec34 100644
--- a/tensorflow/core/debug/debug_graph_utils.cc
+++ b/tensorflow/core/debug/debug_graph_utils.cc
@@ -24,6 +24,30 @@ limitations under the License.
 
 namespace tensorflow {
 
+const string SummarizeDebugTensorWatches(
+    const protobuf::RepeatedPtrField<DebugTensorWatch>& watches) {
+  std::ostringstream oss;
+
+  for (const DebugTensorWatch& watch : watches) {
+    string tensor_name =
+        strings::StrCat(watch.node_name(), ":", watch.output_slot());
+    oss << tensor_name << "|";
+
+    for (const string& debug_op : watch.debug_ops()) {
+      oss << debug_op << ",";
+    }
+
+    oss << "@";
+    for (const string& debug_url : watch.debug_urls()) {
+      oss << debug_url << ",";
+    }
+
+    oss << ";";
+  }
+
+  return oss.str();
+}
+
 // static
 Status DebugNodeInserter::InsertNodes(
     const protobuf::RepeatedPtrField<DebugTensorWatch>& watches, Graph* graph,
diff --git a/tensorflow/core/debug/debug_graph_utils.h b/tensorflow/core/debug/debug_graph_utils.h
index ea61dee4d08..e01af00bdd4 100644
--- a/tensorflow/core/debug/debug_graph_utils.h
+++ b/tensorflow/core/debug/debug_graph_utils.h
@@ -27,6 +27,10 @@ limitations under the License.
 
 namespace tensorflow {
 
+// Returns a summary string for RepeatedPtrFields of DebugTensorWatches.
+const string SummarizeDebugTensorWatches(
+    const protobuf::RepeatedPtrField<DebugTensorWatch>& watches);
+
 class DebugNodeInserter {
  public:
   // EXPERIMENTAL: Insert special debug ops (e.g., DebugIdentity) to graph for
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index f77bc0b6b7a..577f6617f79 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/common_runtime/memory_types.h"
 #include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/log_memory.h"
@@ -207,6 +208,11 @@ Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
     if (!s.ok()) {
       break;
     }
+    unit->graph = subgraph;
+    unit->build_cost_model = graph_options.build_cost_model();
+    if (unit->build_cost_model > 0) {
+      skip_cost_models_ = false;
+    }
   }
   return s;
 }
@@ -319,6 +325,7 @@ Status GraphMgr::RecvOutputs(const int64 step_id, NamedTensors* out) {
 void GraphMgr::ExecuteAsync(const string& handle, const int64 step_id,
                             const ExecutorOpts& opts,
                             StepStatsCollector* collector,
+                            CostGraphDef* cost_graph,
                             CancellationManager* cancellation_manager,
                             const NamedTensors& in, StatusCallback done) {
   // Lookup an item. Holds one ref while executing.
@@ -348,7 +355,7 @@ void GraphMgr::ExecuteAsync(const string& handle, const int64 step_id,
     return;
   }
 
-  StartParallelExecutors(handle, item, rendezvous, collector,
+  StartParallelExecutors(handle, item, rendezvous, collector, cost_graph,
                          cancellation_manager,
                          [this, item, rendezvous, done](const Status& s) {
                            done(s);
@@ -360,6 +367,7 @@ void GraphMgr::ExecuteAsync(const string& handle, const int64 step_id,
 void GraphMgr::StartParallelExecutors(const string& handle, Item* item,
                                       Rendezvous* rendezvous,
                                       StepStatsCollector* collector,
+                                      CostGraphDef* cost_graph,
                                       CancellationManager* cancellation_manager,
                                       StatusCallback done) {
   const int num_units = item->units.size();
@@ -367,7 +375,9 @@ void GraphMgr::StartParallelExecutors(const string& handle, Item* item,
   ResourceMgr* step_resource_manager = new ResourceMgr;
   // NOTE: Transfer one ref of rendezvous and item.
   ExecutorBarrier* barrier = new ExecutorBarrier(
-      num_units, rendezvous, [step_resource_manager, done](const Status& s) {
+      num_units, rendezvous, [this, item, collector, cost_graph,
+                              step_resource_manager, done](const Status& s) {
+        BuildCostModel(item, collector, cost_graph);
         done(s);
         delete step_resource_manager;
       });
@@ -393,4 +403,24 @@ void GraphMgr::StartParallelExecutors(const string& handle, Item* item,
   }
 }
 
+void GraphMgr::BuildCostModel(Item* item, StepStatsCollector* collector,
+                              CostGraphDef* cost_graph) {
+  if (collector && !skip_cost_models_) {
+    // Build the cost model
+    std::unordered_map<string, const Graph*> device_to_graph;
+    for (const auto& unit : item->units) {
+      if (unit.build_cost_model > 0) {
+        device_to_graph[unit.device->name()] = unit.graph;
+      }
+    }
+    collector->BuildCostModel(&cost_model_manager_, device_to_graph);
+
+    if (cost_graph != nullptr) {
+      for (const auto& unit : item->units) {
+        cost_model_manager_.AddToCostGraphDef(unit.graph, cost_graph);
+      }
+    }
+  }
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.h b/tensorflow/core/distributed_runtime/graph_mgr.h
index bb4b3f2c8c6..a8994f14834 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.h
+++ b/tensorflow/core/distributed_runtime/graph_mgr.h
@@ -19,9 +19,11 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "tensorflow/core/common_runtime/costmodel_manager.h"
 #include "tensorflow/core/common_runtime/executor.h"
 #include "tensorflow/core/distributed_runtime/worker_env.h"
 #include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/cost_graph.pb.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -73,6 +75,7 @@ class GraphMgr {
   typedef std::function<void(const Status&)> StatusCallback;
   void ExecuteAsync(const string& handle, const int64 step_id,
                     const ExecutorOpts& opts, StepStatsCollector* collector,
+                    CostGraphDef* cost_graph,
                     CancellationManager* cancellation_manager,
                     const NamedTensors& in, StatusCallback done);
 
@@ -89,9 +92,12 @@ class GraphMgr {
   typedef GraphMgr ME;
 
   struct ExecutionUnit {
+    Graph* graph = nullptr;
     Device* device = nullptr;
     Executor* root = nullptr;
     FunctionLibraryRuntime* lib = nullptr;
+    // Build the cost model if this value is strictly positive.
+    int64 build_cost_model = 0;
   };
 
   struct Item : public core::RefCounted {
@@ -117,6 +123,8 @@ class GraphMgr {
   // Not owned.
   const WorkerEnv* worker_env_;
 
+  CostModelManager cost_model_manager_;
+
   // Owned.
   mutex mu_;
   int64 next_id_ GUARDED_BY(mu_) = 0;
@@ -131,9 +139,17 @@ class GraphMgr {
   void StartParallelExecutors(const string& handle, Item* item,
                               Rendezvous* rendezvous,
                               StepStatsCollector* collector,
+                              CostGraphDef* cost_graph,
                               CancellationManager* cancellation_manager,
                               StatusCallback done);
 
+  // Don't attempt to process cost models unless explicitely requested for at
+  // least one of the items.
+  bool skip_cost_models_ = true;
+
+  void BuildCostModel(Item* item, StepStatsCollector* collector,
+                      CostGraphDef* cost_graph);
+
   Status SendInputsToRendezvous(Rendezvous* rendezvous, const NamedTensors& in);
   Status RecvOutputsFromRendezvous(Rendezvous* rendezvous, NamedTensors* out);
 
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 5537e3f2eff..6f3b7841785 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/scheduler.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
 #include "tensorflow/core/distributed_runtime/worker_interface.h"
+#include "tensorflow/core/framework/cost_graph.pb.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -58,6 +59,7 @@ struct PerStepState {
   Microseconds end_micros = Microseconds(0);
   std::vector<StepStats> step_stats;  // per partition
   StepStats rpc_stats;                // for RPC layer
+  CostGraphDef cost_graph;
 };
 
 // MasterSession wraps SimpleClientGraph in a reference counted object.
@@ -178,7 +180,8 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
   // Post-processing of any runtime statistics gathered during execution.
   void ProcessStats(const MasterEnv* env, int64 step_id, PerStepState* pss,
                     SimpleGraphExecutionState* execution_state,
-                    ProfileHandler* ph, RunStepResponse* resp);
+                    ProfileHandler* ph, const RunStepRequest& req,
+                    RunStepResponse* resp);
   void ProcessDeviceStats(ProfileHandler* ph,
                           const SimpleGraphExecutionState* execution_state,
                           const DeviceStepStats& ds, bool is_rpc);
@@ -480,17 +483,6 @@ class RunManyGraphs {
   TF_DISALLOW_COPY_AND_ASSIGN(RunManyGraphs);
 };
 
-int64 CostFrequency(int64 x) {
-  if (x < 10) {
-    return 1;  // 100%
-  } else if (x < 100) {
-    return 10;  // 10%
-  } else if (x < 1000) {
-    return 100;  // 1%
-  } else {
-    return 1000;  // 0.1%
-  }
-}
 
 Status MasterSession::ReffedClientGraph::RunPartitions(
     const MasterEnv* env, int64 step_id, int64 execution_count,
@@ -604,6 +596,12 @@ Status MasterSession::ReffedClientGraph::RunPartitions(
       if (pss->collect_timeline && calls.get(i)->resp.has_step_stats()) {
         pss->step_stats[i].Swap(calls.get(i)->resp.mutable_step_stats());
       }
+      if (pss->collect_costs && calls.get(i)->resp.has_cost_graph()) {
+        for (int j = 0; j < calls.get(i)->resp.cost_graph().node_size(); ++j) {
+          resp->mutable_metadata()->mutable_cost_graph()->add_node()->Swap(
+              calls.get(i)->resp.mutable_cost_graph()->mutable_node(j));
+        }
+      }
     }
   }
   return status;
@@ -679,7 +677,7 @@ void MasterSession::ReffedClientGraph::CleanupPartitionsAsync(
 void MasterSession::ReffedClientGraph::ProcessStats(
     const MasterEnv* env, int64 step_id, PerStepState* pss,
     SimpleGraphExecutionState* execution_state, ProfileHandler* ph,
-    RunStepResponse* resp) {
+    const RunStepRequest& req, RunStepResponse* resp) {
   if (!pss->collect_costs && !pss->collect_timeline) return;
 
   // Out-of-band logging data is collected now, during post-processing.
@@ -689,9 +687,6 @@ void MasterSession::ReffedClientGraph::ProcessStats(
   }
   for (size_t i = 0; i < partitions_.size(); ++i) {
     const StepStats& ss = pss->step_stats[i];
-    if (pss->collect_costs) {
-      execution_state->UpdateCostsFromStats(ss);
-    }
     if (ph) {
       for (const auto& ds : ss.dev_stats()) {
         ProcessDeviceStats(ph, execution_state, ds, false /*is_rpc*/);
@@ -717,7 +712,7 @@ void MasterSession::ReffedClientGraph::ProcessStats(
     stats_publisher_->PublishStatsProto(step_stats_proto);
     // Copy the stats back, but only for on-demand profiling to avoid slowing
     // down calls that trigger the automatic profiling.
-    if (session_opts_.config.graph_options().timeline_step() <= 0) {
+    if (req.options().trace_level() == RunOptions::FULL_TRACE) {
       resp->mutable_metadata()->mutable_step_stats()->Swap(&step_stats_proto);
     }
   }
@@ -1063,7 +1058,17 @@ Status MasterSession::DoRunWithLocalExecution(CallOptions* opts,
 
   std::unique_ptr<ProfileHandler> ph;
   pss.collect_timeline = req->options().trace_level() == RunOptions::FULL_TRACE;
-  pss.collect_costs = (0 == (count % CostFrequency(count)));
+
+  // Build the cost model every 'build_cost_model_every' steps after skipping an
+  // initial 'build_cost_model_after' steps.
+  const int64 build_cost_model_after =
+      session_opts_.config.graph_options().build_cost_model_after();
+  const int64 build_cost_model_every =
+      session_opts_.config.graph_options().build_cost_model();
+  pss.collect_costs =
+      build_cost_model_every > 0 &&
+      ((count + 1 - build_cost_model_after) % build_cost_model_every == 0);
+
   ph = rcg->GetProfileHandler(step_id, count, req->options());
   if (ph) {
     pss.collect_timeline = true;
@@ -1078,7 +1083,7 @@ Status MasterSession::DoRunWithLocalExecution(CallOptions* opts,
 
   // Schedule post-processing and cleanup to be done asynchronously.
   rcg->Ref();
-  rcg->ProcessStats(env_, step_id, &pss, execution_state_.get(), ph.get(),
+  rcg->ProcessStats(env_, step_id, &pss, execution_state_.get(), ph.get(), *req,
                     resp);
   rcg->CleanupPartitionsAsync(step_id, [rcg](const Status& s) {
     if (!s.ok()) {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index 2ae5dcebe6b..ec8c06abb49 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -329,7 +329,8 @@ class GrpcWorkerService : public AsyncServiceInterface {
       return;
     }
     StepStatsCollector* collector = nullptr;
-    if (call->request.exec_opts().record_timeline()) {
+    if (call->request.exec_opts().record_timeline() ||
+        call->request.exec_opts().record_costs()) {
       collector = new StepStatsCollector(call->response.mutable_step_stats());
       // TODO(mrry,pbar): GPU tracing for distributed steps.
     }
@@ -345,9 +346,10 @@ class GrpcWorkerService : public AsyncServiceInterface {
       cancellation_manager_->RegisterCallback(token,
                                               [cm]() { cm->StartCancel(); });
     }
+    CostGraphDef* cost_graph = call->response.mutable_cost_graph();
     env_->graph_mgr->ExecuteAsync(
         call->request.graph_handle(), step_id, call->request.exec_opts(),
-        collector, cm, in,
+        collector, cost_graph, cm, in,
         [this, step_id, call, cm, out, token, collector](Status s) {
           if (s.ok()) {
             env_->graph_mgr->RecvOutputs(step_id, out);
diff --git a/tensorflow/core/framework/common_shape_fns_test.cc b/tensorflow/core/framework/common_shape_fns_test.cc
index a4efc04467c..7196bc83042 100644
--- a/tensorflow/core/framework/common_shape_fns_test.cc
+++ b/tensorflow/core/framework/common_shape_fns_test.cc
@@ -56,7 +56,7 @@ TEST(CommonShapeFnsTest, NoOutputShapeTest) {
                   .Input({{"data", 0, DT_FLOAT}})
                   .Finalize(&def));
 
-  InferenceContext c(&def, op_def, {S({}), S({10})}, {});
+  InferenceContext c(&def, op_def, {S({}), S({10})}, {}, {});
   TF_EXPECT_OK(NoOutputs(&c));
   EXPECT_EQ(0, c.num_outputs());
 }
@@ -74,14 +74,14 @@ TEST(CommonShapeFnsTest, ScalarShapeTest) {
       NodeDefBuilder("test", "L2Loss").Input("t", 0, DT_FLOAT).Finalize(&def));
 
   {
-    InferenceContext c(&def, op_def, {S({})}, {});
+    InferenceContext c(&def, op_def, {S({})}, {}, {});
     TF_EXPECT_OK(ScalarShape(&c));
     ShapeHandle output = c.output(0);
     EXPECT_EQ(0, c.Rank(output));
   }
 
   {
-    InferenceContext c(&def, op_def, {S({1, 23, 4, 4, 2})}, {});
+    InferenceContext c(&def, op_def, {S({1, 23, 4, 4, 2})}, {}, {});
     TF_EXPECT_OK(ScalarShape(&c));
     ShapeHandle output = c.output(0);
     EXPECT_EQ(0, c.Rank(output));
@@ -108,7 +108,7 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
                   .Finalize(&def));
 
   {
-    InferenceContext c(&def, op_def, {S({2, 3}), S({3, 4})}, {});
+    InferenceContext c(&def, op_def, {S({2, 3}), S({3, 4})}, {}, {});
     TF_EXPECT_OK(MatMulShape(&c));
     ShapeHandle output = c.output(0);
     EXPECT_EQ(2, c.Value(c.Dim(output, 0)));
@@ -117,7 +117,7 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
 
   {
     // Unknown inner dimension for one
-    InferenceContext c(&def, op_def, {S({2, -1}), S({3, 4})}, {});
+    InferenceContext c(&def, op_def, {S({2, -1}), S({3, 4})}, {}, {});
     TF_EXPECT_OK(MatMulShape(&c));
     ShapeHandle output = c.output(0);
     EXPECT_EQ(2, c.Value(c.Dim(output, 0)));
@@ -126,7 +126,7 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
 
   {
     // Invalid rank.
-    InferenceContext c(&def, op_def, {S({2}), S({3, 4})}, {});
+    InferenceContext c(&def, op_def, {S({2}), S({3, 4})}, {}, {});
     auto s = MatMulShape(&c);
     EXPECT_FALSE(s.ok());
     EXPECT_TRUE(
@@ -136,7 +136,7 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
 
   {
     // Unknown outer dimension
-    InferenceContext c(&def, op_def, {S({2, 3}), S({3, -1})}, {});
+    InferenceContext c(&def, op_def, {S({2, 3}), S({3, -1})}, {}, {});
     TF_EXPECT_OK(MatMulShape(&c));
     ShapeHandle output = c.output(0);
     EXPECT_EQ(2, c.Value(c.Dim(output, 0)));
@@ -145,7 +145,7 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
 
   {
     // Inner shapes not compatible
-    InferenceContext c(&def, op_def, {S({2, 5}), S({3, 4})}, {});
+    InferenceContext c(&def, op_def, {S({2, 5}), S({3, 4})}, {}, {});
     auto s = MatMulShape(&c);
     EXPECT_FALSE(s.ok());
     EXPECT_TRUE(
@@ -156,7 +156,7 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
 
   {
     // Inner shapes not compatible
-    InferenceContext c(&def, op_def, {S({2, 5, 3}), S({3, 5, 4})}, {});
+    InferenceContext c(&def, op_def, {S({2, 5, 3}), S({3, 5, 4})}, {}, {});
     auto s = MatMulShape(&c);
     EXPECT_FALSE(s.ok());
     EXPECT_TRUE(
@@ -174,7 +174,7 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
                     .Attr("type", DT_FLOAT)
                     .Finalize(&def));
 
-    InferenceContext c(&def, op_def, {S({3, 2}), S({3, 4})}, {});
+    InferenceContext c(&def, op_def, {S({3, 2}), S({3, 4})}, {}, {});
     auto s = MatMulShape(&c);
     ShapeHandle output = c.output(0);
     EXPECT_EQ(2, c.Value(c.Dim(output, 0)));
@@ -191,7 +191,7 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
                     .Attr("type", DT_FLOAT)
                     .Finalize(&def));
 
-    InferenceContext c(&def, op_def, {S({2, 3}), S({4, 3})}, {});
+    InferenceContext c(&def, op_def, {S({2, 3}), S({4, 3})}, {}, {});
     auto s = MatMulShape(&c);
     ShapeHandle output = c.output(0);
     EXPECT_EQ(2, c.Value(c.Dim(output, 0)));
@@ -215,7 +215,7 @@ TEST(CommonShapeFnsTest, BiasAddShapeTest) {
                   .Finalize(&def));
 
   {
-    InferenceContext c(&def, op_def, {S({2, 10}), S({10})}, {});
+    InferenceContext c(&def, op_def, {S({2, 10}), S({10})}, {}, {});
     TF_EXPECT_OK(BiasAddShape(&c));
     ShapeHandle output = c.output(0);
     EXPECT_EQ(2, c.Value(c.Dim(output, 0)));
@@ -224,7 +224,7 @@ TEST(CommonShapeFnsTest, BiasAddShapeTest) {
 
   {
     // Unknown ranks.
-    InferenceContext c(&def, op_def, {Unknown(), Unknown()}, {});
+    InferenceContext c(&def, op_def, {Unknown(), Unknown()}, {}, {});
     TF_EXPECT_OK(BiasAddShape(&c));
     ShapeHandle output = c.output(0);
     EXPECT_FALSE(c.RankKnown(output));
@@ -232,7 +232,7 @@ TEST(CommonShapeFnsTest, BiasAddShapeTest) {
 
   {
     // Rank > 2
-    InferenceContext c(&def, op_def, {S({4, 3, 4, 2, 15}), S({15})}, {});
+    InferenceContext c(&def, op_def, {S({4, 3, 4, 2, 15}), S({15})}, {}, {});
     TF_EXPECT_OK(BiasAddShape(&c));
     ShapeHandle output = c.output(0);
     EXPECT_EQ("[4,3,4,2,15]", c.DebugString(output));
@@ -245,7 +245,7 @@ TEST(CommonShapeFnsTest, BiasAddShapeTest) {
                     .Input("b", 0, DT_FLOAT)
                     .Attr("data_format", "NCHW")
                     .Finalize(&def));
-    InferenceContext c(&def, op_def, {S({2, 3, 4, 5}), S({3})}, {});
+    InferenceContext c(&def, op_def, {S({2, 3, 4, 5}), S({3})}, {}, {});
     TF_EXPECT_OK(BiasAddShape(&c));
     ShapeHandle output = c.output(0);
     EXPECT_EQ("[2,3,4,5]", c.DebugString(output));
@@ -258,7 +258,8 @@ TEST(CommonShapeFnsTest, BiasAddShapeTest) {
                     .Input("b", 0, DT_FLOAT)
                     .Attr("data_format", "NCHW")
                     .Finalize(&def));
-    InferenceContext c(&def, op_def, {S({8, 6, 4, 2, 3, 4, 5}), S({3})}, {});
+    InferenceContext c(&def, op_def, {S({8, 6, 4, 2, 3, 4, 5}), S({3})}, {},
+                       {});
     TF_EXPECT_OK(BiasAddShape(&c));
     ShapeHandle output = c.output(0);
     EXPECT_EQ("[8,6,4,2,3,4,5]", c.DebugString(output));
@@ -271,7 +272,7 @@ TEST(CommonShapeFnsTest, BiasAddShapeTest) {
                     .Input("b", 0, DT_FLOAT)
                     .Attr("data_format", "NCHW")
                     .Finalize(&def));
-    InferenceContext c(&def, op_def, {S({10, 11, 12}), S({10})}, {});
+    InferenceContext c(&def, op_def, {S({10, 11, 12}), S({10})}, {}, {});
     TF_EXPECT_OK(BiasAddShape(&c));
     ShapeHandle output = c.output(0);
     EXPECT_EQ("[10,11,12]", c.DebugString(output));
@@ -279,7 +280,7 @@ TEST(CommonShapeFnsTest, BiasAddShapeTest) {
 
   {
     // Input rank not high enough
-    InferenceContext c(&def, op_def, {S({3}), S({3})}, {});
+    InferenceContext c(&def, op_def, {S({3}), S({3})}, {}, {});
     EXPECT_FALSE(BiasAddShape(&c).ok());
   }
 
@@ -291,7 +292,7 @@ TEST(CommonShapeFnsTest, BiasAddShapeTest) {
                     .Attr("data_format", "NCHW")
                     .Finalize(&def));
     // NCHW format
-    InferenceContext c(&def, op_def, {S({2, 3}), S({3})}, {});
+    InferenceContext c(&def, op_def, {S({2, 3}), S({3})}, {}, {});
     EXPECT_FALSE(BiasAddShape(&c).ok());
   }
 }
@@ -310,7 +311,7 @@ TEST(CommonShapeFnsTest, BiasAddGradShapeTest) {
                   .Finalize(&def));
 
   {
-    InferenceContext c(&def, op_def, {S({2, 10})}, {});
+    InferenceContext c(&def, op_def, {S({2, 10})}, {}, {});
     TF_EXPECT_OK(BiasAddGradShape(&c));
     ShapeHandle output = c.output(0);
     EXPECT_EQ(10, c.Value(c.Dim(output, 0)));
@@ -318,7 +319,7 @@ TEST(CommonShapeFnsTest, BiasAddGradShapeTest) {
 
   {
     // Rank > 2
-    InferenceContext c(&def, op_def, {S({5, 7, 2, 10})}, {});
+    InferenceContext c(&def, op_def, {S({5, 7, 2, 10})}, {}, {});
     TF_EXPECT_OK(BiasAddGradShape(&c));
     ShapeHandle output = c.output(0);
     EXPECT_EQ(10, c.Value(c.Dim(output, 0)));
@@ -330,7 +331,7 @@ TEST(CommonShapeFnsTest, BiasAddGradShapeTest) {
                     .Input("a", 0, DT_FLOAT)
                     .Attr("data_format", "NCHW")
                     .Finalize(&def));
-    InferenceContext c(&def, op_def, {S({2, 3, 4, 5})}, {});
+    InferenceContext c(&def, op_def, {S({2, 3, 4, 5})}, {}, {});
     TF_EXPECT_OK(BiasAddGradShape(&c));
     ShapeHandle output = c.output(0);
     EXPECT_EQ(3, c.Value(c.Dim(output, 0)));
@@ -342,7 +343,7 @@ TEST(CommonShapeFnsTest, BiasAddGradShapeTest) {
                     .Input("a", 0, DT_FLOAT)
                     .Attr("data_format", "NCHW")
                     .Finalize(&def));
-    InferenceContext c(&def, op_def, {S({8, 6, 4, 2, 3, 4, 5})}, {});
+    InferenceContext c(&def, op_def, {S({8, 6, 4, 2, 3, 4, 5})}, {}, {});
     TF_EXPECT_OK(BiasAddGradShape(&c));
     ShapeHandle output = c.output(0);
     EXPECT_EQ(3, c.Value(c.Dim(output, 0)));
@@ -354,7 +355,7 @@ TEST(CommonShapeFnsTest, BiasAddGradShapeTest) {
                     .Input("a", 0, DT_FLOAT)
                     .Attr("data_format", "NCHW")
                     .Finalize(&def));
-    InferenceContext c(&def, op_def, {S({10, 11, 12})}, {});
+    InferenceContext c(&def, op_def, {S({10, 11, 12})}, {}, {});
     TF_EXPECT_OK(BiasAddGradShape(&c));
     ShapeHandle output = c.output(0);
     EXPECT_EQ(10, c.Value(c.Dim(output, 0)));
@@ -362,7 +363,7 @@ TEST(CommonShapeFnsTest, BiasAddGradShapeTest) {
 
   {
     // Input rank not high enough
-    InferenceContext c(&def, op_def, {S({3})}, {});
+    InferenceContext c(&def, op_def, {S({3})}, {}, {});
     EXPECT_FALSE(BiasAddGradShape(&c).ok());
   }
 
@@ -373,7 +374,7 @@ TEST(CommonShapeFnsTest, BiasAddGradShapeTest) {
                     .Attr("data_format", "NCHW")
                     .Finalize(&def));
     // NCHW format
-    InferenceContext c(&def, op_def, {S({2, 3})}, {});
+    InferenceContext c(&def, op_def, {S({2, 3})}, {}, {});
     EXPECT_FALSE(BiasAddGradShape(&c).ok());
   }
 }
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 109df5d4f56..67c71be46c3 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -400,6 +400,9 @@ class FunctionLibraryRuntime {
   // Returns a debug string showing the definition of the function of
   // 'handle'.
   virtual string DebugString(Handle handle) = 0;
+
+  // Returns the graph version number.
+  virtual int graph_def_version() = 0;
 };
 
 // To register a gradient function for a builtin op, one should use
diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc
index 77a433ddcb5..da88b6a7ca6 100644
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@@ -30,9 +30,10 @@ constexpr int64 InferenceContext::kUnknownDim;
 InferenceContext::InferenceContext(
     const NodeDef* node_def, const OpDef& op_def,
     const std::vector<TensorShapeProto>& input_shapes,
-    const std::vector<const Tensor*>& input_tensors)
+    const std::vector<const Tensor*>& input_tensors,
+    const std::vector<ShapeHandle>& input_tensors_as_shapes)
     : node_def_(*CHECK_NOTNULL(node_def)) {
-  PreInputInit(op_def, input_tensors);
+  PreInputInit(op_def, input_tensors, input_tensors_as_shapes);
   if (!construction_status_.ok()) return;
   for (const TensorShapeProto& p : input_shapes) {
     ShapeHandle shape;
@@ -48,9 +49,10 @@ InferenceContext::InferenceContext(
 InferenceContext::InferenceContext(
     const NodeDef* node_def, const OpDef& op_def,
     const std::vector<ShapeHandle>& input_shapes,
-    const std::vector<const Tensor*>& input_tensors)
+    const std::vector<const Tensor*>& input_tensors,
+    const std::vector<ShapeHandle>& input_tensors_as_shapes)
     : node_def_(*CHECK_NOTNULL(node_def)) {
-  PreInputInit(op_def, input_tensors);
+  PreInputInit(op_def, input_tensors, input_tensors_as_shapes);
   if (!construction_status_.ok()) return;
   inputs_ = input_shapes;
   PostInputInit();
@@ -106,8 +108,10 @@ Status InferenceContext::output(StringPiece output_name,
 }
 
 void InferenceContext::PreInputInit(
-    const OpDef& op_def, const std::vector<const Tensor*>& input_tensors) {
+    const OpDef& op_def, const std::vector<const Tensor*>& input_tensors,
+    const std::vector<ShapeHandle>& input_tensors_as_shapes) {
   input_tensors_ = input_tensors;
+  input_tensors_as_shapes_ = input_tensors_as_shapes;
 
   construction_status_ =
       NameRangesForNode(node_def_, op_def, &input_name_map_, &output_name_map_);
@@ -139,6 +143,7 @@ void InferenceContext::PostInputInit() {
   CHECK_LE(input_tensors_.size(), inputs_.size());
   input_tensors_.resize(inputs_.size());
   requested_input_tensor_.resize(inputs_.size());
+  requested_input_tensor_as_partial_shape_.resize(inputs_.size());
 }
 
 bool InferenceContext::FullyDefined(ShapeHandle s) {
@@ -470,11 +475,24 @@ Status InferenceContext::MakeShapeFromShapeTensor(int input_idx,
   ShapeHandle input_shape;
   TF_RETURN_IF_ERROR(WithRank(input(input_idx), 1, &input_shape));
 
-  const Tensor* t = input_tensor(input_idx);
+  if (input_idx < input_tensors_as_shapes_.size() &&
+      input_tensors_as_shapes_[input_idx].IsSet() &&
+      RankKnown(input_tensors_as_shapes_[input_idx])) {
+    *out = input_tensors_as_shapes_[input_idx];
+    return Status::OK();
+  }
+  requested_input_tensor_as_partial_shape_[input_idx] = true;
+
+  return MakeShapeFromTensor(input_tensor(input_idx), input_shape, out);
+}
+
+Status InferenceContext::MakeShapeFromTensor(const Tensor* t,
+                                             ShapeHandle tensor_shape,
+                                             ShapeHandle* out) {
   if (t == nullptr) {
     // Shape tensor is not known, but if the shape of the shape tensor is then
     // the right number of unknown dims can be created.
-    DimensionHandle shape_dim = Dim(input_shape, 0);
+    DimensionHandle shape_dim = Dim(tensor_shape, 0);
     if (!ValueKnown(shape_dim)) {
       return ReturnUnknownShape(out);
     }
@@ -493,12 +511,24 @@ Status InferenceContext::MakeShapeFromShapeTensor(int input_idx,
   if (t->dtype() == DataType::DT_INT32) {
     auto flat_t = t->flat<int32>();
     for (int i = 0; i < flat_t.size(); ++i) {
-      dims.push_back(MakeDim(flat_t(i)));
+      const int32 val = flat_t(i);
+      if (val < -1) {
+        return errors::InvalidArgument(
+            "Invalid value in tensor used for shape: ", val);
+      }
+      // -1 will become an unknown dim.
+      dims.push_back(MakeDim(val));
     }
   } else if (t->dtype() == DataType::DT_INT64) {
     auto flat_t = t->flat<int64>();
     for (int i = 0; i < flat_t.size(); ++i) {
-      dims.push_back(MakeDim(flat_t(i)));
+      const int64 val = flat_t(i);
+      if (val < -1) {
+        return errors::InvalidArgument(
+            "Invalid value in tensor used for shape: ", val);
+      }
+      // -1 will become an unknown dim.
+      dims.push_back(MakeDim(val));
     }
   } else {
     *out = nullptr;
@@ -558,24 +588,27 @@ Status InferenceContext::MakeDimForScalarInput(int idx, DimensionHandle* out) {
   return Status::OK();
 }
 
-Status InferenceContext::Divide(DimensionHandle dividend, int64 divisor,
+Status InferenceContext::Divide(DimensionHandle dividend,
+                                DimensionOrConstant divisor,
                                 bool evenly_divisible, DimensionHandle* out) {
-  if (divisor == 1) {
+  const int64 divisor_value = Value(divisor);
+  if (divisor_value == 1) {
     *out = dividend;
-  } else if (!ValueKnown(dividend)) {
+  } else if (!ValueKnown(dividend) ||
+             (divisor.dim.IsSet() && !ValueKnown(divisor.dim))) {
     *out = UnknownDim();
   } else {
     const int64 v = Value(dividend);
-    if (divisor <= 0) {
+    if (divisor_value <= 0) {
       return errors::InvalidArgument("Divisor must be positive but is ",
-                                     divisor);
+                                     divisor_value);
     }
-    if (evenly_divisible && (v % divisor) != 0) {
+    if (evenly_divisible && (v % divisor_value) != 0) {
       return errors::InvalidArgument(
-          "Dimension size must be evenly divisible by ", divisor, " but is ",
-          v);
+          "Dimension size must be evenly divisible by ", divisor_value,
+          " but is ", v);
     }
-    *out = MakeDim(v / divisor);
+    *out = MakeDim(v / divisor_value);
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index 1dfb9af0a47..f5befc15a11 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -136,17 +136,33 @@ class InferenceContext {
 
   // <input_tensors> is NULL-padded to be the same size as <input_shapes>.
   //
+  // Elements of <input_tensors_as_shapes> are used for when a shape function
+  // makes a call to MakeShapeFromShapeTensor; in particular, when the
+  // input_tensors[i] is nullptr but the shape represented by it is partially
+  // known from analysis of the graph.
+  // <input_tensors_as_shapes> can have fewer elements than <input_shapes>.
+  // Values of <input_tensors_as_shapes> do not need to outlive the context.
+  //
   // REQUIRES: <node_def> is not NULL, and must outlive the InferenceContext.
   InferenceContext(const NodeDef* node_def, const OpDef& op_def,
                    const std::vector<ShapeHandle>& input_shapes,
-                   const std::vector<const Tensor*>& input_tensors);
+                   const std::vector<const Tensor*>& input_tensors,
+                   const std::vector<ShapeHandle>& input_tensors_as_shapes);
 
   // <input_tensors> is NULL-padded to be the same size as <input_shapes>.
   //
+  // Elements of <input_tensors_as_shapes> are used for when a shape function
+  // makes a call to MakeShapeFromShapeTensor; in particular, when the
+  // input_tensors[i] is nullptr but the shape represented by it is partially
+  // known from analysis of the graph.
+  // <input_tensors_as_shapes> can have fewer elements than <input_shapes>.
+  // Values of <input_tensors_as_shapes> do not need to outlive the context.
+  //
   // REQUIRES: <node_def> is not NULL, and must outlive the InferenceContext.
   InferenceContext(const NodeDef* node_def, const OpDef& op_def,
                    const std::vector<TensorShapeProto>& input_shapes,
-                   const std::vector<const Tensor*>& input_tensors);
+                   const std::vector<const Tensor*>& input_tensors,
+                   const std::vector<ShapeHandle>& input_tensors_as_shapes);
 
   ~InferenceContext();
 
@@ -180,10 +196,21 @@ class InferenceContext {
     return requested_input_tensor_[idx];
   }
 
+  // Returns true if MakeShapeFromInputTensor was called but the constant
+  // input_tensor was not present.
+  bool requested_input_tensor_as_partial_shape(int idx) const {
+    return requested_input_tensor_as_partial_shape_[idx];
+  }
+
   void set_input_tensors(const std::vector<const Tensor*>& input_tensors) {
     input_tensors_ = input_tensors;
   }
 
+  void set_input_tensors_as_shapes(
+      const std::vector<ShapeHandle>& input_tensors_as_shapes) {
+    input_tensors_as_shapes_ = input_tensors_as_shapes;
+  }
+
   void set_output(int idx, ShapeHandle shape) { outputs_[idx] = shape; }
   Status set_output(StringPiece output_name,
                     const std::vector<ShapeHandle>& shapes);
@@ -336,8 +363,8 @@ class InferenceContext {
   // Returns in <out> the result of dividing <dividend> by <divisor>.
   // Returns an error if <divisor>  is not positive or if <evenly_divisible>
   // and <divisor> does not evenly divide <dividend>.
-  Status Divide(DimensionHandle dividend, int64 divisor, bool evenly_divisible,
-                DimensionHandle* out);
+  Status Divide(DimensionHandle dividend, DimensionOrConstant divisor,
+                bool evenly_divisible, DimensionHandle* out);
 
   // Returns in <out> the sum of <first> and <second>.
   Status Add(DimensionHandle first, DimensionOrConstant second,
@@ -408,6 +435,15 @@ class InferenceContext {
     return Status::OK();
   }
 
+  // Note that shape functions should usually call MakeShapeFromShapeTensor,
+  // as it does more analysis to provide partial shapes.
+  //
+  // Returns in <out> a new shape whose dimension sizes come from tensor <t>.
+  // The tensor must be a 1-dimensional int32 or int64 tensor.  If <t> is NULL,
+  // then an unknown shape is returned.
+  Status MakeShapeFromTensor(const Tensor* t, ShapeHandle tensor_shape,
+                             ShapeHandle* out);
+
  private:
   // Creates and stores shapes for use in InferenceContext.
   class ShapeManager {
@@ -443,7 +479,8 @@ class InferenceContext {
   // Shared initialization across the two constructors.  Remove
   // once we get rid of one of them.
   void PreInputInit(const OpDef& op_def,
-                    const std::vector<const Tensor*>& input_tensors);
+                    const std::vector<const Tensor*>& input_tensors,
+                    const std::vector<ShapeHandle>& input_tensors_as_shapes);
   void PostInputInit();
 
   DimensionHandle GetDimension(const DimensionOrConstant& d);
@@ -463,11 +500,15 @@ class InferenceContext {
 
   ShapeManager shape_manager_;
 
-  // inputs_ and outputs_ refer to values from `shape_manager_`.
+  // inputs_, outputs_, and input_tensors_as_shapes_ refer to values from
+  // `shape_manager_`.
   std::vector<ShapeHandle> inputs_;
   std::vector<const Tensor*> input_tensors_;
   std::vector<bool> requested_input_tensor_;
   std::vector<ShapeHandle> outputs_;
+  // Can have fewer elements than inputs_.
+  std::vector<ShapeHandle> input_tensors_as_shapes_;
+  std::vector<bool> requested_input_tensor_as_partial_shape_;
 
   const NodeDef& node_def_;
   NameRangeMap input_name_map_;
diff --git a/tensorflow/core/framework/shape_inference_test.cc b/tensorflow/core/framework/shape_inference_test.cc
index 76a485c678f..06096bfdcc7 100644
--- a/tensorflow/core/framework/shape_inference_test.cc
+++ b/tensorflow/core/framework/shape_inference_test.cc
@@ -71,7 +71,7 @@ TEST_F(ShapeInferenceTest, InputOutputByName) {
                .Attr("N", 3)
                .Input(FakeInput(DT_FLOAT))
                .Finalize(&def);
-  InferenceContext c(&def, op_def, {S({1, 5}), S({2, 5}), S({1, 3})}, {});
+  InferenceContext c(&def, op_def, {S({1, 5}), S({2, 5}), S({1, 3})}, {}, {});
 
   EXPECT_EQ("5", c.DebugString(c.NumElements(c.input(0))));
   EXPECT_EQ("10", c.DebugString(c.NumElements(c.input(1))));
@@ -107,7 +107,7 @@ static OpDef MakeOpDef(int num_inputs, int num_outputs) {
 
 TEST_F(ShapeInferenceTest, DimensionOrConstant) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(1, 1), {Unknown()}, {});
+  InferenceContext c(&def, MakeOpDef(1, 1), {Unknown()}, {}, {});
   EXPECT_EQ(InferenceContext::kUnknownDim,
             c.Value(InferenceContext::kUnknownDim));
   EXPECT_EQ(1, c.Value(1));
@@ -122,7 +122,7 @@ TEST_F(ShapeInferenceTest, Run) {
   NodeDef def;
   def.set_name("foo");
   def.set_op("foo_op");
-  InferenceContext c(&def, MakeOpDef(3, 2), {S({1})}, {});
+  InferenceContext c(&def, MakeOpDef(3, 2), {S({1})}, {}, {});
 
   {
     auto fn = [](InferenceContext* c) {
@@ -154,7 +154,7 @@ TEST_F(ShapeInferenceTest, Run) {
 TEST_F(ShapeInferenceTest, RankAndDimInspection) {
   NodeDef def;
   InferenceContext c(&def, MakeOpDef(3, 2), {Unknown(), S({1, -1, 3}), S({})},
-                     {});
+                     {}, {});
   EXPECT_EQ(3, c.num_inputs());
   EXPECT_EQ(2, c.num_outputs());
 
@@ -195,7 +195,7 @@ TEST_F(ShapeInferenceTest, RankAndDimInspection) {
 TEST_F(ShapeInferenceTest, NumElements) {
   NodeDef def;
   InferenceContext c(&def, MakeOpDef(3, 2),
-                     {Unknown(), S({1, -1, 3}), S({5, 4, 3, 2})}, {});
+                     {Unknown(), S({1, -1, 3}), S({5, 4, 3, 2})}, {}, {});
 
   EXPECT_EQ("?", c.DebugString(c.NumElements(c.input(0))));
   EXPECT_EQ("?", c.DebugString(c.NumElements(c.input(1))));
@@ -208,7 +208,7 @@ TEST_F(ShapeInferenceTest, NumElements) {
 
 TEST_F(ShapeInferenceTest, WithRank) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(2, 2), {Unknown(), S({1, -1, 3})}, {});
+  InferenceContext c(&def, MakeOpDef(2, 2), {Unknown(), S({1, -1, 3})}, {}, {});
 
   auto in0 = c.input(0);
   auto in1 = c.input(1);
@@ -246,7 +246,7 @@ TEST_F(ShapeInferenceTest, WithRank) {
 
 TEST_F(ShapeInferenceTest, WithRankAtMost) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(2, 2), {Unknown(), S({1, -1, 3})}, {});
+  InferenceContext c(&def, MakeOpDef(2, 2), {Unknown(), S({1, -1, 3})}, {}, {});
 
   auto in0 = c.input(0);
   auto in1 = c.input(1);
@@ -284,7 +284,7 @@ TEST_F(ShapeInferenceTest, WithRankAtMost) {
 
 TEST_F(ShapeInferenceTest, WithRankAtLeast) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(2, 2), {Unknown(), S({1, -1, 3})}, {});
+  InferenceContext c(&def, MakeOpDef(2, 2), {Unknown(), S({1, -1, 3})}, {}, {});
 
   auto in0 = c.input(0);
   auto in1 = c.input(1);
@@ -322,7 +322,7 @@ TEST_F(ShapeInferenceTest, WithRankAtLeast) {
 
 TEST_F(ShapeInferenceTest, WithValue) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(1, 2), {S({1, -1})}, {});
+  InferenceContext c(&def, MakeOpDef(1, 2), {S({1, -1})}, {}, {});
 
   auto d0 = c.Dim(c.input(0), 0);
   auto d1 = c.Dim(c.input(0), 1);
@@ -363,7 +363,7 @@ TEST_F(ShapeInferenceTest, WithValue) {
 
 TEST_F(ShapeInferenceTest, MergeDim) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(1, 2), {S({2, -1, 2, 1, -1})}, {});
+  InferenceContext c(&def, MakeOpDef(1, 2), {S({2, -1, 2, 1, -1})}, {}, {});
 
   auto d2 = c.Dim(c.input(0), 0);
   auto d_unknown = c.Dim(c.input(0), 1);
@@ -412,7 +412,7 @@ TEST_F(ShapeInferenceTest, MergeShape) {
   InferenceContext c(&def, MakeOpDef(7, 2),
                      {Unknown(), S({1, 2}), S({-1, 2}), S({1, -1}), S({1, 3}),
                       Unknown(), S({1})},
-                     {});
+                     {}, {});
 
   auto s_unknown = c.input(0);
   auto s_1_2 = c.input(1);
@@ -483,7 +483,7 @@ TEST_F(ShapeInferenceTest, MergePrefix) {
                      {
                          Unknown(), S({-1, 2}), S({1, -1, 3}), S({2, 4}),
                      },
-                     {});
+                     {}, {});
 
   auto s_unknown = c.input(0);
   auto s_u_2 = c.input(1);
@@ -536,7 +536,7 @@ TEST_F(ShapeInferenceTest, MergePrefix) {
 TEST_F(ShapeInferenceTest, Subshape) {
   NodeDef def;
   InferenceContext c(&def, MakeOpDef(2, 2), {S({1, 2, 3, -1, 5}), Unknown()},
-                     {});
+                     {}, {});
 
   ShapeHandle unknown = c.input(1);
   ShapeHandle out;
@@ -611,7 +611,7 @@ TEST_F(ShapeInferenceTest, Subshape) {
 TEST_F(ShapeInferenceTest, Concatenate) {
   NodeDef def;
   InferenceContext c(&def, MakeOpDef(3, 2),
-                     {S({1, -1, 3}), S({4, 5}), Unknown()}, {});
+                     {S({1, -1, 3}), S({4, 5}), Unknown()}, {}, {});
 
   auto in0 = c.input(0);
   auto in1 = c.input(1);
@@ -637,7 +637,7 @@ TEST_F(ShapeInferenceTest, Concatenate) {
 
 TEST_F(ShapeInferenceTest, ReplaceDim) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(2, 0), {S({1, 2, 3}), Unknown()}, {});
+  InferenceContext c(&def, MakeOpDef(2, 0), {S({1, 2, 3}), Unknown()}, {}, {});
 
   auto in = c.input(0);
   auto unknown = c.input(1);
@@ -668,7 +668,7 @@ TEST_F(ShapeInferenceTest, ReplaceDim) {
 
 TEST_F(ShapeInferenceTest, MakeShape) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(1, 2), {S({1, 2, 3, -1, 5})}, {});
+  InferenceContext c(&def, MakeOpDef(1, 2), {S({1, 2, 3, -1, 5})}, {}, {});
 
   std::vector<DimensionHandle> dims;
   auto in0 = c.input(0);
@@ -693,7 +693,7 @@ TEST_F(ShapeInferenceTest, MakeShape) {
 TEST_F(ShapeInferenceTest, UnknownShape) {
   NodeDef def;
   std::vector<ShapeHandle> empty;
-  InferenceContext c(&def, MakeOpDef(0, 2), empty, {});
+  InferenceContext c(&def, MakeOpDef(0, 2), empty, {}, {});
 
   auto u0 = c.UnknownShape();
   auto u1 = c.UnknownShape();
@@ -705,7 +705,7 @@ TEST_F(ShapeInferenceTest, UnknownShape) {
 TEST_F(ShapeInferenceTest, Scalar) {
   NodeDef def;
   std::vector<ShapeHandle> empty;
-  InferenceContext c(&def, MakeOpDef(0, 2), empty, {});
+  InferenceContext c(&def, MakeOpDef(0, 2), empty, {}, {});
 
   auto s0 = c.Scalar();
   EXPECT_EQ("[]", c.DebugString(s0));
@@ -716,7 +716,7 @@ TEST_F(ShapeInferenceTest, Scalar) {
 TEST_F(ShapeInferenceTest, Vector) {
   NodeDef def;
   std::vector<ShapeHandle> empty;
-  InferenceContext c(&def, MakeOpDef(0, 2), empty, {});
+  InferenceContext c(&def, MakeOpDef(0, 2), empty, {}, {});
 
   auto s0 = c.Vector(1);
   EXPECT_EQ("[1]", c.DebugString(s0));
@@ -732,7 +732,7 @@ TEST_F(ShapeInferenceTest, Vector) {
 TEST_F(ShapeInferenceTest, Matrix) {
   NodeDef def;
   std::vector<ShapeHandle> empty;
-  InferenceContext c(&def, MakeOpDef(0, 2), empty, {});
+  InferenceContext c(&def, MakeOpDef(0, 2), empty, {}, {});
 
   auto s0 = c.Matrix(1, 2);
   EXPECT_EQ("[1,2]", c.DebugString(s0));
@@ -754,7 +754,7 @@ TEST_F(ShapeInferenceTest, Matrix) {
 TEST_F(ShapeInferenceTest, MakeShapeFromShapeTensor) {
   auto create = [&](Tensor* t) {
     NodeDef def;
-    InferenceContext c(&def, MakeOpDef(1, 0), {Unknown()}, {t});
+    InferenceContext c(&def, MakeOpDef(1, 0), {Unknown()}, {t}, {});
     ShapeHandle out;
     Status s = c.MakeShapeFromShapeTensor(0, &out);
     if (s.ok()) {
@@ -774,6 +774,9 @@ TEST_F(ShapeInferenceTest, MakeShapeFromShapeTensor) {
   t = ::tensorflow::test::AsTensor<int64>({3, 2, 1});
   EXPECT_EQ("[3,2,1]", create(&t));
 
+  t = ::tensorflow::test::AsTensor<int64>({3, -1, 1});
+  EXPECT_EQ("[3,?,1]", create(&t));
+
   t = ::tensorflow::test::AsTensor<int64>({});
   EXPECT_EQ("[]", create(&t));
 
@@ -790,10 +793,20 @@ TEST_F(ShapeInferenceTest, MakeShapeFromShapeTensor) {
   EXPECT_TRUE(StringPiece(create(&t))
                   .contains("Input tensor must be rank 1, but was rank 2"));
 
+  // Test negative values for the dims.
+  t = ::tensorflow::test::AsTensor<int64>({3, -2, 1});
+  EXPECT_TRUE(StringPiece(create(&t))
+                  .contains("Invalid value in tensor used for shape: -2"));
+
+  // Test negative values for the dims.
+  t = ::tensorflow::test::AsTensor<int32>({3, -2, 1});
+  EXPECT_TRUE(StringPiece(create(&t))
+                  .contains("Invalid value in tensor used for shape: -2"));
+
   // Test when the input shape is wrong.
   {
     NodeDef def;
-    InferenceContext c(&def, MakeOpDef(1, 0), {S({1, -1})}, {nullptr});
+    InferenceContext c(&def, MakeOpDef(1, 0), {S({1, -1})}, {nullptr}, {});
     ShapeHandle out;
     EXPECT_EQ("Shape must be rank 1 but is rank 2",
               c.MakeShapeFromShapeTensor(0, &out).error_message());
@@ -803,7 +816,7 @@ TEST_F(ShapeInferenceTest, MakeShapeFromShapeTensor) {
 TEST_F(ShapeInferenceTest, MakeShapeFromShapeProto) {
   NodeDef def;
   std::vector<ShapeHandle> empty;
-  InferenceContext c(&def, MakeOpDef(0, 2), empty, {});
+  InferenceContext c(&def, MakeOpDef(0, 2), empty, {}, {});
   TensorShapeProto proto;
 
   // With a set unknown rank.
@@ -839,7 +852,7 @@ TEST_F(ShapeInferenceTest, MakeShapeFromShapeProto) {
 TEST_F(ShapeInferenceTest, MakeDim) {
   NodeDef def;
   std::vector<ShapeHandle> empty;
-  InferenceContext c(&def, MakeOpDef(0, 2), empty, {});
+  InferenceContext c(&def, MakeOpDef(0, 2), empty, {}, {});
 
   auto d0 = c.MakeDim(1);
   auto d1 = c.MakeDim(1);
@@ -853,7 +866,7 @@ TEST_F(ShapeInferenceTest, MakeDim) {
 TEST_F(ShapeInferenceTest, UnknownDim) {
   NodeDef def;
   std::vector<ShapeHandle> empty;
-  InferenceContext c(&def, MakeOpDef(0, 2), empty, {});
+  InferenceContext c(&def, MakeOpDef(0, 2), empty, {}, {});
 
   auto d0 = c.UnknownDim();
   auto d1 = c.UnknownDim();
@@ -865,7 +878,7 @@ TEST_F(ShapeInferenceTest, UnknownDim) {
 TEST_F(ShapeInferenceTest, UnknownShapeOfRank) {
   NodeDef def;
   std::vector<ShapeHandle> empty;
-  InferenceContext c(&def, MakeOpDef(0, 2), empty, {});
+  InferenceContext c(&def, MakeOpDef(0, 2), empty, {}, {});
 
   auto unknown_shape_of_rank_3 = c.UnknownShapeOfRank(3);
   EXPECT_EQ("[?,?,?]", c.DebugString(unknown_shape_of_rank_3));
@@ -879,7 +892,7 @@ TEST_F(ShapeInferenceTest, InputTensors) {
   const Tensor t2 = tensorflow::test::AsTensor<float>({20, 30});
   NodeDef def;
   InferenceContext c(&def, MakeOpDef(3, 2), {S({1}), S({2}), S({3})},
-                     {&t1, &t2});
+                     {&t1, &t2}, {});
 
   EXPECT_TRUE(c.input_tensor(0) == &t1);
   EXPECT_TRUE(c.input_tensor(1) == &t2);
@@ -890,7 +903,7 @@ TEST_F(ShapeInferenceTest, MakeDimForScalarInput) {
   Tensor t1 = tensorflow::test::AsScalar<int32>(20);
   Tensor t2 = tensorflow::test::AsScalar<int32>(-1);
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(2, 2), {S({}), S({})}, {&t1, &t2});
+  InferenceContext c(&def, MakeOpDef(2, 2), {S({}), S({})}, {&t1, &t2}, {});
 
   DimensionHandle d;
   EXPECT_TRUE(c.MakeDimForScalarInput(0, &d).ok());
@@ -921,7 +934,7 @@ TEST_F(ShapeInferenceTest, GetAttr) {
             .ok());
 
   std::vector<ShapeHandle> empty;
-  InferenceContext c(&def, op_reg_data.op_def, empty, {});
+  InferenceContext c(&def, op_reg_data.op_def, empty, {}, {});
   string value;
   EXPECT_TRUE(c.GetAttr("foo", &value).ok());
   EXPECT_EQ("bar", value);
@@ -929,11 +942,14 @@ TEST_F(ShapeInferenceTest, GetAttr) {
 
 TEST_F(ShapeInferenceTest, Divide) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(1, 2), {S({6, -1})}, {});
+  InferenceContext c(&def, MakeOpDef(1, 2), {S({6, -1, 1, 2, 0})}, {}, {});
 
   auto s = c.input(0);
   auto d_6 = c.Dim(s, 0);
   auto d_unknown = c.Dim(s, 1);
+  auto d_1 = c.Dim(s, 2);
+  auto d_2 = c.Dim(s, 3);
+  auto d_0 = c.Dim(s, 4);
   bool evenly_divisible = true;
 
   // Dividing unknown by non-1 gives new unknown.
@@ -947,9 +963,15 @@ TEST_F(ShapeInferenceTest, Divide) {
   EXPECT_TRUE(SameHandle(out, d_unknown));
   EXPECT_TRUE(c.Divide(d_6, 1, evenly_divisible, &out).ok());
   EXPECT_TRUE(SameHandle(out, d_6));
+  EXPECT_TRUE(c.Divide(d_unknown, d_1, evenly_divisible, &out).ok());
+  EXPECT_TRUE(SameHandle(out, d_unknown));
+  EXPECT_TRUE(c.Divide(d_6, d_1, evenly_divisible, &out).ok());
+  EXPECT_TRUE(SameHandle(out, d_6));
 
   EXPECT_TRUE(c.Divide(d_6, 2, evenly_divisible, &out).ok());
   EXPECT_EQ("3", c.DebugString(out));
+  EXPECT_TRUE(c.Divide(d_6, d_2, evenly_divisible, &out).ok());
+  EXPECT_EQ("3", c.DebugString(out));
 
   EXPECT_TRUE(
       StringPiece(c.Divide(d_6, 5, evenly_divisible, &out).error_message())
@@ -958,6 +980,9 @@ TEST_F(ShapeInferenceTest, Divide) {
   EXPECT_TRUE(
       StringPiece(c.Divide(d_6, 0, evenly_divisible, &out).error_message())
           .contains("Divisor must be positive but is 0"));
+  EXPECT_TRUE(
+      StringPiece(c.Divide(d_6, d_0, evenly_divisible, &out).error_message())
+          .contains("Divisor must be positive but is 0"));
 
   EXPECT_TRUE(
       StringPiece(c.Divide(d_6, -1, evenly_divisible, &out).error_message())
@@ -979,7 +1004,7 @@ TEST_F(ShapeInferenceTest, Divide) {
 
 TEST_F(ShapeInferenceTest, Add) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(1, 2), {S({6, -1, 0})}, {});
+  InferenceContext c(&def, MakeOpDef(1, 2), {S({6, -1, 0})}, {}, {});
 
   auto s = c.input(0);
   auto d_6 = c.Dim(s, 0);
@@ -1030,7 +1055,7 @@ TEST_F(ShapeInferenceTest, Add) {
 
 TEST_F(ShapeInferenceTest, Subtract) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(1, 2), {S({6, -1, 0, 5})}, {});
+  InferenceContext c(&def, MakeOpDef(1, 2), {S({6, -1, 0, 5})}, {}, {});
 
   auto s = c.input(0);
   auto d_6 = c.Dim(s, 0);
@@ -1079,7 +1104,7 @@ TEST_F(ShapeInferenceTest, Subtract) {
 
 TEST_F(ShapeInferenceTest, Multiply) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(1, 2), {S({6, -1, 0, 1})}, {});
+  InferenceContext c(&def, MakeOpDef(1, 2), {S({6, -1, 0, 1})}, {}, {});
 
   auto s = c.input(0);
   auto d_6 = c.Dim(s, 0);
@@ -1132,7 +1157,7 @@ TEST_F(ShapeInferenceTest, Multiply) {
 TEST_F(ShapeInferenceTest, FullyDefined) {
   NodeDef def;
   std::vector<ShapeHandle> empty;
-  InferenceContext c(&def, MakeOpDef(0, 2), empty, {});
+  InferenceContext c(&def, MakeOpDef(0, 2), empty, {}, {});
 
   // No rank or missing dimension information should return false.
   EXPECT_FALSE(c.FullyDefined(c.UnknownShape()));
@@ -1145,7 +1170,7 @@ TEST_F(ShapeInferenceTest, FullyDefined) {
 
 TEST_F(ShapeInferenceTest, Min) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(1, 2), {S({1, 2, -1, 0})}, {});
+  InferenceContext c(&def, MakeOpDef(1, 2), {S({1, 2, -1, 0})}, {}, {});
 
   auto s = c.input(0);
   auto d_1 = c.Dim(s, 0);
@@ -1193,7 +1218,7 @@ TEST_F(ShapeInferenceTest, Min) {
 
 TEST_F(ShapeInferenceTest, Max) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(1, 2), {S({1, 2, -1})}, {});
+  InferenceContext c(&def, MakeOpDef(1, 2), {S({1, 2, -1})}, {}, {});
 
   auto s = c.input(0);
   auto d_1 = c.Dim(s, 0);
@@ -1231,7 +1256,7 @@ TEST_F(ShapeInferenceTest, Max) {
 TEST_F(ShapeInferenceTest, ValidateSparseTensor_UnknownShapes) {
   NodeDef def;
   InferenceContext c(&def, MakeOpDef(3, 1), {Unknown(), Unknown(), Unknown()},
-                     {});
+                     {}, {});
   EXPECT_EQ(3, c.num_inputs());
   EXPECT_EQ(1, c.num_outputs());
 
@@ -1243,7 +1268,7 @@ TEST_F(ShapeInferenceTest, ValidateSparseTensor_UnknownShapes) {
 
 TEST_F(ShapeInferenceTest, ValidateSparseTensor_UnknownDims) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(3, 1), {S({-1, -1}), S({-1}), S({-1})},
+  InferenceContext c(&def, MakeOpDef(3, 1), {S({-1, -1}), S({-1}), S({-1})}, {},
                      {});
   EXPECT_EQ(3, c.num_inputs());
   EXPECT_EQ(1, c.num_outputs());
@@ -1256,7 +1281,8 @@ TEST_F(ShapeInferenceTest, ValidateSparseTensor_UnknownDims) {
 
 TEST_F(ShapeInferenceTest, ValidateSparseTensor_InvalidIndicesRank) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(3, 1), {S({-1}), S({-1}), S({-1})}, {});
+  InferenceContext c(&def, MakeOpDef(3, 1), {S({-1}), S({-1}), S({-1})}, {},
+                     {});
   EXPECT_EQ(3, c.num_inputs());
   EXPECT_EQ(1, c.num_outputs());
 
@@ -1269,7 +1295,8 @@ TEST_F(ShapeInferenceTest, ValidateSparseTensor_InvalidIndicesRank) {
 
 TEST_F(ShapeInferenceTest, ValidateSparseTensor_InvalidNumElements) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(3, 1), {S({5, 3}), S({4}), S({3})}, {});
+  InferenceContext c(&def, MakeOpDef(3, 1), {S({5, 3}), S({4}), S({3})}, {},
+                     {});
   EXPECT_EQ(3, c.num_inputs());
   EXPECT_EQ(1, c.num_outputs());
 
@@ -1282,7 +1309,8 @@ TEST_F(ShapeInferenceTest, ValidateSparseTensor_InvalidNumElements) {
 
 TEST_F(ShapeInferenceTest, ValidateSparseTensor_InvalidRank) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(3, 1), {S({5, 3}), S({5}), S({4})}, {});
+  InferenceContext c(&def, MakeOpDef(3, 1), {S({5, 3}), S({5}), S({4})}, {},
+                     {});
   EXPECT_EQ(3, c.num_inputs());
   EXPECT_EQ(1, c.num_outputs());
 
@@ -1295,7 +1323,8 @@ TEST_F(ShapeInferenceTest, ValidateSparseTensor_InvalidRank) {
 
 TEST_F(ShapeInferenceTest, ValidateSparseTensor_UnknownNumIndexElements) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(3, 1), {S({-1, 3}), S({5}), S({3})}, {});
+  InferenceContext c(&def, MakeOpDef(3, 1), {S({-1, 3}), S({5}), S({3})}, {},
+                     {});
   EXPECT_EQ(3, c.num_inputs());
   EXPECT_EQ(1, c.num_outputs());
 
@@ -1307,7 +1336,8 @@ TEST_F(ShapeInferenceTest, ValidateSparseTensor_UnknownNumIndexElements) {
 
 TEST_F(ShapeInferenceTest, ValidateSparseTensor_UnknownNumValueElements) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(3, 1), {S({5, 3}), S({-1}), S({3})}, {});
+  InferenceContext c(&def, MakeOpDef(3, 1), {S({5, 3}), S({-1}), S({3})}, {},
+                     {});
   EXPECT_EQ(3, c.num_inputs());
   EXPECT_EQ(1, c.num_outputs());
 
@@ -1319,7 +1349,8 @@ TEST_F(ShapeInferenceTest, ValidateSparseTensor_UnknownNumValueElements) {
 
 TEST_F(ShapeInferenceTest, ValidateSparseTensor_UnknownIndexRank) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(3, 1), {S({5, -1}), S({5}), S({3})}, {});
+  InferenceContext c(&def, MakeOpDef(3, 1), {S({5, -1}), S({5}), S({3})}, {},
+                     {});
   EXPECT_EQ(3, c.num_inputs());
   EXPECT_EQ(1, c.num_outputs());
 
@@ -1331,7 +1362,8 @@ TEST_F(ShapeInferenceTest, ValidateSparseTensor_UnknownIndexRank) {
 
 TEST_F(ShapeInferenceTest, ValidateSparseTensor_UnknownShapeRank) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(3, 1), {S({5, 3}), S({5}), S({-1})}, {});
+  InferenceContext c(&def, MakeOpDef(3, 1), {S({5, 3}), S({5}), S({-1})}, {},
+                     {});
   EXPECT_EQ(3, c.num_inputs());
   EXPECT_EQ(1, c.num_outputs());
 
@@ -1343,7 +1375,8 @@ TEST_F(ShapeInferenceTest, ValidateSparseTensor_UnknownShapeRank) {
 
 TEST_F(ShapeInferenceTest, ValidateSparseTensor) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(3, 1), {S({5, 3}), S({5}), S({3})}, {});
+  InferenceContext c(&def, MakeOpDef(3, 1), {S({5, 3}), S({5}), S({3})}, {},
+                     {});
   EXPECT_EQ(3, c.num_inputs());
   EXPECT_EQ(1, c.num_outputs());
 
diff --git a/tensorflow/core/framework/shape_inference_testutil.cc b/tensorflow/core/framework/shape_inference_testutil.cc
index 6cad1f8efaa..ed1d3ec5201 100644
--- a/tensorflow/core/framework/shape_inference_testutil.cc
+++ b/tensorflow/core/framework/shape_inference_testutil.cc
@@ -44,7 +44,8 @@ Status ShapeInferenceTestutil::InferShapes(ShapeInferenceTestOp op,
   }
 
   shape_inference::InferenceContext c(&op.node_def, op_reg_data->op_def,
-                                      in_shapes, op.input_tensors);
+                                      in_shapes, op.input_tensors,
+                                      {} /* input_tensors_as_shapes */);
   TF_RETURN_IF_ERROR(c.construction_status());
   if (op_reg_data->shape_inference_fn == nullptr) {
     return errors::InvalidArgument(
diff --git a/tensorflow/core/graph/costmodel.cc b/tensorflow/core/graph/costmodel.cc
index 023014671c1..f6429806fe8 100644
--- a/tensorflow/core/graph/costmodel.cc
+++ b/tensorflow/core/graph/costmodel.cc
@@ -243,6 +243,11 @@ void CostModel::RecordMaxMemorySize(const Node* node, int output_slot,
   if (id < 0) return;
   Ensure(id);
   auto& current_max = max_mem_usage_[id].output_port_mem[output_slot];
+  // If the memory allocator doesn't track memory usage, let's infer a lower
+  // bound from the tensor shape and its data type.
+  if (bytes.value() < 0) {
+    bytes = MinTensorMemoryUsage(tensor_shape, dtype);
+  }
   if (bytes.value() > current_max.value()) {
     current_max = bytes.value();
     max_mem_usage_[id].output_port_shape[output_slot] = tensor_shape;
@@ -476,4 +481,18 @@ void CostModel::WriteSummaryToLog() const {
   }
 }
 
+Bytes CostModel::MinTensorMemoryUsage(const TensorShapeProto& tensor_shape,
+                                      const DataType& dtype) {
+  if (tensor_shape.unknown_rank()) {
+    return Bytes(-1);
+  }
+
+  size_t num_coefficients = 1;
+  for (const TensorShapeProto::Dim& dim : tensor_shape.dim()) {
+    // If the dimension is unknown, it has to be at least 1
+    num_coefficients *= std::max<size_t>(dim.size(), 1);
+  }
+  return Bytes(num_coefficients * DataTypeSize(dtype));
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/costmodel.h b/tensorflow/core/graph/costmodel.h
index 95bd0b9da17..0d942338b08 100644
--- a/tensorflow/core/graph/costmodel.h
+++ b/tensorflow/core/graph/costmodel.h
@@ -159,6 +159,9 @@ class CostModel {
   void WriteSummaryToLog() const;
 
  private:
+  static Bytes MinTensorMemoryUsage(const TensorShapeProto& tensor_shape,
+                                    const DataType& dtype);
+
   const bool is_global_;
 
   // Resizes vectors so that they are large enough for "id".
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index 7acdfaa70a2..92d35977f9f 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -355,33 +355,19 @@ Status GraphConstructor::ValidateShape(Node* node) {
       // functions that are not critical to correct execution but
       // would cause graphs to fail if imported after correcting.
       //
-      // This can be removed after 2017/03/08.
       const string& op = node->def().op();
-      const std::vector<string> whitelist = {"RandomShuffleQueue",
-                                             "PaddingFIFOQueue",
-                                             "FIFOQueue",
-                                             "PriorityQueue",
-                                             "QueueSize",
-                                             "Stack",
-                                             "Barrier",
-                                             "BarrierReadySize",
-                                             "BarrierIncompleteSize",
-                                             "HashTable",
-                                             "MutableHashTable",
-                                             "MutableHashTableOfTensors",
-                                             "Mutex",
-                                             "CuckooTable",
-                                             "IndexTable",
-                                             "WholeFileReader",
-                                             "TextLineReader",
-                                             "FixedLengthRecordReader",
-                                             "TFRecordReader",
-                                             "IdentityReader",
-                                             "RefSwitch",
-                                             "RefEnter",
-                                             "RefNextIteration",
-                                             "RefMerge",
-                                             "RefIdentity"};
+      const std::vector<string> whitelist = {
+          // To be removed after 2017/03/08.
+          "RandomShuffleQueue", "PaddingFIFOQueue", "FIFOQueue",
+          "PriorityQueue", "QueueSize", "Stack", "Barrier", "BarrierReadySize",
+          "BarrierIncompleteSize", "HashTable", "MutableHashTable",
+          "MutableHashTableOfTensors", "Mutex", "CuckooTable", "IndexTable",
+          "WholeFileReader", "TextLineReader", "FixedLengthRecordReader",
+          "TFRecordReader", "IdentityReader", "RefSwitch", "RefEnter",
+          "RefNextIteration", "RefMerge", "RefIdentity",
+          // To be removed after 2017/04/24.
+          "ConditionalAccumulator", "SparseConditionalAccumulator", "Table",
+      };
       if (std::find(whitelist.begin(), whitelist.end(), op) ==
           whitelist.end()) {
         return errors::InvalidArgument(
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index 454cb2aa615..3275cde762c 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -77,7 +77,6 @@ struct ControlFlowInfo {
   const Node* frame = nullptr;         // frame of a node
   const Node* parent_frame = nullptr;  // parent frame of a node
   string frame_name;                   // frame name of a node
-  int iter_level = -1;                 // level of a node
 };
 
 struct PairIntHash {
@@ -365,11 +364,13 @@ Status BuildControlFlowInfo(Graph* g, std::vector<ControlFlowInfo>* info) {
   info->clear();
   info->resize(g->num_node_ids());
 
+  std::vector<const Node*> parent_nodes;
+  parent_nodes.resize(g->num_node_ids());
+
   Node* src_node = g->source_node();
   ControlFlowInfo& src_info = (*info)[src_node->id()];
   src_info.frame = src_node;
   src_info.parent_frame = src_node;
-  src_info.iter_level = 0;
 
   string frame_name;
   std::deque<Node*> ready;
@@ -381,7 +382,6 @@ Status BuildControlFlowInfo(Graph* g, std::vector<ControlFlowInfo>* info) {
     const Node* frame = curr_info.frame;
     const Node* parent = curr_info.parent_frame;
     frame_name = curr_info.frame_name;
-    int iter_level = curr_info.iter_level;
 
     if (IsExit(curr_node)) {
       // Exit to the parent frame.
@@ -389,7 +389,6 @@ Status BuildControlFlowInfo(Graph* g, std::vector<ControlFlowInfo>* info) {
       frame = parent_info.frame;
       parent = parent_info.parent_frame;
       frame_name = parent_info.frame_name;
-      iter_level = parent_info.iter_level;
     }
 
     // Optimize colocation for control flow nodes.
@@ -400,23 +399,29 @@ Status BuildControlFlowInfo(Graph* g, std::vector<ControlFlowInfo>* info) {
       int out_id = out->id();
       ControlFlowInfo* out_info = &(*info)[out_id];
       const Node* out_parent = out_info->parent_frame;
-      bool is_visited = (out_info->iter_level != -1);
+      bool is_visited = (parent_nodes[out_id] != nullptr);
 
       // Skip Sink/Source nodes.
       if (!out->IsOp()) continue;
 
       // Add to ready queue if not seen.
       if (!is_visited) {
+        parent_nodes[out->id()] = curr_node;
         ready.push_back(out);
       }
 
       // Process the node 'out'.
       if (IsEnter(out)) {
         if (is_visited) {
-          const string& parent_name = (*info)[out_parent->id()].frame_name;
-          if (parent_name != frame_name || iter_level != out_info->iter_level) {
-            return errors::InvalidArgument("All inputs to node ", out->name(),
-                                           " must be from the same frame.");
+          const string& parent_frame = (*info)[out_parent->id()].frame_name;
+          if (parent_frame != frame_name) {
+            return errors::InvalidArgument(
+                "The node '", out->name(),
+                "' has inputs from different "
+                "frames. The input '",
+                curr_node->name(), "' is in frame '", frame_name,
+                "'. The input '", parent_nodes[out->id()]->name(),
+                "' is in frame '", parent_frame, "'.");
           }
         } else {
           out_info->frame = out;
@@ -427,36 +432,26 @@ Status BuildControlFlowInfo(Graph* g, std::vector<ControlFlowInfo>* info) {
             return errors::InvalidArgument("The Enter node ", out->name(),
                                            " must have a frame name.");
           }
-          out_info->iter_level = 0;
-        }
-      } else if (IsNextIteration(out)) {
-        if (is_visited) {
-          if (out_info->frame_name != frame_name) {
-            return errors::InvalidArgument("All inputs to node ", out->name(),
-                                           " must be from the same frame.");
-          }
-        } else {
-          out_info->frame = frame;
-          out_info->parent_frame = parent;
-          out_info->frame_name = frame_name;
-          out_info->iter_level = iter_level + 1;
         }
       } else {
         if (is_visited) {
           if (out_info->frame_name != frame_name) {
-            return errors::InvalidArgument("All inputs to node ", out->name(),
-                                           " must be from the same frame.");
+            return errors::InvalidArgument(
+                "The node '", out->name(),
+                "' has inputs from different "
+                "frames. The input '",
+                curr_node->name(), "' is in frame '", frame_name,
+                "'. The input '", parent_nodes[out->id()]->name(),
+                "' is in frame '", out_info->frame_name, "'.");
           }
         } else {
           out_info->frame = frame;
           out_info->parent_frame = parent;
           out_info->frame_name = frame_name;
-          out_info->iter_level = iter_level;
         }
       }
     }
   }
-
   return Status::OK();
 }
 
@@ -559,7 +554,6 @@ void AddControlFlowInfo(const Node* node, const Node* src,
   info->frame = src_info.frame;
   info->parent_frame = src_info.parent_frame;
   info->frame_name = src_info.frame_name;
-  info->iter_level = src_info.iter_level;
 }
 
 // Constructs a control loop. Returns a struct containing the newly created
diff --git a/tensorflow/core/graph/node_builder.cc b/tensorflow/core/graph/node_builder.cc
index 27d89295958..46e54c9eabe 100644
--- a/tensorflow/core/graph/node_builder.cc
+++ b/tensorflow/core/graph/node_builder.cc
@@ -129,7 +129,7 @@ Status NodeBuilder::Finalize(Graph* graph, Node** created_node) const {
 void NodeBuilder::AddIndexError(Node* node, int i) {
   if (node == nullptr) {
     errors_.emplace_back(
-        strings::StrCat("Attempt to add nullptr Node to node with type",
+        strings::StrCat("Attempt to add nullptr Node to node with type ",
                         def_builder_.op_def().name()));
   } else {
     errors_.emplace_back(
diff --git a/tensorflow/core/graph/types.h b/tensorflow/core/graph/types.h
index accd2cd888b..c7078099277 100644
--- a/tensorflow/core/graph/types.h
+++ b/tensorflow/core/graph/types.h
@@ -24,6 +24,9 @@ namespace tensorflow {
 // We model running time in microseconds.
 TF_LIB_GTL_DEFINE_INT_TYPE(Microseconds, int64);
 
+// We can also model running time in nanoseconds for more accuracy.
+TF_LIB_GTL_DEFINE_INT_TYPE(Nanoseconds, int64);
+
 // We model size in bytes.
 TF_LIB_GTL_DEFINE_INT_TYPE(Bytes, int64);
 
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 94e685731c8..34954f00664 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -57,6 +57,7 @@ tf_kernel_library(
     name = "strided_slice_op",
     srcs = [
         "strided_slice_op.cc",
+        "strided_slice_op_inst_0.cc",
         "strided_slice_op_inst_1.cc",
         "strided_slice_op_inst_2.cc",
         "strided_slice_op_inst_3.cc",
@@ -404,7 +405,6 @@ ARRAY_DEPS = [
     "//tensorflow/core:array_ops_op_lib",
     "//tensorflow/core:core_cpu",
     "//tensorflow/core:framework",
-    "//tensorflow/core:gpu_runtime",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
     "//tensorflow/core:proto_text",
@@ -419,7 +419,9 @@ tf_kernel_libraries(
         "debug_ops",
         "immutable_constant_op",
     ],
-    deps = ARRAY_DEPS,
+    deps = ARRAY_DEPS + [
+        "//tensorflow/core:gpu_runtime",
+    ],
 )
 
 tf_kernel_libraries(
@@ -563,6 +565,24 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "fake_quant_ops_test",
+    size = "small",
+    srcs = ["fake_quant_ops_test.cc"],
+    deps = [
+        ":fake_quant_ops",
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cc_test(
     name = "fused_batch_norm_op_test",
     size = "small",
@@ -1058,6 +1078,7 @@ tf_kernel_libraries(
         ":image_resizer_state",
         "//tensorflow/core:framework",
         "//tensorflow/core:image_ops_op_lib",
+        "//tensorflow/core:jpeg",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
@@ -1710,6 +1731,22 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "fake_quant_ops",
+    srcs = ["fake_quant_ops.cc"],
+    hdrs = ["fake_quant_ops_functor.h"],
+    gpu_srcs = [
+        "fake_quant_ops_gpu.cu.cc",
+        "fake_quant_ops_functor.h",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//third_party/eigen3",
+    ],
+    alwayslink = 1,
+)
+
 tf_kernel_library(
     name = "fused_batch_norm_util",
     gpu_srcs = [
@@ -2226,6 +2263,7 @@ filegroup(
         "strided_slice_op.cc",
         "strided_slice_op.h",
         "strided_slice_op_impl.h",
+        "strided_slice_op_inst_0.cc",
         "strided_slice_op_inst_1.cc",
         "strided_slice_op_inst_2.cc",
         "strided_slice_op_inst_3.cc",
@@ -2393,6 +2431,8 @@ filegroup(
     name = "android_quantized_ops",
     srcs = [
         "dequantize_op.cc",
+        "meta_support.cc",
+        "meta_support.h",
         "quantization_utils.cc",
         "quantization_utils.h",
         "quantize_down_and_shrink_range.cc",
@@ -2406,6 +2446,7 @@ filegroup(
         "quantized_pooling_ops.cc",
         "quantized_reshape_op.cc",
         "reference_gemm.h",
+        "requantization_range_op.cc",
         "requantize.cc",
         "reshape_op.h",
     ],
@@ -2493,6 +2534,7 @@ tf_kernel_library(
     name = "quantized_ops",
     srcs = [
         "dequantize_op.cc",
+        "meta_support.cc",
         "quantization_utils.cc",
         "quantize_down_and_shrink_range.cc",
         "quantize_op.cc",
@@ -2504,10 +2546,12 @@ tf_kernel_library(
         "quantized_matmul_op.cc",
         "quantized_pooling_ops.cc",
         "quantized_reshape_op.cc",
+        "requantization_range_op.cc",
         "requantize.cc",
         "reshape_op.h",
     ],
     hdrs = [
+        "meta_support.h",
         "quantization_utils.h",
         "reference_gemm.h",
     ],
@@ -2528,6 +2572,22 @@ tf_kernel_library(
     ],
 )
 
+tf_cc_test(
+    name = "requantization_range_op_test",
+    size = "small",
+    srcs = ["requantization_range_op_test.cc"],
+    deps = [
+        ":quantized_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:ops_testutil",
+        "//tensorflow/core/kernels:ops_util",
+    ],
+)
+
 tf_cc_test(
     name = "quantize_down_and_shrink_range_op_test",
     size = "small",
@@ -2621,6 +2681,7 @@ tf_cc_test(
     name = "quantized_conv_ops_test",
     size = "small",
     srcs = ["quantized_conv_ops_test.cc"],
+    tags = ["nomsan"],  # http://b/32242946
     deps = [
         ":quantized_ops",
         "//tensorflow/core:array_ops_op_lib",
@@ -2659,6 +2720,7 @@ tf_cc_test(
     name = "quantized_matmul_op_test",
     size = "small",
     srcs = ["quantized_matmul_op_test.cc"],
+    tags = ["nomsan"],  # http://b/32242946
     deps = [
         ":quantized_ops",
         "//tensorflow/core:array_ops_op_lib",
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index a743be66124..dba37ca396d 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -209,6 +209,7 @@ TF_CALL_ALL_TYPES(REGISTER_CPU);
 #undef REGISTER_CPU
 
 #if GOOGLE_CUDA
+REGISTER_KERNEL(bool, GPU);
 REGISTER_KERNEL(Eigen::half, GPU);
 REGISTER_KERNEL(float, GPU);
 REGISTER_KERNEL(double, GPU);
diff --git a/tensorflow/core/kernels/constant_op_gpu.cu.cc b/tensorflow/core/kernels/constant_op_gpu.cu.cc
index 29f39a72f39..f12cf3fe7fd 100644
--- a/tensorflow/core/kernels/constant_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/constant_op_gpu.cu.cc
@@ -89,6 +89,7 @@ struct SetZeroFunctor<GPUDevice, T> {
 };
 
 #define DEFINE_SETZERO_GPU(T) template struct SetZeroFunctor<GPUDevice, T>
+DEFINE_SETZERO_GPU(bool);
 DEFINE_SETZERO_GPU(Eigen::half);
 DEFINE_SETZERO_GPU(float);
 DEFINE_SETZERO_GPU(double);
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index e7d4c4778ea..572a729b34b 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -237,7 +237,7 @@ struct functor_traits<scalar_compose_op<Scalar, UnaryFunctor, BinaryFunctor>> {
 };
 
 // TODO(b/32239616): This kernel should be moved into Eigen and vectorized.
-template <typename T>
+template <typename T, typename Enable = void>
 struct google_floor_div {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& x,
                                                            const T& y) const {
@@ -251,6 +251,15 @@ struct google_floor_div {
   }
 };
 
+template <typename T>
+struct google_floor_div<
+    T, typename std::enable_if<std::is_unsigned<T>::value>::type> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& x,
+                                                           const T& y) const {
+    return x / y;
+  }
+};
+
 template <typename Scalar>
 struct functor_traits<google_floor_div<Scalar>> {
   enum {
diff --git a/tensorflow/core/kernels/dequantize_op.cc b/tensorflow/core/kernels/dequantize_op.cc
index 375287000eb..c28909e03ba 100644
--- a/tensorflow/core/kernels/dequantize_op.cc
+++ b/tensorflow/core/kernels/dequantize_op.cc
@@ -17,11 +17,12 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/type_traits.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/meta_support.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace {
@@ -75,9 +76,15 @@ class DequantizeOp : public OpKernel {
            scale_factor) +
           min_range;
     } else if (mode_ == QUANTIZE_MODE_MIN_FIRST) {
-      QuantizedTensorToFloatInPlaceUsingEigen<T>(
-          ctx->template eigen_device<Device>(), input, min_range, max_range,
-          output);
+      if (meta::IsSupportedAndEnabled() && std::is_same<T, quint8>()) {
+        auto input_ui8_array = input.flat<quint8>();
+        meta::Dequantize(ctx, input_ui8_array.data(), input_ui8_array.size(),
+                         min_range, max_range, output->flat<float>().data());
+      } else {
+        QuantizedTensorToFloatInPlaceUsingEigen<T>(
+            ctx->template eigen_device<Device>(), input, min_range, max_range,
+            output);
+      }
     }
   }
 
diff --git a/tensorflow/core/kernels/example_parsing_ops_test.cc b/tensorflow/core/kernels/example_parsing_ops_test.cc
index 187d72685ec..67ac4777130 100644
--- a/tensorflow/core/kernels/example_parsing_ops_test.cc
+++ b/tensorflow/core/kernels/example_parsing_ops_test.cc
@@ -33,66 +33,83 @@ limitations under the License.
 
 namespace tensorflow {
 
-typedef std::map<std::pair<int, int>, Tensor> ExampleTensorMap;
+typedef std::map<std::tuple<int, int, int>, Tensor> ExampleTensorMap;
 
 // Fillers to fill the underlying repeated array in protobuf.
 class BytesFiller {
  public:
-  BytesFiller() : dense_default(DT_STRING, TensorShape()) {}
-  void operator()(Feature* f) const {
-    f->mutable_bytes_list()->add_value("abcd1234abcd1234abcd1234abcd1234!");
+  BytesFiller() {}
+  void operator()(Feature* f, int feature_size) const {
+    for (int i = 0; i < feature_size; ++i) {
+      f->mutable_bytes_list()->add_value("abcd1234abcd1234abcd1234abcd1234!");
+    }
+  }
+  Tensor make_dense_default(int feature_size) {
+    return Tensor(dtype, TensorShape({feature_size}));
   }
-  Tensor dense_default;
   DataType dtype = DT_STRING;
 };
 
 class Int64Filler {
  public:
-  Int64Filler() : dense_default(DT_INT64, TensorShape()) {}
-  void operator()(Feature* f) const {
-    f->mutable_int64_list()->add_value(1729);
+  Int64Filler() {}
+  void operator()(Feature* f, int feature_size) const {
+    for (int i = 0; i < feature_size; ++i) {
+      f->mutable_int64_list()->add_value(1729);
+    }
+  }
+  Tensor make_dense_default(int feature_size) {
+    return Tensor(dtype, TensorShape({feature_size}));
   }
-  Tensor dense_default;
   DataType dtype = DT_INT64;
 };
 
 class FloatFiller {
  public:
-  FloatFiller() : dense_default(DT_FLOAT, TensorShape()) {}
-  void operator()(Feature* f) const {
-    f->mutable_float_list()->add_value(1.729);
+  FloatFiller() {}
+  void operator()(Feature* f, int feature_size) const {
+    for (int i = 0; i < feature_size; ++i) {
+      f->mutable_float_list()->add_value(1.729);
+    }
+  }
+  Tensor make_dense_default(int feature_size) {
+    return Tensor(dtype, TensorShape({feature_size}));
   }
-  Tensor dense_default;
   DataType dtype = DT_FLOAT;
 };
 
 template <typename T>
 struct ExampleStore {
   typedef T Filler;
-  static ExampleTensorMap GetSerializedExamples() {
-    ExampleTensorMap examples;
-    int keys[] = {10, 100, 1000};
-    int batch_sizes[] = {128, 512};
+  static void AddExample(ExampleTensorMap* examples, int num_keys,
+                         int batch_size, int feature_size) {
     Example example;
     Filler fill;
-    for (int num_keys : keys) {
-      for (int batch_size : batch_sizes) {
-        Tensor record_string(DT_STRING, TensorShape({batch_size}));
-        auto string_t = record_string.vec<string>();
-        example.Clear();
-        for (int b = 0; b < batch_size; ++b) {
-          for (int k = 0; k < num_keys; ++k) {
-            string k_str = strings::Printf("feature_%d", k);
-            Feature f;
-            fill(&f);
-            Features* features = example.mutable_features();
-            (*features->mutable_feature())[k_str] = f;
-          }
-          CHECK(example.SerializeToString(&string_t(b)));
-        }
-        examples[std::make_pair(batch_size, num_keys)] = record_string;
+    Tensor record_string(DT_STRING, TensorShape({batch_size}));
+    auto string_t = record_string.vec<string>();
+    example.Clear();
+    for (int b = 0; b < batch_size; ++b) {
+      for (int k = 0; k < num_keys; ++k) {
+        string k_str = strings::Printf("feature_%d", k);
+        Feature f;
+        fill(&f, feature_size);
+        Features* features = example.mutable_features();
+        (*features->mutable_feature())[k_str] = f;
       }
+      CHECK(example.SerializeToString(&string_t(b)));
     }
+    (*examples)[std::make_tuple(batch_size, num_keys, feature_size)] =
+        record_string;
+  }
+  static ExampleTensorMap GetSerializedExamples() {
+    ExampleTensorMap examples;
+    AddExample(&examples, 10, 128, 1);
+    AddExample(&examples, 100, 128, 1);
+    AddExample(&examples, 1000, 128, 1);
+    AddExample(&examples, 10, 512, 1);
+    AddExample(&examples, 100, 512, 1);
+    AddExample(&examples, 1000, 512, 1);
+    AddExample(&examples, 1, 1, 1000000);
     return examples;
   }
   static ExampleTensorMap serialized_example;
@@ -118,10 +135,10 @@ struct BenchmarkOptions {
 };
 
 template <typename Options>
-static Graph* ParseExample(int batch_size, int num_keys) {
+static Graph* ParseExample(int batch_size, int num_keys, int feature_size) {
   Graph* g = new Graph(OpRegistry::Global());
-  Tensor& serialized =
-      Options::Store::serialized_example[std::make_pair(batch_size, num_keys)];
+  Tensor& serialized = Options::Store::serialized_example[std::make_tuple(
+      batch_size, num_keys, feature_size)];
   Tensor names(DT_STRING, TensorShape({batch_size}));
 
   std::vector<NodeBuilder::NodeOut> sparse_keys;
@@ -135,9 +152,9 @@ static Graph* ParseExample(int batch_size, int num_keys) {
     key.scalar<string>()() = strings::Printf("feature_%d", i);
     if (opt.benchmark_dense) {
       dense_keys.emplace_back(test::graph::Constant(g, key));
-      dense_defaults.emplace_back(
-          test::graph::Constant(g, opt.filler.dense_default));
-      dense_shapes.push_back(TensorShape());
+      dense_defaults.emplace_back(test::graph::Constant(
+          g, opt.filler.make_dense_default(feature_size)));
+      dense_shapes.push_back(TensorShape({feature_size}));
     } else {
       sparse_keys.emplace_back(test::graph::Constant(g, key));
       sparse_types.push_back(opt.filler.dtype);
@@ -166,23 +183,25 @@ typedef BenchmarkOptions<ExampleStore<Int64Filler>, true> DenseInt64;
 typedef BenchmarkOptions<ExampleStore<FloatFiller>, false> SparseFloat;
 typedef BenchmarkOptions<ExampleStore<FloatFiller>, true> DenseFloat;
 
-// B == batch_size, K == num_keys.  K must be one of 10, 100, 1000
-#define BM_ParseExample(TYPE, B, K)                                      \
-  static void BM_ParseExample##_##TYPE##_##B##_##K(int iters) {          \
-    int64 items_per_iter = static_cast<int64>(B) * K;                    \
+// B == batch_size, K == num_keys. F == feature_size.
+// K must be one of 10, 100, 1000
+#define BM_ParseExample(TYPE, B, K, F)                                   \
+  static void BM_ParseExample##_##TYPE##_##B##_##K##_##F(int iters) {    \
+    int64 items_per_iter = static_cast<int64>(B) * K * F;                \
     testing::UseRealTime();                                              \
     testing::ItemsProcessed(static_cast<int64>(iters) * items_per_iter); \
-    test::Benchmark("cpu", ParseExample<TYPE>(B, K)).Run(iters);         \
+    test::Benchmark("cpu", ParseExample<TYPE>(B, K, F)).Run(iters);      \
   }                                                                      \
-  BENCHMARK(BM_ParseExample##_##TYPE##_##B##_##K);
+  BENCHMARK(BM_ParseExample##_##TYPE##_##B##_##K##_##F);
 
-#define BM_AllParseExample(Type)    \
-  BM_ParseExample(Type, 128, 10);   \
-  BM_ParseExample(Type, 512, 10);   \
-  BM_ParseExample(Type, 128, 100);  \
-  BM_ParseExample(Type, 512, 100);  \
-  BM_ParseExample(Type, 128, 1000); \
-  BM_ParseExample(Type, 512, 1000);
+#define BM_AllParseExample(Type)       \
+  BM_ParseExample(Type, 128, 10, 1);   \
+  BM_ParseExample(Type, 512, 10, 1);   \
+  BM_ParseExample(Type, 128, 100, 1);  \
+  BM_ParseExample(Type, 512, 100, 1);  \
+  BM_ParseExample(Type, 128, 1000, 1); \
+  BM_ParseExample(Type, 512, 1000, 1); \
+  BM_ParseExample(Type, 1, 1, 1000000);
 
 BM_AllParseExample(SparseString);
 BM_AllParseExample(DenseString);
diff --git a/tensorflow/core/kernels/fake_quant_ops.cc b/tensorflow/core/kernels/fake_quant_ops.cc
new file mode 100644
index 00000000000..41f9c218437
--- /dev/null
+++ b/tensorflow/core/kernels/fake_quant_ops.cc
@@ -0,0 +1,580 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#ifdef GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
+
+#define FAKE_QUANT_NO_DEBUG
+
+#include "tensorflow/core/kernels/fake_quant_ops_functor.h"
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+using tensorflow::BinaryElementWiseOp;
+using tensorflow::DEVICE_CPU;
+#if GOOGLE_CUDA
+using tensorflow::DEVICE_GPU;
+#endif
+using tensorflow::DT_BOOL;
+using tensorflow::OpKernel;
+using tensorflow::OpKernelConstruction;
+using tensorflow::OpKernelContext;
+using tensorflow::PersistentTensor;
+using tensorflow::Tensor;
+using tensorflow::TensorShape;
+using tensorflow::TTypes;  // NOLINT This is needed in CUDA mode, do not remove.
+using tensorflow::UnaryElementWiseOp;
+using tensorflow::errors::InvalidArgument;
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+// -----------------------------------------------------------------------------
+// Implementation of FakeQuantWithMinMaxArgsOp, see its documentation in
+// core/ops/array_ops.cc.
+template <typename Device>
+class FakeQuantWithMinMaxArgsOp
+    : public UnaryElementWiseOp<float, FakeQuantWithMinMaxArgsOp<Device>> {
+ public:
+  typedef UnaryElementWiseOp<float, FakeQuantWithMinMaxArgsOp<Device>> Base;
+  explicit FakeQuantWithMinMaxArgsOp(OpKernelConstruction* context)
+      : Base::UnaryElementWiseOp(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("min", &min_));
+    OP_REQUIRES_OK(context, context->GetAttr("max", &max_));
+    OP_REQUIRES(context, min_ < max_,
+                InvalidArgument("min has to be smaller than max, was: ", min_,
+                                " >= ", max_));
+  }
+
+  void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
+    FakeQuantWithMinMaxArgsFunctor<Device> functor;
+    functor(context->eigen_device<Device>(), input.flat<float>(), min_, max_,
+            output->flat<float>());
+  }
+ private:
+  float min_;
+  float max_;
+};
+
+// Implementation of FakeQuantWithMinMaxArgsGradientOp, see its documentation in
+// core/ops/array_ops.cc.
+template <typename Device>
+class FakeQuantWithMinMaxArgsGradientOp
+    : public BinaryElementWiseOp<float,
+                                 FakeQuantWithMinMaxArgsGradientOp<Device>> {
+ public:
+  typedef BinaryElementWiseOp<float, FakeQuantWithMinMaxArgsGradientOp<Device>>
+      Base;
+  explicit FakeQuantWithMinMaxArgsGradientOp(OpKernelConstruction* context)
+      : Base::BinaryElementWiseOp(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("min", &min_));
+    OP_REQUIRES_OK(context, context->GetAttr("max", &max_));
+    OP_REQUIRES(context, min_ < max_,
+                InvalidArgument("min has to be smaller than max, was: ", min_,
+                                " >= ", max_));
+  }
+
+  template <int NDIMS>
+  void Operate(OpKernelContext* context, const Tensor& gradient,
+               const Tensor& input, Tensor* output) {
+    OperateNoTemplate(context, gradient, input, output);
+  }
+
+  void OperateNoTemplate(OpKernelContext* context, const Tensor& gradient,
+                         const Tensor& input, Tensor* output) {
+    OP_REQUIRES(context, input.IsSameSize(gradient),
+                InvalidArgument("gradient and input must be the same size"));
+    FakeQuantWithMinMaxArgsGradientFunctor<Device> functor;
+    functor(context->eigen_device<Device>(), gradient.flat<float>(),
+            input.flat<float>(), min_, max_, output->flat<float>());
+  }
+ private:
+  float min_;
+  float max_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("FakeQuantWithMinMaxArgs").Device(DEVICE_CPU),
+                        FakeQuantWithMinMaxArgsOp<CPUDevice>);
+REGISTER_KERNEL_BUILDER(
+    Name("FakeQuantWithMinMaxArgsGradient").Device(DEVICE_CPU),
+    FakeQuantWithMinMaxArgsGradientOp<CPUDevice>);
+
+#if GOOGLE_CUDA
+typedef Eigen::GpuDevice GPUDevice;
+
+// Forward declarations for functor specializations for GPU.
+template <>
+void FakeQuantWithMinMaxArgsFunctor<GPUDevice>::operator()(
+    const GPUDevice& d,
+    typename TTypes<float>::ConstFlat inputs,
+    const float min, const float max,
+    typename TTypes<float>::Flat outputs);
+extern template struct FakeQuantWithMinMaxArgsFunctor<GPUDevice>;
+REGISTER_KERNEL_BUILDER(Name("FakeQuantWithMinMaxArgs").Device(DEVICE_GPU),
+                        FakeQuantWithMinMaxArgsOp<GPUDevice>);
+
+template <>
+void FakeQuantWithMinMaxArgsGradientFunctor<GPUDevice>::operator()(
+    const GPUDevice& d,
+    typename TTypes<float>::ConstFlat gradients,
+    typename TTypes<float>::ConstFlat inputs,
+    const float min, const float max,
+    typename TTypes<float>::Flat backprops);
+REGISTER_KERNEL_BUILDER(
+    Name("FakeQuantWithMinMaxArgsGradient").Device(DEVICE_GPU),
+    FakeQuantWithMinMaxArgsGradientOp<GPUDevice>);
+#endif  // GOOGLE_CUDA
+
+// -----------------------------------------------------------------------------
+// Implementation of FakeQuantWithMinMaxVarsOp, see its documentation in
+// core/ops/array_ops.cc.
+template <typename Device>
+class FakeQuantWithMinMaxVarsOp : public OpKernel {
+ public:
+  explicit FakeQuantWithMinMaxVarsOp(OpKernelConstruction* context)
+      : OpKernel::OpKernel(context) {
+#ifndef FAKE_QUANT_NO_DEBUG
+    OP_REQUIRES_OK(context,
+                   context->allocate_persistent(DT_BOOL, {},
+                                                &check_min_max_handle_,
+                                                nullptr));
+#endif
+  }
+
+  void Compute(OpKernelContext* context) override {
+    CHECK_EQ(3, context->num_inputs());
+    const Tensor& input = context->input(0);
+    const Tensor& min = context->input(1);
+    const Tensor& max = context->input(2);
+#ifndef FAKE_QUANT_NO_DEBUG
+    Tensor* check_min_max = check_min_max_handle_.AccessTensor(context);
+#endif
+
+    Tensor* output;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input.shape(), &output));
+
+    FakeQuantWithMinMaxVarsFunctor<Device> functor;
+    functor(context->eigen_device<Device>(), input.flat<float>(),
+            min.scalar<float>(), max.scalar<float>(),
+#ifndef FAKE_QUANT_NO_DEBUG
+            check_min_max->scalar<bool>(),
+#endif
+            output->flat<float>());
+  }
+
+ private:
+#ifndef FAKE_QUANT_NO_DEBUG
+  PersistentTensor check_min_max_handle_;
+#endif
+};
+
+// Implementation of FakeQuantWithMinMaxVarsGradientOp, see its documentation in
+// core/ops/array_ops.cc.
+template <typename Device>
+class FakeQuantWithMinMaxVarsGradientOp : public OpKernel {
+ public:
+  explicit FakeQuantWithMinMaxVarsGradientOp(OpKernelConstruction* context)
+      : OpKernel::OpKernel(context) {
+#ifndef FAKE_QUANT_NO_DEBUG
+    OP_REQUIRES_OK(context,
+                   context->allocate_persistent(DT_BOOL, {},
+                                                &check_min_max_handle_,
+                                                nullptr));
+#endif
+  }
+
+  void Compute(OpKernelContext* context) override {
+    CHECK_EQ(4, context->num_inputs());
+    const Tensor& gradient = context->input(0);
+    const Tensor& input = context->input(1);
+    OP_REQUIRES(context, input.IsSameSize(gradient),
+                InvalidArgument("gradient and input must be the same size"));
+    const Tensor& min = context->input(2);
+    const Tensor& max = context->input(3);
+#ifndef FAKE_QUANT_NO_DEBUG
+    Tensor* check_min_max = check_min_max_handle_.AccessTensor(context);
+#endif
+
+    Tensor* grad_wrt_input;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input.shape(), &grad_wrt_input));
+
+    TensorShape scalar_shape;
+    Tensor* grad_wrt_min;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(1, scalar_shape, &grad_wrt_min));
+
+    Tensor* grad_wrt_max;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(2, scalar_shape, &grad_wrt_max));
+
+    FakeQuantWithMinMaxVarsGradientFunctor<Device> functor;
+    functor(context->eigen_device<Device>(), gradient.flat<float>(),
+            input.flat<float>(), min.scalar<float>(), max.scalar<float>(),
+#ifndef FAKE_QUANT_NO_DEBUG
+            check_min_max->scalar<bool>(),
+#endif
+            grad_wrt_input->flat<float>(), grad_wrt_min->scalar<float>(),
+            grad_wrt_max->scalar<float>());
+  }
+
+ private:
+#ifndef FAKE_QUANT_NO_DEBUG
+  PersistentTensor check_min_max_handle_;
+#endif
+};
+
+REGISTER_KERNEL_BUILDER(Name("FakeQuantWithMinMaxVars").Device(DEVICE_CPU),
+                        FakeQuantWithMinMaxVarsOp<CPUDevice>);
+REGISTER_KERNEL_BUILDER(
+    Name("FakeQuantWithMinMaxVarsGradient").Device(DEVICE_CPU),
+    FakeQuantWithMinMaxVarsGradientOp<CPUDevice>);
+
+#if GOOGLE_CUDA
+template <>
+void FakeQuantWithMinMaxVarsFunctor<GPUDevice>::operator()(
+    const GPUDevice& d,
+    typename TTypes<float>::ConstFlat inputs,
+    typename TTypes<float>::ConstScalar min,
+    typename TTypes<float>::ConstScalar max,
+#ifndef FAKE_QUANT_NO_DEBUG
+    typename TTypes<bool>::Scalar check_min_max,
+#endif
+    typename TTypes<float>::Flat output);
+extern template struct FakeQuantWithMinMaxVarsFunctor<GPUDevice>;
+REGISTER_KERNEL_BUILDER(Name("FakeQuantWithMinMaxVars")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("min")
+                            .HostMemory("max"),
+                        FakeQuantWithMinMaxVarsOp<GPUDevice>);
+
+template <>
+void FakeQuantWithMinMaxVarsGradientFunctor<GPUDevice>::operator()(
+    const GPUDevice& d,
+    typename TTypes<float>::ConstFlat gradients,
+    typename TTypes<float>::ConstFlat inputs,
+    typename TTypes<float>::ConstScalar min,
+    typename TTypes<float>::ConstScalar max,
+#ifndef FAKE_QUANT_NO_DEBUG
+    typename TTypes<bool>::Scalar check_min_max,
+#endif
+    typename TTypes<float>::Flat backprops_wrt_input,
+    typename TTypes<float>::Scalar backprop_wrt_min,
+    typename TTypes<float>::Scalar backprop_wrt_max);
+extern template struct FakeQuantWithMinMaxVarsGradientFunctor<GPUDevice>;
+REGISTER_KERNEL_BUILDER(Name("FakeQuantWithMinMaxVarsGradient")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("min")
+                            .HostMemory("max"),
+                        FakeQuantWithMinMaxVarsGradientOp<GPUDevice>);
+#endif  // GOOGLE_CUDA
+
+// -----------------------------------------------------------------------------
+// Implementation of FakeQuantWithMinMaxVarsPerChannelOp, see its documentation
+// in core/ops/array_ops.cc.
+template <typename Device>
+class FakeQuantWithMinMaxVarsPerChannelOp : public OpKernel {
+ public:
+  explicit FakeQuantWithMinMaxVarsPerChannelOp(OpKernelConstruction* context)
+      : OpKernel::OpKernel(context) {
+#ifndef FAKE_QUANT_NO_DEBUG
+    OP_REQUIRES_OK(context,
+                   context->allocate_persistent(DT_BOOL, {},
+                                                &check_min_max_handle_,
+                                                nullptr));
+#endif
+  }
+
+  void Compute(OpKernelContext* context) override {
+    CHECK_EQ(3, context->num_inputs());
+    const Tensor& input = context->input(0);
+    const int depth = input.dim_size(input.dims() - 1);  // last dimension size.
+    const Tensor& min = context->input(1);
+    OP_REQUIRES(context, min.dim_size(0) == depth,
+                InvalidArgument("min has incorrect size, expected ", depth,
+                                " was ", min.dim_size(0)));
+    const Tensor& max = context->input(2);
+    OP_REQUIRES(context, max.dim_size(0) == depth,
+                InvalidArgument("max has incorrect size, expected ", depth,
+                                " was ", max.dim_size(0)));
+#ifndef FAKE_QUANT_NO_DEBUG
+    Tensor* check_min_max = check_min_max_handle_.AccessTensor(context);
+#endif
+
+    Tensor* output;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input.shape(), &output));
+
+    switch (input.dims()) {
+      case 4: {
+        FakeQuant4WithMinMaxVarsPerChannelFunctor<Device> functor;
+        functor(context->eigen_device<Device>(), input.dim_size(0),
+                input.dim_size(1), input.dim_size(2), input.dim_size(3),
+                input.flat<float>(), min.vec<float>(), max.vec<float>(),
+#ifndef FAKE_QUANT_NO_DEBUG
+                check_min_max->scalar<bool>(),
+#endif
+                output->flat<float>());
+        break;
+      }
+      case 2: {
+        FakeQuant2WithMinMaxVarsPerChannelFunctor<Device> functor;
+        functor(context->eigen_device<Device>(),
+                input.dim_size(0), input.dim_size(1),
+                input.flat<float>(), min.vec<float>(), max.vec<float>(),
+#ifndef FAKE_QUANT_NO_DEBUG
+                check_min_max->scalar<bool>(),
+#endif
+                output->flat<float>());
+        break;
+      }
+      case 1: {
+        FakeQuant1WithMinMaxVarsPerChannelFunctor<Device> functor;
+        functor(context->eigen_device<Device>(),
+                input.vec<float>(), min.vec<float>(), max.vec<float>(),
+#ifndef FAKE_QUANT_NO_DEBUG
+                check_min_max->scalar<bool>(),
+#endif
+                output->vec<float>());
+        break;
+      }
+      default:
+        context->SetStatus(InvalidArgument("Only inputs of dimensions 1, 2 or "
+                                           "4 supported, was: ", input.dims()));
+        break;
+    }
+  }
+
+ private:
+#ifndef FAKE_QUANT_NO_DEBUG
+  PersistentTensor check_min_max_handle_;
+#endif
+};
+
+// Implementation of FakeQuantWithMinMaxVarsPerChannelGradientOp, see its
+// documentation in core/ops/array_ops.cc.
+template <typename Device>
+class FakeQuantWithMinMaxVarsPerChannelGradientOp : public OpKernel {
+ public:
+  explicit FakeQuantWithMinMaxVarsPerChannelGradientOp(
+      OpKernelConstruction* context) : OpKernel::OpKernel(context) {
+#ifndef FAKE_QUANT_NO_DEBUG
+    OP_REQUIRES_OK(context,
+                   context->allocate_persistent(DT_BOOL, {},
+                                                &check_min_max_handle_,
+                                                nullptr));
+#endif
+  }
+
+  void Compute(OpKernelContext* context) override {
+    CHECK_EQ(4, context->num_inputs());
+    const Tensor& gradient = context->input(0);
+    const Tensor& input = context->input(1);
+    OP_REQUIRES(context, input.IsSameSize(gradient),
+                InvalidArgument("gradient and input must be the same size"));
+    const int depth = input.dim_size(input.dims() - 1);  // last dimension size.
+    const Tensor& min = context->input(2);
+    OP_REQUIRES(context, min.dim_size(0) == depth,
+                InvalidArgument("min has incorrect size, expected ", depth,
+                                " was ", min.dim_size(0)));
+    const Tensor& max = context->input(3);
+    OP_REQUIRES(context, max.dim_size(0) == depth,
+                InvalidArgument("max has incorrect size, expected ", depth,
+                                " was ", max.dim_size(0)));
+#ifndef FAKE_QUANT_NO_DEBUG
+    Tensor* check_min_max = check_min_max_handle_.AccessTensor(context);
+#endif
+
+    Tensor* grad_wrt_input;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input.shape(), &grad_wrt_input));
+
+    TensorShape min_max_shape({input.dim_size(input.dims() - 1)});
+    Tensor* grad_wrt_min;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(1, min_max_shape, &grad_wrt_min));
+
+    Tensor* grad_wrt_max;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(2, min_max_shape, &grad_wrt_max));
+
+    switch (input.dims()) {
+      case 4: {
+        FakeQuant4WithMinMaxVarsPerChannelGradientFunctor<Device> functor;
+        functor(context->eigen_device<Device>(), input.dim_size(0),
+                input.dim_size(1), input.dim_size(2), input.dim_size(3),
+                gradient.flat<float>(), input.flat<float>(),
+                min.vec<float>(), max.vec<float>(),
+#ifndef FAKE_QUANT_NO_DEBUG
+                check_min_max->scalar<bool>(),
+#endif
+                grad_wrt_input->flat<float>(),
+                grad_wrt_min->vec<float>(), grad_wrt_max->vec<float>());
+        break;
+      }
+      case 2: {
+        FakeQuant2WithMinMaxVarsPerChannelGradientFunctor<Device> functor;
+        functor(context->eigen_device<Device>(),
+                input.dim_size(0), input.dim_size(1),
+                gradient.flat<float>(), input.flat<float>(),
+                min.vec<float>(), max.vec<float>(),
+#ifndef FAKE_QUANT_NO_DEBUG
+                check_min_max->scalar<bool>(),
+#endif
+                grad_wrt_input->flat<float>(),
+                grad_wrt_min->vec<float>(), grad_wrt_max->vec<float>());
+        break;
+      }
+      case 1: {
+        FakeQuant1WithMinMaxVarsPerChannelGradientFunctor<Device> functor;
+        functor(context->eigen_device<Device>(),
+                gradient.vec<float>(), input.vec<float>(),
+                min.vec<float>(), max.vec<float>(),
+#ifndef FAKE_QUANT_NO_DEBUG
+                check_min_max->scalar<bool>(),
+#endif
+                grad_wrt_input->vec<float>(),
+                grad_wrt_min->vec<float>(), grad_wrt_max->vec<float>());
+        break;
+      }
+      default:
+        context->SetStatus(InvalidArgument("Only inputs of dimensions 1, 2 or "
+                                           "4 supported, was: ", input.dims()));
+        break;
+    }
+  }
+
+ private:
+#ifndef FAKE_QUANT_NO_DEBUG
+  PersistentTensor check_min_max_handle_;
+#endif
+};
+
+REGISTER_KERNEL_BUILDER(Name("FakeQuantWithMinMaxVarsPerChannel")
+                            .Device(DEVICE_CPU),
+                        FakeQuantWithMinMaxVarsPerChannelOp<CPUDevice>);
+REGISTER_KERNEL_BUILDER(Name("FakeQuantWithMinMaxVarsPerChannelGradient")
+                            .Device(DEVICE_CPU),
+    FakeQuantWithMinMaxVarsPerChannelGradientOp<CPUDevice>);
+
+#if GOOGLE_CUDA
+template <>
+void FakeQuant1WithMinMaxVarsPerChannelFunctor<GPUDevice>::operator()(
+    const GPUDevice& d,
+    typename TTypes<float>::ConstVec inputs,
+    typename TTypes<float>::ConstVec min,
+    typename TTypes<float>::ConstVec max,
+#ifndef FAKE_QUANT_NO_DEBUG
+    typename TTypes<bool>::Scalar check_min_max,
+#endif
+    typename TTypes<float>::Vec outputs);
+extern template struct FakeQuant1WithMinMaxVarsPerChannelFunctor<GPUDevice>;
+
+template <>
+void FakeQuant2WithMinMaxVarsPerChannelFunctor<GPUDevice>::operator()(
+    const GPUDevice& d, const Index batch_size, const Index depth,
+    typename TTypes<float>::ConstFlat inputs,
+    typename TTypes<float>::ConstFlat min,
+    typename TTypes<float>::ConstFlat max,
+#ifndef FAKE_QUANT_NO_DEBUG
+    typename TTypes<bool>::Scalar check_min_max,
+#endif
+    typename TTypes<float>::Flat outputs);
+extern template struct FakeQuant2WithMinMaxVarsPerChannelFunctor<GPUDevice>;
+
+template <>
+void FakeQuant4WithMinMaxVarsPerChannelFunctor<GPUDevice>::operator()(
+    const GPUDevice& d, const Index batch_size, const Index height,
+    const Index width, const Index depth,
+    typename TTypes<float>::ConstFlat inputs,
+    typename TTypes<float>::ConstFlat min,
+    typename TTypes<float>::ConstFlat max,
+#ifndef FAKE_QUANT_NO_DEBUG
+    typename TTypes<bool>::Scalar check_min_max,
+#endif
+    typename TTypes<float>::Flat outputs);
+extern template struct FakeQuant4WithMinMaxVarsPerChannelFunctor<GPUDevice>;
+
+REGISTER_KERNEL_BUILDER(Name("FakeQuantWithMinMaxVarsPerChannel")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("min")
+                            .HostMemory("max"),
+                        FakeQuantWithMinMaxVarsPerChannelOp<GPUDevice>);
+
+template <>
+void FakeQuant1WithMinMaxVarsPerChannelGradientFunctor<GPUDevice>::operator()(
+    const GPUDevice& d,
+    typename TTypes<float>::ConstVec gradients,
+    typename TTypes<float>::ConstVec inputs,
+    typename TTypes<float>::ConstVec min,
+    typename TTypes<float>::ConstVec max,
+#ifndef FAKE_QUANT_NO_DEBUG
+    typename TTypes<bool>::Scalar check_min_max,
+#endif
+    typename TTypes<float>::Vec backprops_wrt_input,
+    typename TTypes<float>::Vec backprop_wrt_min,
+    typename TTypes<float>::Vec backprop_wrt_max);
+extern template struct
+    FakeQuant1WithMinMaxVarsPerChannelGradientFunctor<GPUDevice>;
+
+template <>
+void FakeQuant2WithMinMaxVarsPerChannelGradientFunctor<GPUDevice>::operator()(
+    const GPUDevice& d, const Index batch_size, const Index depth,
+    typename TTypes<float>::ConstFlat gradients,
+    typename TTypes<float>::ConstFlat inputs,
+    typename TTypes<float>::ConstVec min,
+    typename TTypes<float>::ConstVec max,
+#ifndef FAKE_QUANT_NO_DEBUG
+    typename TTypes<bool>::Scalar check_min_max,
+#endif
+    typename TTypes<float>::Flat backprops_wrt_input,
+    typename TTypes<float>::Vec backprop_wrt_min,
+    typename TTypes<float>::Vec backprop_wrt_max);
+extern template struct
+    FakeQuant2WithMinMaxVarsPerChannelGradientFunctor<GPUDevice>;
+
+template <>
+void FakeQuant4WithMinMaxVarsPerChannelGradientFunctor<GPUDevice>::operator()(
+    const GPUDevice& d, const Index batch_size, const Index height,
+    const Index width, const Index depth,
+    typename TTypes<float>::ConstFlat gradients,
+    typename TTypes<float>::ConstFlat inputs,
+    typename TTypes<float>::ConstVec min,
+    typename TTypes<float>::ConstVec max,
+#ifndef FAKE_QUANT_NO_DEBUG
+    typename TTypes<bool>::Scalar check_min_max,
+#endif
+    typename TTypes<float>::Flat backprops_wrt_input,
+    typename TTypes<float>::Vec backprop_wrt_min,
+    typename TTypes<float>::Vec backprop_wrt_max);
+extern template struct
+    FakeQuant4WithMinMaxVarsPerChannelGradientFunctor<GPUDevice>;
+
+REGISTER_KERNEL_BUILDER(Name("FakeQuantWithMinMaxVarsPerChannelGradient")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("min")
+                            .HostMemory("max"),
+                        FakeQuantWithMinMaxVarsPerChannelGradientOp<GPUDevice>);
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fake_quant_ops_functor.h b/tensorflow/core/kernels/fake_quant_ops_functor.h
new file mode 100644
index 00000000000..d3f600cd824
--- /dev/null
+++ b/tensorflow/core/kernels/fake_quant_ops_functor.h
@@ -0,0 +1,434 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_FAKE_QUANT_FUNCTOR_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_FAKE_QUANT_FUNCTOR_H_
+
+#include <tuple>
+
+#define EIGEN_STACK_ALLOCATION_LIMIT 0
+#define EIGEN_USE_THREADS
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+static constexpr int kSteps = 255;
+static constexpr float kStepsFloat = static_cast<float>(kSteps);
+
+// Gymnastics with nudged zero point is to ensure that real zero maps to
+// an integer, which is required for e.g. zero-padding in convolutional layers.
+// Returns (nudged_min, nudged_max, nudged_scale).
+template <typename Device>
+std::tuple<float, float, float> Nudge(const float min, const float max) {
+  const float scale = (max - min) / (kStepsFloat - 0.0f);
+  const float zero_point_from_min = 0.0f - min / scale;
+  const uint8 nudged_zero_point = [zero_point_from_min] {
+    if (zero_point_from_min < 0.0f) {
+      return static_cast<uint8>(0);
+    } else if (zero_point_from_min > kStepsFloat) {
+      return static_cast<uint8>(kSteps);
+    } else {
+      return static_cast<uint8>(std::round(zero_point_from_min));
+    }
+  }();
+
+  const float nudged_min = (0.0f - nudged_zero_point) * scale;
+  const float nudged_max = (kStepsFloat - nudged_zero_point) * scale;
+  return std::make_tuple(nudged_min, nudged_max, scale);
+}
+
+template<typename T> using ConstScalar =
+  typename tensorflow::TTypes<T>::ConstScalar;
+template<typename T> using Scalar = typename tensorflow::TTypes<T>::Scalar;
+template<typename T> using ConstVec = typename tensorflow::TTypes<T>::ConstVec;
+template<typename T> using Vec = typename tensorflow::TTypes<T>::Vec;
+template<typename T> using ConstFlat =
+  typename tensorflow::TTypes<T>::ConstFlat;
+template<typename T> using Flat = typename tensorflow::TTypes<T>::Flat;
+
+// Functor called by FakeQuantWithMinMaxArgsOp to do the work.  Compiles both
+// for CPU and GPU.
+template <typename Device>
+struct FakeQuantWithMinMaxArgsFunctor {
+  void operator()(const Device& d, ConstFlat<float> inputs,
+                  const float min, const float max, Flat<float> outputs) {
+    eigen_assert(min <= 0.0f && "min should be <= 0.0");
+    eigen_assert(max >= 0.0f && "max should be >= 0.0");
+    eigen_assert(min < max && "min should be < max");
+
+    float nudged_min, nudged_max, nudged_scale;
+    std::tie(nudged_min, nudged_max, nudged_scale) = Nudge<Device>(min, max);
+    const float inv_nudged_scale = 1.0f / nudged_scale;
+
+    auto clamped = inputs.cwiseMin(nudged_max).cwiseMax(nudged_min);
+    auto clamped_shifted = clamped - nudged_min;
+    outputs.device(d) = (clamped_shifted * inv_nudged_scale + 0.5f).floor() *
+        nudged_scale + nudged_min;
+  }
+};
+
+// Functor called by FakeQuantWithMinMaxArgsGradientOp to do the work.  Compiles
+// both for CPU and GPU.
+template <typename Device>
+struct FakeQuantWithMinMaxArgsGradientFunctor {
+  void operator()(const Device& d, ConstFlat<float> gradients,
+                  ConstFlat<float> inputs, const float min, const float max,
+                  Flat<float> backprops) {
+    eigen_assert(min <= 0.0f && "min should be <= 0.0");
+    eigen_assert(max >= 0.0f && "max should be >= 0.0");
+    eigen_assert(min < max && "min should be < max");
+
+    float nudged_min, nudged_max, nudged_scale;
+    std::tie(nudged_min, nudged_max, nudged_scale) = Nudge<Device>(min, max);
+
+    auto between_nudged_min_max = (inputs >= nudged_min && inputs <= nudged_max)
+        .select(inputs.constant(1.0f), inputs.constant(0.0f));
+    backprops.device(d) = gradients * between_nudged_min_max;
+  }
+};
+
+// Functor called by FakeQuantWithMinMaxVarsOp to do the work.  Compiles both
+// for CPU and GPU.
+template <typename Device>
+struct FakeQuantWithMinMaxVarsFunctor {
+  void operator()(const Device& d, ConstFlat<float> inputs,
+                  ConstScalar<float> min, ConstScalar<float> max,
+#ifndef FAKE_QUANT_NO_DEBUG
+                  Scalar<bool> check_min_max,
+#endif
+                  Flat<float> outputs) {
+#ifndef FAKE_QUANT_NO_DEBUG
+    check_min_max.device(d) = (min <= 0.0f).all();
+    eigen_assert(check_min_max() && "min should be <= 0.0 coeff-wise");
+    check_min_max.device(d) = (max >= 0.0f).all();
+    eigen_assert(check_min_max() >= 0.0f && "max should be >= 0.0 coeff-wise");
+    check_min_max.device(d) = (min < max).all();
+    eigen_assert(check_min_max() && "min should be < max coeff-wise");
+#endif
+
+    float nudged_min, nudged_max, nudged_scale;
+    std::tie(nudged_min, nudged_max, nudged_scale) =
+        Nudge<Device>(min(), max());
+    const auto nudged_scale_repl = inputs.constant(nudged_scale);
+
+    const auto clamped = inputs.cwiseMin(nudged_max).cwiseMax(nudged_min);
+    const auto clamped_shifted = clamped - nudged_min;
+    outputs.device(d) = (clamped_shifted / nudged_scale_repl + 0.5f).floor() *
+        nudged_scale_repl + nudged_min;
+  }
+};
+
+// Functor called by FakeQuantWithMinMaxVarsGradientOp to do the work.  Compiles
+// both for CPU and GPU.
+template <typename Device>
+struct FakeQuantWithMinMaxVarsGradientFunctor {
+  void operator()(const Device& d,
+                  ConstFlat<float> gradients, ConstFlat<float> inputs,
+                  ConstScalar<float> min, ConstScalar<float> max,
+#ifndef FAKE_QUANT_NO_DEBUG
+                  Scalar<bool> check_min_max,
+#endif
+                  Flat<float> backprops_wrt_input,
+                  Scalar<float> backprop_wrt_min,
+                  Scalar<float> backprop_wrt_max) {
+#ifndef FAKE_QUANT_NO_DEBUG
+    check_min_max.device(d) = (min <= 0.0f).all();
+    eigen_assert(check_min_max() && "min should be <= 0.0 coeff-wise");
+    check_min_max.device(d) = (max >= 0.0f).all();
+    eigen_assert(check_min_max() >= 0.0f && "max should be >= 0.0 coeff-wise");
+    check_min_max.device(d) = (min < max).all();
+    eigen_assert(check_min_max() && "min should be < max coeff-wise");
+#endif
+
+    float nudged_min, nudged_max, nudged_scale;
+    std::tie(nudged_min, nudged_max, nudged_scale) =
+        Nudge<Device>(min(), max());
+
+    const auto between_min_max = (inputs >= nudged_min && inputs <= nudged_max)
+        .select(inputs.constant(1.0f), inputs.constant(0.0f));
+    backprops_wrt_input.device(d) = gradients * between_min_max;
+
+    const auto below_min = (inputs < nudged_min)
+        .select(inputs.constant(1.0f), inputs.constant(0.0f));
+    backprop_wrt_min.device(d) = (gradients * below_min).sum();
+
+    const auto above_max = (inputs > nudged_max)
+        .select(inputs.constant(1.0f), inputs.constant(0.0f));
+    backprop_wrt_max.device(d) = (gradients * above_max).sum();
+  }
+};
+
+using Index = typename tensorflow::TTypes<float>::ConstTensor::Index;
+
+// Functor called by FakeQuantWithMinMaxVarsPerChannelOp to do the work.
+// Compiles both for CPU and GPU.
+//
+// Already verified: inputs, outputs, min, max are of shape [d].
+template <typename Device>
+struct FakeQuant1WithMinMaxVarsPerChannelFunctor {
+  void operator()(const Device& d, ConstVec<float> inputs,
+                  ConstVec<float> min, ConstVec<float> max,
+#ifndef FAKE_QUANT_NO_DEBUG
+                  Scalar<bool> check_min_max,
+#endif
+                  Vec<float> outputs) {
+#ifndef FAKE_QUANT_NO_DEBUG
+    check_min_max.device(d) = (min <= 0.0f).all();
+    eigen_assert(check_min_max() && "min should be <= 0.0 coeff-wise");
+    check_min_max.device(d) = (max >= 0.0f).all();
+    eigen_assert(check_min_max() >= 0.0f && "max should be >= 0.0 coeff-wise");
+    check_min_max.device(d) = (min < max).all();
+    eigen_assert(check_min_max() && "min should be < max coeff-wise");
+#endif
+
+    for (Index i = 0; i < min.size(); ++i) {
+      float nudged_min, nudged_max, nudged_scale;
+      std::tie(nudged_min, nudged_max, nudged_scale) =
+          Nudge<Device>(min(i), max(i));
+      const float clamped =
+          std::max(std::min(inputs(i), nudged_max), nudged_min);
+      const float clamped_shifted = clamped - nudged_min;
+
+      outputs(i) = std::round(clamped_shifted / nudged_scale) * nudged_scale +
+          nudged_min;
+    }
+  }
+};
+
+// Already verified: inputs, outputs are of shape [b, d], min, max are of shape
+// [d].
+template <typename Device>
+struct FakeQuant2WithMinMaxVarsPerChannelFunctor {
+  void operator()(const Device& d, const Index batch_size, const Index depth,
+                  ConstFlat<float> inputs,
+                  ConstVec<float> min, ConstVec<float> max,
+#ifndef FAKE_QUANT_NO_DEBUG
+                  Scalar<bool> check_min_max,
+#endif
+                  Flat<float> outputs) {
+#ifndef FAKE_QUANT_NO_DEBUG
+    check_min_max.device(d) = (min <= 0.0f).all();
+    eigen_assert(check_min_max() && "min should be <= 0.0 coeff-wise");
+    check_min_max.device(d) = (max >= 0.0f).all();
+    eigen_assert(check_min_max() >= 0.0f && "max should be >= 0.0 coeff-wise");
+    check_min_max.device(d) = (min < max).all();
+    eigen_assert(check_min_max() && "min should be < max coeff-wise");
+#endif
+
+    Eigen::DSizes<Index, 2> restored(batch_size, depth);
+    const auto inputs_restored = inputs.reshape(restored);
+    for (Index i = 0; i < min.size(); ++i) {
+      float nudged_min, nudged_max, nudged_scale;
+      std::tie(nudged_min, nudged_max, nudged_scale) =
+          Nudge<Device>(min(i), max(i));
+      const auto clamped = inputs_restored.chip<1>(i)
+          .cwiseMin(nudged_max).cwiseMax(nudged_min);
+      const auto clamped_shifted = clamped - nudged_min;
+
+      outputs.reshape(restored).chip<1>(i).device(d) =
+          (clamped_shifted / nudged_scale + 0.5f).floor() * nudged_scale +
+              nudged_min;
+    }
+  }
+};
+
+// Already verified: inputs, outputs are of shape [b, h, w, d], min, max are
+// of shape [d].
+template <typename Device>
+struct FakeQuant4WithMinMaxVarsPerChannelFunctor {
+  void operator()(const Device& d, const Index batch_size, const Index height,
+                  const Index width, const Index depth,
+                  ConstFlat<float> inputs,
+                  ConstVec<float> min, ConstVec<float> max,
+#ifndef FAKE_QUANT_NO_DEBUG
+                  Scalar<bool> check_min_max,
+#endif
+                  Flat<float> outputs) {
+#ifndef FAKE_QUANT_NO_DEBUG
+    check_min_max.device(d) = (min <= 0.0f).all();
+    eigen_assert(check_min_max() && "min should be <= 0.0 coeff-wise");
+    check_min_max.device(d) = (max >= 0.0f).all();
+    eigen_assert(check_min_max() >= 0.0f && "max should be >= 0.0 coeff-wise");
+    check_min_max.device(d) = (min < max).all();
+    eigen_assert(check_min_max() && "min should be < max coeff-wise");
+#endif
+
+    Eigen::DSizes<Index, 4> restored(batch_size, height, width, depth);
+    const auto inputs_restored = inputs.reshape(restored);
+    for (Index i = 0; i < min.size(); ++i) {
+      float nudged_min, nudged_max, nudged_scale;
+      std::tie(nudged_min, nudged_max, nudged_scale) =
+          Nudge<Device>(min(i), max(i));
+      const auto clamped = inputs_restored.chip<3>(i)
+          .cwiseMin(nudged_max).cwiseMax(nudged_min);
+      const auto clamped_shifted = clamped - nudged_min;
+
+      outputs.reshape(restored).chip<3>(i).device(d) =
+          (clamped_shifted / nudged_scale + 0.5f).floor() * nudged_scale +
+              nudged_min;
+    }
+  }
+};
+
+// Functor called by FakeQuantWithMinMaxVarsPerChannelGradientOp to do the work.
+// Compiles both for CPU and GPU.
+//
+// Already verified: gradients, inputs, outputs, min, max, backprops_wrt_input,
+// backprop_wrt_min, backprop_wrt_max are of shape [d].
+template <typename Device>
+struct FakeQuant1WithMinMaxVarsPerChannelGradientFunctor {
+  void operator()(const Device& d,
+                  ConstVec<float> gradients, ConstVec<float> inputs,
+                  ConstVec<float> min, ConstVec<float> max,
+#ifndef FAKE_QUANT_NO_DEBUG
+                  Scalar<bool> check_min_max,
+#endif
+                  Vec<float> backprops_wrt_input, Vec<float> backprop_wrt_min,
+                  Vec<float> backprop_wrt_max) {
+#ifndef FAKE_QUANT_NO_DEBUG
+    check_min_max.device(d) = (min <= 0.0f).all();
+    eigen_assert(check_min_max() && "min should be <= 0.0 coeff-wise");
+    check_min_max.device(d) = (max >= 0.0f).all();
+    eigen_assert(check_min_max() >= 0.0f && "max should be >= 0.0 coeff-wise");
+    check_min_max.device(d) = (min < max).all();
+    eigen_assert(check_min_max() && "min should be < max coeff-wise");
+#endif
+
+    for (Index i = 0; i < min.size(); ++i) {
+      float nudged_min, nudged_max, nudged_scale;
+      std::tie(nudged_min, nudged_max, nudged_scale) =
+          Nudge<Device>(min(i), max(i));
+
+      const bool between_min_max =
+          inputs(i) >= nudged_min && inputs(i) <= nudged_max;
+      backprops_wrt_input(i) = between_min_max ? gradients(i) : 0.0f;
+
+      const bool below_min = inputs(i) < nudged_min;
+      backprop_wrt_min(i) = below_min ? gradients(i) : 0.0f;
+
+      const bool above_max = inputs(i) > nudged_max;
+      backprop_wrt_max(i) = above_max ? gradients(i) : 0.0f;
+    }
+  }
+};
+
+// Already verified: gradients, inputs, backprops_wrt_input are of shape [b, d],
+// min, max, backprop_wrt_min, backprop_wrt_max are of shape [d].
+template <typename Device>
+struct FakeQuant2WithMinMaxVarsPerChannelGradientFunctor {
+  void operator()(const Device& d, const Index batch_size, const Index depth,
+                  ConstFlat<float> gradients, ConstFlat<float> inputs,
+                  ConstVec<float> min, ConstVec<float> max,
+#ifndef FAKE_QUANT_NO_DEBUG
+                  Scalar<bool> check_min_max,
+#endif
+                  Flat<float> backprops_wrt_input,
+                  Vec<float> backprop_wrt_min, Vec<float> backprop_wrt_max) {
+#ifndef FAKE_QUANT_NO_DEBUG
+    check_min_max.device(d) = (min <= 0.0f).all();
+    eigen_assert(check_min_max() && "min should be <= 0.0 coeff-wise");
+    check_min_max.device(d) = (max >= 0.0f).all();
+    eigen_assert(check_min_max() >= 0.0f && "max should be >= 0.0 coeff-wise");
+    check_min_max.device(d) = (min < max).all();
+    eigen_assert(check_min_max() && "min should be < max coeff-wise");
+#endif
+
+    Eigen::DSizes<Index, 2> restored(batch_size, depth);
+    const auto gradients_restored = gradients.reshape(restored);
+    const auto inputs_restored = inputs.reshape(restored);
+    for (Index i = 0; i < min.size(); ++i) {
+      float nudged_min, nudged_max, nudged_scale;
+      std::tie(nudged_min, nudged_max, nudged_scale) =
+          Nudge<Device>(min(i), max(i));
+      const auto gradients_chip = gradients_restored.chip<1>(i);
+      const auto inputs_chip = inputs_restored.chip<1>(i);
+
+      const auto between_min_max =
+          (inputs_chip >= nudged_min && inputs_chip <= nudged_max)
+              .select(inputs_chip.constant(1.0f), inputs_chip.constant(0.0f));
+      backprops_wrt_input.reshape(restored).chip<1>(i).device(d) =
+          gradients_chip * between_min_max;
+
+      const auto below_min = (inputs_chip < nudged_min)
+          .select(inputs_chip.constant(1.0f), inputs_chip.constant(0.0f));
+      Eigen::DSizes<Index, 1> reduce(0);
+      backprop_wrt_min.chip<0>(i).device(d) =
+          (gradients_chip * below_min).sum(reduce);
+
+      const auto above_max = (inputs_chip > nudged_max)
+          .select(inputs_chip.constant(1.0f), inputs_chip.constant(0.0f));
+      backprop_wrt_max.chip<0>(i).device(d) =
+          (gradients_chip * above_max).sum(reduce);
+    }
+  }
+};
+
+// Already verified: gradients, inputs, backprops_wrt_input are of shape
+// [b, h, w, d], min, max, backprop_wrt_min, backprop_wrt_max are of shape [d].
+template <typename Device>
+struct FakeQuant4WithMinMaxVarsPerChannelGradientFunctor {
+  void operator()(const Device& d, const Index batch_size, const Index height,
+                  const Index width, const Index depth,
+                  ConstFlat<float> gradients, ConstFlat<float> inputs,
+                  ConstVec<float> min, ConstVec<float> max,
+#ifndef FAKE_QUANT_NO_DEBUG
+                  Scalar<bool> check_min_max,
+#endif
+                  Flat<float> backprops_wrt_input,
+                  Vec<float> backprop_wrt_min, Vec<float> backprop_wrt_max) {
+#ifndef FAKE_QUANT_NO_DEBUG
+    check_min_max.device(d) = (min <= 0.0f).all();
+    eigen_assert(check_min_max() && "min should be <= 0.0 coeff-wise");
+    check_min_max.device(d) = (max >= 0.0f).all();
+    eigen_assert(check_min_max() >= 0.0f && "max should be >= 0.0 coeff-wise");
+    check_min_max.device(d) = (min < max).all();
+    eigen_assert(check_min_max() && "min should be < max coeff-wise");
+#endif
+
+    Eigen::DSizes<Index, 4> restored(batch_size, height, width, depth);
+    const auto gradients_restored = gradients.reshape(restored);
+    const auto inputs_restored = inputs.reshape(restored);
+    for (Index i = 0; i < min.size(); ++i) {
+      float nudged_min, nudged_max, nudged_scale;
+      std::tie(nudged_min, nudged_max, nudged_scale) =
+          Nudge<Device>(min(i), max(i));
+      const auto gradients_chip = gradients_restored.chip<3>(i);
+      const auto inputs_chip = inputs_restored.chip<3>(i);
+
+      const auto between_min_max =
+          (inputs_chip >= nudged_min && inputs_chip <= nudged_max)
+              .select(inputs_chip.constant(1.0f), inputs_chip.constant(0.0f));
+      backprops_wrt_input.reshape(restored).chip<3>(i).device(d) =
+          gradients_chip * between_min_max;
+
+      const auto below_min = (inputs_chip < nudged_min)
+          .select(inputs_chip.constant(1.0f), inputs_chip.constant(0.0f));
+      Eigen::DSizes<Index, 3> reduce(0, 1, 2);
+      backprop_wrt_min.chip<0>(i).device(d) =
+          (gradients_chip * below_min).sum(reduce);
+
+      const auto above_max = (inputs_chip > nudged_max)
+          .select(inputs_chip.constant(1.0f), inputs_chip.constant(0.0f));
+      backprop_wrt_max.chip<0>(i).device(d) =
+          (gradients_chip * above_max).sum(reduce);
+    }
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_FAKE_QUANT_FUNCTOR_H_
diff --git a/tensorflow/core/kernels/fake_quant_ops_gpu.cu.cc b/tensorflow/core/kernels/fake_quant_ops_gpu.cu.cc
new file mode 100644
index 00000000000..ad327937877
--- /dev/null
+++ b/tensorflow/core/kernels/fake_quant_ops_gpu.cu.cc
@@ -0,0 +1,41 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define FAKE_QUANT_NO_DEBUG
+
+#define EIGEN_USE_GPU
+#include "tensorflow/core/kernels/fake_quant_ops_functor.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// Just instantiate GPU functor implementations.
+template struct FakeQuantWithMinMaxArgsFunctor<GPUDevice>;
+template struct FakeQuantWithMinMaxArgsGradientFunctor<GPUDevice>;
+template struct FakeQuantWithMinMaxVarsFunctor<GPUDevice>;
+template struct FakeQuantWithMinMaxVarsGradientFunctor<GPUDevice>;
+template struct FakeQuant1WithMinMaxVarsPerChannelFunctor<GPUDevice>;
+template struct FakeQuant2WithMinMaxVarsPerChannelFunctor<GPUDevice>;
+template struct FakeQuant4WithMinMaxVarsPerChannelFunctor<GPUDevice>;
+template struct FakeQuant1WithMinMaxVarsPerChannelGradientFunctor<GPUDevice>;
+template struct FakeQuant2WithMinMaxVarsPerChannelGradientFunctor<GPUDevice>;
+template struct FakeQuant4WithMinMaxVarsPerChannelGradientFunctor<GPUDevice>;
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/fake_quant_ops_test.cc b/tensorflow/core/kernels/fake_quant_ops_test.cc
new file mode 100644
index 00000000000..38ad345f0d3
--- /dev/null
+++ b/tensorflow/core/kernels/fake_quant_ops_test.cc
@@ -0,0 +1,821 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+
+namespace tensorflow {
+
+using tensorflow::AllocatorAttributes;
+using tensorflow::DT_FLOAT;
+using tensorflow::NodeDefBuilder;
+using tensorflow::OpsTestBase;
+using tensorflow::Tensor;
+using tensorflow::TensorShape;
+using tensorflow::test::ExpectClose;
+using tensorflow::test::FillValues;
+
+class QuantOpsTest : public OpsTestBase {
+ protected:
+  void AddRandomInput(const TensorShape& shape) {
+    CHECK_GT(input_types_.size(), inputs_.size())
+        << "Adding more inputs than types; perhaps you need to call MakeOp";
+    Tensor* input = new Tensor(device_->GetAllocator(AllocatorAttributes()),
+                               DT_FLOAT, shape);
+    input->flat<float>().setRandom();
+    tensors_.push_back(input);
+    bool is_ref = IsRefType(input_types_[inputs_.size()]);
+    if (is_ref) {
+      CHECK_EQ(RemoveRefType(input_types_[inputs_.size()]), DT_FLOAT);
+      inputs_.push_back({&lock_for_refs_, input});
+    } else {
+      CHECK_EQ(input_types_[inputs_.size()], DT_FLOAT);
+      inputs_.push_back({nullptr, input});
+    }
+  }
+};
+
+TEST_F(QuantOpsTest, WithArgsNoNudging) {
+  // Original quantization range: [-10 + 0 / 4, -10 + 255 / 4], scale: 1/4.
+  // Original zero point: 40, no nudging necessary.
+  // Expected quantized values: -10.0, -10.25, ..., 53.75.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxArgs")
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Attr("min", -10.0f)
+                   .Attr("max", 53.75f)
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({2, 3}),
+                           {-10.1f, -10.0f, -9.9f, -9.75f, 53.75f, 53.8f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output = GetOutput(0);
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  FillValues<float>(&expected,
+                    {-10.0f, -10.0f, -10.0f, -9.75f, 53.75f, 53.75f});
+  ExpectClose(expected, *output);
+}
+
+TEST_F(QuantOpsTest, WithArgsNudgedZeroIs0) {
+  // Original quantization range: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.4, nudged to 0.
+  // Nudged range: [0.0; 63.75].
+  // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxArgs")
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Attr("min", -0.1f)
+                   .Attr("max", 63.65f)
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({2, 3}),
+                           {-0.1f, 0.0f, 0.1f, 0.25f, 63.75f, 63.8f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output = GetOutput(0);
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  FillValues<float>(&expected, {0.0f, 0.0f, 0.0f, 0.25f, 63.75f, 63.75f});
+  ExpectClose(expected, *output);
+}
+
+TEST_F(QuantOpsTest, WithArgsNudgedZeroIs1) {
+  // Original quantization range: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.5, nudged to 1.
+  // Nudged range: [-0.25; 63.5].
+  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxArgs")
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Attr("min", -0.125f)
+                   .Attr("max", 63.625f)
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({2, 3}),
+                           {-0.26f, -0.25f, -0.24f, 0.0f, 63.5f, 63.6f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output = GetOutput(0);
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  FillValues<float>(&expected, {-0.25f, -0.25f, -0.25f, 0.0f, 63.5f, 63.5f});
+  ExpectClose(expected, *output);
+}
+
+TEST_F(QuantOpsTest, WithArgsNudgedZeroIs255) {
+  // Original quantization range: [0.4 / 4 - 255 / 4, 0.4 / 4 + 0 / 4].
+  // Scale: 1/4,  original zero point: 254.6, nudged to 255.
+  // Nudged range: [-63.75; 0.0].
+  // Expected quantized values: -63.75, -63.5, -63.25, ..., 0.0.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxArgs")
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Attr("min", -63.65f)
+                   .Attr("max", 0.1f)
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({2, 3}),
+                           {-63.8f, -63.75f, -63.7f, -63.5f, 0.0f, 0.1f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output = GetOutput(0);
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  FillValues<float>(&expected, {-63.75f, -63.75f, -63.75f, -63.5f, 0.0f, 0.0f});
+  ExpectClose(expected, *output);
+}
+
+TEST_F(QuantOpsTest, WithArgsGradient) {
+  // Original quantization range: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.5, nudged to 1.
+  // Nudged range: [-0.25; 63.5].
+  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxArgsGradient")
+                   .Input(FakeInput(DT_FLOAT))  // gradient
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Attr("min", -0.125f)
+                   .Attr("max", 63.625f)
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({2, 3}));
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({2, 3}),
+                           {-0.26f, -0.25f, -0.24f, 0.0f, 63.5f, 63.6f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output = GetOutput(0);
+  auto input_flat = GetInput(0).flat<float>();
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  FillValues<float>(&expected,
+                    {0.0f, input_flat(1), input_flat(2),
+                     input_flat(3), input_flat(4), 0.0f});
+  ExpectClose(expected, *output);
+}
+
+TEST_F(QuantOpsTest, WithVarsNoNudging) {
+  // Original quantization range: [-10 + 0 / 4, -10 + 255 / 4], scale: 1/4.
+  // Original zero point: 40, no nudging necessary.
+  // Expected quantized values: -10.0, -10.25, ..., 53.75.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVars")
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({2, 3}),
+                           {-10.1f, -10.0f, -9.9f, -9.75f, 53.75f, 53.8f});
+  // Min.
+  AddInputFromArray<float>(TensorShape({}), {-10.0f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({}), {53.75f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output = GetOutput(0);
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  FillValues<float>(&expected,
+                    {-10.0f, -10.0f, -10.0f, -9.75f, 53.75f, 53.75f});
+  ExpectClose(expected, *output);
+}
+
+TEST_F(QuantOpsTest, WithVarsNudgedZeroIs0) {
+  // Original quantization range: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.4, nudged to 0.
+  // Nudged range: [0.0; 63.75].
+  // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVars")
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({2, 3}),
+                           {-0.1f, 0.0f, 0.1f, 0.25f, 63.75f, 63.8f});
+  // Min.
+  AddInputFromArray<float>(TensorShape({}), {-0.1f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({}), {63.65f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output = GetOutput(0);
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  FillValues<float>(&expected,
+                    {0.0f, 0.0f, 0.0f, 0.25f, 63.75f, 63.75f});
+  ExpectClose(expected, *output);
+}
+
+TEST_F(QuantOpsTest, WithVarsNudgedZeroIs1) {
+  // Original quantization range: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.5, nudged to 1.
+  // Nudged range: [-0.25; 63.5].
+  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVars")
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({2, 3}),
+                           {-0.26f, -0.25f, -0.24f, 0.0f, 63.5f, 63.6f});
+  // Min.
+  AddInputFromArray<float>(TensorShape({}), {-0.125f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({}), {63.625f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output = GetOutput(0);
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  FillValues<float>(&expected,
+                    {-0.25f, -0.25f, -0.25f, 0.0f, 63.5f, 63.5f});
+  ExpectClose(expected, *output);
+}
+
+TEST_F(QuantOpsTest, WithVarsGradient) {
+  // Original quantization range: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.5, nudged to 1.
+  // Nudged range: [-0.25; 63.5].
+  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsGradient")
+                   .Input(FakeInput(DT_FLOAT))  // gradients
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({2, 3}));
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({2, 3}),
+                           {-0.26f, -0.25f, -0.24f, 0.0f, 63.5f, 63.6f});
+  // Min.
+  AddInputFromArray<float>(TensorShape({}), {-0.125f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({}), {63.625f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  auto in_flat = GetInput(0).flat<float>();
+  FillValues<float>(&expected_bprop_wrt_input,
+                    {0.0f, in_flat(1),
+                     in_flat(2), in_flat(3),
+                     in_flat(4), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({}));
+  expected_bprop_wrt_min.flat<float>()(0) = in_flat(0);
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({}));
+  expected_bprop_wrt_max.flat<float>()(0) = in_flat(5);
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim1NudgedZeroIs0) {
+  // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.4, nudged to 0.
+  // Nudged ranges: [0.0; 63.75].
+  // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannel")
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({4}), {-0.1f, 0.0f, 63.75f, 63.8f});
+  // Min.
+  AddInputFromArray<float>(TensorShape({4}), {-0.1f, -0.1f, -0.1f, -0.1f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({4}), {63.65f, 63.65f, 63.65f, 63.65f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output = GetOutput(0);
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected, {0.0f, 0.0f, 63.75f, 63.75f});
+  ExpectClose(expected, *output);
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim1NudgedZeroIs1) {
+  // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.5, nudged to 1.
+  // Nudged ranges: [-0.25; 63.5].
+  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannel")
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({4}), {-0.26f, -0.25f, -0.24f, 63.6f});
+  // Min.
+  AddInputFromArray<float>(TensorShape({4}),
+                           {-0.125f, -0.125f, -0.125f, -0.125f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({4}),
+                           {63.625f, 63.625f, 63.625f, 63.625f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output = GetOutput(0);
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected, {-0.25f, -0.25f, -0.25f, 63.5f});
+  ExpectClose(expected, *output);
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim2NudgedZeroIs0) {
+  // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.4, nudged to 0.
+  // Nudged ranges: [0.0; 63.75].
+  // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannel")
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({2, 3}),
+                           {-0.1f, 0.0f, 0.1f,
+                           0.25f, 63.75f, 63.8f});
+  // Min.
+  AddInputFromArray<float>(TensorShape({3}), {-0.1f, -0.1f, -0.1f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({3}), {63.65f, 63.65f, 63.65f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output = GetOutput(0);
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  FillValues<float>(&expected, {0.0f, 0.0f, 0.0f,
+                                0.25f, 63.75f, 63.75f});
+  ExpectClose(expected, *output);
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim2NudgedZeroIs1) {
+  // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.5, nudged to 1.
+  // Nudged ranges: [-0.25; 63.5].
+  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannel")
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({2, 3}),
+                           {-0.26f, -0.25f, -0.24f,
+                            0.0f, 63.5f, 63.6f});
+  // Min.
+  AddInputFromArray<float>(TensorShape({3}), {-0.125f, -0.125f, -0.125f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({3}), {63.625f, 63.625f, 63.625f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output = GetOutput(0);
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  FillValues<float>(&expected, {-0.25f, -0.25f, -0.25f,
+                                0.0f, 63.5f, 63.5f});
+  ExpectClose(expected, *output);
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim4NudgedZeroIs0) {
+  // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.4, nudged to 0.
+  // Nudged ranges: [0.0; 63.75].
+  // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannel")
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({1, 2, 3, 4}),
+                           {-0.1f, 0.0f, 0.1f, 0.25f,
+                             0.5f, 0.75f, 1.0f, 1.25f,
+                             1.5f, 1.75f, 2.0f, 2.25f,
+
+                             63.0f,  63.25f, 63.5f,   63.7f,
+                             63.75f, 63.8f,  63.9f,  100.0f,
+                            100.0f, 100.0f, 100.0f, 1000.0f});
+  // Min.
+  AddInputFromArray<float>(TensorShape({4}), {-0.1f, -0.1f, -0.1f, -0.1f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({4}), {63.65f, 63.65f, 63.65f, 63.65f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output = GetOutput(0);
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 3, 4}));
+  FillValues<float>(&expected,
+                    {0.0f, 0.0f,  0.0f, 0.25f,
+                     0.5f, 0.75f, 1.0f, 1.25f,
+                     1.5f, 1.75f, 2.0f, 2.25f,
+
+                     63.0f,  63.25f, 63.5f,  63.75f,
+                     63.75f, 63.75f, 63.75f, 63.75f,
+                     63.75f, 63.75f, 63.75f, 63.75f});
+  ExpectClose(expected, *output);
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim4NudgedZeroIs1) {
+  // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.5, nudged to 1.
+  // Nudged ranges: [-0.25; 63.5].
+  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannel")
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({1, 2, 3, 4}),
+                           {-0.3f, -0.25f, -0.2f,  0.0f,
+                             0.25f, 0.5f,   0.75f, 1.0f,
+                             1.25f, 1.5f,   1.75f, 2.0f,
+
+                             63.0f,  63.25f, 63.4f,   63.5f,
+                             63.6f,  63.7f, 100.0f,  100.0f,
+                            100.0f, 100.0f, 100.0f, 1000.0f});
+  // Min.
+  AddInputFromArray<float>(TensorShape({4}),
+                           {-0.125f, -0.125f, -0.125f, -0.125f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({4}),
+                           {63.625f, 63.625f, 63.625f, 63.625f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output = GetOutput(0);
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 3, 4}));
+  FillValues<float>(&expected,
+                    {-0.25f, -0.25f, -0.25f, 0.0f,
+                      0.25f,  0.5f,   0.75f, 1.0f,
+                      1.25f,  1.5f,   1.75f, 2.0f,
+
+                      63.0f, 63.25f, 63.5f, 63.5f,
+                      63.5f, 63.5f,  63.5f, 63.5f,
+                      63.5f, 63.5f,  63.5f, 63.5f});
+  ExpectClose(expected, *output);
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim1GradientNudgedZeroIs0) {
+  // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.4, nudged to 0.
+  // Nudged ranges: [0.0; 63.75].
+  // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Input(FakeInput(DT_FLOAT))  // gradients
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({4}));
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({4}), {-0.1f, 0.0f, 63.75f, 63.8f});
+  // Min.
+  AddInputFromArray<float>(TensorShape({4}), {-0.1f, -0.1f, -0.1f, -0.1f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({4}), {63.65f, 63.65f, 63.65f, 63.65f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({4}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(&expected_bprop_wrt_input,
+                    {0.0f, grad_flat(1), grad_flat(2), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_min,
+                    {grad_flat(0), 0.0f, 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_max,
+                    {0.0f, 0.0f, 0.0f, grad_flat(3)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim1GradientNudgedZeroIs1) {
+  // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.5, nudged to 1.
+  // Nudged ranges: [-0.25; 63.5].
+  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Input(FakeInput(DT_FLOAT))  // gradients
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({4}));
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({4}), {-0.3f, -0.25f, 63.5f, 63.6f});
+  // Min.
+  AddInputFromArray<float>(TensorShape({4}),
+                           {-0.125f, -0.125f, -0.125f, -0.125f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({4}),
+                           {63.625f, 63.625f, 63.625f, 63.625f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({4}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(&expected_bprop_wrt_input,
+                    {0.0f, grad_flat(1), grad_flat(2), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_min,
+                    {grad_flat(0), 0.0f, 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_max,
+                    {0.0f, 0.0f, 0.0f, grad_flat(3)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedZeroIs0) {
+  // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.4, nudged to 0.
+  // Nudged ranges: [0.0; 63.75].
+  // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Input(FakeInput(DT_FLOAT))  // gradients
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({2, 3}));
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({2, 3}),
+                           {-0.1f, 0.0f, 0.1f,
+                            0.25f, 63.75f, 63.8f});
+  // Min.
+  AddInputFromArray<float>(TensorShape({3}), {-0.1f, -0.1f, -0.1f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({3}), {63.65f, 63.65f, 63.65f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(&expected_bprop_wrt_input,
+                    {0.0f, grad_flat(1), grad_flat(2),
+                     grad_flat(3), grad_flat(4), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3}));
+  FillValues<float>(&expected_bprop_wrt_min,
+                    {grad_flat(0), 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3}));
+  FillValues<float>(&expected_bprop_wrt_max,
+                    {0.0f, 0.0f, grad_flat(5)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedZeroIs1) {
+  // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.5, nudged to 1.
+  // Nudged ranges: [-0.25; 63.5].
+  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Input(FakeInput(DT_FLOAT))  // gradients
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({2, 3}));
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({2, 3}),
+                           {-0.3f, -0.25f, -0.2f,
+                            0.0f, 63.5f, 63.6f});
+  // Min.
+  AddInputFromArray<float>(TensorShape({3}), {-0.125f, -0.125f, -0.125f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({3}), {63.625f, 63.625f, 63.625f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(&expected_bprop_wrt_input,
+                    {0.0f, grad_flat(1), grad_flat(2),
+                     grad_flat(3), grad_flat(4), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3}));
+  FillValues<float>(&expected_bprop_wrt_min,
+                    {grad_flat(0), 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3}));
+  FillValues<float>(&expected_bprop_wrt_max,
+                    {0.0f, 0.0f, grad_flat(5)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim4GradientNudgedZeroIs0) {
+  // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.4, nudged to 0.
+  // Nudged ranges: [0.0; 63.75].
+  // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Input(FakeInput(DT_FLOAT))  // gradients
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({1, 2, 3, 4}));
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({1, 2, 3, 4}),
+                           {-0.1f, 0.0f, 63.75f, 63.8f,
+                            -0.1f, 0.0f, 63.75f, 63.8f,
+                            -0.1f, 0.0f, 63.75f, 63.8f,
+
+                            -0.1f, 0.0f, 63.75f, 63.8f,
+                            -0.1f, 0.0f, 63.75f, 63.8f,
+                            -0.1f, 0.0f, 63.75f, 63.8f});
+  // Min.
+  AddInputFromArray<float>(TensorShape({4}), {-0.1f, -0.1f, -0.1f, -0.1f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({4}), {63.65f, 63.65f, 63.65f, 63.65f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT,
+                                  TensorShape({1, 2, 3, 4}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(
+      &expected_bprop_wrt_input,
+      {0.0f, grad_flat(1), grad_flat(2), 0.0f,
+       0.0f, grad_flat(5), grad_flat(6), 0.0f,
+       0.0f, grad_flat(9), grad_flat(10), 0.0f,
+
+       0.0f, grad_flat(13), grad_flat(14), 0.0f,
+       0.0f, grad_flat(17), grad_flat(18), 0.0f,
+       0.0f, grad_flat(21), grad_flat(22), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_min,
+                    {grad_flat(0) + grad_flat(4) + grad_flat(8) +
+                         grad_flat(12) + grad_flat(16) + grad_flat(20),
+                     0.0f, 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_max,
+                    {0.0f, 0.0f, 0.0f,
+                     grad_flat(3) + grad_flat(7) + grad_flat(11) +
+                         grad_flat(15) + grad_flat(19) + grad_flat(23)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim4GradientNudgedZeroIs1) {
+  // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.5, nudged to 1.
+  // Nudged ranges: [-0.25; 63.5].
+  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Input(FakeInput(DT_FLOAT))  // gradients
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({1, 2, 3, 4}));
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({1, 2, 3, 4}),
+                           {-0.3f, -0.25f, 63.5f, 63.6f,
+                            -0.3f, -0.25f, 63.5f, 63.6f,
+                            -0.3f, -0.25f, 63.5f, 63.6f,
+
+                            -0.3f, -0.25f, 63.5f, 63.6f,
+                            -0.3f, -0.25f, 63.5f, 63.6f,
+                            -0.3f, -0.25f, 63.5f, 63.6f});
+  // Min.
+  AddInputFromArray<float>(TensorShape({4}),
+                           {-0.125f, -0.125f, -0.125f, -0.125f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({4}),
+                           {63.625f, 63.625f, 63.625f, 63.625f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT,
+                                  TensorShape({1, 2, 3, 4}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(&expected_bprop_wrt_input,
+                    {0.0f, grad_flat(1), grad_flat(2), 0.0f,
+                     0.0f, grad_flat(5), grad_flat(6), 0.0f,
+                     0.0f, grad_flat(9), grad_flat(10), 0.0f,
+
+                     0.0f, grad_flat(13), grad_flat(14), 0.0f,
+                     0.0f, grad_flat(17), grad_flat(18), 0.0f,
+                     0.0f, grad_flat(21), grad_flat(22), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_min,
+                    {grad_flat(0) + grad_flat(4) + grad_flat(8) +
+                         grad_flat(12) + grad_flat(16) + grad_flat(20),
+                     0.0f, 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_max,
+                    {0.0f, 0.0f, 0.0f,
+                     grad_flat(3) + grad_flat(7) + grad_flat(11) +
+                         grad_flat(15) + grad_flat(19) + grad_flat(23)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index 56253eb64a7..4a08f98b33b 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/gradients.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -86,26 +87,27 @@ class RetvalOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("_Arg").Device(DEVICE_CPU), ArgOp);
 REGISTER_KERNEL_BUILDER(Name("_Retval").Device(DEVICE_CPU), RetvalOp);
 
-#define REGISTER_GPU_KERNELS(type)                                       \
-  REGISTER_KERNEL_BUILDER(                                               \
-      Name("_Arg").Device(DEVICE_GPU).TypeConstraint<type>("T"), ArgOp); \
-  REGISTER_KERNEL_BUILDER(                                               \
-      Name("_Retval").Device(DEVICE_GPU).TypeConstraint<type>("T"), RetvalOp);
-REGISTER_GPU_KERNELS(Eigen::half);
-REGISTER_GPU_KERNELS(float);
-REGISTER_GPU_KERNELS(double);
-#undef REGISTER_GPU_KERNELS
+#define REGISTER(type)     \
+  REGISTER_KERNEL_BUILDER( \
+      Name("_Arg").Device(DEVICE_GPU).TypeConstraint<type>("T"), ArgOp);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER)
+TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name("_Arg")
+                                                   .Device(DEVICE_GPU)
+                                                   .HostMemory("output")
+                                                   .TypeConstraint<int32>("T"),
+                                               ArgOp);
+#undef REGISTER
 
-REGISTER_KERNEL_BUILDER(Name("_Arg")
-                            .Device(DEVICE_GPU)
-                            .HostMemory("output")
-                            .TypeConstraint<int32>("T"),
-                        ArgOp);
-REGISTER_KERNEL_BUILDER(Name("_Retval")
-                            .Device(DEVICE_GPU)
-                            .HostMemory("input")
-                            .TypeConstraint<int32>("T"),
-                        RetvalOp);
+#define REGISTER(type)     \
+  REGISTER_KERNEL_BUILDER( \
+      Name("_Retval").Device(DEVICE_GPU).TypeConstraint<type>("T"), RetvalOp);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER)
+TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name("_Retval")
+                                                   .Device(DEVICE_GPU)
+                                                   .HostMemory("input")
+                                                   .TypeConstraint<int32>("T"),
+                                               RetvalOp);
+#undef REGISTER
 
 class PassOn : public OpKernel {
  public:
diff --git a/tensorflow/core/kernels/hexagon/BUILD b/tensorflow/core/kernels/hexagon/BUILD
index 0454289b991..72b603463b3 100644
--- a/tensorflow/core/kernels/hexagon/BUILD
+++ b/tensorflow/core/kernels/hexagon/BUILD
@@ -30,6 +30,7 @@ tf_cc_test(
     name = "quantized_matmul_op_for_hexagon_test",
     size = "small",
     srcs = ["quantized_matmul_op_for_hexagon_test.cc"],
+    tags = ["nomsan"],  # http://b/32242946
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/core/kernels/meta_support.cc b/tensorflow/core/kernels/meta_support.cc
new file mode 100644
index 00000000000..4ef56d1987b
--- /dev/null
+++ b/tensorflow/core/kernels/meta_support.cc
@@ -0,0 +1,373 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/meta_support.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
+
+#if (defined(GEMMLOWP_NEON_32) || defined(GEMMLOWP_NEON_64)) && \
+    !defined(TENSORFLOW_DISABLE_META) && !defined(__APPLE__)
+#define TENSORFLOW_USE_META (1)
+#endif
+
+namespace tensorflow {
+namespace meta {
+
+namespace {
+
+int g_num_threads = 0;
+bool g_enabled = true;
+bool g_use_local_context = false;
+
+#ifdef TENSORFLOW_USE_META
+
+uint8_t* GetScratch() {
+  static uint8_t* scratch = new uint8_t[2048 * 1024];
+  return scratch;
+}
+
+gemmlowp::WorkersPool* GetWorkersPool() {
+  static gemmlowp::WorkersPool* pool = new gemmlowp::WorkersPool();
+  return pool;
+}
+
+mutex& GetMutex() {
+  static mutex mu;
+  return mu;
+}
+
+int GetWorkersCount(OpKernelContext* tf_context) {
+  if (g_num_threads == 0) {
+    return tf_context->device()->tensorflow_cpu_worker_threads()->num_threads;
+  }
+  return g_num_threads;
+}
+
+typedef gemmlowp::meta::SimpleContext<gemmlowp::WorkersPool> LocalContext;
+
+template <typename Context, typename Params>
+void MultiThreadGemm(Context* context, const Params& params) {
+  if (params.m <= 4) {
+    gemmlowp::meta::Gemm<gemmlowp::meta::GemmExecutorPackLHSCacheFriendly<>,
+                         Params, 1, 8, 8>(params);
+  } else {
+    if (params.m >= params.n) {
+      gemmlowp::meta::MultiThreadGemm<
+          Context, gemmlowp::meta::GemmExecutorPackRHSCacheFriendly<>, Params,
+          2, 4, 8>(context, params);
+    } else {
+      gemmlowp::meta::MultiThreadGemm<
+          Context, gemmlowp::meta::GemmExecutorPackLHSCacheFriendly<>, Params,
+          2, 4, 8>(context, params);
+    }
+  }
+}
+
+template <typename LeftStream, typename RightStream>
+void QuantizedGemmImpl(OpKernelContext* tf_context, const quint8* a_data,
+                       const quint8* b_data, qint32* c_data, int m, int n,
+                       int k, int offset_a, int offset_b, int lda, int ldb,
+                       int ldc) {
+  typedef gemmlowp::meta::GemmParams<
+      uint8_t, int32_t, LeftStream, RightStream,
+      gemmlowp::meta::QuantizedStaticPreprocessedAsInt32,
+      gemmlowp::meta::RowMajor>
+      Params;
+  Params params;
+
+  params.m = m;
+  params.n = n;
+  params.k = k;
+
+  params.lhs = reinterpret_cast<const uint8_t*>(&(a_data->value));
+  params.rhs = reinterpret_cast<const uint8_t*>(&(b_data->value));
+  params.result = reinterpret_cast<int32_t*>(&(c_data->value));
+  params.scratch = GetScratch();
+
+  params.left_stream.count = k;
+  params.left_stream.stride = lda;
+  params.left_stream.multiplicative_sum_offset = offset_b;
+  params.left_stream.additive_sum_offset = k * offset_a * offset_b;
+
+  params.right_stream.count = k;
+  params.right_stream.stride = ldb;
+  params.right_stream.multiplicative_sum_offset = offset_a;
+  params.right_stream.additive_sum_offset = 0;
+
+  params.fused_kernel.kernel.count = k;
+  params.fused_kernel.output_stream.stride = ldc * sizeof(int32_t);
+
+  if (g_use_local_context) {
+    LocalContext local_context(GetWorkersCount(tf_context), GetWorkersPool());
+    MultiThreadGemm<LocalContext, Params>(&local_context, params);
+  } else {
+    auto& workers = *(tf_context->device()->tensorflow_cpu_worker_threads());
+    TensorflowGemmContext context(workers.num_threads, workers.workers);
+    MultiThreadGemm<TensorflowGemmContext, Params>(&context, params);
+  }
+}
+
+template <typename Params, int kernel_size>
+void MultiThreadTransform1D(OpKernelContext* tf_context, const Params& params) {
+  if (g_use_local_context) {
+    LocalContext local_context(GetWorkersCount(tf_context), GetWorkersPool());
+    gemmlowp::meta::MultiThreadTransform1D<LocalContext, Params, kernel_size>(
+        &local_context, params);
+  } else {
+    auto& workers = *(tf_context->device()->tensorflow_cpu_worker_threads());
+    TensorflowGemmContext context(workers.num_threads, workers.workers);
+    gemmlowp::meta::MultiThreadTransform1D<TensorflowGemmContext, Params,
+                                           kernel_size>(&context, params);
+  }
+}
+
+template <typename QuantizedType>
+double CalculateRangeScale(float min, float max) {
+  const int bits = sizeof(QuantizedType) * 8;
+  return static_cast<double>(max - min) /
+         ((static_cast<int64_t>(1) << bits) - 1);
+}
+
+template <typename QuantizedType>
+double CalculateOneOverRangeScale(float min, float max) {
+  if (min == max) {
+    return 0.0;
+  }
+  const int bits = sizeof(QuantizedType) * 8;
+  return static_cast<double>((static_cast<int64_t>(1) << bits) - 1) /
+         (max - min);
+}
+
+#endif  // TENSORFLOW_USE_META
+
+}  // namespace
+
+void SetNumThreads(int num_threads) { g_num_threads = num_threads; }
+
+int GetNumThreads() { return g_num_threads; }
+
+void SetUseLocalContext(bool use_local_context) {
+  g_use_local_context = use_local_context;
+}
+
+bool GetUseLocalContext() { return g_use_local_context; }
+
+bool IsSupported() {
+#if defined(TENSORFLOW_USE_META)
+  return true;
+#else
+  return false;
+#endif
+}
+
+bool IsEnabled() { return g_enabled; }
+
+void SetEnabled(bool enabled) { g_enabled = enabled; }
+
+bool IsSupportedAndEnabled() { return IsSupported() && IsEnabled(); }
+
+void QuantizedGemm(OpKernelContext* tf_context, bool transpose_a,
+                   bool transpose_b, const quint8* a_data, const quint8* b_data,
+                   qint32* c_data, int m, int n, int k, int offset_a,
+                   int offset_b, int lda, int ldb, int ldc) {
+#ifdef TENSORFLOW_USE_META
+  mutex_lock library_lock(GetMutex());
+  if (transpose_a) {
+    if (transpose_b) {
+      QuantizedGemmImpl<gemmlowp::meta::ColumnMajorWithSum,
+                        gemmlowp::meta::RowMajorWithSum>(
+          tf_context, a_data, b_data, c_data, m, n, k, offset_a, offset_b, lda,
+          ldb, ldc);
+    } else {
+      QuantizedGemmImpl<gemmlowp::meta::ColumnMajorWithSum,
+                        gemmlowp::meta::ColumnMajorWithSum>(
+          tf_context, a_data, b_data, c_data, m, n, k, offset_a, offset_b, lda,
+          ldb, ldc);
+    }
+  } else {
+    if (transpose_b) {
+      QuantizedGemmImpl<gemmlowp::meta::RowMajorWithSum,
+                        gemmlowp::meta::RowMajorWithSum>(
+          tf_context, a_data, b_data, c_data, m, n, k, offset_a, offset_b, lda,
+          ldb, ldc);
+    } else {
+      QuantizedGemmImpl<gemmlowp::meta::RowMajorWithSum,
+                        gemmlowp::meta::ColumnMajorWithSum>(
+          tf_context, a_data, b_data, c_data, m, n, k, offset_a, offset_b, lda,
+          ldb, ldc);
+    }
+  }
+#else
+  LOG(FATAL) << "QuantizedGemm: Meta fastpath not supported.";
+#endif
+}
+
+void Requantize(OpKernelContext* tf_context, const qint32* input, int count,
+                float input_min, float input_max, float output_min,
+                float output_max, quint8* output) {
+#ifdef TENSORFLOW_USE_META
+  mutex_lock library_lock(GetMutex());
+  typedef gemmlowp::meta::Transform1DParams<int32_t, uint8_t,
+                                            gemmlowp::meta::Requantize>
+      Params;
+
+  Params params;
+  params.input = reinterpret_cast<const int32_t*>(input);
+  params.output = reinterpret_cast<uint8_t*>(output);
+  params.kernel.count = count;
+  params.kernel.input_range_min = input_min;
+  params.kernel.output_range_min = output_min;
+  params.kernel.input_range_scale =
+      CalculateRangeScale<int32_t>(input_min, input_max);
+  params.kernel.one_over_output_range_scale =
+      CalculateOneOverRangeScale<uint8_t>(output_min, output_max);
+  params.kernel.input_range_offset =
+      static_cast<float>(std::numeric_limits<int32_t>::lowest());
+
+  // After adding the output_range_offset the value is cast from float to uint.
+  // The float to int/uint cast in NEON uses round toward 0. To keep the
+  // rounding consistent with Eigen, which uses round toward closest, we can
+  // add 0.5f and exploit the fact that we only operate on non negative values.
+  // TODO(maciekc): fix the actual kernel in gemmlowp/meta
+  params.kernel.output_range_offset =
+      static_cast<float>(std::numeric_limits<uint8_t>::lowest()) + 0.5f;
+
+  MultiThreadTransform1D<Params, 16>(tf_context, params);
+#else
+  LOG(FATAL) << "Requantize: Meta fastpath not supported.";
+#endif
+}
+
+void Dequantize(OpKernelContext* tf_context, const quint8* input, int count,
+                float range_min, float range_max, float* output) {
+#ifdef TENSORFLOW_USE_META
+  mutex_lock library_lock(GetMutex());
+  typedef gemmlowp::meta::Transform1DParams<uint8_t, float,
+                                            gemmlowp::meta::Dequantize>
+      Params;
+
+  Params params;
+  params.input = reinterpret_cast<const uint8_t*>(input);
+  params.output = reinterpret_cast<float*>(output);
+  params.kernel.count = count;
+  params.kernel.range_min = range_min;
+  params.kernel.range_scale =
+      CalculateRangeScale<uint8_t>(range_min, range_max);
+  params.kernel.range_offset =
+      static_cast<float>(std::numeric_limits<uint8_t>::lowest());
+
+  MultiThreadTransform1D<Params, 16>(tf_context, params);
+#else
+  LOG(FATAL) << "Dequantize: Meta fastpath not supported.";
+#endif
+}
+
+void Quantize(OpKernelContext* tf_context, const float* input, int count,
+              float range_min, float range_max, quint8* output) {
+#ifdef TENSORFLOW_USE_META
+  mutex_lock library_lock(GetMutex());
+  typedef gemmlowp::meta::Transform1DParams<float, uint8_t,
+                                            gemmlowp::meta::Quantize>
+      Params;
+
+  Params params;
+  params.input = reinterpret_cast<const float*>(input);
+  params.output = reinterpret_cast<uint8_t*>(output);
+  params.kernel.count = count;
+  params.kernel.range_min = range_min;
+  params.kernel.range_scale =
+      CalculateOneOverRangeScale<uint8_t>(range_min, range_max);
+
+  // After adding the range_offset the value is cast from float to uint.
+  // The float to int/uint cast in NEON uses round toward 0. To keep the
+  // rounding consistent with Eigen, which uses round toward closest, we can
+  // add 0.5f and exploit the fact that we only operate on non negative values.
+  // TODO(maciekc): fix the the actual kernel in gemmlowp/meta
+  params.kernel.range_offset =
+      static_cast<float>(std::numeric_limits<uint8_t>::lowest()) + 0.5f;
+
+  MultiThreadTransform1D<Params, 16>(tf_context, params);
+#else
+  LOG(FATAL) << "Quantize: Meta fastpath not supported.";
+#endif
+}
+
+void QuantizedBiasAdd(OpKernelContext* tf_context, const quint8* input,
+                      int input_count, const quint8* bias, int bias_count,
+                      float input_min, float input_max, float bias_min,
+                      float bias_max, float output_min, float output_max,
+                      qint32* output) {
+#ifdef TENSORFLOW_USE_META
+  mutex_lock library_lock(GetMutex());
+  typedef gemmlowp::meta::Transform1DParams<uint8_t, int32_t,
+                                            gemmlowp::meta::BiasAdd<uint8_t>>
+      Params;
+
+  Params params;
+  params.input = reinterpret_cast<const uint8_t*>(input);
+  params.output = reinterpret_cast<int32_t*>(output);
+  params.kernel.bias = reinterpret_cast<const uint8_t*>(bias);
+  params.kernel.count = bias_count;
+  params.kernel.rows = input_count / bias_count;
+  params.kernel.input_range_min = input_min;
+  params.kernel.bias_range_min = bias_min;
+  params.kernel.input_range_scale =
+      CalculateRangeScale<uint8_t>(input_min, input_max);
+  params.kernel.bias_range_scale =
+      CalculateRangeScale<uint8_t>(bias_min, bias_max);
+  params.kernel.input_range_offset = 0;
+  params.kernel.bias_range_offset = 0;
+  params.kernel.output_range_min = output_min;
+  params.kernel.one_over_output_range_scale =
+      CalculateOneOverRangeScale<int32_t>(output_min, output_max);
+  params.kernel.output_range_offset =
+      static_cast<float>(std::numeric_limits<int32_t>::lowest());
+
+  // TODO(maciekc): add multithreading to bias add.
+  // Right now this kernel does not support multi threaded execution.
+  gemmlowp::meta::Transform1D<Params, 16>(params);
+#else
+  LOG(FATAL) << "QuantizedBiasAdd: Meta fastpath not supported.";
+#endif
+}
+
+void Clamp(OpKernelContext* tf_context, const quint8* input, int count,
+           quint8 clamp_min, quint8 clamp_max, quint8* output) {
+#ifdef TENSORFLOW_USE_META
+  mutex_lock library_lock(GetMutex());
+  typedef gemmlowp::meta::Transform1DParams<uint8_t, uint8_t,
+                                            gemmlowp::meta::MinMax<uint8_t>>
+      Params;
+
+  Params params;
+  params.input = reinterpret_cast<const uint8_t*>(input);
+  params.output = reinterpret_cast<uint8_t*>(output);
+  params.kernel.count = count;
+  params.kernel.min = clamp_min;
+  params.kernel.max = clamp_max;
+
+  MultiThreadTransform1D<Params, 16>(tf_context, params);
+#else
+  LOG(FATAL) << "Clamp: Meta fastpath not supported.";
+#endif
+}
+
+}  // namespace meta
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/meta_support.h b/tensorflow/core/kernels/meta_support.h
new file mode 100644
index 00000000000..0d87baf0344
--- /dev/null
+++ b/tensorflow/core/kernels/meta_support.h
@@ -0,0 +1,112 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_QUANTIZATION_KERNELS_META_SUPPORT_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_QUANTIZATION_KERNELS_META_SUPPORT_H_
+
+#include "meta/multi_thread_gemm.h"
+#include "meta/multi_thread_transform.h"
+#include "meta/quantized_mul_kernels.h"
+#include "meta/streams.h"
+#include "meta/transform_kernels.h"
+
+#include "tensorflow/core/framework/numeric_types.h"
+
+namespace tensorflow {
+
+class OpKernelContext;
+
+namespace meta {
+
+// Gemmlowp/meta is a small library of optimized Arm32/64 kernels for quantized
+// matrix multiplication and other quantized computations.
+
+// Set the maximum number of threads of computation that the internal workers
+// pool can use. If num_threads is 0, then use intra_op_parallelism_threads.
+void SetNumThreads(int num_threads);
+
+int GetNumThreads();
+
+// Toggle the internal workers pool. If set to false, the computations will
+// use the worker pool passed each time in the OpKernelContext. If set to true
+// then the OpKernelContext will be ignored, and the internal optimized workers
+// pool will be used.
+//
+// The internal workers pool is disabled by default (false).
+void SetUseLocalContext(bool use_local_context);
+
+bool GetUseLocalContext();
+
+// Toggles the codepath. Enabled by default (true) on supported platforms.
+void SetEnabled(bool enabled);
+
+// Returns true if the codepath is supported and is enabled. Use this call
+// before calling the compute functions. If the codepath is not supported, and
+// any of the compute function is called, the library will log a FATAL error.
+bool IsSupportedAndEnabled();
+
+// Calculate the quantized matrix multiplication:
+//
+// for (i, j) in [0, m) x [0, n) do
+//   c_data[i, j] :=
+//     sum((a_data[i, l] + offset_a) * (b_data[l, j] + offset_b)) : l in [0, k)
+//
+// If transpose_a is false the lhs operand has row major layout, otherwise
+// column major. Similarily transpose_b describes the layout of the rhs operand.
+// lda, ldb, and ldc are the strides of the lhs operand, rhs operand and the
+// result arrays.
+void QuantizedGemm(OpKernelContext* context, bool transpose_a, bool transpose_b,
+                   const quint8* a_data, const quint8* b_data, qint32* c_data,
+                   int m, int n, int k, int offset_a, int offset_b, int lda,
+                   int ldb, int ldc);
+
+// Take an array of numbers from the range [input_min, input_max] quantized
+// uniformly to int32 values, recover their float values, and then quantize
+// them back uniformly to the range [output_min, output_max] as uint8.
+// Saturate the uint8 values.
+void Requantize(OpKernelContext* context, const qint32* input, int count,
+                float input_min, float input_max, float output_min,
+                float output_max, quint8* output);
+
+// Take an array of numbers from the range [range_min, range_max] quantized
+// uniformly to uint8 values and recover their float values.
+void Dequantize(OpKernelContext* context, const quint8* input, int count,
+                float range_min, float range_max, float* output);
+
+// Take an array of float values and quantize them uniformly to the range
+// [range_min, range_max] expressed as uint8. Saturate the uint8 values.
+void Quantize(OpKernelContext*, const float* input, int count, float range_min,
+              float range_max, quint8* output);
+
+// Take two arrays: the inputs and the bias quantized uniformly in the ranges
+// [input_min, input_max], and [bias_min, bias_max] accordingly, as uint8
+// values. Recover their float values. Add the values. Quantize them back
+// uniformly to the range [output_min, output_max] as int32. Saturate the
+// int32 values.
+void QuantizedBiasAdd(OpKernelContext* context, const quint8* input,
+                      int input_count, const quint8* bias, int bias_count,
+                      float input_min, float input_max, float bias_min,
+                      float bias_max, float output_min, float output_max,
+                      qint32* output);
+
+// Take an array of uint8 values and clamp them to the range [clamp_min,
+// clamp_max].
+void Clamp(OpKernelContext* context, const quint8* input, int input_count,
+           quint8 clamp_min, quint8 clamp_max, quint8* output);
+
+}  // namespace meta
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_QUANTIZATION_KERNELS_META_SUPPORT_H_
diff --git a/tensorflow/core/kernels/quantize_down_and_shrink_range.cc b/tensorflow/core/kernels/quantize_down_and_shrink_range.cc
index aef5f0b6a35..9893a855877 100644
--- a/tensorflow/core/kernels/quantize_down_and_shrink_range.cc
+++ b/tensorflow/core/kernels/quantize_down_and_shrink_range.cc
@@ -20,11 +20,12 @@ limitations under the License.
 #include <math.h>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/type_traits.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/meta_support.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
@@ -48,6 +49,7 @@ class QuantizeDownAndShrinkRangeOp : public OpKernel {
     Tensor* output_max = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(2, TensorShape({}), &output_max));
 
+    // See QuantizationRangeOp as well, which has a copy of this logic.
     auto input_array = input.flat<T1>();
     const int32 input_lowest_quantized =
         static_cast<int32>(Eigen::NumTraits<T1>::lowest());
@@ -78,9 +80,17 @@ class QuantizeDownAndShrinkRangeOp : public OpKernel {
 #endif
 
     if (input_array.size() > 0) {
-      RequantizeManyInNewRangeUsingEigen<T1, T2>(
-          ctx->eigen_device<CPUDevice>(), input, input_min_float,
-          input_max_float, actual_min_float, actual_max_float, output);
+      if (meta::IsSupportedAndEnabled() && std::is_same<T1, qint32>() &&
+          std::is_same<T2, quint8>()) {
+        auto input_i32_array = input.flat<qint32>();
+        meta::Requantize(ctx, input_i32_array.data(), input_i32_array.size(),
+                         input_min_float, input_max_float, actual_min_float,
+                         actual_max_float, output->flat<quint8>().data());
+      } else {
+        RequantizeManyInNewRangeUsingEigen<T1, T2>(
+            ctx->eigen_device<CPUDevice>(), input, input_min_float,
+            input_max_float, actual_min_float, actual_max_float, output);
+      }
     }
 
     output_min->flat<float>().setConstant(actual_min_float);
diff --git a/tensorflow/core/kernels/quantize_op.cc b/tensorflow/core/kernels/quantize_op.cc
index 003654c1b0f..b8f0dd86425 100644
--- a/tensorflow/core/kernels/quantize_op.cc
+++ b/tensorflow/core/kernels/quantize_op.cc
@@ -17,11 +17,12 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/type_traits.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/meta_support.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace {
@@ -124,9 +125,15 @@ class QuantizeV2Op : public OpKernel {
                 .template cast<T>();
       }
     } else if (mode_ == QUANTIZE_MODE_MIN_FIRST) {
-      FloatTensorToQuantizedInPlaceUsingEigen<T>(
-          ctx->template eigen_device<Device>(), input, min_range, max_range,
-          output);
+      if (meta::IsSupportedAndEnabled() && std::is_same<T, quint8>()) {
+        auto input_array = input.flat<float>();
+        meta::Quantize(ctx, input_array.data(), input_array.size(), min_range,
+                       max_range, output->flat<quint8>().data());
+      } else {
+        FloatTensorToQuantizedInPlaceUsingEigen<T>(
+            ctx->template eigen_device<Device>(), input, min_range, max_range,
+            output);
+      }
     }
 
     Tensor* output_min_tensor = nullptr;
diff --git a/tensorflow/core/kernels/quantized_activation_ops.cc b/tensorflow/core/kernels/quantized_activation_ops.cc
index ea1cf15f7bb..2896c3d45a7 100644
--- a/tensorflow/core/kernels/quantized_activation_ops.cc
+++ b/tensorflow/core/kernels/quantized_activation_ops.cc
@@ -16,10 +16,11 @@ limitations under the License.
 // Implements a quantized version of the Relu6 operation.
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/meta_support.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
@@ -37,8 +38,16 @@ class QuantizedReluOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, input.shape(), &output));
     const T min_as_quantized = FloatToQuantized<T>(0.0f, min_input, max_input);
-    output->flat<T>().device(context->eigen_cpu_device()) =
-        input.flat<T>().cwiseMax(min_as_quantized).template cast<T>();
+
+    if (meta::IsSupportedAndEnabled() && std::is_same<T, quint8>()) {
+      auto input_ui8_array = input.flat<quint8>();
+      meta::Clamp(context, input_ui8_array.data(), input_ui8_array.size(),
+                  min_as_quantized, 255, output->flat<quint8>().data());
+    } else {
+      output->flat<T>().device(context->eigen_cpu_device()) =
+          input.flat<T>().cwiseMax(min_as_quantized).template cast<T>();
+    }
+
     Tensor* output_min = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(1, {}, &output_min));
     output_min->flat<float>()(0) = min_input;
@@ -63,11 +72,20 @@ class QuantizedRelu6Op : public OpKernel {
                    context->allocate_output(0, input.shape(), &output));
     const T min_as_quantized = FloatToQuantized<T>(0.0f, min_input, max_input);
     const T max_as_quantized = FloatToQuantized<T>(6.0f, min_input, max_input);
-    output->flat<T>().device(context->eigen_cpu_device()) =
-        input.flat<T>()
-            .cwiseMax(min_as_quantized)
-            .cwiseMin(max_as_quantized)
-            .template cast<T>();
+
+    if (meta::IsSupportedAndEnabled() && std::is_same<T, quint8>()) {
+      auto input_ui8_array = input.flat<quint8>();
+      meta::Clamp(context, input_ui8_array.data(), input_ui8_array.size(),
+                  min_as_quantized, max_as_quantized,
+                  output->flat<quint8>().data());
+    } else {
+      output->flat<T>().device(context->eigen_cpu_device()) =
+          input.flat<T>()
+              .cwiseMax(min_as_quantized)
+              .cwiseMin(max_as_quantized)
+              .template cast<T>();
+    }
+
     Tensor* output_min = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(1, {}, &output_min));
     output_min->flat<float>()(0) = min_input;
diff --git a/tensorflow/core/kernels/quantized_bias_add_op.cc b/tensorflow/core/kernels/quantized_bias_add_op.cc
index 0b34bfcad83..5457d290c25 100644
--- a/tensorflow/core/kernels/quantized_bias_add_op.cc
+++ b/tensorflow/core/kernels/quantized_bias_add_op.cc
@@ -15,11 +15,14 @@ limitations under the License.
 
 // Implements a quantized eight-bit version of the bias addition operation.
 
-#include "tensorflow/core/kernels/quantization_utils.h"
+#define EIGEN_USE_THREADS
+
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/meta_support.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
@@ -60,9 +63,23 @@ class QuantizedBiasAddOp : public OpKernel {
 
     float total_min;
     float total_max;
-    QuantizedAddUsingEigen<T1, T2, T3>(
-        context->template eigen_device<CPUDevice>(), input, input_min,
-        input_max, bias, bias_min, bias_max, output, &total_min, &total_max);
+
+    if (meta::IsSupportedAndEnabled() && std::is_same<T1, quint8>() &&
+        std::is_same<T2, quint8>() && std::is_same<T3, qint32>()) {
+      auto input_ui8_array = input.flat<quint8>();
+      auto bias_ui8_array = bias.flat<quint8>();
+      GetOutputMinAndMaxForQuantizedAdd(input_min, input_max, bias_min,
+                                        bias_max, &total_min, &total_max);
+      meta::QuantizedBiasAdd(context, input_ui8_array.data(),
+                             input_ui8_array.size(), bias_ui8_array.data(),
+                             bias_ui8_array.size(), input_min, input_max,
+                             bias_min, bias_max, total_min, total_max,
+                             output->flat<qint32>().data());
+    } else {
+      QuantizedAddUsingEigen<T1, T2, T3>(
+          context->template eigen_device<CPUDevice>(), input, input_min,
+          input_max, bias, bias_min, bias_max, output, &total_min, &total_max);
+    }
 
     Tensor* output_min = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(1, {}, &output_min));
diff --git a/tensorflow/core/kernels/quantized_conv_ops.cc b/tensorflow/core/kernels/quantized_conv_ops.cc
index fb69d770c0b..2405c55c5b1 100644
--- a/tensorflow/core/kernels/quantized_conv_ops.cc
+++ b/tensorflow/core/kernels/quantized_conv_ops.cc
@@ -18,12 +18,15 @@ limitations under the License.
 #include <algorithm>
 #include <vector>
 
+#define EIGEN_USE_THREADS
+
 #include "public/gemmlowp.h"
-#include "tensorflow/core/kernels/quantization_utils.h"
-#include "tensorflow/core/kernels/reference_gemm.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/meta_support.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
+#include "tensorflow/core/kernels/reference_gemm.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/padding.h"
 
@@ -338,12 +341,20 @@ class Im2ColConvFunctor {
     const int lda = filter_value_count;
     const int ldb = filter_count;
     const int ldc = filter_count;
-    // The gemmlowp optimized library only works for a particular set of data
-    // types, so check if we meet those requirements and
-    // fall back to a slower reference implementation if not.
-    if (std::is_same<T1, quint8>() && std::is_same<T2, quint8>() &&
-        std::is_same<T3, qint32>() && (output_offset == 0) &&
-        (output_mult == 1) && (output_shift == 0)) {
+
+    if (meta::IsSupportedAndEnabled() && std::is_same<T1, quint8>() &&
+        std::is_same<T2, quint8>() && std::is_same<T3, qint32>() &&
+        (output_offset == 0) && (output_mult == 1) && (output_shift == 0) &&
+        (transpose_c == false)) {
+      meta::QuantizedGemm(op_context, transpose_a, transpose_b,
+                          im2col_buffer.get(), filter_data, output_data, m, n,
+                          k, -input_offset, -filter_offset, lda, ldb, ldc);
+    } else if (std::is_same<T1, quint8>() && std::is_same<T2, quint8>() &&
+               std::is_same<T3, qint32>() && (output_offset == 0) &&
+               (output_mult == 1) && (output_shift == 0)) {
+      // The gemmlowp optimized library only works for a particular set of data
+      // types, so check if we meet those requirements and
+      // fall back to a slower reference implementation if not.
       const uint8* im2col_data_as_uint8 = &(im2col_buffer.get()->value);
       const uint8* filter_data_as_uint8 = &(filter_data->value);
       int32* output_data_as_int32 = &(output_data->value);
diff --git a/tensorflow/core/kernels/quantized_matmul_op.cc b/tensorflow/core/kernels/quantized_matmul_op.cc
index 0ce9e376423..4abcae0d357 100644
--- a/tensorflow/core/kernels/quantized_matmul_op.cc
+++ b/tensorflow/core/kernels/quantized_matmul_op.cc
@@ -15,11 +15,14 @@ limitations under the License.
 
 // Implements a quantized eight-bit version of the matmul operation.
 
+#define EIGEN_USE_THREADS
+
 #include "public/gemmlowp.h"
-#include "tensorflow/core/kernels/quantization_utils.h"
-#include "tensorflow/core/kernels/reference_gemm.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/meta_support.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
+#include "tensorflow/core/kernels/reference_gemm.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
@@ -125,12 +128,20 @@ class QuantizedMatMulOp : public OpKernel {
     const size_t ldb = b.dim_size(1);
     const size_t ldc = n;
 
-    // The gemmlowp optimized library only works for a particular set of data
-    // types, so check if we meet those requirements and
-    // fall back to a slower reference implementation if not.
-    if (std::is_same<T1, quint8>() && std::is_same<T2, quint8>() &&
-        std::is_same<Toutput, qint32>() && (offset_c == 0) && (mult_c == 1) &&
-        (shift_c == 0) && (transpose_c == false)) {
+    if (meta::IsSupportedAndEnabled() && std::is_same<T1, quint8>() &&
+        std::is_same<T2, quint8>() && std::is_same<Toutput, qint32>() &&
+        (offset_c == 0) && (mult_c == 1) && (shift_c == 0) &&
+        (transpose_c == false)) {
+      // Gemmlowp/meta code path works on 32 & 64 bit Arm with NEON Simd and
+      // allows optimized quantized 8bit to 32bit gemm.
+      meta::QuantizedGemm(context, transpose_a_, transpose_b_, a_data, b_data,
+                          c_data, m, n, k, offset_a, offset_b, lda, ldb, ldc);
+    } else if (std::is_same<T1, quint8>() && std::is_same<T2, quint8>() &&
+               std::is_same<Toutput, qint32>() && (offset_c == 0) &&
+               (mult_c == 1) && (shift_c == 0) && (transpose_c == false)) {
+      // The gemmlowp optimized library only works for a particular set of data
+      // types, so check if we meet those requirements and fall back to a slower
+      // reference implementation if not.
       if (transpose_a_) {
         if (transpose_b_) {
           GemmlowpMultiply<true, true, false>(context, a_data, b_data, c_data,
diff --git a/tensorflow/core/kernels/requantization_range_op.cc b/tensorflow/core/kernels/requantization_range_op.cc
new file mode 100644
index 00000000000..1aad48763bb
--- /dev/null
+++ b/tensorflow/core/kernels/requantization_range_op.cc
@@ -0,0 +1,80 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/array_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include <math.h>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/kernels/quantization_utils.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/type_traits.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <class T1>
+class RequantizationRangeOp : public OpKernel {
+ public:
+  explicit RequantizationRangeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& input = ctx->input(0);
+    const float input_min_float = ctx->input(1).flat<float>()(0);
+    const float input_max_float = ctx->input(2).flat<float>()(0);
+    Tensor* output_min = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output_min));
+    Tensor* output_max = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(1, TensorShape({}), &output_max));
+
+    // See the deprecated QuantizeDownAndShrinkRangeOp as well, which has a copy
+    // of this logic.
+    auto input_array = input.flat<T1>();
+    const int32 input_lowest_quantized =
+        static_cast<int32>(Eigen::NumTraits<T1>::lowest());
+    const int32 input_highest_quantized =
+        static_cast<int32>(Eigen::NumTraits<T1>::highest());
+    T1 actual_min_quantized = input_highest_quantized;
+    T1 actual_max_quantized = input_lowest_quantized;
+    for (int i = 0; i < input_array.size(); ++i) {
+      const T1 value = input_array(i);
+      actual_min_quantized = std::min(actual_min_quantized, value);
+      actual_max_quantized = std::max(actual_max_quantized, value);
+    }
+    // We want to make sure that the minimum is no larger than zero, so that the
+    // convolution operation can run efficiently.
+    const float actual_min_float =
+        std::min(0.0f, QuantizedToFloat(actual_min_quantized, input_min_float,
+                                        input_max_float));
+    const float actual_max_float = QuantizedToFloat(
+        actual_max_quantized, input_min_float, input_max_float);
+
+    output_min->flat<float>().setConstant(actual_min_float);
+    output_max->flat<float>().setConstant(actual_max_float);
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("RequantizationRange")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<qint32>("Tinput"),
+                        RequantizationRangeOp<qint32>);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/requantization_range_op_test.cc b/tensorflow/core/kernels/requantization_range_op_test.cc
new file mode 100644
index 00000000000..38dc3af7cca
--- /dev/null
+++ b/tensorflow/core/kernels/requantization_range_op_test.cc
@@ -0,0 +1,66 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+class RequantizationRangeTest : public OpsTestBase {
+ protected:
+};
+
+// Runs a manually generated array through the operator, and makes sure that the
+// results match the expected hand-calculated values.
+TEST_F(RequantizationRangeTest, HandCrafted) {
+  TF_ASSERT_OK(NodeDefBuilder("requantization_range", "RequantizationRange")
+                   .Input(FakeInput(DT_QINT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("Tinput", DataTypeToEnum<qint32>::v())
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+
+  // For this test we have an input that has the theoretical range of -256.0f to
+  // +256.0f, but the actual values present only span -1.0f to 1.0f. We expect
+  // the operator to take advantage of this, and rescale the output to fill up
+  // the available range in the lower bit depth, and update to the true min and
+  // max ranges.
+  const int value_count = 3;
+  AddInputFromArray<qint32>(TensorShape({value_count}),
+                            {-(1 << 23), 0, (1 << 23)});
+  AddInputFromArray<float>(TensorShape({1}), {-256.0f});
+  AddInputFromArray<float>(TensorShape({1}), {256.0f});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected_min(allocator(), DT_FLOAT, TensorShape({}));
+  test::FillValues<float>(&expected_min, {-1.0f});
+  test::ExpectTensorEqual<float>(expected_min, *GetOutput(0));
+  Tensor expected_max(allocator(), DT_FLOAT, TensorShape({}));
+  test::FillValues<float>(&expected_max, {1.0f});
+  test::ExpectTensorEqual<float>(expected_max, *GetOutput(1));
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/requantize.cc b/tensorflow/core/kernels/requantize.cc
index 865970a99e0..fc8af1799d5 100644
--- a/tensorflow/core/kernels/requantize.cc
+++ b/tensorflow/core/kernels/requantize.cc
@@ -55,9 +55,10 @@ class RequantizeOp : public OpKernel {
         errors::InvalidArgument("requested_output_min must be <= 0, but got ",
                                 requested_output_min_float));
     OP_REQUIRES(
-        ctx, requested_output_max_float >= 0.0f,
-        errors::InvalidArgument("requested_output_max must be <= 0, but got ",
-                                requested_output_max_float));
+        ctx, requested_output_max_float >= requested_output_min_float,
+        errors::InvalidArgument(
+            "requested_output_max must be >= requested_output_min, but got ",
+            requested_output_max_float, " and ", requested_output_min_float));
 
     auto input_array = input.flat<T1>();
 
diff --git a/tensorflow/core/kernels/requantize_op_test.cc b/tensorflow/core/kernels/requantize_op_test.cc
index e7674eb2946..44cacf890b6 100644
--- a/tensorflow/core/kernels/requantize_op_test.cc
+++ b/tensorflow/core/kernels/requantize_op_test.cc
@@ -88,10 +88,12 @@ TEST_F(RequantizeTest, InvalidOutputMax) {
                             {-(1 << 23), 0, (1 << 23)});
   AddInputFromArray<float>(TensorShape({1}), {-256.0f});
   AddInputFromArray<float>(TensorShape({1}), {256.0f});
-  AddInputFromArray<float>(TensorShape({1}), {-1.0f});
-  AddInputFromArray<float>(TensorShape({1}), {-0.001f});
-  EXPECT_EQ("requested_output_max must be <= 0, but got -0.001",
-            RunOpKernel().error_message());
+  AddInputFromArray<float>(TensorShape({1}), {-10.0f});
+  AddInputFromArray<float>(TensorShape({1}), {-11.0f});
+  EXPECT_EQ(
+      "requested_output_max must be >= requested_output_min, but got -11 and "
+      "-10",
+      RunOpKernel().error_message());
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/sdca_ops.cc b/tensorflow/core/kernels/sdca_ops.cc
index 63e705df438..d30e7486f51 100644
--- a/tensorflow/core/kernels/sdca_ops.cc
+++ b/tensorflow/core/kernels/sdca_ops.cc
@@ -167,7 +167,7 @@ class Example {
   // A dense vector which is a row-slice of the underlying matrix.
   struct DenseVector {
     // Returns a row slice from the matrix.
-    Eigen::TensorMap<Eigen::Tensor<const float, 1, Eigen::RowMajor>> row()
+    Eigen::TensorMap<Eigen::Tensor<const float, 1, Eigen::RowMajor>> Row()
         const {
       return Eigen::TensorMap<Eigen::Tensor<const float, 1, Eigen::RowMajor>>(
           data_matrix.data() + row_index * data_matrix.dimension(1),
@@ -176,7 +176,7 @@ class Example {
 
     // Returns a row slice as a 1 * F matrix, where F is the number of features.
     Eigen::TensorMap<Eigen::Tensor<const float, 2, Eigen::RowMajor>>
-    row_as_matrix() const {
+    RowAsMatrix() const {
       return Eigen::TensorMap<Eigen::Tensor<const float, 2, Eigen::RowMajor>>(
           data_matrix.data() + row_index * data_matrix.dimension(1), 1,
           data_matrix.dimension(1));
@@ -228,18 +228,26 @@ class FeatureWeightsDenseStorage {
       const Eigen::ThreadPoolDevice& device,
       const Example::DenseVector& dense_vector,
       const std::vector<double>& normalized_bounded_dual_delta) {
-    // Transform the dual vector into a column matrix.
-    const Eigen::TensorMap<Eigen::Tensor<const double, 2, Eigen::RowMajor>>
-        dual_matrix(normalized_bounded_dual_delta.data(),
-                    normalized_bounded_dual_delta.size(), 1);
-    const Eigen::array<Eigen::IndexPair<int>, 1> product_dims = {
-        Eigen::IndexPair<int>(1, 0)};
-    // This essentially computes delta_w += delta_vector / \lamdba * N.
-    deltas_.device(device) =
-        (deltas_.cast<double>() +
-         dual_matrix.contract(dense_vector.row_as_matrix().cast<double>(),
-                              product_dims))
-            .cast<float>();
+    const size_t num_weight_vectors = normalized_bounded_dual_delta.size();
+    if (num_weight_vectors == 1) {
+      deltas_.device(device) =
+          deltas_ +
+          dense_vector.RowAsMatrix() *
+              deltas_.constant(normalized_bounded_dual_delta[0]);
+    } else {
+      // Transform the dual vector into a column matrix.
+      const Eigen::TensorMap<Eigen::Tensor<const double, 2, Eigen::RowMajor>>
+          dual_matrix(normalized_bounded_dual_delta.data(), num_weight_vectors,
+                      1);
+      const Eigen::array<Eigen::IndexPair<int>, 1> product_dims = {
+          Eigen::IndexPair<int>(1, 0)};
+      // This essentially computes delta_w += delta_vector / \lamdba * N.
+      deltas_.device(device) =
+          (deltas_.cast<double>() +
+           dual_matrix.contract(dense_vector.RowAsMatrix().cast<double>(),
+                                product_dims))
+              .cast<float>();
+    }
   }
 
  private:
@@ -456,19 +464,37 @@ const ExampleStatistics Example::ComputeWxAndWeightedExampleNorm(
         dense_weights.nominals() +
         dense_weights.deltas() *
             dense_weights.deltas().constant(num_loss_partitions);
-    const Eigen::array<Eigen::IndexPair<int>, 1> product_dims = {
-        Eigen::IndexPair<int>(1, 1)};
-    const Eigen::Tensor<float, 2, Eigen::RowMajor> prev_prediction =
-        regularization.EigenShrinkMatrix(dense_weights.nominals())
-            .contract(dense_vector.row_as_matrix(), product_dims);
-    const Eigen::Tensor<float, 2, Eigen::RowMajor> prediction =
-        regularization.EigenShrinkMatrix(feature_weights)
-            .contract(dense_vector.row_as_matrix(), product_dims);
-    // The result of "tensor contraction" (multiplication)  in the code
-    // above is of dimension num_weight_vectors * 1.
-    for (int l = 0; l < num_weight_vectors; ++l) {
-      result.prev_wx[l] += prev_prediction(l, 0);
-      result.wx[l] += prediction(l, 0);
+    if (num_weight_vectors == 1) {
+      const Eigen::Tensor<float, 0, Eigen::RowMajor> prev_prediction =
+          (dense_vector.Row() *
+           regularization.EigenShrinkVector(
+               Eigen::TensorMap<Eigen::Tensor<const float, 1, Eigen::RowMajor>>(
+                   dense_weights.nominals().data(),
+                   dense_weights.nominals().dimension(1))))
+              .sum();
+      const Eigen::Tensor<float, 0, Eigen::RowMajor> prediction =
+          (dense_vector.Row() *
+           regularization.EigenShrinkVector(
+               Eigen::TensorMap<Eigen::Tensor<const float, 1, Eigen::RowMajor>>(
+                   feature_weights.data(), feature_weights.dimension(1))))
+              .sum();
+      result.prev_wx[0] += prev_prediction();
+      result.wx[0] += prediction();
+    } else {
+      const Eigen::array<Eigen::IndexPair<int>, 1> product_dims = {
+          Eigen::IndexPair<int>(1, 1)};
+      const Eigen::Tensor<float, 2, Eigen::RowMajor> prev_prediction =
+          regularization.EigenShrinkMatrix(dense_weights.nominals())
+              .contract(dense_vector.RowAsMatrix(), product_dims);
+      const Eigen::Tensor<float, 2, Eigen::RowMajor> prediction =
+          regularization.EigenShrinkMatrix(feature_weights)
+              .contract(dense_vector.RowAsMatrix(), product_dims);
+      // The result of "tensor contraction" (multiplication)  in the code
+      // above is of dimension num_weight_vectors * 1.
+      for (int l = 0; l < num_weight_vectors; ++l) {
+        result.prev_wx[l] += prev_prediction(l, 0);
+        result.wx[l] += prediction(l, 0);
+      }
     }
   }
 
@@ -824,7 +850,7 @@ void Examples::ComputeSquaredNormPerExample(
       }
       for (int j = 0; j < num_dense_features; ++j) {
         const Eigen::Tensor<float, 0, Eigen::RowMajor> sn =
-            example->dense_vectors_[j]->row().square().sum();
+            example->dense_vectors_[j]->Row().square().sum();
         squared_norm += sn();
       }
       example->squared_norm_ = squared_norm;
diff --git a/tensorflow/core/kernels/sdca_ops_test.cc b/tensorflow/core/kernels/sdca_ops_test.cc
index 9ddbd817e19..400f330ce7b 100644
--- a/tensorflow/core/kernels/sdca_ops_test.cc
+++ b/tensorflow/core/kernels/sdca_ops_test.cc
@@ -232,6 +232,17 @@ void BM_SDCA(const int iters, const int num_examples) {
   test::Benchmark("cpu", train, GetSingleThreadedOptions(), init).Run(iters);
 }
 
+void BM_SDCA_LARGE_DENSE(const int iters, const int num_examples) {
+  testing::StopTiming();
+  Graph* init = nullptr;
+  Graph* train = nullptr;
+  GetGraphs(num_examples, 0 /* sparse feature groups */,
+            0 /* sparse features per group */, 5 /* dense feature groups*/,
+            200000 /* dense features per group */, &init, &train);
+  testing::StartTiming();
+  test::Benchmark("cpu", train, GetSingleThreadedOptions(), init).Run(iters);
+}
+
 void BM_SDCA_LARGE_SPARSE(const int iters, const int num_examples) {
   testing::StopTiming();
   Graph* init = nullptr;
@@ -242,10 +253,10 @@ void BM_SDCA_LARGE_SPARSE(const int iters, const int num_examples) {
   testing::StartTiming();
   test::Benchmark("cpu", train, GetMultiThreadedOptions(), init).Run(iters);
 }
-
 }  // namespace
 
 BENCHMARK(BM_SDCA)->Arg(128)->Arg(256)->Arg(512)->Arg(1024);
+BENCHMARK(BM_SDCA_LARGE_DENSE)->Arg(128)->Arg(256)->Arg(512)->Arg(1024);
 BENCHMARK(BM_SDCA_LARGE_SPARSE)->Arg(128)->Arg(256)->Arg(512)->Arg(1024);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/softmax_op.cc b/tensorflow/core/kernels/softmax_op.cc
index 8ec8409e21d..c7ae93852f8 100644
--- a/tensorflow/core/kernels/softmax_op.cc
+++ b/tensorflow/core/kernels/softmax_op.cc
@@ -65,6 +65,9 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("Softmax").Device(DEVICE_GPU).TypeConstraint<float>("T"),
     SoftmaxOp<GPUDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("Softmax").Device(DEVICE_GPU).TypeConstraint<double>("T"),
+    SoftmaxOp<GPUDevice, double>);
 REGISTER_KERNEL_BUILDER(
     Name("LogSoftmax").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
     SoftmaxOp<GPUDevice, Eigen::half>);
diff --git a/tensorflow/core/kernels/softmax_op_gpu.cu.cc b/tensorflow/core/kernels/softmax_op_gpu.cu.cc
index 8c26a66a3c3..3f7dd383c60 100644
--- a/tensorflow/core/kernels/softmax_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/softmax_op_gpu.cu.cc
@@ -41,6 +41,7 @@ struct SoftmaxFunctor<GPUDevice, T> {
 // Instantiate the GPU implementation for float.
 template struct functor::SoftmaxFunctor<GPUDevice, Eigen::half>;
 template struct functor::SoftmaxFunctor<GPUDevice, float>;
+template struct functor::SoftmaxFunctor<GPUDevice, double>;
 
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
index cf17efaf01e..e5b0b6fcd21 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/lib/core/blocking_counter.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
@@ -852,6 +853,15 @@ class SparseMatMulOp : public OpKernel {
                                         b.shape().DebugString()));
     Tensor* output = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({m, n}), &output));
+
+    if (k == 0) {
+      // If the inner dimension k in the matrix multiplication is zero, we fill
+      // the output with zeros.
+      functor::SetZeroFunctor<CPUDevice, float> f;
+      f(ctx->eigen_device<CPUDevice>(), output->flat<float>());
+      return;
+    }
+
     auto out = output->matrix<float>();
 
     std::unique_ptr<Tensor> a_float;
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index c6c4f191b86..6cbcbf9fd95 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -295,21 +295,16 @@ class StridedSliceAssignOp : public OpKernel {
 
       // 0-dimensional case implies the left and right are exactly the same
       // scalar shape
-      if (processing_shape.dims() == 0) {
-        functor::DenseUpdate<Device, T, ASSIGN> copy;
-        copy(context->eigen_device<Device>(), old_lhs.flat<T>(),
-             input.flat<T>());
-        return;
-      }
 
 // Handle general dimensions
-#define HANDLE_DIM(NDIM)                                                      \
-  if (processing_dims == NDIM) {                                              \
-    HandleStridedSliceAssignCase<Device, T, NDIM>(context, begin, end,        \
-                                                  strides, processing_shape,  \
-                                                  is_simple_slice, &old_lhs); \
-    return;                                                                   \
+#define HANDLE_DIM(NDIM)                                                 \
+  if (processing_dims == NDIM) {                                         \
+    HandleStridedSliceAssignCase<Device, T, NDIM>()(                     \
+        context, begin, end, strides, processing_shape, is_simple_slice, \
+        &old_lhs);                                                       \
+    return;                                                              \
   }
+      HANDLE_DIM(0);
       HANDLE_DIM(1);
       HANDLE_DIM(2);
       HANDLE_DIM(3);
@@ -377,7 +372,15 @@ REGISTER_STRIDED_SLICE(bfloat16);
                               .HostMemory("end")               \
                               .HostMemory("strides")           \
                               .TypeConstraint<int32>("Index"), \
-                          StridedSliceGradOp<GPUDevice, type>)
+                          StridedSliceGradOp<GPUDevice, type>) \
+  REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")           \
+                              .Device(DEVICE_GPU)              \
+                              .TypeConstraint<type>("T")       \
+                              .HostMemory("begin")             \
+                              .HostMemory("end")               \
+                              .HostMemory("strides")           \
+                              .TypeConstraint<int32>("Index"), \
+                          StridedSliceAssignOp<GPUDevice, type>)
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 
@@ -405,7 +408,15 @@ REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad")
                             .HostMemory("dy")
                             .HostMemory("output"),
                         StridedSliceGradOp<CPUDevice, int32>);
-
+REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32>("Index")
+                            .HostMemory("ref")
+                            .HostMemory("begin")
+                            .HostMemory("end")
+                            .HostMemory("strides"),
+                        StridedSliceAssignOp<CPUDevice, int32>)
 #undef REGISTER_GPU
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/strided_slice_op.h b/tensorflow/core/kernels/strided_slice_op.h
index 098f5379d5f..13128e67fb6 100644
--- a/tensorflow/core/kernels/strided_slice_op.h
+++ b/tensorflow/core/kernels/strided_slice_op.h
@@ -116,6 +116,14 @@ struct StridedSliceAssign {
   }
 };
 
+template <typename Device, typename T>
+struct StridedSliceAssignScalar {
+  void operator()(const Device& d, typename TTypes<T, 1>::Tensor output,
+                  typename TTypes<T, 1>::ConstTensor input) {
+    output.device(d) = input;
+  }
+};
+
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/strided_slice_op_gpu.cu.cc b/tensorflow/core/kernels/strided_slice_op_gpu.cu.cc
index 75b4b324190..e8f75cf38d0 100644
--- a/tensorflow/core/kernels/strided_slice_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/strided_slice_op_gpu.cu.cc
@@ -45,7 +45,8 @@ typedef Eigen::GpuDevice GPUDevice;
   template struct functor::StridedSliceAssign<GPUDevice, T, 3>; \
   template struct functor::StridedSliceAssign<GPUDevice, T, 4>; \
   template struct functor::StridedSliceAssign<GPUDevice, T, 5>; \
-  template struct functor::StridedSliceAssign<GPUDevice, T, 6>;
+  template struct functor::StridedSliceAssign<GPUDevice, T, 6>; \
+  template struct functor::StridedSliceAssignScalar<GPUDevice, T>;
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
 DEFINE_GPU_KERNELS(int32);
 
diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h
index b1b5d2df3eb..e89d1920b9c 100644
--- a/tensorflow/core/kernels/strided_slice_op_impl.h
+++ b/tensorflow/core/kernels/strided_slice_op_impl.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types_traits.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/dense_update_ops.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -51,12 +52,14 @@ void HandleStridedSliceGradCase(OpKernelContext* context,
                                 bool is_simple_slice, Tensor* result);
 
 template <typename Device, typename T, int NDIM>
-void HandleStridedSliceAssignCase(OpKernelContext* context,
-                                  const gtl::ArraySlice<int64>& begin,
-                                  const gtl::ArraySlice<int64>& end,
-                                  const gtl::ArraySlice<int64>& strides,
-                                  const TensorShape& processing_shape,
-                                  bool is_simple_slice, Tensor* result);
+class HandleStridedSliceAssignCase {
+ public:
+  void operator()(OpKernelContext* context, const gtl::ArraySlice<int64>& begin,
+                  const gtl::ArraySlice<int64>& end,
+                  const gtl::ArraySlice<int64>& strides,
+                  const TensorShape& processing_shape, bool is_simple_slice,
+                  Tensor* result);
+};
 }  // namespace tensorflow
 
 // The actual implementation. This is designed so multiple
@@ -134,12 +137,10 @@ void HandleStridedSliceGradCase(OpKernelContext* context,
 }
 
 template <typename Device, typename T, int NDIM>
-void HandleStridedSliceAssignCase(OpKernelContext* context,
-                                  const gtl::ArraySlice<int64>& begin,
-                                  const gtl::ArraySlice<int64>& end,
-                                  const gtl::ArraySlice<int64>& strides,
-                                  const TensorShape& processing_shape,
-                                  bool is_simple_slice, Tensor* result) {
+void HandleStridedSliceAssignCase<Device, T, NDIM>::operator()(
+    OpKernelContext* context, const gtl::ArraySlice<int64>& begin,
+    const gtl::ArraySlice<int64>& end, const gtl::ArraySlice<int64>& strides,
+    const TensorShape& processing_shape, bool is_simple_slice, Tensor* result) {
   gtl::InlinedVector<int64, 4> processing_dims = processing_shape.dim_sizes();
   typedef typename proxy_type<Device, T>::type Proxy;
   Eigen::DSizes<Eigen::DenseIndex, NDIM> begin_di;
@@ -156,14 +157,34 @@ void HandleStridedSliceAssignCase(OpKernelContext* context,
       begin_di, end_di, strides_di);
 }
 
+template <typename Device, typename T>
+class HandleStridedSliceAssignCase<Device, T, 0> {
+ public:
+  enum { NDIM_PROXY = 1 };
+  void operator()(OpKernelContext* context, const gtl::ArraySlice<int64>& begin,
+                  const gtl::ArraySlice<int64>& end,
+                  const gtl::ArraySlice<int64>& strides,
+                  const TensorShape& processing_shape, bool is_simple_slice,
+                  Tensor* result) {
+    gtl::InlinedVector<int64, 1> processing_dims(1);
+    processing_dims[0] = 1;
+
+    typedef typename proxy_type<Device, T>::type Proxy;
+    functor::StridedSliceAssignScalar<Device, Proxy>()(
+        context->eigen_device<Device>(),
+        result->bit_casted_shaped<Proxy, 1>(processing_dims),
+        context->input(4).bit_casted_shaped<Proxy, 1>(processing_dims));
+  }
+};
+
 // NODE(aselle): according to bsteiner, we need this because otherwise
 // nvcc instantiates templates that are invalid. strided_slice_op_gpu.cu
 // handles instantiates externally. It is important that this is done#
 
 // before the HandleXXCase's are instantiated to avoid duplicate
 // specialization errors.
-#if GOOGLE_CUDA
-#define PREVENT_INSTANTIATE(T, NDIM)                               \
+
+#define PREVENT_INSTANTIATE_DIM1_AND_UP(T, NDIM)                   \
   namespace functor {                                              \
   template <>                                                      \
   void StridedSlice<GPUDevice, T, NDIM>::operator()(               \
@@ -197,12 +218,28 @@ void HandleStridedSliceAssignCase(OpKernelContext* context,
       const Eigen::DSizes<Eigen::DenseIndex, NDIM>& strides);      \
   extern template struct StridedSliceAssign<GPUDevice, T, NDIM>;   \
   }  // namespace functor
+#define PREVENT_INSTANTIATE_DIM0_ONLY(T, NDIM)                   \
+  namespace functor {                                            \
+  template <>                                                    \
+  void StridedSliceAssignScalar<GPUDevice, T>::operator()(       \
+      const GPUDevice& d, typename TTypes<T, 1>::Tensor output,  \
+      typename TTypes<T, 1>::ConstTensor input);                 \
+  extern template struct StridedSliceAssignScalar<GPUDevice, T>; \
+  }  // namespace functor
 
+// Dimension 0 only instantiates some functors. So we only need
+// to prevent ones defined by PREVENT_INSTANTIATE_DIM0_ONLY
+#if GOOGLE_CUDA
+#if STRIDED_SLICE_INSTANTIATE_DIM == 0
+#define PREVENT_INSTANTIATE(T, NDIM) PREVENT_INSTANTIATE_DIM0_ONLY(T, NDIM)
+#else
+#define PREVENT_INSTANTIATE(T, NDIM) PREVENT_INSTANTIATE_DIM1_AND_UP(T, NDIM)
+#endif
 #else
 #define PREVENT_INSTANTIATE(T, NDIM)
 #endif
 
-#define INSTANTIATE(DEVICE, T, DIM)                                   \
+#define INSTANTIATE_DIM1_AND_UP_HANDLERS(DEVICE, T, DIM)              \
   template void HandleStridedSliceCase<DEVICE, T, DIM>(               \
       OpKernelContext * context, const gtl::ArraySlice<int64>& begin, \
       const gtl::ArraySlice<int64>& end,                              \
@@ -210,18 +247,25 @@ void HandleStridedSliceAssignCase(OpKernelContext* context,
       const TensorShape& processing_shape, bool is_simple_slice,      \
       Tensor* result);                                                \
   template void HandleStridedSliceGradCase<DEVICE, T, DIM>(           \
-      OpKernelContext * context, const gtl::ArraySlice<int64>& begin, \
-      const gtl::ArraySlice<int64>& end,                              \
-      const gtl::ArraySlice<int64>& strides,                          \
-      const TensorShape& processing_shape, bool is_simple_slice,      \
-      Tensor* result);                                                \
-  template void HandleStridedSliceAssignCase<DEVICE, T, DIM>(         \
       OpKernelContext * context, const gtl::ArraySlice<int64>& begin, \
       const gtl::ArraySlice<int64>& end,                              \
       const gtl::ArraySlice<int64>& strides,                          \
       const TensorShape& processing_shape, bool is_simple_slice,      \
       Tensor* result);
 
+#define INSTANTIATE_DIM0_AND_UP_HANDLERS(DEVICE, T, DIM) \
+  template class HandleStridedSliceAssignCase<DEVICE, T, DIM>;
+
+// Only some kernels need to be instantiated on dim 0.
+#if STRIDED_SLICE_INSTANTIATE_DIM == 0
+#define INSTANTIATE(DEVICE, T, DIM) \
+  INSTANTIATE_DIM0_AND_UP_HANDLERS(DEVICE, T, DIM)
+#else
+#define INSTANTIATE(DEVICE, T, DIM)                \
+  INSTANTIATE_DIM0_AND_UP_HANDLERS(DEVICE, T, DIM) \
+  INSTANTIATE_DIM1_AND_UP_HANDLERS(DEVICE, T, DIM)
+#endif
+
 #define DECLARE_FOR_N_CPU(T) \
   INSTANTIATE(CPUDevice, T, STRIDED_SLICE_INSTANTIATE_DIM)
 
diff --git a/tensorflow/core/kernels/strided_slice_op_inst_0.cc b/tensorflow/core/kernels/strided_slice_op_inst_0.cc
new file mode 100644
index 00000000000..48b52442d65
--- /dev/null
+++ b/tensorflow/core/kernels/strided_slice_op_inst_0.cc
@@ -0,0 +1,23 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif
+
+#define STRIDED_SLICE_INSTANTIATE_DIM 0
+#include "tensorflow/core/kernels/strided_slice_op_impl.h"
+#undef STRIDED_SLICE_INSTANTIATE_DIM
diff --git a/tensorflow/core/lib/core/status.h b/tensorflow/core/lib/core/status.h
index 814f76cb938..734ea91c80f 100644
--- a/tensorflow/core/lib/core/status.h
+++ b/tensorflow/core/lib/core/status.h
@@ -110,7 +110,7 @@ typedef std::function<void(const Status&)> StatusCallback;
 
 // DEBUG only version of TF_CHECK_OK.  Compiler still parses 'val' even in opt
 // mode.
-#ifdef NDEBUG
+#ifndef NDEBUG
 #define TF_DCHECK_OK(val) TF_CHECK_OK(val)
 #else
 #define TF_DCHECK_OK(val) \
diff --git a/tensorflow/core/lib/gtl/flatmap.h b/tensorflow/core/lib/gtl/flatmap.h
new file mode 100644
index 00000000000..c66bc47168a
--- /dev/null
+++ b/tensorflow/core/lib/gtl/flatmap.h
@@ -0,0 +1,349 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_FLATMAP_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_FLATMAP_H_
+
+#include <stddef.h>
+#include <utility>
+#include "tensorflow/core/lib/gtl/flatrep.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace gtl {
+
+// FlatMap<K,V,...> provides a map from K to V.
+//
+// The map is implemented using an open-addressed hash table.  A
+// single array holds entire map contents and collisions are resolved
+// by probing at a sequence of locations in the array.
+template <typename Key, typename Val, class Hash, class Eq = std::equal_to<Key>>
+class FlatMap {
+ private:
+  // Forward declare some internal types needed in public section.
+  struct Bucket;
+
+ public:
+  typedef Key key_type;
+  typedef Val mapped_type;
+  typedef Hash hasher;
+  typedef Eq key_equal;
+  typedef size_t size_type;
+  typedef ptrdiff_t difference_type;
+
+  // We cannot use std::pair<> since internal representation stores
+  // keys and values in separate arrays, so we make a custom struct
+  // that holds references to the internal key, value elements.
+  struct value_type {
+    typedef Key first_type;
+    typedef Val second_type;
+
+    const Key& first;
+    Val& second;
+    value_type(const Key& k, Val& v) : first(k), second(v) {}
+  };
+  typedef value_type* pointer;
+  typedef const value_type* const_pointer;
+  typedef value_type& reference;
+  typedef const value_type& const_reference;
+
+  FlatMap() : FlatMap(1) {}
+
+  explicit FlatMap(size_t N, const Hash& hf = Hash(), const Eq& eq = Eq())
+      : rep_(N, hf, eq) {}
+
+  FlatMap(const FlatMap& src) : rep_(src.rep_) {}
+
+  template <typename InputIter>
+  FlatMap(InputIter first, InputIter last, size_t N = 1,
+          const Hash& hf = Hash(), const Eq& eq = Eq())
+      : FlatMap(N, hf, eq) {
+    insert(first, last);
+  }
+
+  FlatMap& operator=(const FlatMap& src) {
+    rep_.CopyFrom(src.rep_);
+    return *this;
+  }
+
+  ~FlatMap() {}
+
+  void swap(FlatMap& x) { rep_.swap(x.rep_); }
+  void clear_no_resize() { rep_.clear_no_resize(); }
+  void clear() { rep_.clear(); }
+  void reserve(size_t N) { rep_.Resize(std::max(N, size())); }
+  void rehash(size_t N) { rep_.Resize(std::max(N, size())); }
+  void resize(size_t N) { rep_.Resize(std::max(N, size())); }
+  size_t size() const { return rep_.size(); }
+  bool empty() const { return size() == 0; }
+  size_t bucket_count() const { return rep_.bucket_count(); }
+  hasher hash_function() const { return rep_.hash_function(); }
+  key_equal key_eq() const { return rep_.key_eq(); }
+
+  class iterator {
+   public:
+    iterator() : b_(nullptr), end_(nullptr), i_(0) {}
+
+    // Make iterator pointing at first element at or after b.
+    explicit iterator(Bucket* b, Bucket* end) : b_(b), end_(end), i_(0) {
+      SkipUnused();
+    }
+
+    // Make iterator pointing exactly at ith element in b, which must exist.
+    iterator(Bucket* b, Bucket* end, uint32 i) : b_(b), end_(end), i_(i) {
+      FillValue();
+    }
+
+    value_type& operator*() { return *val(); }
+    value_type* operator->() { return val(); }
+    bool operator==(const iterator& x) const {
+      return b_ == x.b_ && i_ == x.i_;
+    }
+    bool operator!=(const iterator& x) const { return !(*this == x); }
+    iterator& operator++() {
+      DCHECK(b_ != end_);
+      i_++;
+      SkipUnused();
+      return *this;
+    }
+
+   private:
+    friend class FlatMap;
+    Bucket* b_;
+    Bucket* end_;
+    uint32 i_;
+    char space_[sizeof(value_type)];
+
+    value_type* val() { return reinterpret_cast<value_type*>(space_); }
+    void FillValue() { new (space_) value_type(b_->key(i_), b_->val(i_)); }
+    void SkipUnused() {
+      while (b_ < end_) {
+        if (i_ >= Rep::kWidth) {
+          i_ = 0;
+          b_++;
+        } else if (b_->marker[i_] < 2) {
+          i_++;
+        } else {
+          FillValue();
+          break;
+        }
+      }
+    }
+  };
+
+  class const_iterator {
+   private:
+    mutable iterator rep_;  // Share state and logic with non-const iterator.
+   public:
+    const_iterator() : rep_() {}
+    explicit const_iterator(Bucket* start, Bucket* end) : rep_(start, end) {}
+    const_iterator(Bucket* b, Bucket* end, uint32 i) : rep_(b, end, i) {}
+
+    const value_type& operator*() const { return *rep_.val(); }
+    const value_type* operator->() const { return rep_.val(); }
+    bool operator==(const const_iterator& x) const { return rep_ == x.rep_; }
+    bool operator!=(const const_iterator& x) const { return rep_ != x.rep_; }
+    const_iterator& operator++() {
+      ++rep_;
+      return *this;
+    }
+  };
+
+  iterator begin() { return iterator(rep_.start(), rep_.limit()); }
+  iterator end() { return iterator(rep_.limit(), rep_.limit()); }
+  const_iterator begin() const {
+    return const_iterator(rep_.start(), rep_.limit());
+  }
+  const_iterator end() const {
+    return const_iterator(rep_.limit(), rep_.limit());
+  }
+
+  size_t count(const Key& k) const { return rep_.Find(k).found ? 1 : 0; }
+  iterator find(const Key& k) {
+    auto r = rep_.Find(k);
+    return r.found ? iterator(r.b, rep_.limit(), r.index) : end();
+  }
+  const_iterator find(const Key& k) const {
+    auto r = rep_.Find(k);
+    return r.found ? const_iterator(r.b, rep_.limit(), r.index) : end();
+  }
+
+  Val& at(const Key& k) {
+    auto r = rep_.Find(k);
+    DCHECK(r.found);
+    return r.b->val(r.index);
+  }
+  const Val& at(const Key& k) const {
+    auto r = rep_.Find(k);
+    DCHECK(r.found);
+    return r.b->val(r.index);
+  }
+
+  template <typename P>
+  std::pair<iterator, bool> insert(const P& p) {
+    return Insert(p.first, p.second);
+  }
+  std::pair<iterator, bool> insert(const std::pair<const Key, Val>& p) {
+    return Insert(p.first, p.second);
+  }
+  template <typename InputIter>
+  void insert(InputIter first, InputIter last) {
+    for (; first != last; ++first) {
+      insert(*first);
+    }
+  }
+
+  Val& operator[](const Key& k) { return IndexOp(k); }
+  Val& operator[](Key&& k) { return IndexOp(std::forward<Key>(k)); }
+
+  template <typename... Args>
+  std::pair<iterator, bool> emplace(Args&&... args) {
+    return InsertPair(std::make_pair(std::forward<Args>(args)...));
+  }
+
+  size_t erase(const Key& k) {
+    auto r = rep_.Find(k);
+    if (!r.found) return 0;
+    rep_.Erase(r.b, r.index);
+    return 1;
+  }
+  iterator erase(iterator pos) {
+    rep_.Erase(pos.b_, pos.i_);
+    ++pos;
+    return pos;
+  }
+  iterator erase(iterator pos, iterator last) {
+    for (; pos != last; ++pos) {
+      rep_.Erase(pos.b_, pos.i_);
+    }
+    return pos;
+  }
+
+  std::pair<iterator, iterator> equal_range(const Key& k) {
+    auto pos = find(k);
+    if (pos == end()) {
+      return std::make_pair(pos, pos);
+    } else {
+      auto next = pos;
+      ++next;
+      return std::make_pair(pos, next);
+    }
+  }
+  std::pair<const_iterator, const_iterator> equal_range(const Key& k) const {
+    auto pos = find(k);
+    if (pos == end()) {
+      return std::make_pair(pos, pos);
+    } else {
+      auto next = pos;
+      ++next;
+      return std::make_pair(pos, next);
+    }
+  }
+
+  bool operator==(const FlatMap& x) const {
+    if (size() != x.size()) return false;
+    for (auto& p : x) {
+      auto i = find(p.first);
+      if (i == end()) return false;
+      if (i->second != p.second) return false;
+    }
+    return true;
+  }
+  bool operator!=(const FlatMap& x) const { return !(*this == x); }
+
+  // If key exists in the table, prefetch the associated value.  This
+  // is a hint, and may have no effect.
+  void prefetch_value(const Key& key) const { rep_.Prefetch(key); }
+
+ private:
+  using Rep = internal::FlatRep<Key, Bucket, Hash, Eq>;
+
+  // Bucket stores kWidth <marker, key, value> triples.
+  // The data is organized as three parallel arrays to reduce padding.
+  struct Bucket {
+    uint8 marker[Rep::kWidth];
+
+    // Wrap keys and values in union to control construction and destruction.
+    union Storage {
+      struct {
+        Key key[Rep::kWidth];
+        Val val[Rep::kWidth];
+      };
+      Storage() {}
+      ~Storage() {}
+    } storage;
+
+    Key& key(uint32 i) {
+      DCHECK_GE(marker[i], 2);
+      return storage.key[i];
+    }
+    Val& val(uint32 i) {
+      DCHECK_GE(marker[i], 2);
+      return storage.val[i];
+    }
+    template <typename V>
+    void InitVal(uint32 i, V&& v) {
+      new (&storage.val[i]) Val(std::forward<V>(v));
+    }
+    void Destroy(uint32 i) {
+      storage.key[i].Key::~Key();
+      storage.val[i].Val::~Val();
+    }
+    void MoveFrom(uint32 i, Bucket* src, uint32 src_index) {
+      new (&storage.key[i]) Key(std::move(src->storage.key[src_index]));
+      new (&storage.val[i]) Val(std::move(src->storage.val[src_index]));
+    }
+    void CopyFrom(uint32 i, Bucket* src, uint32 src_index) {
+      new (&storage.key[i]) Key(src->storage.key[src_index]);
+      new (&storage.val[i]) Val(src->storage.val[src_index]);
+    }
+  };
+
+  template <typename Pair>
+  std::pair<iterator, bool> InsertPair(Pair&& p) {
+    return Insert(std::forward<decltype(p.first)>(p.first),
+                  std::forward<decltype(p.second)>(p.second));
+  }
+
+  template <typename K, typename V>
+  std::pair<iterator, bool> Insert(K&& k, V&& v) {
+    rep_.MaybeResize();
+    auto r = rep_.FindOrInsert(std::forward<K>(k));
+    const bool inserted = !r.found;
+    if (inserted) {
+      r.b->InitVal(r.index, std::forward<V>(v));
+    }
+    return {iterator(r.b, rep_.limit(), r.index), inserted};
+  }
+
+  template <typename K>
+  Val& IndexOp(K&& k) {
+    rep_.MaybeResize();
+    auto r = rep_.FindOrInsert(std::forward<K>(k));
+    Val* vptr = &r.b->val(r.index);
+    if (!r.found) {
+      new (vptr) Val();  // Initialize value in new slot.
+    }
+    return *vptr;
+  }
+
+  Rep rep_;
+};
+
+}  // namespace gtl
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_FLATMAP_H_
diff --git a/tensorflow/core/lib/gtl/flatmap_test.cc b/tensorflow/core/lib/gtl/flatmap_test.cc
new file mode 100644
index 00000000000..2fa610b7e12
--- /dev/null
+++ b/tensorflow/core/lib/gtl/flatmap_test.cc
@@ -0,0 +1,576 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/lib/gtl/flatmap.h"
+
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace gtl {
+namespace {
+
+typedef FlatMap<int64, int32, HashInt64> NumMap;
+
+// If map has an entry for k, return the corresponding value, else return def.
+int32 Get(const NumMap& map, int64 k, int32 def = -1) {
+  auto iter = map.find(k);
+  if (iter == map.end()) {
+    EXPECT_EQ(map.count(k), 0);
+    return def;
+  } else {
+    EXPECT_EQ(map.count(k), 1);
+    EXPECT_EQ(&map.at(k), &iter->second);
+    EXPECT_EQ(iter->first, k);
+    return iter->second;
+  }
+}
+
+// Return contents of map as a sorted list of pairs.
+typedef std::vector<std::pair<int64, int32>> NumMapContents;
+NumMapContents Contents(const NumMap& map) {
+  NumMapContents result;
+  for (const auto& p : map) {
+    result.push_back({p.first, p.second});
+  }
+  std::sort(result.begin(), result.end());
+  return result;
+}
+
+// Fill entries with keys [start,limit).
+void Fill(NumMap* map, int64 start, int64 limit) {
+  for (int64 i = start; i < limit; i++) {
+    map->insert({i, i * 100});
+  }
+}
+
+TEST(FlatMapTest, Find) {
+  NumMap map;
+  EXPECT_EQ(Get(map, 1), -1);
+  map.insert({1, 100});
+  map.insert({2, 200});
+  EXPECT_EQ(Get(map, 1), 100);
+  EXPECT_EQ(Get(map, 2), 200);
+  EXPECT_EQ(Get(map, 3), -1);
+}
+
+TEST(FlatMapTest, Insert) {
+  NumMap map;
+  EXPECT_EQ(Get(map, 1), -1);
+
+  // New entry.
+  auto result = map.insert({1, 100});
+  EXPECT_TRUE(result.second);
+  EXPECT_EQ(result.first->first, 1);
+  EXPECT_EQ(result.first->second, 100);
+  EXPECT_EQ(Get(map, 1), 100);
+
+  // Attempt to insert over existing entry.
+  result = map.insert({1, 200});
+  EXPECT_FALSE(result.second);
+  EXPECT_EQ(result.first->first, 1);
+  EXPECT_EQ(result.first->second, 100);
+  EXPECT_EQ(Get(map, 1), 100);
+
+  // Overwrite through iterator.
+  result.first->second = 300;
+  EXPECT_EQ(result.first->second, 300);
+  EXPECT_EQ(Get(map, 1), 300);
+
+  // Should get updated value.
+  result = map.insert({1, 400});
+  EXPECT_FALSE(result.second);
+  EXPECT_EQ(result.first->first, 1);
+  EXPECT_EQ(result.first->second, 300);
+  EXPECT_EQ(Get(map, 1), 300);
+}
+
+TEST(FlatMapTest, InsertGrowth) {
+  NumMap map;
+  const int n = 100;
+  Fill(&map, 0, 100);
+  EXPECT_EQ(map.size(), n);
+  for (int i = 0; i < n; i++) {
+    EXPECT_EQ(Get(map, i), i * 100) << i;
+  }
+}
+
+TEST(FlatMapTest, Emplace) {
+  NumMap map;
+
+  // New entry.
+  auto result = map.emplace(1, 100);
+  EXPECT_TRUE(result.second);
+  EXPECT_EQ(result.first->first, 1);
+  EXPECT_EQ(result.first->second, 100);
+  EXPECT_EQ(Get(map, 1), 100);
+
+  // Attempt to insert over existing entry.
+  result = map.emplace(1, 200);
+  EXPECT_FALSE(result.second);
+  EXPECT_EQ(result.first->first, 1);
+  EXPECT_EQ(result.first->second, 100);
+  EXPECT_EQ(Get(map, 1), 100);
+
+  // Overwrite through iterator.
+  result.first->second = 300;
+  EXPECT_EQ(result.first->second, 300);
+  EXPECT_EQ(Get(map, 1), 300);
+
+  // Update a second value
+  result = map.emplace(2, 400);
+  EXPECT_TRUE(result.second);
+  EXPECT_EQ(result.first->first, 2);
+  EXPECT_EQ(result.first->second, 400);
+  EXPECT_EQ(Get(map, 2), 400);
+}
+
+TEST(FlatMapTest, EmplaceUniquePtr) {
+  FlatMap<int64, std::unique_ptr<string>, HashInt64> smap;
+  smap.emplace(1, std::unique_ptr<string>(new string("hello")));
+}
+
+TEST(FlatMapTest, Size) {
+  NumMap map;
+  EXPECT_EQ(map.size(), 0);
+
+  map.insert({1, 100});
+  map.insert({2, 200});
+  EXPECT_EQ(map.size(), 2);
+}
+
+TEST(FlatMapTest, Empty) {
+  NumMap map;
+  EXPECT_TRUE(map.empty());
+
+  map.insert({1, 100});
+  map.insert({2, 200});
+  EXPECT_FALSE(map.empty());
+}
+
+TEST(FlatMapTest, ArrayOperator) {
+  NumMap map;
+
+  // Create new element if not found.
+  auto v1 = &map[1];
+  EXPECT_EQ(*v1, 0);
+  EXPECT_EQ(Get(map, 1), 0);
+
+  // Write through returned reference.
+  *v1 = 100;
+  EXPECT_EQ(map[1], 100);
+  EXPECT_EQ(Get(map, 1), 100);
+
+  // Reuse existing element if found.
+  auto v1a = &map[1];
+  EXPECT_EQ(v1, v1a);
+  EXPECT_EQ(*v1, 100);
+
+  // Create another element.
+  map[2] = 200;
+  EXPECT_EQ(Get(map, 1), 100);
+  EXPECT_EQ(Get(map, 2), 200);
+}
+
+TEST(FlatMapTest, Count) {
+  NumMap map;
+  EXPECT_EQ(map.count(1), 0);
+  EXPECT_EQ(map.count(2), 0);
+
+  map.insert({1, 100});
+  EXPECT_EQ(map.count(1), 1);
+  EXPECT_EQ(map.count(2), 0);
+
+  map.insert({2, 200});
+  EXPECT_EQ(map.count(1), 1);
+  EXPECT_EQ(map.count(2), 1);
+}
+
+TEST(FlatMapTest, Iter) {
+  NumMap map;
+  EXPECT_EQ(Contents(map), NumMapContents());
+
+  map.insert({1, 100});
+  map.insert({2, 200});
+  EXPECT_EQ(Contents(map), NumMapContents({{1, 100}, {2, 200}}));
+}
+
+TEST(FlatMapTest, Erase) {
+  NumMap map;
+  EXPECT_EQ(map.erase(1), 0);
+  map[1] = 100;
+  map[2] = 200;
+  EXPECT_EQ(map.erase(3), 0);
+  EXPECT_EQ(map.erase(1), 1);
+  EXPECT_EQ(map.size(), 1);
+  EXPECT_EQ(Get(map, 2), 200);
+  EXPECT_EQ(Contents(map), NumMapContents({{2, 200}}));
+  EXPECT_EQ(map.erase(2), 1);
+  EXPECT_EQ(Contents(map), NumMapContents());
+}
+
+TEST(FlatMapTest, EraseIter) {
+  NumMap map;
+  Fill(&map, 1, 11);
+  size_t size = 10;
+  for (auto iter = map.begin(); iter != map.end();) {
+    iter = map.erase(iter);
+    size--;
+    EXPECT_EQ(map.size(), size);
+  }
+  EXPECT_EQ(Contents(map), NumMapContents());
+}
+
+TEST(FlatMapTest, EraseIterPair) {
+  NumMap map;
+  Fill(&map, 1, 11);
+  NumMap expected;
+  auto p1 = map.begin();
+  expected.insert(*p1);
+  ++p1;
+  expected.insert(*p1);
+  ++p1;
+  auto p2 = map.end();
+  EXPECT_EQ(map.erase(p1, p2), map.end());
+  EXPECT_EQ(map.size(), 2);
+  EXPECT_EQ(Contents(map), Contents(expected));
+}
+
+TEST(FlatMapTest, EraseLongChains) {
+  // Make a map with lots of elements and erase a bunch of them to ensure
+  // that we are likely to hit them on future lookups.
+  NumMap map;
+  const int num = 128;
+  Fill(&map, 0, num);
+  for (int i = 0; i < num; i += 3) {
+    EXPECT_EQ(map.erase(i), 1);
+  }
+  for (int i = 0; i < num; i++) {
+    if ((i % 3) != 0) {
+      EXPECT_EQ(Get(map, i), i * 100);
+    } else {
+      EXPECT_EQ(map.count(i), 0);
+    }
+  }
+
+  // Erase remainder to trigger table shrinking.
+  const size_t orig_buckets = map.bucket_count();
+  for (int i = 0; i < num; i++) {
+    map.erase(i);
+  }
+  EXPECT_TRUE(map.empty());
+  EXPECT_EQ(map.bucket_count(), orig_buckets);
+  map[1] = 100;  // Actual shrinking is triggered by an insert.
+  EXPECT_LT(map.bucket_count(), orig_buckets);
+}
+
+TEST(FlatMap, AlternatingInsertRemove) {
+  NumMap map;
+  map.insert({1000, 1000});
+  map.insert({2000, 1000});
+  map.insert({3000, 1000});
+  for (int i = 0; i < 10000; i++) {
+    map.insert({i, i});
+    map.erase(i);
+  }
+}
+
+TEST(FlatMap, ClearNoResize) {
+  NumMap map;
+  Fill(&map, 0, 100);
+  const size_t orig = map.bucket_count();
+  map.clear_no_resize();
+  EXPECT_EQ(map.size(), 0);
+  EXPECT_EQ(Contents(map), NumMapContents());
+  EXPECT_EQ(map.bucket_count(), orig);
+}
+
+TEST(FlatMap, Clear) {
+  NumMap map;
+  Fill(&map, 0, 100);
+  const size_t orig = map.bucket_count();
+  map.clear();
+  EXPECT_EQ(map.size(), 0);
+  EXPECT_EQ(Contents(map), NumMapContents());
+  EXPECT_LT(map.bucket_count(), orig);
+}
+
+TEST(FlatMap, Copy) {
+  for (int n = 0; n < 10; n++) {
+    NumMap src;
+    Fill(&src, 0, n);
+    NumMap copy = src;
+    EXPECT_EQ(Contents(src), Contents(copy));
+    NumMap copy2;
+    copy2 = src;
+    EXPECT_EQ(Contents(src), Contents(copy2));
+    copy2 = copy2;  // Self-assignment
+    EXPECT_EQ(Contents(src), Contents(copy2));
+  }
+}
+
+TEST(FlatMap, InitFromIter) {
+  for (int n = 0; n < 10; n++) {
+    NumMap src;
+    Fill(&src, 0, n);
+    auto vec = Contents(src);
+    NumMap dst(vec.begin(), vec.end());
+    EXPECT_EQ(Contents(dst), vec);
+  }
+}
+
+TEST(FlatMap, InsertIter) {
+  NumMap a, b;
+  Fill(&a, 1, 10);
+  Fill(&b, 8, 20);
+  b[9] = 10000;  // Should not get inserted into a since a already has 9
+  a.insert(b.begin(), b.end());
+  NumMap expected;
+  Fill(&expected, 1, 20);
+  EXPECT_EQ(Contents(a), Contents(expected));
+}
+
+TEST(FlatMap, Eq) {
+  NumMap empty;
+
+  NumMap elems;
+  Fill(&elems, 0, 5);
+  EXPECT_FALSE(empty == elems);
+  EXPECT_TRUE(empty != elems);
+
+  NumMap copy = elems;
+  EXPECT_TRUE(copy == elems);
+  EXPECT_FALSE(copy != elems);
+
+  NumMap changed = elems;
+  changed[3] = 1;
+  EXPECT_FALSE(changed == elems);
+  EXPECT_TRUE(changed != elems);
+
+  NumMap changed2 = elems;
+  changed2.erase(3);
+  EXPECT_FALSE(changed2 == elems);
+  EXPECT_TRUE(changed2 != elems);
+}
+
+TEST(FlatMap, Swap) {
+  NumMap a, b;
+  Fill(&a, 1, 5);
+  Fill(&b, 100, 200);
+  NumMap c = a;
+  NumMap d = b;
+  EXPECT_EQ(c, a);
+  EXPECT_EQ(d, b);
+  c.swap(d);
+  EXPECT_EQ(c, b);
+  EXPECT_EQ(d, a);
+}
+
+TEST(FlatMap, Reserve) {
+  NumMap src;
+  Fill(&src, 1, 100);
+  NumMap a = src;
+  a.reserve(10);
+  EXPECT_EQ(a, src);
+  NumMap b = src;
+  b.rehash(1000);
+  EXPECT_EQ(b, src);
+}
+
+TEST(FlatMap, EqualRangeMutable) {
+  NumMap map;
+  Fill(&map, 1, 10);
+
+  // Existing element
+  auto p1 = map.equal_range(3);
+  EXPECT_TRUE(p1.first != p1.second);
+  EXPECT_EQ(p1.first->first, 3);
+  EXPECT_EQ(p1.first->second, 300);
+  ++p1.first;
+  EXPECT_TRUE(p1.first == p1.second);
+
+  // Missing element
+  auto p2 = map.equal_range(100);
+  EXPECT_TRUE(p2.first == p2.second);
+}
+
+TEST(FlatMap, EqualRangeConst) {
+  NumMap tmp;
+  Fill(&tmp, 1, 10);
+
+  const NumMap map = tmp;
+
+  // Existing element
+  auto p1 = map.equal_range(3);
+  EXPECT_TRUE(p1.first != p1.second);
+  EXPECT_EQ(p1.first->first, 3);
+  EXPECT_EQ(p1.first->second, 300);
+  ++p1.first;
+  EXPECT_TRUE(p1.first == p1.second);
+
+  // Missing element
+  auto p2 = map.equal_range(100);
+  EXPECT_TRUE(p2.first == p2.second);
+}
+
+TEST(FlatMap, Prefetch) {
+  NumMap map;
+  Fill(&map, 0, 1000);
+  // Prefetch present and missing keys.
+  for (int i = 0; i < 2000; i++) {
+    map.prefetch_value(i);
+  }
+}
+
+// Non-copyable values should work.
+struct NC {
+  int64 value;
+  NC() : value(-1) {}
+  NC(int64 v) : value(v) {}
+  NC(const NC& x) : value(x.value) {}
+  bool operator==(const NC& x) const { return value == x.value; }
+};
+struct HashNC {
+  size_t operator()(NC x) const { return x.value; }
+};
+
+TEST(FlatMap, NonCopyable) {
+  FlatMap<NC, NC, HashNC> map;
+  for (int i = 0; i < 100; i++) {
+    map[NC(i)] = NC(i * 100);
+  }
+  for (int i = 0; i < 100; i++) {
+    EXPECT_EQ(map.count(NC(i)), 1);
+    auto iter = map.find(NC(i));
+    EXPECT_NE(iter, map.end());
+    EXPECT_EQ(iter->first, NC(i));
+    EXPECT_EQ(iter->second, NC(i * 100));
+    EXPECT_EQ(map[NC(i)], NC(i * 100));
+  }
+  map.erase(NC(10));
+  EXPECT_EQ(map.count(NC(10)), 0);
+}
+
+// Test with heap-allocated objects so that mismanaged constructions
+// or destructions will show up as errors under a sanitizer or
+// heap checker.
+TEST(FlatMap, ConstructDestruct) {
+  FlatMap<string, string, HashStr> map;
+  string k1 = "the quick brown fox jumped over the lazy dog";
+  string k2 = k1 + k1;
+  string k3 = k1 + k2;
+  map[k1] = k2;
+  map[k3] = k1;
+  EXPECT_EQ(k1, map.find(k1)->first);
+  EXPECT_EQ(k2, map.find(k1)->second);
+  EXPECT_EQ(k1, map[k3]);
+  map.erase(k3);
+  EXPECT_EQ(string(), map[k3]);
+
+  map.clear();
+  map[k1] = k2;
+  EXPECT_EQ(k2, map[k1]);
+
+  map.reserve(100);
+  EXPECT_EQ(k2, map[k1]);
+}
+
+// Type to use to ensure that custom equality operator is used
+// that ignores extra value.
+struct CustomCmpKey {
+  int64 a;
+  int64 b;
+  CustomCmpKey(int64 v1, int64 v2) : a(v1), b(v2) {}
+  bool operator==(const CustomCmpKey& x) const { return a == x.a && b == x.b; }
+};
+struct HashA {
+  size_t operator()(CustomCmpKey x) const { return x.a; }
+};
+struct EqA {
+  // Ignore b fields.
+  bool operator()(CustomCmpKey x, CustomCmpKey y) const { return x.a == y.a; }
+};
+TEST(FlatMap, CustomCmp) {
+  FlatMap<CustomCmpKey, int, HashA, EqA> map;
+  map[CustomCmpKey(100, 200)] = 300;
+  EXPECT_EQ(300, map[CustomCmpKey(100, 200)]);
+  EXPECT_EQ(300, map[CustomCmpKey(100, 500)]);  // Differences in key.b ignored
+}
+
+// Test unique_ptr handling.
+typedef std::unique_ptr<int> UniqInt;
+static UniqInt MakeUniq(int i) { return UniqInt(new int(i)); }
+
+struct HashUniq {
+  size_t operator()(const UniqInt& p) const { return *p; }
+};
+struct EqUniq {
+  bool operator()(const UniqInt& a, const UniqInt& b) const { return *a == *b; }
+};
+typedef FlatMap<UniqInt, UniqInt, HashUniq, EqUniq> UniqMap;
+
+TEST(FlatMap, UniqueMap) {
+  UniqMap map;
+
+  // Fill map
+  const int N = 10;
+  for (int i = 0; i < N; i++) {
+    if ((i % 2) == 0) {
+      map[MakeUniq(i)] = MakeUniq(i + 100);
+    } else {
+      map.emplace(MakeUniq(i), MakeUniq(i + 100));
+    }
+  }
+  EXPECT_EQ(map.size(), N);
+
+  // Lookups
+  for (int i = 0; i < N; i++) {
+    EXPECT_EQ(*map.at(MakeUniq(i)), i + 100);
+  }
+
+  // find+erase
+  EXPECT_EQ(map.count(MakeUniq(2)), 1);
+  map.erase(MakeUniq(2));
+  EXPECT_EQ(map.count(MakeUniq(2)), 0);
+
+  // clear
+  map.clear();
+  EXPECT_EQ(map.size(), 0);
+}
+
+TEST(FlatMap, UniqueMapIter) {
+  UniqMap map;
+  const int kCount = 10;
+  const int kValueDelta = 100;
+  for (int i = 1; i <= kCount; i++) {
+    map[MakeUniq(i)] = MakeUniq(i + kValueDelta);
+  }
+  int key_sum = 0;
+  int val_sum = 0;
+  for (const auto& p : map) {
+    key_sum += *p.first;
+    val_sum += *p.second;
+  }
+  EXPECT_EQ(key_sum, (kCount * (kCount + 1)) / 2);
+  EXPECT_EQ(val_sum, key_sum + (kCount * kValueDelta));
+}
+
+}  // namespace
+}  // namespace gtl
+}  // namespace tensorflow
diff --git a/tensorflow/core/lib/gtl/flatrep.h b/tensorflow/core/lib/gtl/flatrep.h
new file mode 100644
index 00000000000..ff590d41280
--- /dev/null
+++ b/tensorflow/core/lib/gtl/flatrep.h
@@ -0,0 +1,332 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_FLATREP_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_FLATREP_H_
+
+#include <string.h>
+#include <utility>
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace gtl {
+namespace internal {
+
+// Internal representation for FlatMap and FlatSet.
+//
+// The representation is an open-addressed hash table.  Conceptually,
+// the representation is a flat array of entries.  However we
+// structure it as an array of of buckets where each bucket holds
+// kWidth entries along with metadata for the kWidth entries.  The
+// metadata marker is
+//
+//  (a) kEmpty: the entry is empty
+//  (b) kDeleted: the entry has been deleted
+//  (c) other: the entry is occupied and has low-8 bits of its hash.
+//      These hash bits can be used to avoid potentially expensive
+//      key comparisons.
+//
+// FlatMap passes in a bucket that contains keys and values, FlatSet
+// passes in a bucket that does not contain values.
+template <typename Key, typename Bucket, class Hash, class Eq>
+class FlatRep {
+ public:
+  // kWidth is the number of entries stored in a bucket.
+  static const uint32 kBase = 3;
+  static const uint32 kWidth = (1 << kBase);
+
+  FlatRep(size_t N, const Hash& hf, const Eq& eq) : hash_(hf), equal_(eq) {
+    Init(N);
+  }
+  explicit FlatRep(const FlatRep& src) : hash_(src.hash_), equal_(src.equal_) {
+    Init(src.size());
+    CopyEntries(src.array_, src.end_, CopyEntry());
+  }
+  ~FlatRep() {
+    clear_no_resize();
+    delete[] array_;
+  }
+
+  // Simple accessors.
+  size_t size() const { return not_empty_ - deleted_; }
+  size_t bucket_count() const { return mask_ + 1; }
+  Bucket* start() const { return array_; }
+  Bucket* limit() const { return end_; }
+  const Hash& hash_function() const { return hash_; }
+  const Eq& key_eq() const { return equal_; }
+
+  // Overwrite contents of *this with contents of src.
+  void CopyFrom(const FlatRep& src) {
+    if (this != &src) {
+      clear_no_resize();
+      delete[] array_;
+      Init(src.size());
+      CopyEntries(src.array_, src.end_, CopyEntry());
+    }
+  }
+
+  void clear_no_resize() {
+    for (Bucket* b = array_; b != end_; b++) {
+      for (uint32 i = 0; i < kWidth; i++) {
+        if (b->marker[i] >= 2) {
+          b->Destroy(i);
+          b->marker[i] = kEmpty;
+        }
+      }
+    }
+    not_empty_ = 0;
+    deleted_ = 0;
+  }
+
+  void clear() {
+    clear_no_resize();
+    grow_ = 0;  // Consider shrinking in MaybeResize()
+    MaybeResize();
+  }
+
+  void swap(FlatRep& x) {
+    using std::swap;
+    swap(array_, x.array_);
+    swap(end_, x.end_);
+    swap(lglen_, x.lglen_);
+    swap(mask_, x.mask_);
+    swap(not_empty_, x.not_empty_);
+    swap(deleted_, x.deleted_);
+    swap(grow_, x.grow_);
+    swap(shrink_, x.shrink_);
+  }
+
+  struct SearchResult {
+    bool found;
+    Bucket* b;
+    uint32 index;
+  };
+
+  // Hash value is partitioned as follows:
+  // 1. Bottom 8 bits are stored in bucket to help speed up comparisons.
+  // 2. Next 3 bits give index inside bucket.
+  // 3. Remaining bits give bucket number.
+
+  // Find bucket/index for key k.
+  SearchResult Find(const Key& k) const {
+    size_t h = hash_(k);
+    const uint32 marker = Marker(h & 0xff);
+    size_t index = (h >> 8) & mask_;  // Holds bucket num and index-in-bucket
+    uint32 num_probes = 1;            // Needed for quadratic probing
+    while (true) {
+      uint32 bi = index & (kWidth - 1);
+      Bucket* b = &array_[index >> kBase];
+      const uint32 x = b->marker[bi];
+      if (x == marker && equal_(b->key(bi), k)) {
+        return {true, b, bi};
+      } else if (x == kEmpty) {
+        return {false, nullptr, 0};
+      }
+      // Quadratic probing.
+      index = (index + num_probes) & mask_;
+      num_probes++;
+    }
+  }
+
+  // Find bucket/index for key k, creating a new one if necessary.
+  //
+  // KeyType is a template parameter so that k's type is deduced and it
+  // becomes a universal reference which allows the key initialization
+  // below to use an rvalue constructor if available.
+  template <typename KeyType>
+  SearchResult FindOrInsert(KeyType&& k) {
+    size_t h = hash_(k);
+    const uint32 marker = Marker(h & 0xff);
+    size_t index = (h >> 8) & mask_;  // Holds bucket num and index-in-bucket
+    uint32 num_probes = 1;            // Needed for quadratic probing
+    Bucket* del = nullptr;            // First encountered deletion for kInsert
+    uint32 di = 0;
+    while (true) {
+      uint32 bi = index & (kWidth - 1);
+      Bucket* b = &array_[index >> kBase];
+      const uint32 x = b->marker[bi];
+      if (x == marker && equal_(b->key(bi), k)) {
+        return {true, b, bi};
+      } else if (!del && x == kDeleted) {
+        // Remember deleted index to use for insertion.
+        del = b;
+        di = bi;
+      } else if (x == kEmpty) {
+        if (del) {
+          // Store in the first deleted slot we encountered
+          b = del;
+          bi = di;
+          deleted_--;  // not_empty_ does not change
+        } else {
+          not_empty_++;
+        }
+        b->marker[bi] = marker;
+        new (&b->key(bi)) Key(std::forward<KeyType>(k));
+        return {false, b, bi};
+      }
+      // Quadratic probing.
+      index = (index + num_probes) & mask_;
+      num_probes++;
+    }
+  }
+
+  void Erase(Bucket* b, uint32 i) {
+    b->Destroy(i);
+    b->marker[i] = kDeleted;
+    deleted_++;
+    grow_ = 0;  // Consider shrinking on next insert
+  }
+
+  void Prefetch(const Key& k) const {
+    size_t h = hash_(k);
+    size_t index = (h >> 8) & mask_;  // Holds bucket num and index-in-bucket
+    uint32 bi = index & (kWidth - 1);
+    Bucket* b = &array_[index >> kBase];
+    prefetch(&b->storage.key[bi]);
+  }
+  void prefetch(const void* ptr) const {
+    // TODO(jeff,sanjay): Remove this routine when we add a
+    // prefetch(...) call to platform so that the Prefetch routine
+    // actually does something
+  }
+
+  inline void MaybeResize() {
+    if (not_empty_ < grow_) {
+      return;  // Nothing to do
+    }
+    if (grow_ == 0) {
+      // Special value set by erase to cause shrink on next insert.
+      if (size() >= shrink_) {
+        // Not small enough to shrink.
+        grow_ = static_cast<size_t>(bucket_count() * 0.8);
+        if (not_empty_ < grow_) return;
+      }
+    }
+    Resize(size() + 1);
+  }
+
+  void Resize(size_t N) {
+    Bucket* old = array_;
+    Bucket* old_end = end_;
+    Init(N);
+    CopyEntries(old, old_end, MoveEntry());
+    delete[] old;
+  }
+
+ private:
+  enum { kEmpty = 0, kDeleted = 1 };  // Special markers for an entry.
+
+  Hash hash_;         // User-supplied hasher
+  Eq equal_;          // User-supplied comparator
+  uint8 lglen_;       // lg(#buckets)
+  Bucket* array_;     // array of length (1 << lglen_)
+  Bucket* end_;       // Points just past last bucket in array_
+  size_t mask_;       // (# of entries in table) - 1
+  size_t not_empty_;  // Count of entries with marker != kEmpty
+  size_t deleted_;    // Count of entries with marker == kDeleted
+  size_t grow_;       // Grow array when not_empty_ >= grow_
+  size_t shrink_;     // Shrink array when size() < shrink_
+
+  // Avoid kEmpty and kDeleted markers when computing hash values to
+  // store in Bucket::marker[].
+  static uint32 Marker(uint32 hb) { return hb + (hb < 2 ? 2 : 0); }
+
+  void Init(size_t N) {
+    // Make enough room for N elements.
+    size_t lg = 0;  // Smallest table is just one bucket.
+    while (N >= 0.8 * ((1 << lg) * kWidth)) {
+      lg++;
+    }
+    const size_t n = (1 << lg);
+    Bucket* array = new Bucket[n];
+    for (size_t i = 0; i < n; i++) {
+      Bucket* b = &array[i];
+      memset(b->marker, kEmpty, kWidth);
+    }
+    const size_t capacity = (1 << lg) * kWidth;
+    lglen_ = lg;
+    mask_ = capacity - 1;
+    array_ = array;
+    end_ = array + n;
+    not_empty_ = 0;
+    deleted_ = 0;
+    grow_ = static_cast<size_t>(capacity * 0.8);
+    if (lg == 0) {
+      // Already down to one bucket; no more shrinking.
+      shrink_ = 0;
+    } else {
+      shrink_ = static_cast<size_t>(grow_ * 0.4);  // Must be less than 0.5
+    }
+  }
+
+  // Used by FreshInsert when we should copy from source.
+  struct CopyEntry {
+    inline void operator()(Bucket* dst, uint32 dsti, Bucket* src, uint32 srci) {
+      dst->CopyFrom(dsti, src, srci);
+    }
+  };
+
+  // Used by FreshInsert when we should move from source.
+  struct MoveEntry {
+    inline void operator()(Bucket* dst, uint32 dsti, Bucket* src, uint32 srci) {
+      dst->MoveFrom(dsti, src, srci);
+      src->Destroy(srci);
+      src->marker[srci] = kDeleted;
+    }
+  };
+
+  template <typename Copier>
+  void CopyEntries(Bucket* start, Bucket* end, Copier copier) {
+    for (Bucket* b = start; b != end; b++) {
+      for (uint32 i = 0; i < kWidth; i++) {
+        if (b->marker[i] >= 2) {
+          FreshInsert(b, i, copier);
+        }
+      }
+    }
+  }
+
+  // Create an entry for the key numbered src_index in *src and return
+  // its bucket/index.  Used for insertion into a fresh table.  We
+  // assume that there are no deletions, and k does not already exist
+  // in the table.
+  template <typename Copier>
+  void FreshInsert(Bucket* src, uint32 src_index, Copier copier) {
+    size_t h = hash_(src->key(src_index));
+    const uint32 marker = Marker(h & 0xff);
+    size_t index = (h >> 8) & mask_;  // Holds bucket num and index-in-bucket
+    uint32 num_probes = 1;            // Needed for quadratic probing
+    while (true) {
+      uint32 bi = index & (kWidth - 1);
+      Bucket* b = &array_[index >> kBase];
+      const uint32 x = b->marker[bi];
+      if (x == 0) {
+        b->marker[bi] = marker;
+        not_empty_++;
+        copier(b, bi, src, src_index);
+        return;
+      }
+      // Quadratic probing.
+      index = (index + num_probes) & mask_;
+      num_probes++;
+    }
+  }
+};
+
+}  // namespace internal
+}  // namespace gtl
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_FLATREP_H_
diff --git a/tensorflow/core/lib/gtl/flatset.h b/tensorflow/core/lib/gtl/flatset.h
new file mode 100644
index 00000000000..b94d88cbc6a
--- /dev/null
+++ b/tensorflow/core/lib/gtl/flatset.h
@@ -0,0 +1,277 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_FLATSET_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_FLATSET_H_
+
+#include <stddef.h>
+#include <utility>
+#include "tensorflow/core/lib/gtl/flatrep.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace gtl {
+
+// FlatSet<K,...> provides a set of K.
+//
+// The map is implemented using an open-addressed hash table.  A
+// single array holds entire map contents and collisions are resolved
+// by probing at a sequence of locations in the array.
+template <typename Key, class Hash, class Eq = std::equal_to<Key>>
+class FlatSet {
+ private:
+  // Forward declare some internal types needed in public section.
+  struct Bucket;
+
+ public:
+  typedef Key key_type;
+  typedef Key value_type;
+  typedef Hash hasher;
+  typedef Eq key_equal;
+  typedef size_t size_type;
+  typedef ptrdiff_t difference_type;
+  typedef value_type* pointer;
+  typedef const value_type* const_pointer;
+  typedef value_type& reference;
+  typedef const value_type& const_reference;
+
+  FlatSet() : FlatSet(1) {}
+
+  explicit FlatSet(size_t N, const Hash& hf = Hash(), const Eq& eq = Eq())
+      : rep_(N, hf, eq) {}
+
+  FlatSet(const FlatSet& src) : rep_(src.rep_) {}
+
+  template <typename InputIter>
+  FlatSet(InputIter first, InputIter last, size_t N = 1,
+          const Hash& hf = Hash(), const Eq& eq = Eq())
+      : FlatSet(N, hf, eq) {
+    insert(first, last);
+  }
+
+  FlatSet& operator=(const FlatSet& src) {
+    rep_.CopyFrom(src.rep_);
+    return *this;
+  }
+
+  ~FlatSet() {}
+
+  void swap(FlatSet& x) { rep_.swap(x.rep_); }
+  void clear_no_resize() { rep_.clear_no_resize(); }
+  void clear() { rep_.clear(); }
+  void reserve(size_t N) { rep_.Resize(std::max(N, size())); }
+  void rehash(size_t N) { rep_.Resize(std::max(N, size())); }
+  void resize(size_t N) { rep_.Resize(std::max(N, size())); }
+  size_t size() const { return rep_.size(); }
+  bool empty() const { return size() == 0; }
+  size_t bucket_count() const { return rep_.bucket_count(); }
+  hasher hash_function() const { return rep_.hash_function(); }
+  key_equal key_eq() const { return rep_.key_eq(); }
+
+  class iterator {
+   public:
+    iterator() : b_(nullptr), end_(nullptr), i_(0) {}
+
+    // Make iterator pointing at first element at or after b.
+    explicit iterator(Bucket* b, Bucket* end) : b_(b), end_(end), i_(0) {
+      SkipUnused();
+    }
+
+    // Make iterator pointing exactly at ith element in b, which must exist.
+    iterator(Bucket* b, Bucket* end, uint32 i) : b_(b), end_(end), i_(i) {}
+
+    Key& operator*() { return key(); }
+    Key* operator->() { return &key(); }
+    bool operator==(const iterator& x) const {
+      return b_ == x.b_ && i_ == x.i_;
+    }
+    bool operator!=(const iterator& x) const { return !(*this == x); }
+    iterator& operator++() {
+      DCHECK(b_ != end_);
+      i_++;
+      SkipUnused();
+      return *this;
+    }
+
+   private:
+    friend class FlatSet;
+    Bucket* b_;
+    Bucket* end_;
+    uint32 i_;
+
+    Key& key() const { return b_->key(i_); }
+    void SkipUnused() {
+      while (b_ < end_) {
+        if (i_ >= Rep::kWidth) {
+          i_ = 0;
+          b_++;
+        } else if (b_->marker[i_] < 2) {
+          i_++;
+        } else {
+          break;
+        }
+      }
+    }
+  };
+
+  class const_iterator {
+   private:
+    mutable iterator rep_;  // Share state and logic with non-const iterator.
+   public:
+    const_iterator() : rep_() {}
+    explicit const_iterator(Bucket* start, Bucket* end) : rep_(start, end) {}
+    const_iterator(Bucket* b, Bucket* end, uint32 i) : rep_(b, end, i) {}
+
+    const Key& operator*() const { return rep_.key(); }
+    const Key* operator->() const { return &rep_.key(); }
+    bool operator==(const const_iterator& x) const { return rep_ == x.rep_; }
+    bool operator!=(const const_iterator& x) const { return rep_ != x.rep_; }
+    const_iterator& operator++() {
+      ++rep_;
+      return *this;
+    }
+  };
+
+  iterator begin() { return iterator(rep_.start(), rep_.limit()); }
+  iterator end() { return iterator(rep_.limit(), rep_.limit()); }
+  const_iterator begin() const {
+    return const_iterator(rep_.start(), rep_.limit());
+  }
+  const_iterator end() const {
+    return const_iterator(rep_.limit(), rep_.limit());
+  }
+
+  size_t count(const Key& k) const { return rep_.Find(k).found ? 1 : 0; }
+  iterator find(const Key& k) {
+    auto r = rep_.Find(k);
+    return r.found ? iterator(r.b, rep_.limit(), r.index) : end();
+  }
+  const_iterator find(const Key& k) const {
+    auto r = rep_.Find(k);
+    return r.found ? const_iterator(r.b, rep_.limit(), r.index) : end();
+  }
+
+  std::pair<iterator, bool> insert(const Key& k) { return Insert(k); }
+  template <typename InputIter>
+  void insert(InputIter first, InputIter last) {
+    for (; first != last; ++first) {
+      insert(*first);
+    }
+  }
+
+  template <typename... Args>
+  std::pair<iterator, bool> emplace(Args&&... args) {
+    rep_.MaybeResize();
+    auto r = rep_.FindOrInsert(std::forward<Args>(args)...);
+    const bool inserted = !r.found;
+    return {iterator(r.b, rep_.limit(), r.index), inserted};
+  }
+
+  size_t erase(const Key& k) {
+    auto r = rep_.Find(k);
+    if (!r.found) return 0;
+    rep_.Erase(r.b, r.index);
+    return 1;
+  }
+  iterator erase(iterator pos) {
+    rep_.Erase(pos.b_, pos.i_);
+    ++pos;
+    return pos;
+  }
+  iterator erase(iterator pos, iterator last) {
+    for (; pos != last; ++pos) {
+      rep_.Erase(pos.b_, pos.i_);
+    }
+    return pos;
+  }
+
+  std::pair<iterator, iterator> equal_range(const Key& k) {
+    auto pos = find(k);
+    if (pos == end()) {
+      return std::make_pair(pos, pos);
+    } else {
+      auto next = pos;
+      ++next;
+      return std::make_pair(pos, next);
+    }
+  }
+  std::pair<const_iterator, const_iterator> equal_range(const Key& k) const {
+    auto pos = find(k);
+    if (pos == end()) {
+      return std::make_pair(pos, pos);
+    } else {
+      auto next = pos;
+      ++next;
+      return std::make_pair(pos, next);
+    }
+  }
+
+  bool operator==(const FlatSet& x) const {
+    if (size() != x.size()) return false;
+    for (const auto& elem : x) {
+      auto i = find(elem);
+      if (i == end()) return false;
+    }
+    return true;
+  }
+  bool operator!=(const FlatSet& x) const { return !(*this == x); }
+
+  // If key exists in the table, prefetch it.  This is a hint, and may
+  // have no effect.
+  void prefetch_value(const Key& key) const { rep_.Prefetch(key); }
+
+ private:
+  using Rep = internal::FlatRep<Key, Bucket, Hash, Eq>;
+
+  // Bucket stores kWidth <marker, key, value> triples.
+  // The data is organized as three parallel arrays to reduce padding.
+  struct Bucket {
+    uint8 marker[Rep::kWidth];
+
+    // Wrap keys in union to control construction and destruction.
+    union Storage {
+      Key key[Rep::kWidth];
+      Storage() {}
+      ~Storage() {}
+    } storage;
+
+    Key& key(uint32 i) {
+      DCHECK_GE(marker[i], 2);
+      return storage.key[i];
+    }
+    void Destroy(uint32 i) { storage.key[i].Key::~Key(); }
+    void MoveFrom(uint32 i, Bucket* src, uint32 src_index) {
+      new (&storage.key[i]) Key(std::move(src->storage.key[src_index]));
+    }
+    void CopyFrom(uint32 i, Bucket* src, uint32 src_index) {
+      new (&storage.key[i]) Key(src->storage.key[src_index]);
+    }
+  };
+
+  std::pair<iterator, bool> Insert(const Key& k) {
+    rep_.MaybeResize();
+    auto r = rep_.FindOrInsert(k);
+    const bool inserted = !r.found;
+    return {iterator(r.b, rep_.limit(), r.index), inserted};
+  }
+
+  Rep rep_;
+};
+
+}  // namespace gtl
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_FLATSET_H_
diff --git a/tensorflow/core/lib/gtl/flatset_test.cc b/tensorflow/core/lib/gtl/flatset_test.cc
new file mode 100644
index 00000000000..ea9c9c22b55
--- /dev/null
+++ b/tensorflow/core/lib/gtl/flatset_test.cc
@@ -0,0 +1,501 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/lib/gtl/flatset.h"
+
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace gtl {
+namespace {
+
+typedef FlatSet<int64, HashInt64> NumSet;
+
+// Returns true iff set has an entry for k.
+// Also verifies that find and count give consistent results.
+bool Has(const NumSet& set, int64 k) {
+  auto iter = set.find(k);
+  if (iter == set.end()) {
+    EXPECT_EQ(set.count(k), 0);
+    return false;
+  } else {
+    EXPECT_EQ(set.count(k), 1);
+    EXPECT_EQ(*iter, k);
+    return true;
+  }
+}
+
+// Return contents of set as a sorted list of numbers.
+typedef std::vector<int64> NumSetContents;
+NumSetContents Contents(const NumSet& set) {
+  NumSetContents result;
+  for (int64 n : set) {
+    result.push_back(n);
+  }
+  std::sort(result.begin(), result.end());
+  return result;
+}
+
+// Fill entries with keys [start,limit).
+void Fill(NumSet* set, int64 start, int64 limit) {
+  for (int64 i = start; i < limit; i++) {
+    set->insert(i);
+  }
+}
+
+TEST(FlatSetTest, Find) {
+  NumSet set;
+  EXPECT_FALSE(Has(set, 1));
+  set.insert(1);
+  set.insert(2);
+  EXPECT_TRUE(Has(set, 1));
+  EXPECT_TRUE(Has(set, 2));
+  EXPECT_FALSE(Has(set, 3));
+}
+
+TEST(FlatSetTest, Insert) {
+  NumSet set;
+  EXPECT_FALSE(Has(set, 1));
+
+  // New entry.
+  auto result = set.insert(1);
+  EXPECT_TRUE(result.second);
+  EXPECT_EQ(*result.first, 1);
+  EXPECT_TRUE(Has(set, 1));
+
+  // Attempt to insert over existing entry.
+  result = set.insert(1);
+  EXPECT_FALSE(result.second);
+  EXPECT_EQ(*result.first, 1);
+  EXPECT_TRUE(Has(set, 1));
+}
+
+TEST(FlatSetTest, InsertGrowth) {
+  NumSet set;
+  const int n = 100;
+  Fill(&set, 0, 100);
+  EXPECT_EQ(set.size(), n);
+  for (int i = 0; i < n; i++) {
+    EXPECT_TRUE(Has(set, i)) << i;
+  }
+}
+
+TEST(FlatSetTest, Emplace) {
+  NumSet set;
+
+  // New entry.
+  auto result = set.emplace(73);
+  EXPECT_TRUE(result.second);
+  EXPECT_EQ(*result.first, 73);
+  EXPECT_TRUE(Has(set, 73));
+
+  // Attempt to insert an existing entry.
+  result = set.emplace(73);
+  EXPECT_FALSE(result.second);
+  EXPECT_EQ(*result.first, 73);
+  EXPECT_TRUE(Has(set, 73));
+
+  // Add a second value
+  result = set.emplace(103);
+  EXPECT_TRUE(result.second);
+  EXPECT_EQ(*result.first, 103);
+  EXPECT_TRUE(Has(set, 103));
+}
+
+TEST(FlatSetTest, Size) {
+  NumSet set;
+  EXPECT_EQ(set.size(), 0);
+
+  set.insert(1);
+  set.insert(2);
+  EXPECT_EQ(set.size(), 2);
+}
+
+TEST(FlatSetTest, Empty) {
+  NumSet set;
+  EXPECT_TRUE(set.empty());
+
+  set.insert(1);
+  set.insert(2);
+  EXPECT_FALSE(set.empty());
+}
+
+TEST(FlatSetTest, Count) {
+  NumSet set;
+  EXPECT_EQ(set.count(1), 0);
+  EXPECT_EQ(set.count(2), 0);
+
+  set.insert(1);
+  EXPECT_EQ(set.count(1), 1);
+  EXPECT_EQ(set.count(2), 0);
+
+  set.insert(2);
+  EXPECT_EQ(set.count(1), 1);
+  EXPECT_EQ(set.count(2), 1);
+}
+
+TEST(FlatSetTest, Iter) {
+  NumSet set;
+  EXPECT_EQ(Contents(set), NumSetContents());
+
+  set.insert(1);
+  set.insert(2);
+  EXPECT_EQ(Contents(set), NumSetContents({1, 2}));
+}
+
+TEST(FlatSetTest, Erase) {
+  NumSet set;
+  EXPECT_EQ(set.erase(1), 0);
+  set.insert(1);
+  set.insert(2);
+  EXPECT_EQ(set.erase(3), 0);
+  EXPECT_EQ(set.erase(1), 1);
+  EXPECT_EQ(set.size(), 1);
+  EXPECT_TRUE(Has(set, 2));
+  EXPECT_EQ(Contents(set), NumSetContents({2}));
+  EXPECT_EQ(set.erase(2), 1);
+  EXPECT_EQ(Contents(set), NumSetContents());
+}
+
+TEST(FlatSetTest, EraseIter) {
+  NumSet set;
+  Fill(&set, 1, 11);
+  size_t size = 10;
+  for (auto iter = set.begin(); iter != set.end();) {
+    iter = set.erase(iter);
+    size--;
+    EXPECT_EQ(set.size(), size);
+  }
+  EXPECT_EQ(Contents(set), NumSetContents());
+}
+
+TEST(FlatSetTest, EraseIterPair) {
+  NumSet set;
+  Fill(&set, 1, 11);
+  NumSet expected;
+  auto p1 = set.begin();
+  expected.insert(*p1);
+  ++p1;
+  expected.insert(*p1);
+  ++p1;
+  auto p2 = set.end();
+  EXPECT_EQ(set.erase(p1, p2), set.end());
+  EXPECT_EQ(set.size(), 2);
+  EXPECT_EQ(Contents(set), Contents(expected));
+}
+
+TEST(FlatSetTest, EraseLongChains) {
+  // Make a set with lots of elements and erase a bunch of them to ensure
+  // that we are likely to hit them on future lookups.
+  NumSet set;
+  const int num = 128;
+  Fill(&set, 0, num);
+  for (int i = 0; i < num; i += 3) {
+    EXPECT_EQ(set.erase(i), 1);
+  }
+  for (int i = 0; i < num; i++) {
+    // Multiples of 3 should be not present.
+    EXPECT_EQ(Has(set, i), ((i % 3) != 0)) << i;
+  }
+
+  // Erase remainder to trigger table shrinking.
+  const size_t orig_buckets = set.bucket_count();
+  for (int i = 0; i < num; i++) {
+    set.erase(i);
+  }
+  EXPECT_TRUE(set.empty());
+  EXPECT_EQ(set.bucket_count(), orig_buckets);
+  set.insert(1);  // Actual shrinking is triggered by an insert.
+  EXPECT_LT(set.bucket_count(), orig_buckets);
+}
+
+TEST(FlatSet, ClearNoResize) {
+  NumSet set;
+  Fill(&set, 0, 100);
+  const size_t orig = set.bucket_count();
+  set.clear_no_resize();
+  EXPECT_EQ(set.size(), 0);
+  EXPECT_EQ(Contents(set), NumSetContents());
+  EXPECT_EQ(set.bucket_count(), orig);
+}
+
+TEST(FlatSet, Clear) {
+  NumSet set;
+  Fill(&set, 0, 100);
+  const size_t orig = set.bucket_count();
+  set.clear();
+  EXPECT_EQ(set.size(), 0);
+  EXPECT_EQ(Contents(set), NumSetContents());
+  EXPECT_LT(set.bucket_count(), orig);
+}
+
+TEST(FlatSet, Copy) {
+  for (int n = 0; n < 10; n++) {
+    NumSet src;
+    Fill(&src, 0, n);
+    NumSet copy = src;
+    EXPECT_EQ(Contents(src), Contents(copy));
+    NumSet copy2;
+    copy2 = src;
+    EXPECT_EQ(Contents(src), Contents(copy2));
+    copy2 = copy2;  // Self-assignment
+    EXPECT_EQ(Contents(src), Contents(copy2));
+  }
+}
+
+TEST(FlatSet, InitFromIter) {
+  for (int n = 0; n < 10; n++) {
+    NumSet src;
+    Fill(&src, 0, n);
+    auto vec = Contents(src);
+    NumSet dst(vec.begin(), vec.end());
+    EXPECT_EQ(Contents(dst), vec);
+  }
+}
+
+TEST(FlatSet, InsertIter) {
+  NumSet a, b;
+  Fill(&a, 1, 10);
+  Fill(&b, 8, 20);
+  b.insert(9);  // Should not get inserted into a since a already has 9
+  a.insert(b.begin(), b.end());
+  NumSet expected;
+  Fill(&expected, 1, 20);
+  EXPECT_EQ(Contents(a), Contents(expected));
+}
+
+TEST(FlatSet, Eq) {
+  NumSet empty;
+
+  NumSet elems;
+  Fill(&elems, 0, 5);
+  EXPECT_FALSE(empty == elems);
+  EXPECT_TRUE(empty != elems);
+
+  NumSet copy = elems;
+  EXPECT_TRUE(copy == elems);
+  EXPECT_FALSE(copy != elems);
+
+  NumSet changed = elems;
+  changed.insert(7);
+  EXPECT_FALSE(changed == elems);
+  EXPECT_TRUE(changed != elems);
+
+  NumSet changed2 = elems;
+  changed2.erase(3);
+  EXPECT_FALSE(changed2 == elems);
+  EXPECT_TRUE(changed2 != elems);
+}
+
+TEST(FlatSet, Swap) {
+  NumSet a, b;
+  Fill(&a, 1, 5);
+  Fill(&b, 100, 200);
+  NumSet c = a;
+  NumSet d = b;
+  EXPECT_EQ(c, a);
+  EXPECT_EQ(d, b);
+  c.swap(d);
+  EXPECT_EQ(c, b);
+  EXPECT_EQ(d, a);
+}
+
+TEST(FlatSet, Reserve) {
+  NumSet src;
+  Fill(&src, 1, 100);
+  NumSet a = src;
+  a.reserve(10);
+  EXPECT_EQ(a, src);
+  NumSet b = src;
+  b.rehash(1000);
+  EXPECT_EQ(b, src);
+}
+
+TEST(FlatSet, EqualRangeMutable) {
+  NumSet set;
+  Fill(&set, 1, 10);
+
+  // Existing element
+  auto p1 = set.equal_range(3);
+  EXPECT_TRUE(p1.first != p1.second);
+  EXPECT_EQ(*p1.first, 3);
+  ++p1.first;
+  EXPECT_TRUE(p1.first == p1.second);
+
+  // Missing element
+  auto p2 = set.equal_range(100);
+  EXPECT_TRUE(p2.first == p2.second);
+}
+
+TEST(FlatSet, EqualRangeConst) {
+  NumSet tmp;
+  Fill(&tmp, 1, 10);
+
+  const NumSet set = tmp;
+
+  // Existing element
+  auto p1 = set.equal_range(3);
+  EXPECT_TRUE(p1.first != p1.second);
+  EXPECT_EQ(*p1.first, 3);
+  ++p1.first;
+  EXPECT_TRUE(p1.first == p1.second);
+
+  // Missing element
+  auto p2 = set.equal_range(100);
+  EXPECT_TRUE(p2.first == p2.second);
+}
+
+TEST(FlatSet, Prefetch) {
+  NumSet set;
+  Fill(&set, 0, 1000);
+  // Prefetch present and missing keys.
+  for (int i = 0; i < 2000; i++) {
+    set.prefetch_value(i);
+  }
+}
+
+// Non-copyable values should work.
+struct NC {
+  int64 value;
+  NC() : value(-1) {}
+  NC(int64 v) : value(v) {}
+  NC(const NC& x) : value(x.value) {}
+  bool operator==(const NC& x) const { return value == x.value; }
+};
+struct HashNC {
+  size_t operator()(NC x) const { return x.value; }
+};
+
+TEST(FlatSet, NonCopyable) {
+  FlatSet<NC, HashNC> set;
+  for (int i = 0; i < 100; i++) {
+    set.insert(NC(i));
+  }
+  for (int i = 0; i < 100; i++) {
+    EXPECT_EQ(set.count(NC(i)), 1);
+    auto iter = set.find(NC(i));
+    EXPECT_NE(iter, set.end());
+    EXPECT_EQ(*iter, NC(i));
+  }
+  set.erase(NC(10));
+  EXPECT_EQ(set.count(NC(10)), 0);
+}
+
+// Test with heap-allocated objects so that mismanaged constructions
+// or destructions will show up as errors under a sanitizer or
+// heap checker.
+TEST(FlatSet, ConstructDestruct) {
+  FlatSet<string, HashStr> set;
+  string k1 = "the quick brown fox jumped over the lazy dog";
+  string k2 = k1 + k1;
+  string k3 = k1 + k2;
+  set.insert(k1);
+  set.insert(k3);
+  EXPECT_EQ(set.count(k1), 1);
+  EXPECT_EQ(set.count(k2), 0);
+  EXPECT_EQ(set.count(k3), 1);
+
+  set.erase(k3);
+  EXPECT_EQ(set.count(k3), 0);
+
+  set.clear();
+  set.insert(k1);
+  EXPECT_EQ(set.count(k1), 1);
+  EXPECT_EQ(set.count(k3), 0);
+
+  set.reserve(100);
+  EXPECT_EQ(set.count(k1), 1);
+  EXPECT_EQ(set.count(k3), 0);
+}
+
+// Type to use to ensure that custom equality operator is used
+// that ignores extra value.
+struct CustomCmpKey {
+  int64 a;
+  int64 b;
+  CustomCmpKey(int64 v1, int64 v2) : a(v1), b(v2) {}
+  bool operator==(const CustomCmpKey& x) const { return a == x.a && b == x.b; }
+};
+struct HashA {
+  size_t operator()(CustomCmpKey x) const { return x.a; }
+};
+struct EqA {
+  // Ignore b fields.
+  bool operator()(CustomCmpKey x, CustomCmpKey y) const { return x.a == y.a; }
+};
+TEST(FlatSet, CustomCmp) {
+  FlatSet<CustomCmpKey, HashA, EqA> set;
+  set.insert(CustomCmpKey(100, 200));
+  EXPECT_EQ(set.count(CustomCmpKey(100, 200)), 1);
+  EXPECT_EQ(set.count(CustomCmpKey(100, 500)), 1);  // key.b ignored
+}
+
+// Test unique_ptr handling.
+typedef std::unique_ptr<int> UniqInt;
+static UniqInt MakeUniq(int i) { return UniqInt(new int(i)); }
+
+struct HashUniq {
+  size_t operator()(const UniqInt& p) const { return *p; }
+};
+struct EqUniq {
+  bool operator()(const UniqInt& a, const UniqInt& b) const { return *a == *b; }
+};
+typedef FlatSet<UniqInt, HashUniq, EqUniq> UniqSet;
+
+TEST(FlatSet, UniqueSet) {
+  UniqSet set;
+
+  // Fill set
+  const int N = 10;
+  for (int i = 0; i < N; i++) {
+    set.emplace(MakeUniq(i));
+  }
+  EXPECT_EQ(set.size(), N);
+
+  // Lookups
+  for (int i = 0; i < N; i++) {
+    EXPECT_EQ(set.count(MakeUniq(i)), 1);
+  }
+
+  // erase
+  set.erase(MakeUniq(2));
+  EXPECT_EQ(set.count(MakeUniq(2)), 0);
+
+  // clear
+  set.clear();
+  EXPECT_EQ(set.size(), 0);
+}
+
+TEST(FlatSet, UniqueSetIter) {
+  UniqSet set;
+  const int kCount = 10;
+  for (int i = 1; i <= kCount; i++) {
+    set.emplace(MakeUniq(i));
+  }
+  int sum = 0;
+  for (const auto& p : set) {
+    sum += *p;
+  }
+  EXPECT_EQ(sum, (kCount * (kCount + 1)) / 2);
+}
+
+}  // namespace
+}  // namespace gtl
+}  // namespace tensorflow
diff --git a/tensorflow/core/lib/hash/hash.h b/tensorflow/core/lib/hash/hash.h
index 3c71e7d6cce..4e64c90d629 100644
--- a/tensorflow/core/lib/hash/hash.h
+++ b/tensorflow/core/lib/hash/hash.h
@@ -42,6 +42,24 @@ inline uint64 Hash64Combine(uint64 a, uint64 b) {
   return a ^ (b + 0x9e3779b97f4a7800ULL + (a << 10) + (a >> 4));
 }
 
+// Convenience Hash functors
+struct HashInt64 {
+  size_t operator()(int64 x) const { return static_cast<size_t>(x); }
+};
+struct HashStr {
+  size_t operator()(const string& s) const {
+    return static_cast<size_t>(Hash64(s));
+  }
+};
+template <typename PTR>
+struct HashPtr {
+  size_t operator()(const PTR p) const {
+    // Hash pointers as integers, but bring more entropy to the lower bits.
+    size_t k = static_cast<size_t>(reinterpret_cast<uintptr_t>(p));
+    return k + (k >> 6);
+  }
+};
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_LIB_HASH_HASH_H_
diff --git a/tensorflow/core/lib/monitoring/collected_metrics.h b/tensorflow/core/lib/monitoring/collected_metrics.h
index 42a80bf5b78..3dde55342ef 100644
--- a/tensorflow/core/lib/monitoring/collected_metrics.h
+++ b/tensorflow/core/lib/monitoring/collected_metrics.h
@@ -25,14 +25,12 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/core/framework/summary.pb.h"
 #include "tensorflow/core/lib/monitoring/metric_def.h"
 
 namespace tensorflow {
 namespace monitoring {
 
-// The type of the metric values.
-enum class ValueType : int { kInt64 = 0 };
-
 // A metric is a statistic about a monitorable entity.
 //
 // Metrics are named with path-like strings, which must conform to the regular
@@ -89,6 +87,7 @@ struct Point {
   // The actual metric value, dependent on the value_type enum.
   ValueType value_type;
   int64 int64_value;
+  HistogramProto histogram_value;
 
   // start_timestamp and end_timestamp indicate the time period over which this
   // point's value measurement applies.
diff --git a/tensorflow/core/lib/monitoring/collection_registry.cc b/tensorflow/core/lib/monitoring/collection_registry.cc
index 47112279cff..d3fd7132de5 100644
--- a/tensorflow/core/lib/monitoring/collection_registry.cc
+++ b/tensorflow/core/lib/monitoring/collection_registry.cc
@@ -49,9 +49,8 @@ void Collector::CollectMetricDescriptor(
     metric_descriptor->label_names.push_back(label_name.ToString());
   }
 
-  // Only cumulative int64 counter is implemented at the moment.
-  metric_descriptor->metric_kind = MetricKind::kCumulative;
-  metric_descriptor->value_type = ValueType::kInt64;
+  metric_descriptor->metric_kind = metric_def->kind();
+  metric_descriptor->value_type = metric_def->value_type();
 }
 
 }  // namespace internal
diff --git a/tensorflow/core/lib/monitoring/collection_registry.h b/tensorflow/core/lib/monitoring/collection_registry.h
index 3da2439238f..2eff4684367 100644
--- a/tensorflow/core/lib/monitoring/collection_registry.h
+++ b/tensorflow/core/lib/monitoring/collection_registry.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <map>
 #include <memory>
 
+#include "tensorflow/core/framework/summary.pb.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/monitoring/collected_metrics.h"
 #include "tensorflow/core/lib/monitoring/metric_def.h"
@@ -217,6 +218,14 @@ inline void CollectValue(const int64& value, Point* const point) {
   point->int64_value = value;
 }
 
+template <>
+inline void CollectValue(const HistogramProto& value, Point* const point) {
+  point->value_type = ValueType::kHistogram;
+  // This is inefficient. If and when we hit snags, we can change the API to do
+  // this more efficiently.
+  point->histogram_value = value;
+}
+
 // Used by the CollectionRegistry class to collect all the values of all the
 // metrics in the registry. This is an implementation detail of the
 // CollectionRegistry class, please do not depend on this.
diff --git a/tensorflow/core/lib/monitoring/collection_registry_test.cc b/tensorflow/core/lib/monitoring/collection_registry_test.cc
index 04a4879da47..34a480b07db 100644
--- a/tensorflow/core/lib/monitoring/collection_registry_test.cc
+++ b/tensorflow/core/lib/monitoring/collection_registry_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/lib/monitoring/collection_registry.h"
 
 #include "tensorflow/core/lib/monitoring/counter.h"
+#include "tensorflow/core/lib/monitoring/sampler.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
@@ -23,6 +24,8 @@ limitations under the License.
 namespace tensorflow {
 namespace monitoring {
 
+using histogram::Histogram;
+
 namespace test_util {
 
 class CollectionRegistryTestAccess {
@@ -42,7 +45,7 @@ TEST(CollectionRegistryTest, RegistrationUnregistration) {
   auto* collection_registry = CollectionRegistry::Default();
   const MetricDef<MetricKind::kCumulative, int64, 0> metric_def0(
       "/tensorflow/metric0", "An example metric with no labels.");
-  const MetricDef<MetricKind::kGauge, double, 1> metric_def1(
+  const MetricDef<MetricKind::kGauge, HistogramProto, 1> metric_def1(
       "/tensorflow/metric1", "An example metric with one label.", "LabelName");
 
   {
@@ -173,6 +176,112 @@ TEST(CollectMetricsTest, Counter) {
   }
 }
 
+void EqHistograms(const Histogram& expected,
+                  const HistogramProto& actual_proto) {
+  Histogram actual;
+  ASSERT_TRUE(actual.DecodeFromProto(actual_proto));
+
+  EXPECT_EQ(expected.ToString(), actual.ToString());
+}
+
+TEST(CollectMetricsTest, Sampler) {
+  auto sampler_with_labels = std::unique_ptr<Sampler<2>>(
+      Sampler<2>::New({"/tensorflow/test/sampler_with_labels",
+                       "Sampler with labels.", "MyLabel0", "MyLabel1"},
+                      {1.0, 2.0}));
+  auto sampler_without_labels = std::unique_ptr<Sampler<0>>(Sampler<0>::New(
+      {"/tensorflow/test/sampler_without_labels", "Sampler without labels."},
+      {0.0}));
+
+  Histogram with_labels0({1.0, 2.0, DBL_MAX});
+  sampler_with_labels->GetCell("Label00", "Label10")->Add(0.7);
+  with_labels0.Add(0.7);
+
+  Histogram with_labels1({1.0, 2.0, DBL_MAX});
+  sampler_with_labels->GetCell("Label01", "Label11")->Add(1.5);
+  with_labels1.Add(1.5);
+
+  Histogram without_labels({0.0, DBL_MAX});
+  sampler_without_labels->GetCell()->Add(0.5);
+  without_labels.Add(0.5);
+
+  for (const bool collect_metric_descriptors : {true, false}) {
+    SCOPED_TRACE(strings::StrCat("collect_metric_descriptors: ",
+                                 collect_metric_descriptors));
+
+    auto* collection_registry = CollectionRegistry::Default();
+    CollectionRegistry::CollectMetricsOptions options;
+    options.collect_metric_descriptors = collect_metric_descriptors;
+    const std::unique_ptr<CollectedMetrics> collected_metrics =
+        collection_registry->CollectMetrics(options);
+
+    if (collect_metric_descriptors) {
+      ASSERT_EQ(2, collected_metrics->metric_descriptor_map.size());
+
+      const MetricDescriptor& ld = *collected_metrics->metric_descriptor_map.at(
+          "/tensorflow/test/sampler_with_labels");
+      EXPECT_EQ("/tensorflow/test/sampler_with_labels", ld.name);
+      EXPECT_EQ("Sampler with labels.", ld.description);
+      ASSERT_EQ(2, ld.label_names.size());
+      EXPECT_EQ("MyLabel0", ld.label_names[0]);
+      EXPECT_EQ("MyLabel1", ld.label_names[1]);
+      EXPECT_EQ(MetricKind::kCumulative, ld.metric_kind);
+      EXPECT_EQ(ValueType::kHistogram, ld.value_type);
+
+      const MetricDescriptor& ud = *collected_metrics->metric_descriptor_map.at(
+          "/tensorflow/test/sampler_without_labels");
+      EXPECT_EQ("/tensorflow/test/sampler_without_labels", ud.name);
+      EXPECT_EQ("Sampler without labels.", ud.description);
+      ASSERT_EQ(0, ud.label_names.size());
+      EXPECT_EQ(MetricKind::kCumulative, ud.metric_kind);
+      EXPECT_EQ(ValueType::kHistogram, ud.value_type);
+    } else {
+      EXPECT_EQ(0, collected_metrics->metric_descriptor_map.size());
+    }
+
+    ASSERT_EQ(2, collected_metrics->point_set_map.size());
+
+    const PointSet& lps = *collected_metrics->point_set_map.at(
+        "/tensorflow/test/sampler_with_labels");
+    EXPECT_EQ("/tensorflow/test/sampler_with_labels", lps.metric_name);
+    ASSERT_EQ(2, lps.points.size());
+    ASSERT_EQ(2, lps.points[0]->labels.size());
+    EXPECT_EQ("MyLabel0", lps.points[0]->labels[0].name);
+    EXPECT_EQ("Label00", lps.points[0]->labels[0].value);
+    EXPECT_EQ("MyLabel1", lps.points[0]->labels[1].name);
+    EXPECT_EQ("Label10", lps.points[0]->labels[1].value);
+    EXPECT_EQ(ValueType::kHistogram, lps.points[0]->value_type);
+    EqHistograms(with_labels0, lps.points[0]->histogram_value);
+    EXPECT_LT(0, lps.points[0]->start_timestamp_millis);
+    EXPECT_LT(0, lps.points[0]->end_timestamp_millis);
+    EXPECT_GE(lps.points[0]->end_timestamp_millis,
+              lps.points[0]->start_timestamp_millis);
+    ASSERT_EQ(2, lps.points[1]->labels.size());
+    EXPECT_EQ("MyLabel0", lps.points[1]->labels[0].name);
+    EXPECT_EQ("Label01", lps.points[1]->labels[0].value);
+    EXPECT_EQ("MyLabel1", lps.points[1]->labels[1].name);
+    EXPECT_EQ("Label11", lps.points[1]->labels[1].value);
+    EXPECT_EQ(ValueType::kHistogram, lps.points[1]->value_type);
+    EqHistograms(with_labels1, lps.points[1]->histogram_value);
+    EXPECT_LT(0, lps.points[1]->start_timestamp_millis);
+    EXPECT_LT(0, lps.points[1]->end_timestamp_millis);
+    EXPECT_GE(lps.points[1]->end_timestamp_millis,
+              lps.points[1]->start_timestamp_millis);
+
+    const PointSet& ups = *collected_metrics->point_set_map.at(
+        "/tensorflow/test/sampler_without_labels");
+    EXPECT_EQ("/tensorflow/test/sampler_without_labels", ups.metric_name);
+    ASSERT_EQ(1, ups.points.size());
+    EXPECT_EQ(0, ups.points[0]->labels.size());
+    EXPECT_EQ(ValueType::kHistogram, ups.points[0]->value_type);
+    EqHistograms(without_labels, ups.points[0]->histogram_value);
+    EXPECT_LT(0, ups.points[0]->start_timestamp_millis);
+    EXPECT_LT(0, ups.points[0]->end_timestamp_millis);
+    EXPECT_GE(ups.points[0]->end_timestamp_millis,
+              ups.points[0]->start_timestamp_millis);
+  }
+}
+
 // A FakeClockEnv to manually advance time.
 class FakeClockEnv : public EnvWrapper {
  public:
diff --git a/tensorflow/core/lib/monitoring/counter.h b/tensorflow/core/lib/monitoring/counter.h
index e76057b980a..4b84e9d928c 100644
--- a/tensorflow/core/lib/monitoring/counter.h
+++ b/tensorflow/core/lib/monitoring/counter.h
@@ -155,7 +155,7 @@ CounterCell* Counter<NumLabels>::GetCell(const Labels&... labels)
                 "Mismatch between Counter<NumLabels> and number of labels "
                 "provided in GetCell(...).");
 
-  const LabelArray& label_array = {labels...};
+  const LabelArray& label_array = {{labels...}};
   mutex_lock l(mu_);
   const auto found_it = cells_.find(label_array);
   if (found_it != cells_.end()) {
diff --git a/tensorflow/core/lib/monitoring/metric_def.h b/tensorflow/core/lib/monitoring/metric_def.h
index 8c7207b829f..116a73823d7 100644
--- a/tensorflow/core/lib/monitoring/metric_def.h
+++ b/tensorflow/core/lib/monitoring/metric_def.h
@@ -19,11 +19,25 @@ limitations under the License.
 #include <array>
 #include <vector>
 
+#include "tensorflow/core/framework/summary.pb.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 
 namespace tensorflow {
 namespace monitoring {
 
+// The different metric kinds available.
+//
+// Gauge indicates that the metric's values are instantaneous measurements of a
+// (typically) continuously varying quantity. Examples: a process's current heap
+// size, a queue's current length.
+//
+// Cumulative indicates that the metric's values represent non-negative changes
+// over specified time periods. Example: the number of rpc calls to a service.
+enum class MetricKind : int { kGauge = 0, kCumulative };
+
+// The type of the metric values.
+enum class ValueType : int { kInt64 = 0, kHistogram };
+
 // Everything in the internal namespace is implementation details. Do not depend
 // on this.
 namespace internal {
@@ -46,17 +60,20 @@ class StringLiteral {
   const StringPiece literal_;
 };
 
-}  // namespace internal
+template <typename Value>
+ValueType GetValueType();
 
-// The different metric kinds available.
-//
-// Gauge indicates that the metric's values are instantaneous measurements of a
-// (typically) continuously varying quantity. Examples: a process's current heap
-// size, a queue's current length.
-//
-// Cumulative indicates that the metric's values represent non-negative changes
-// over specified time periods. Example: the number of rpc calls to a service.
-enum class MetricKind : int { kGauge = 0, kCumulative };
+template <>
+inline ValueType GetValueType<int64>() {
+  return ValueType::kInt64;
+}
+
+template <>
+inline ValueType GetValueType<HistogramProto>() {
+  return ValueType::kHistogram;
+}
+
+}  // namespace internal
 
 // Abstract base class for a metric definition.
 //
@@ -69,6 +86,8 @@ class AbstractMetricDef {
  public:
   MetricKind kind() const { return kind_; }
 
+  ValueType value_type() const { return value_type_; }
+
   StringPiece name() const { return name_; }
 
   StringPiece description() const { return description_; }
@@ -82,16 +101,19 @@ class AbstractMetricDef {
   friend class MetricDef;
 
   AbstractMetricDef(
-      const MetricKind kind, const internal::StringLiteral name,
+      const MetricKind kind, const ValueType value_type,
+      const internal::StringLiteral name,
       const internal::StringLiteral description,
       const std::vector<internal::StringLiteral>& label_descriptions)
       : kind_(kind),
+        value_type_(value_type),
         name_(name),
         description_(description),
-        label_descriptions_(
-            {label_descriptions.begin(), label_descriptions.end()}) {}
+        label_descriptions_(std::vector<StringPiece>(
+            label_descriptions.begin(), label_descriptions.end())) {}
 
   const MetricKind kind_;
+  const ValueType value_type_;
   const StringPiece name_;
   const StringPiece description_;
   const std::vector<StringPiece> label_descriptions_;
@@ -108,14 +130,12 @@ class AbstractMetricDef {
 template <MetricKind metric_kind, typename Value, int NumLabels>
 class MetricDef : public AbstractMetricDef {
  public:
-  using value_type = Value;
-
   template <typename... LabelDesc>
   MetricDef(const internal::StringLiteral name,
             const internal::StringLiteral description,
             const LabelDesc&... label_descriptions)
-      : AbstractMetricDef(metric_kind, name, description,
-                          {label_descriptions...}) {
+      : AbstractMetricDef(metric_kind, internal::GetValueType<Value>(), name,
+                          description, {label_descriptions...}) {
     static_assert(sizeof...(LabelDesc) == NumLabels,
                   "Mismatch between Counter<NumLabels> and number of label "
                   "descriptions.");
diff --git a/tensorflow/core/lib/monitoring/metric_def_test.cc b/tensorflow/core/lib/monitoring/metric_def_test.cc
index 237be6f48c5..dc07a08e4fe 100644
--- a/tensorflow/core/lib/monitoring/metric_def_test.cc
+++ b/tensorflow/core/lib/monitoring/metric_def_test.cc
@@ -24,7 +24,7 @@ namespace {
 TEST(MetricDefTest, Simple) {
   const MetricDef<MetricKind::kCumulative, int64, 0> metric_def0(
       "/tensorflow/metric0", "An example metric with no labels.");
-  const MetricDef<MetricKind::kGauge, double, 1> metric_def1(
+  const MetricDef<MetricKind::kGauge, HistogramProto, 1> metric_def1(
       "/tensorflow/metric1", "An example metric with one label.", "LabelName");
 
   EXPECT_EQ("/tensorflow/metric0", metric_def0.name());
diff --git a/tensorflow/core/lib/monitoring/sampler.h b/tensorflow/core/lib/monitoring/sampler.h
index 9a08437bfdf..3932f8d1a72 100644
--- a/tensorflow/core/lib/monitoring/sampler.h
+++ b/tensorflow/core/lib/monitoring/sampler.h
@@ -28,13 +28,12 @@ limitations under the License.
 
 #include "tensorflow/core/framework/summary.pb.h"
 #include "tensorflow/core/lib/histogram/histogram.h"
+#include "tensorflow/core/lib/monitoring/collection_registry.h"
 #include "tensorflow/core/lib/monitoring/metric_def.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 
-// TODO(vinuraja): Not ready yet. The collection part has to be plumbed in.
-
 namespace tensorflow {
 namespace monitoring {
 
@@ -68,9 +67,10 @@ class SamplerCell {
 
 // A stateful class for updating a cumulative histogram metric.
 //
-// This class encapsulates a set of values (or a single value for a label-less
-// metric). Each value is identified by a tuple of labels. The class allows the
-// user to increment each value.
+// This class encapsulates a set of histograms (or a single histogram for a
+// label-less metric) configured with a list of increasing bucket boundaries.
+// Each histogram is identified by a tuple of labels. The class allows the user
+// to add a sample to each histogram value.
 //
 // Sampler allocates storage and maintains a cell for each value. You can
 // retrieve an individual cell using a label-tuple and update it separately.
@@ -81,7 +81,10 @@ class SamplerCell {
 template <int NumLabels>
 class Sampler {
  public:
-  ~Sampler() {}
+  ~Sampler() {
+    // Deleted here, before the metric_def is destroyed.
+    registration_handle_.reset();
+  }
 
   // Creates the metric based on the metric-definition arguments.
   //
@@ -110,7 +113,17 @@ class Sampler {
   Sampler(const MetricDef<MetricKind::kCumulative, HistogramProto, NumLabels>&
               metric_def,
           const std::vector<double>& bucket_limits)
-      : metric_def_(metric_def), bucket_limits_(bucket_limits) {}
+      : metric_def_(metric_def),
+        bucket_limits_(bucket_limits),
+        registration_handle_(CollectionRegistry::Default()->Register(
+            &metric_def_, [&](MetricCollectorGetter getter) {
+              auto metric_collector = getter.Get(&metric_def_);
+
+              mutex_lock l(mu_);
+              for (const auto& cell : cells_) {
+                metric_collector.CollectValue(cell.first, cell.second.value());
+              }
+            })) {}
 
   mutable mutex mu_;
 
@@ -122,6 +135,9 @@ class Sampler {
   // Bucket limits for the histograms in the cells.
   const std::vector<double> bucket_limits_;
 
+  // Registration handle with the CollectionRegistry.
+  std::unique_ptr<CollectionRegistry::RegistrationHandle> registration_handle_;
+
   // We use a std::map here because we give out pointers to the SamplerCells,
   // which need to remain valid even after more cells.
   using LabelArray = std::array<string, NumLabels>;
@@ -171,7 +187,7 @@ SamplerCell* Sampler<NumLabels>::GetCell(const Labels&... labels)
                 "Mismatch between Sampler<NumLabels> and number of labels "
                 "provided in GetCell(...).");
 
-  const LabelArray& label_array = {labels...};
+  const LabelArray& label_array = {{labels...}};
   mutex_lock l(mu_);
   const auto found_it = cells_.find(label_array);
   if (found_it != cells_.end()) {
diff --git a/tensorflow/core/lib/monitoring/sampler_test.cc b/tensorflow/core/lib/monitoring/sampler_test.cc
index b018d020da9..27e1ccca3c9 100644
--- a/tensorflow/core/lib/monitoring/sampler_test.cc
+++ b/tensorflow/core/lib/monitoring/sampler_test.cc
@@ -23,10 +23,10 @@ namespace {
 
 using histogram::Histogram;
 
-static void EqHistograms(const histogram::Histogram& expected,
-                         const HistogramProto& actual_proto) {
-  histogram::Histogram actual;
-  EXPECT_TRUE(actual.DecodeFromProto(actual_proto));
+void EqHistograms(const Histogram& expected,
+                  const HistogramProto& actual_proto) {
+  Histogram actual;
+  ASSERT_TRUE(actual.DecodeFromProto(actual_proto));
 
   EXPECT_EQ(expected.ToString(), actual.ToString());
 }
diff --git a/tensorflow/core/lib/strings/numbers.cc b/tensorflow/core/lib/strings/numbers.cc
index 4df0f54378e..fc07bd446c1 100644
--- a/tensorflow/core/lib/strings/numbers.cc
+++ b/tensorflow/core/lib/strings/numbers.cc
@@ -80,16 +80,12 @@ T locale_independent_strtonum(const char* str, const char** endptr) {
   // Set to result to what strto{f,d} functions would have returned. If the
   // number was outside the range, the stringstream sets the fail flag, but
   // returns the +/-max() value, whereas strto{f,d} functions return +/-INF.
-  bool real_fail = false;
   if (s.fail()) {
-    real_fail = true;
     if (result == std::numeric_limits<T>::max()) {
       result = std::numeric_limits<T>::infinity();
-      real_fail = false;
       s.clear(s.rdstate() & ~std::ios::failbit);
     } else if (result == -std::numeric_limits<T>::max()) {
       result = -std::numeric_limits<T>::infinity();
-      real_fail = false;
       s.clear(s.rdstate() & ~std::ios::failbit);
     }
   }
@@ -97,10 +93,9 @@ T locale_independent_strtonum(const char* str, const char** endptr) {
   if (endptr) {
     *endptr =
         str +
-        (real_fail
-             ? static_cast<std::iostream::pos_type>(0)
-             : (s.eof() ? static_cast<std::iostream::pos_type>(strlen(str))
-                        : s.tellg()));
+        (s.fail() ? static_cast<std::iostream::pos_type>(0)
+                  : (s.eof() ? static_cast<std::iostream::pos_type>(strlen(str))
+                             : s.tellg()));
   }
   return result;
 }
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 33695451dba..6e076a092e1 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -114,48 +114,49 @@ Status SetOutputShapeForReshape(InferenceContext* c) {
   ShapeHandle out;
   TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &out));
 
-  // If the rank and all dimensions of the input tensor are known, we may
-  // infer missing shape information or perform shape checks.
-  // NumElements conveniently returns kUnknownDim upon missing rank or
-  // dimension information.
-  // Additionally, if the rank of the out shape is unknown we have no shape
-  // information to go off of.
+  if (!c->RankKnown(out)) {
+    // We have no information about the shape of the output.
+    c->set_output(0, out);
+    return Status::OK();
+  }
   DimensionHandle num_in_elems = c->NumElements(in);
-  DimensionHandle num_out_elems = c->NumElements(out);
-  if (!c->ValueKnown(num_in_elems) || !c->RankKnown(out)) {
-    // Do nothing. We have no shape information to infer from so we directly
-    // return out as our shape.
-  } else if (c->ValueKnown(num_out_elems)) {
-    // If we know the number of output elements, we ensure that they
-    // are equal to the number of input elements.
-    if (c->Value(num_in_elems) != c->Value(num_out_elems)) {
+  if (c->FullyDefined(out)) {
+    DimensionHandle num_out_elems = c->NumElements(out);
+    if (c->ValueKnown(num_in_elems) &&
+        c->Value(num_in_elems) != c->Value(num_out_elems)) {
       return errors::InvalidArgument(
           "Cannot reshape a tensor with ", c->DebugString(num_in_elems),
           " elements to shape ", c->DebugString(out), " (",
           c->DebugString(num_out_elems), " elements)");
     }
-  } else {
-    // If we don't know the number of output elements, we can infer
+    c->set_output(0, out);
+    return Status::OK();
+  }
+
+  if (c->ValueKnown(num_in_elems)) {
+    // We don't know the number of output elements, but we can try to infer
     // the missing dimension.
     int32 unknown_idx = -1;
+    bool too_many_unknown = false;
     DimensionHandle known_elems = c->MakeDim(1);
     for (int32 i = 0; i < c->Rank(out); ++i) {
       DimensionHandle dim = c->Dim(out, i);
       if (!c->ValueKnown(dim)) {
         if (unknown_idx >= 0) {
-          return errors::InvalidArgument(
-              "Cannot infer multiple unknown dimensions in shape ",
-              c->DebugString(out));
+          too_many_unknown = true;
+          break;
         }
         unknown_idx = i;
       } else {
         TF_RETURN_IF_ERROR(c->Multiply(known_elems, dim, &known_elems));
       }
     }
-    DimensionHandle inferred_dim;
-    TF_RETURN_IF_ERROR(c->Divide(num_in_elems, c->Value(known_elems),
-                                 true /* evenly_divisible */, &inferred_dim));
-    TF_RETURN_IF_ERROR(c->ReplaceDim(out, unknown_idx, inferred_dim, &out));
+    if (!too_many_unknown) {
+      DimensionHandle inferred_dim;
+      TF_RETURN_IF_ERROR(c->Divide(num_in_elems, c->Value(known_elems),
+                                   true /* evenly_divisible */, &inferred_dim));
+      TF_RETURN_IF_ERROR(c->ReplaceDim(out, unknown_idx, inferred_dim, &out));
+    }
   }
 
   c->set_output(0, out);
@@ -2477,11 +2478,10 @@ REGISTER_OP("Placeholder")
       PartialTensorShape shape;
       TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape));
 
-      // Placeholder has a legacy bug where we cannot tell
-      // the difference between a scalar shape attribute and
-      // 'unknown shape'.  So if the shape is a scalar, we return
-      // an unknown shape.
-      if (shape.dims() == 0) {
+      // Placeholder has legacy behavior where we cannot tell the difference
+      // between a scalar shape attribute and 'unknown shape'.  So if the shape
+      // is a scalar, we return an unknown shape.
+      if (shape.dims() <= 0) {
         return shape_inference::UnknownShape(c);
       }
 
@@ -4382,6 +4382,117 @@ output_min: This value is copied from input_min.
 output_max: This value is copied from input_max.
 )Doc");
 
+REGISTER_OP("FakeQuantWithMinMaxArgs")
+    .Attr("min: float = -6.0")
+    .Attr("max: float = 6.0")
+    .Input("inputs: float")
+    .Output("outputs: float")
+    .Doc(R"doc(
+Fake-quantize the 'inputs' tensor, type float to 'outputs' tensor of same type.
+
+Attributes [min; max] define the clamping range for the 'inputs' data.  Op
+divides this range into 255 steps (total of 256 values), then replaces each
+'inputs' value with the closest of the quantized step values.
+
+Quantization is called fake since the output is still in floating point.
+)doc");
+
+REGISTER_OP("FakeQuantWithMinMaxArgsGradient")
+    .Attr("min: float = -6.0")
+    .Attr("max: float = 6.0")
+    .Input("gradients: float")
+    .Input("inputs: float")
+    .Output("backprops: float")
+    .Doc(R"doc(
+Compute gradients for a FakeQuantWithMinMaxArgs operation.
+
+gradients: Backpropagated gradients above the FakeQuantWithMinMaxArgs operation.
+inputs: Values passed as inputs to the FakeQuantWithMinMaxArgs operation.
+backprops: Backpropagated gradients below the FakeQuantWithMinMaxArgs operation:
+  `gradients * (inputs >= min && inputs <= max)`.
+)doc");
+
+REGISTER_OP("FakeQuantWithMinMaxVars")
+    .Input("inputs: float")
+    .Input("min: float")
+    .Input("max: float")
+    .Output("outputs: float")
+    .Doc(R"doc(
+Fake-quantize the 'inputs' tensor of type float and shape `[b, h, w, d]` via
+global float scalars `min` and `max` to 'outputs' tensor of same shape as
+`inputs`.
+
+[min; max] is the clamping range for the 'inputs' data.  Op divides this range
+into 255 steps (total of 256 values), then replaces each 'inputs' value with the
+closest of the quantized step values.
+
+This operation has a gradient and thus allows for training `min` and `max` values.
+)doc");
+
+REGISTER_OP("FakeQuantWithMinMaxVarsGradient")
+    .Input("gradients: float")
+    .Input("inputs: float")
+    .Input("min: float")
+    .Input("max: float")
+    .Output("backprops_wrt_input: float")
+    .Output("backprop_wrt_min: float")
+    .Output("backprop_wrt_max: float")
+    .Doc(R"doc(
+Compute gradients for a FakeQuantWithMinMaxVars operation.
+
+gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation.
+inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation.
+min, max: Quantization interval, scalar floats.
+backprops_wrt_input: Backpropagated gradients w.r.t. inputs:
+  `gradients * (inputs >= min && inputs <= max)`.
+backprop_wrt_min: Backpropagated gradients w.r.t. min parameter:
+  `sum(gradients * (inputs < min))`.
+backprop_wrt_max: Backpropagated gradients w.r.t. max parameter:
+  `sum(gradients * (inputs > max))`.
+)doc");
+
+REGISTER_OP("FakeQuantWithMinMaxVarsPerChannel")
+    .Input("inputs: float")
+    .Input("min: float")
+    .Input("max: float")
+    .Output("outputs: float")
+    .Doc(R"doc(
+Fake-quantize the 'inputs' tensor of type float and one of the shapes: `[d]`,
+`[b, d]` `[b, h, w, d]` via per-channel floats `min` and `max` of shape `[d]`
+to 'outputs' tensor of same shape as `inputs`.
+
+[min; max] is the clamping range for the 'inputs' data in the corresponding
+depth channel.  Op divides this range into 255 steps (total of 256 values), then
+replaces each 'inputs' value with the closest of the quantized step values.
+
+This operation has a gradient and thus allows for training `min` and `max` values.
+)doc");
+
+REGISTER_OP("FakeQuantWithMinMaxVarsPerChannelGradient")
+    .Input("gradients: float")
+    .Input("inputs: float")
+    .Input("min: float")
+    .Input("max: float")
+    .Output("backprops_wrt_input: float")
+    .Output("backprop_wrt_min: float")
+    .Output("backprop_wrt_max: float")
+    .Doc(R"doc(
+Compute gradients for a FakeQuantWithMinMaxVarsPerChannel operation.
+
+gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation,
+  shape one of: `[d]`, `[b, d]`,  `[b, h, w, d]`.
+inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation, shape
+  same as `gradients`.
+min, max: Quantization interval, floats of shape `[d]`.
+backprops_wrt_input: Backpropagated gradients w.r.t. inputs, shape same as
+  `inputs`:
+    `gradients * (inputs >= min && inputs <= max)`.
+backprop_wrt_min: Backpropagated gradients w.r.t. min parameter, shape `[d]`:
+  `sum_per_d(gradients * (inputs < min))`.
+backprop_wrt_max: Backpropagated gradients w.r.t. max parameter, shape `[d]`:
+  `sum_per_d(gradients * (inputs > max))`.
+)doc");
+
 // Deprecated op registrations:
 
 // The following can be deleted after 10mar2017.
diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc
index 71491e8d669..8679739b70c 100644
--- a/tensorflow/core/ops/array_ops_test.cc
+++ b/tensorflow/core/ops/array_ops_test.cc
@@ -693,8 +693,7 @@ TEST(ArrayOpsTest, Reshape_ShapeFn) {
               "[7];[2]");
   // Multiple missing dimensions cannot be inferred.
   new_shape = test::AsTensor<int32>({-1, -1, 2});
-  INFER_ERROR("Cannot infer multiple unknown dimensions in shape [?,?,2]", op,
-              "[8];[3]");
+  INFER_OK(op, "[8];[3]", "[?,?,2]");
 
   // Reshaping to a scalar.
   new_shape = test::AsTensor<int32>({});
diff --git a/tensorflow/core/ops/compat/ops_history.v0.pbtxt b/tensorflow/core/ops/compat/ops_history.v0.pbtxt
index fac856d6602..b5b056e41f6 100644
--- a/tensorflow/core/ops/compat/ops_history.v0.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v0.pbtxt
@@ -11709,6 +11709,160 @@ op {
     type: DT_STRING
   }
 }
+op {
+  name: "FakeQuantWithMinMaxArgs"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "min"
+    type: "float"
+    default_value {
+      f: -6
+    }
+  }
+  attr {
+    name: "max"
+    type: "float"
+    default_value {
+      f: 6
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxArgsGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "min"
+    type: "float"
+    default_value {
+      f: -6
+    }
+  }
+  attr {
+    name: "max"
+    type: "float"
+    default_value {
+      f: 6
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVars"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops_wrt_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_max"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsPerChannel"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsPerChannelGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops_wrt_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_max"
+    type: DT_FLOAT
+  }
+}
 op {
   name: "Fill"
   input_arg {
@@ -22466,6 +22620,42 @@ op {
     }
   }
 }
+op {
+  name: "RequantizationRange"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
 op {
   name: "Requantize"
   input_arg {
diff --git a/tensorflow/core/ops/data_flow_ops.cc b/tensorflow/core/ops/data_flow_ops.cc
index d1f6d9ff0ae..3c13ca2bfbf 100644
--- a/tensorflow/core/ops/data_flow_ops.cc
+++ b/tensorflow/core/ops/data_flow_ops.cc
@@ -629,6 +629,10 @@ REGISTER_OP("SparseConditionalAccumulator")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->Vector(2));
+      return Status::OK();
+    })
     .Doc(R"doc(
 A conditional accumulator for aggregating sparse gradients. The accumulator
 accepts gradients marked with local_step greater or equal to the most recent
@@ -654,6 +658,11 @@ REGISTER_OP("SparseAccumulatorApplyGradient")
     .Input("gradient_shape: int64")
     .Attr("dtype: numbertype")
     .Attr("has_known_shape: bool")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Applies a sparse gradient to a given accumulator. Does not add if local_step is
 lesser than the accumulator's global_step.
@@ -679,6 +688,14 @@ REGISTER_OP("SparseAccumulatorTakeGradient")
     .Output("values: dtype")
     .Output("shape: int64")
     .Attr("dtype: numbertype")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      // Shape of output is the shape of the accumulator referenced
+      // by 'handle', but which is not available here, so we lose
+      // shape information.
+      return shape_inference::UnknownShape(c);
+    })
     .Doc(R"doc(
 Extracts the average sparse gradient in the given SparseConditionalAccumulator,
 provided that sufficient (i.e., more than num_required) gradients have been
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 3390e3661d6..8d3d9310a4d 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -2298,6 +2298,35 @@ out_type: The type of the output. Should be a lower bit depth than Tinput.
 
 )doc");
 
+REGISTER_OP("RequantizationRange")
+    .Input("input: Tinput")
+    .Input("input_min: float")
+    .Input("input_max: float")
+    .Output("output_min: float")
+    .Output("output_max: float")
+    .Attr("Tinput: quantizedtype")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      c->set_output(0, c->Scalar());
+      c->set_output(1, c->Scalar());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Given a quantized tensor described by (input, input_min, input_max), outputs a
+range that covers the actual values present in that tensor.  This op is
+typically used to produce the requested_output_min and requested_output_max for
+Requantize.
+
+input_min: The float value that the minimum quantized input value represents.
+input_max: The float value that the maximum quantized input value represents.
+Tinput: The type of the input.
+output_min: The computed min output.
+output_max: the computed max output.
+
+)doc");
+
 // Deprecated ops:
 REGISTER_OP("BatchFFT")
     .Input("input: complex64")
diff --git a/tensorflow/core/ops/math_ops_test.cc b/tensorflow/core/ops/math_ops_test.cc
index d5f56d7a174..79ae187342b 100644
--- a/tensorflow/core/ops/math_ops_test.cc
+++ b/tensorflow/core/ops/math_ops_test.cc
@@ -462,4 +462,15 @@ TEST(MathOpsTest, Requantize_ShapeFn) {
   INFER_ERROR("must be rank 0", op, "?;?;?;?;[4]");
 }
 
+TEST(MathOpstest, RequantizationRange_ShapeFn) {
+  ShapeInferenceTestOp op("RequantizationRange");
+
+  INFER_OK(op, "?;?;?", "[];[]");
+  INFER_OK(op, "?;[];[]", "[];[]");
+
+  // Rank checks on input scalars.
+  INFER_ERROR("must be rank 0", op, "?;[1];?");
+  INFER_ERROR("must be rank 0", op, "?;?;[2]");
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index c8abfc04eb4..7a57f917e0a 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -6710,6 +6710,182 @@ op {
   }
   summary: "Output a fact about factorials."
 }
+op {
+  name: "FakeQuantWithMinMaxArgs"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "min"
+    type: "float"
+    default_value {
+      f: -6
+    }
+  }
+  attr {
+    name: "max"
+    type: "float"
+    default_value {
+      f: 6
+    }
+  }
+  summary: "Fake-quantize the \'inputs\' tensor, type float to \'outputs\' tensor of same type."
+  description: "Attributes [min; max] define the clamping range for the \'inputs\' data.  Op\ndivides this range into 255 steps (total of 256 values), then replaces each\n\'inputs\' value with the closest of the quantized step values.\n\nQuantization is called fake since the output is still in floating point."
+}
+op {
+  name: "FakeQuantWithMinMaxArgsGradient"
+  input_arg {
+    name: "gradients"
+    description: "Backpropagated gradients above the FakeQuantWithMinMaxArgs operation."
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    description: "Values passed as inputs to the FakeQuantWithMinMaxArgs operation."
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops"
+    description: "Backpropagated gradients below the FakeQuantWithMinMaxArgs operation:\n`gradients * (inputs >= min && inputs <= max)`."
+    type: DT_FLOAT
+  }
+  attr {
+    name: "min"
+    type: "float"
+    default_value {
+      f: -6
+    }
+  }
+  attr {
+    name: "max"
+    type: "float"
+    default_value {
+      f: 6
+    }
+  }
+  summary: "Compute gradients for a FakeQuantWithMinMaxArgs operation."
+}
+op {
+  name: "FakeQuantWithMinMaxVars"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+  summary: "Fake-quantize the \'inputs\' tensor of type float and shape `[b, h, w, d]` via"
+  description: "global float scalars `min` and `max` to \'outputs\' tensor of same shape as\n`inputs`.\n\n[min; max] is the clamping range for the \'inputs\' data.  Op divides this range\ninto 255 steps (total of 256 values), then replaces each \'inputs\' value with the\nclosest of the quantized step values.\n\nThis operation has a gradient and thus allows for training `min` and `max` values."
+}
+op {
+  name: "FakeQuantWithMinMaxVarsGradient"
+  input_arg {
+    name: "gradients"
+    description: "Backpropagated gradients above the FakeQuantWithMinMaxVars operation."
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    description: "Values passed as inputs to the FakeQuantWithMinMaxVars operation.\nmin, max: Quantization interval, scalar floats."
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops_wrt_input"
+    description: "Backpropagated gradients w.r.t. inputs:\n`gradients * (inputs >= min && inputs <= max)`."
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_min"
+    description: "Backpropagated gradients w.r.t. min parameter:\n`sum(gradients * (inputs < min))`."
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_max"
+    description: "Backpropagated gradients w.r.t. max parameter:\n`sum(gradients * (inputs > max))`."
+    type: DT_FLOAT
+  }
+  summary: "Compute gradients for a FakeQuantWithMinMaxVars operation."
+}
+op {
+  name: "FakeQuantWithMinMaxVarsPerChannel"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+  summary: "Fake-quantize the \'inputs\' tensor of type float and one of the shapes: `[d]`,"
+  description: "`[b, d]` `[b, h, w, d]` via per-channel floats `min` and `max` of shape `[d]`\nto \'outputs\' tensor of same shape as `inputs`.\n\n[min; max] is the clamping range for the \'inputs\' data in the corresponding\ndepth channel.  Op divides this range into 255 steps (total of 256 values), then\nreplaces each \'inputs\' value with the closest of the quantized step values.\n\nThis operation has a gradient and thus allows for training `min` and `max` values."
+}
+op {
+  name: "FakeQuantWithMinMaxVarsPerChannelGradient"
+  input_arg {
+    name: "gradients"
+    description: "Backpropagated gradients above the FakeQuantWithMinMaxVars operation,\nshape one of: `[d]`, `[b, d]`,  `[b, h, w, d]`."
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    description: "Values passed as inputs to the FakeQuantWithMinMaxVars operation, shape\n  same as `gradients`.\nmin, max: Quantization interval, floats of shape `[d]`."
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops_wrt_input"
+    description: "Backpropagated gradients w.r.t. inputs, shape same as\n`inputs`:\n  `gradients * (inputs >= min && inputs <= max)`."
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_min"
+    description: "Backpropagated gradients w.r.t. min parameter, shape `[d]`:\n`sum_per_d(gradients * (inputs < min))`."
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_max"
+    description: "Backpropagated gradients w.r.t. max parameter, shape `[d]`:\n`sum_per_d(gradients * (inputs > max))`."
+    type: DT_FLOAT
+  }
+  summary: "Compute gradients for a FakeQuantWithMinMaxVarsPerChannel operation."
+}
 op {
   name: "Fill"
   input_arg {
@@ -14090,6 +14266,49 @@ op {
   }
   summary: "Computes rectified linear gradients for a Relu operation."
 }
+op {
+  name: "RequantizationRange"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "input_min"
+    description: "The float value that the minimum quantized input value represents."
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    description: "The float value that the maximum quantized input value represents."
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_min"
+    description: "The computed min output."
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    description: "the computed max output."
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    description: "The type of the input."
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  summary: "Given a quantized tensor described by (input, input_min, input_max), outputs a"
+  description: "range that covers the actual values present in that tensor.  This op is\ntypically used to produce the requested_output_min and requested_output_max for\nRequantize."
+}
 op {
   name: "Requantize"
   input_arg {
diff --git a/tensorflow/core/ops/state_ops.cc b/tensorflow/core/ops/state_ops.cc
index 629a280cc8a..b9ac8b16ffb 100644
--- a/tensorflow/core/ops/state_ops.cc
+++ b/tensorflow/core/ops/state_ops.cc
@@ -28,7 +28,24 @@ REGISTER_OP("Variable")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::UnknownShape)
+    .SetShapeFn([](InferenceContext* c) {
+      PartialTensorShape shape;
+      TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape));
+
+      // Variable has legacy behavior where we cannot tell the difference
+      // between a scalar shape attribute and 'unknown shape'.  So if the shape
+      // is a scalar, we return an unknown shape.
+      if (shape.dims() <= 0) {
+        return shape_inference::UnknownShape(c);
+      }
+
+      TensorShapeProto shape_proto;
+      shape.AsProto(&shape_proto);
+      ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeProto(shape_proto, &out));
+      c->set_output(0, out);
+      return Status::OK();
+    })
     .Doc(R"doc(
 Holds state in the form of a tensor that persists across steps.
 
diff --git a/tensorflow/core/ops/state_ops_test.cc b/tensorflow/core/ops/state_ops_test.cc
index 586de77edc8..4c1ec67e9cf 100644
--- a/tensorflow/core/ops/state_ops_test.cc
+++ b/tensorflow/core/ops/state_ops_test.cc
@@ -71,4 +71,30 @@ TEST(StateOpsTest, TemporaryVariable_ShapeFn) {
   INFER_OK(op, "", "[1,2,3]");
 }
 
+TEST(StateOpsTest, Variable_ShapeFn) {
+  ShapeInferenceTestOp op("Variable");
+  TensorShapeProto shape_proto;
+
+  // Unknown rank.
+  PartialTensorShape().AsProto(&shape_proto);
+  TF_ASSERT_OK(NodeDefBuilder("test", "Variable")
+                   .Attr("shape", shape_proto)
+                   .Finalize(&op.node_def));
+  INFER_OK(op, "", "?");
+
+  // For historical reasons an empty TensorShapeProto can be either an unknown
+  // rank or a scalar, so the shape function conservatively says "unknown"
+  shape_proto.Clear();
+  TF_ASSERT_OK(NodeDefBuilder("test", "Variable")
+                   .Attr("shape", shape_proto)
+                   .Finalize(&op.node_def));
+  INFER_OK(op, "", "?");
+
+  // Specified shape.
+  TensorShape({1, 2, 3}).AsProto(&shape_proto);
+  TF_ASSERT_OK(NodeDefBuilder("test", "Variable")
+                   .Attr("shape", shape_proto)
+                   .Finalize(&op.node_def));
+  INFER_OK(op, "", "[1,2,3]");
+}
 }  // end namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 56e89277cc8..6641971ba07 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -52,6 +52,9 @@ constexpr uint64 kUploadRetryDelayMicros = 1000000L;
 // The HTTP response code "308 Resume Incomplete".
 constexpr uint64 HTTP_CODE_RESUME_INCOMPLETE = 308;
 
+// The file statistics returned by Stat() for directories.
+const FileStatistics DIRECTORY_STAT(0, 0, true);
+
 Status GetTmpFilename(string* filename) {
   if (!filename) {
     return errors::Internal("'filename' cannot be nullptr.");
@@ -80,19 +83,19 @@ Status ParseGcsPath(StringPiece fname, bool empty_object_ok, string* bucket,
   StringPiece scheme, bucketp, objectp;
   ParseURI(fname, &scheme, &bucketp, &objectp);
   if (scheme != "gs") {
-    return errors::InvalidArgument(
-        strings::StrCat("GCS path doesn't start with 'gs://': ", fname));
+    return errors::InvalidArgument("GCS path doesn't start with 'gs://': ",
+                                   fname);
   }
   *bucket = bucketp.ToString();
   if (bucket->empty() || *bucket == ".") {
-    return errors::InvalidArgument(
-        strings::StrCat("GCS path doesn't contain a bucket name: ", fname));
+    return errors::InvalidArgument("GCS path doesn't contain a bucket name: ",
+                                   fname);
   }
   objectp.Consume("/");
   *object = objectp.ToString();
   if (!empty_object_ok && object->empty()) {
-    return errors::InvalidArgument(
-        strings::StrCat("GCS path doesn't contain an object name: ", fname));
+    return errors::InvalidArgument("GCS path doesn't contain an object name: ",
+                                   fname);
   }
   return Status::OK();
 }
@@ -128,8 +131,8 @@ Status GetValue(const Json::Value& parent, const string& name,
                 Json::Value* result) {
   *result = parent.get(name, Json::Value::null);
   if (*result == Json::Value::null) {
-    return errors::Internal(strings::StrCat(
-        "The field '", name, "' was expected in the JSON response."));
+    return errors::Internal("The field '", name,
+                            "' was expected in the JSON response.");
   }
   return Status::OK();
 }
@@ -141,8 +144,8 @@ Status GetStringValue(const Json::Value& parent, const string& name,
   TF_RETURN_IF_ERROR(GetValue(parent, name, &result_value));
   if (!result_value.isString()) {
     return errors::Internal(
-        strings::StrCat("The field '", name,
-                        "' in the JSON response was expected to be a string."));
+        "The field '", name,
+        "' in the JSON response was expected to be a string.");
   }
   *result = result_value.asString();
   return Status::OK();
@@ -162,8 +165,8 @@ Status GetInt64Value(const Json::Value& parent, const string& name,
     return Status::OK();
   }
   return errors::Internal(
-      strings::StrCat("The field '", name,
-                      "' in the JSON response was expected to be a number."));
+      "The field '", name,
+      "' in the JSON response was expected to be a number.");
 }
 
 /// Reads a boolean JSON value with the given name from a parent JSON value.
@@ -172,9 +175,9 @@ Status GetBoolValue(const Json::Value& parent, const string& name,
   Json::Value result_value;
   TF_RETURN_IF_ERROR(GetValue(parent, name, &result_value));
   if (!result_value.isBool()) {
-    return errors::Internal(strings::StrCat(
+    return errors::Internal(
         "The field '", name,
-        "' in the JSON response was expected to be a boolean."));
+        "' in the JSON response was expected to be a boolean.");
   }
   *result = result_value.asBool();
   return Status::OK();
@@ -233,9 +236,9 @@ class GcsRandomAccessFile : public RandomAccessFile {
     if (result->size() < n) {
       // This is not an error per se. The RandomAccessFile interface expects
       // that Read returns OutOfRange if fewer bytes were read than requested.
-      return errors::OutOfRange(strings::StrCat("EOF reached, ", result->size(),
-                                                " bytes were read out of ", n,
-                                                " bytes requested."));
+      return errors::OutOfRange("EOF reached, ", result->size(),
+                                " bytes were read out of ", n,
+                                " bytes requested.");
     }
     return Status::OK();
   }
@@ -378,8 +381,8 @@ class GcsWritableFile : public WritableFile {
         case errors::Code::NOT_FOUND:
           // GCS docs recommend retrying the whole upload. We're relying on the
           // RetryingFileSystem to retry the Sync() call.
-          return errors::Unavailable(
-              strings::StrCat("Could not upload gs://", bucket_, "/", object_));
+          return errors::Unavailable("Could not upload gs://", bucket_, "/",
+                                     object_);
         case errors::Code::UNAVAILABLE:
           // The upload can be resumed, but GCS docs recommend an exponential
           // back-off.
@@ -391,8 +394,7 @@ class GcsWritableFile : public WritableFile {
           return upload_status;
       }
     }
-    return errors::Aborted(
-        strings::StrCat("Upload gs://", bucket_, "/", object_, " failed."));
+    return errors::Aborted("Upload gs://", bucket_, "/", object_, " failed.");
   }
 
  private:
@@ -445,9 +447,9 @@ class GcsWritableFile : public WritableFile {
         request->Send(), " when initiating an upload to ", GetGcsPath());
     *session_uri = request->GetResponseHeader("Location");
     if (session_uri->empty()) {
-      return errors::Internal(
-          strings::StrCat("Unexpected response from GCS when writing to ",
-                          GetGcsPath(), ": 'Location' header not returned."));
+      return errors::Internal("Unexpected response from GCS when writing to ",
+                              GetGcsPath(),
+                              ": 'Location' header not returned.");
     }
     return Status::OK();
   }
@@ -495,15 +497,14 @@ class GcsWritableFile : public WritableFile {
       std::vector<int64> range_parts;
       if (!str_util::SplitAndParseAsInts(range_piece, '-', &range_parts) ||
           range_parts.size() != 2) {
-        return errors::Internal(strings::StrCat(
-            "Unexpected response from GCS when writing ", GetGcsPath(),
-            ": Range header '", received_range, "' could not be parsed."));
+        return errors::Internal("Unexpected response from GCS when writing ",
+                                GetGcsPath(), ": Range header '",
+                                received_range, "' could not be parsed.");
       }
       if (range_parts[0] != 0) {
-        return errors::Internal(
-            strings::StrCat("Unexpected response from GCS when writing to ",
-                            GetGcsPath(), ": the returned range '",
-                            received_range, "' does not start at zero."));
+        return errors::Internal("Unexpected response from GCS when writing to ",
+                                GetGcsPath(), ": the returned range '",
+                                received_range, "' does not start at zero.");
       }
       // If GCS returned "Range: 0-10", this means 11 bytes were uploaded.
       *uploaded = range_parts[1] + 1;
@@ -655,14 +656,31 @@ bool GcsFileSystem::FileExists(const string& fname) {
     return false;
   }
   if (object.empty()) {
-    return BucketExists(bucket).ok();
+    bool result;
+    return BucketExists(bucket, &result).ok() && result;
   }
-  return ObjectExists(bucket, object).ok() || FolderExists(fname).ok();
+  bool result;
+  return (ObjectExists(bucket, object, &result).ok() && result) ||
+         (FolderExists(fname, &result).ok() && result);
 }
 
-Status GcsFileSystem::ObjectExists(const string& bucket, const string& object) {
-  FileStatistics stat;
-  return StatForObject(bucket, object, &stat);
+Status GcsFileSystem::ObjectExists(const string& bucket, const string& object,
+                                   bool* result) {
+  if (!result) {
+    return errors::Internal("'result' cannot be nullptr.");
+  }
+  FileStatistics not_used_stat;
+  const Status status = StatForObject(bucket, object, &not_used_stat);
+  switch (status.code()) {
+    case errors::Code::OK:
+      *result = true;
+      return Status::OK();
+    case errors::Code::NOT_FOUND:
+      *result = false;
+      return Status::OK();
+    default:
+      return status;
+  }
 }
 
 Status GcsFileSystem::StatForObject(const string& bucket, const string& object,
@@ -707,7 +725,10 @@ Status GcsFileSystem::StatForObject(const string& bucket, const string& object,
   return Status::OK();
 }
 
-Status GcsFileSystem::BucketExists(const string& bucket) {
+Status GcsFileSystem::BucketExists(const string& bucket, bool* result) {
+  if (!result) {
+    return errors::Internal("'result' cannot be nullptr.");
+  }
   string auth_token;
   TF_RETURN_IF_ERROR(AuthProvider::GetToken(auth_provider_.get(), &auth_token));
 
@@ -715,15 +736,26 @@ Status GcsFileSystem::BucketExists(const string& bucket) {
   TF_RETURN_IF_ERROR(request->Init());
   request->SetUri(strings::StrCat(kGcsUriBase, "b/", bucket));
   request->AddAuthBearerHeader(auth_token);
-  return request->Send();
+  const Status status = request->Send();
+  switch (status.code()) {
+    case errors::Code::OK:
+      *result = true;
+      return Status::OK();
+    case errors::Code::NOT_FOUND:
+      *result = false;
+      return Status::OK();
+    default:
+      return status;
+  }
 }
 
-Status GcsFileSystem::FolderExists(const string& dirname) {
+Status GcsFileSystem::FolderExists(const string& dirname, bool* result) {
+  if (!result) {
+    return errors::Internal("'result' cannot be nullptr.");
+  }
   std::vector<string> children;
   TF_RETURN_IF_ERROR(GetChildrenBounded(dirname, 1, &children, true));
-  if (children.empty()) {
-    return errors::NotFound("Folder does not exist.");
-  }
+  *result = !children.empty();
   return Status::OK();
 }
 
@@ -740,8 +772,8 @@ Status GcsFileSystem::GetMatchingPaths(const string& pattern,
       pattern.substr(0, pattern.find_first_of("*?[\\"));
   const string& dir = io::Dirname(fixed_prefix).ToString();
   if (dir.empty()) {
-    return errors::InvalidArgument(
-        strings::StrCat("A GCS pattern doesn't have a bucket name: ", pattern));
+    return errors::InvalidArgument("A GCS pattern doesn't have a bucket name: ",
+                                   pattern);
   }
   std::vector<string> all_files;
   TF_RETURN_IF_ERROR(GetChildrenBounded(dir, UINT64_MAX, &all_files, true));
@@ -854,9 +886,9 @@ Status GcsFileSystem::GetChildrenBounded(const string& dirname,
         const string& prefix_str = prefix.asString();
         StringPiece relative_path(prefix_str);
         if (!relative_path.Consume(object_prefix)) {
-          return errors::Internal(strings::StrCat(
+          return errors::Internal(
               "Unexpected response: the returned folder name ", prefix_str,
-              " doesn't match the prefix ", object_prefix));
+              " doesn't match the prefix ", object_prefix);
         }
         result->emplace_back(relative_path.ToString());
         if (++retrieved_results >= max_results) {
@@ -882,18 +914,30 @@ Status GcsFileSystem::Stat(const string& fname, FileStatistics* stat) {
   }
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, true, &bucket, &object));
-  if (StatForObject(bucket, object, stat).ok()) {
+  if (object.empty()) {
+    bool is_bucket;
+    TF_RETURN_IF_ERROR(BucketExists(bucket, &is_bucket));
+    if (is_bucket) {
+      *stat = DIRECTORY_STAT;
+      return Status::OK();
+    }
+    return errors::NotFound("The specified bucket ", fname, " was not found.");
+  }
+
+  const Status status = StatForObject(bucket, object, stat);
+  if (status.ok()) {
     return Status::OK();
   }
-  if ((object.empty() && BucketExists(bucket).ok()) ||
-      (!object.empty() && FolderExists(fname).ok())) {
-    stat->length = 0;
-    stat->mtime_nsec = 0;
-    stat->is_directory = true;
+  if (status.code() != errors::Code::NOT_FOUND) {
+    return status;
+  }
+  bool is_folder;
+  TF_RETURN_IF_ERROR(FolderExists(fname, &is_folder));
+  if (is_folder) {
+    *stat = DIRECTORY_STAT;
     return Status::OK();
   }
-  return errors::NotFound(
-      strings::StrCat("The specified path ", fname, " was not found."));
+  return errors::NotFound("The specified path ", fname, " was not found.");
 }
 
 Status GcsFileSystem::DeleteFile(const string& fname) {
@@ -917,11 +961,11 @@ Status GcsFileSystem::CreateDir(const string& dirname) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseGcsPath(dirname, true, &bucket, &object));
   if (object.empty()) {
-    if (BucketExists(bucket).ok()) {
-      return Status::OK();
-    }
-    return errors::NotFound(
-        strings::StrCat("The specified bucket ", dirname, " was not found."));
+    bool is_bucket;
+    TF_RETURN_IF_ERROR(BucketExists(bucket, &is_bucket));
+    return is_bucket ? Status::OK()
+                     : errors::NotFound("The specified bucket ", dirname,
+                                        " was not found.");
   }
   // Create a zero-length directory marker object.
   std::unique_ptr<WritableFile> file;
@@ -1014,9 +1058,9 @@ Status GcsFileSystem::RenameObject(const string& src, const string& target) {
     // which requires multiple rewrite calls.
     // TODO(surkov): implement multi-step rewrites.
     return errors::Unimplemented(
-        strings::StrCat("Couldn't rename ", src, " to ", target,
-                        ": moving large files between buckets with different "
-                        "locations or storage classes is not supported."));
+        "Couldn't rename ", src, " to ", target,
+        ": moving large files between buckets with different "
+        "locations or storage classes is not supported.");
   }
 
   TF_RETURN_IF_ERROR(DeleteFile(src));
@@ -1027,21 +1071,26 @@ Status GcsFileSystem::IsDirectory(const string& fname) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, true, &bucket, &object));
   if (object.empty()) {
-    if (BucketExists(bucket).ok()) {
+    bool is_bucket;
+    TF_RETURN_IF_ERROR(BucketExists(bucket, &is_bucket));
+    if (is_bucket) {
       return Status::OK();
     }
-    return errors::NotFound(strings::StrCat("The specified bucket gs://",
-                                            bucket, " was not found."));
+    return errors::NotFound("The specified bucket gs://", bucket,
+                            " was not found.");
   }
-  if (FolderExists(fname).ok()) {
+  bool is_folder;
+  TF_RETURN_IF_ERROR(FolderExists(fname, &is_folder));
+  if (is_folder) {
     return Status::OK();
   }
-  if (ObjectExists(bucket, object).ok()) {
-    return errors::FailedPrecondition(
-        strings::StrCat("The specified path ", fname, " is not a directory."));
+  bool is_object;
+  TF_RETURN_IF_ERROR(ObjectExists(bucket, object, &is_object));
+  if (is_object) {
+    return errors::FailedPrecondition("The specified path ", fname,
+                                      " is not a directory.");
   }
-  return errors::NotFound(
-      strings::StrCat("The specified path ", fname, " was not found."));
+  return errors::NotFound("The specified path ", fname, " was not found.");
 }
 
 Status GcsFileSystem::DeleteRecursively(const string& dirname,
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.h b/tensorflow/core/platform/cloud/gcs_file_system.h
index 618be5934ea..c98a50cc879 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.h
+++ b/tensorflow/core/platform/cloud/gcs_file_system.h
@@ -76,9 +76,21 @@ class GcsFileSystem : public FileSystem {
                            int64* undeleted_dirs) override;
 
  private:
-  Status BucketExists(const string& bucket);
-  Status ObjectExists(const string& bucket, const string& object);
-  Status FolderExists(const string& dirname);
+  /// \brief Checks if the bucket exists. Returns OK if the check succeeded.
+  ///
+  /// 'result' is set if the function returns OK. 'result' cannot be nullptr.
+  Status BucketExists(const string& bucket, bool* result);
+
+  /// \brief Checks if the object exists. Returns OK if the check succeeded.
+  ///
+  /// 'result' is set if the function returns OK. 'result' cannot be nullptr.
+  Status ObjectExists(const string& bucket, const string& object, bool* result);
+
+  /// \brief Checks if the folder exists. Returns OK if the check succeeded.
+  ///
+  /// 'result' is set if the function returns OK. 'result' cannot be nullptr.
+  Status FolderExists(const string& dirname, bool* result);
+
   Status GetChildrenBounded(const string& dir, uint64 max_results,
                             std::vector<string>* result, bool recursively);
   /// Retrieves file statistics assuming fname points to a GCS object.
diff --git a/tensorflow/core/platform/default/build_config/BUILD b/tensorflow/core/platform/default/build_config/BUILD
index fe51a698d15..a63aa4d7a97 100644
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@@ -86,6 +86,14 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "jpeg",
+    copts = tf_copts(),
+    deps = [
+        "@jpeg_archive//:jpeg",
+    ],
+)
+
 cc_library(
     name = "protos_cc",
     copts = tf_copts(),
diff --git a/tensorflow/core/platform/env_test.cc b/tensorflow/core/platform/env_test.cc
index 36586d3f822..d3e9e08c46c 100644
--- a/tensorflow/core/platform/env_test.cc
+++ b/tensorflow/core/platform/env_test.cc
@@ -303,178 +303,4 @@ TEST_F(DefaultEnvTest, RecursivelyCreateDirWithUri) {
   EXPECT_TRUE(env->FileExists(create_path));
 }
 
-// Creates a new TestEnv that uses Env::Default for all basic ops but
-// uses the default implementation for the GetMatchingFiles function instead.
-class TestEnv : public EnvWrapper {
- public:
-  explicit TestEnv(Env* env) : EnvWrapper(env) {}
-
-  ~TestEnv() override = default;
-};
-
-Env* GetTestEnv() {
-  static Env* default_env = new TestEnv(Env::Default());
-  return default_env;
-}
-
-class InterPlanetaryFileSystem : public NullFileSystem {
- public:
-  Status IsDirectory(const string& dirname) override {
-    if (dirname == "ipfs://solarsystem" ||
-        dirname == "ipfs://solarsystem/Earth" ||
-        dirname == "ipfs://solarsystem/Jupiter") {
-      return Status::OK();
-    }
-    return Status(tensorflow::error::FAILED_PRECONDITION, "Not a directory");
-  }
-
-  Status GetChildren(const string& dir, std::vector<string>* result) override {
-    std::vector<string> celestial_bodies;
-    if (dir == "ipfs://solarsystem") {
-      celestial_bodies = {"Mercury",  "Venus",   "Earth",  "Mars",
-                          "Jupiter",  "Saturn",  "Uranus", "Neptune",
-                          ".PlanetX", "Planet0", "Planet1"};
-
-    } else if (dir == "ipfs://solarsystem/Earth") {
-      celestial_bodies = {"Moon"};
-    } else if (dir == "ipfs://solarsystem/Jupiter") {
-      celestial_bodies = {"Europa", "Io", "Ganymede"};
-    }
-    result->insert(result->end(), celestial_bodies.begin(),
-                   celestial_bodies.end());
-    return Status::OK();
-  }
-};
-
-REGISTER_FILE_SYSTEM_ENV(GetTestEnv(), "ipfs", InterPlanetaryFileSystem);
-
-class TestEnvTest : public ::testing::Test {
- protected:
-  void SetUp() override { env_->CreateDir(BaseDir()); }
-
-  void TearDown() override {
-    int64 undeleted_files, undeleted_dirs;
-    env_->DeleteRecursively(BaseDir(), &undeleted_files, &undeleted_dirs);
-  }
-
-  // Returns all the matched entries as a comma separated string removing the
-  // common prefix of BaseDir().
-  string Match(const string& base_dir, const string& suffix_pattern) {
-    std::vector<string> results;
-    Status s = env_->GetMatchingPaths(io::JoinPath(base_dir, suffix_pattern),
-                                      &results);
-    if (!s.ok()) {
-      return s.ToString();
-    } else {
-      std::vector<StringPiece> trimmed_results;
-      std::sort(results.begin(), results.end());
-      for (const string& result : results) {
-        StringPiece trimmed_result(result);
-        EXPECT_TRUE(trimmed_result.Consume(base_dir + "/"));
-        trimmed_results.push_back(trimmed_result);
-      }
-      return str_util::Join(trimmed_results, ",");
-    }
-  }
-
-  Env* env_ = GetTestEnv();
-};
-
-TEST_F(TestEnvTest, IPFS) {
-  std::vector<string> matched_planets;
-  TF_EXPECT_OK(env_->GetChildren("ipfs://solarsystem", &matched_planets));
-  std::vector<string> planets = {"Mercury",  "Venus",   "Earth",  "Mars",
-                                 "Jupiter",  "Saturn",  "Uranus", "Neptune",
-                                 ".PlanetX", "Planet0", "Planet1"};
-  int c = 0;
-  for (auto p : matched_planets) {
-    EXPECT_EQ(p, planets[c++]);
-  }
-}
-
-TEST_F(TestEnvTest, MatchNonExistentFile) {
-  EXPECT_EQ(Match(BaseDir(), "thereisnosuchfile"), "");
-}
-
-TEST_F(TestEnvTest, MatchSimple) {
-  // Create a few files.
-  TF_EXPECT_OK(
-      WriteStringToFile(env_, io::JoinPath(BaseDir(), "match-00"), ""));
-  TF_EXPECT_OK(
-      WriteStringToFile(env_, io::JoinPath(BaseDir(), "match-0a"), ""));
-  TF_EXPECT_OK(
-      WriteStringToFile(env_, io::JoinPath(BaseDir(), "match-01"), ""));
-  TF_EXPECT_OK(
-      WriteStringToFile(env_, io::JoinPath(BaseDir(), "match-aaa"), ""));
-
-  EXPECT_EQ(Match(BaseDir(), "match-*"),
-            "match-00,match-01,match-0a,match-aaa");
-  EXPECT_EQ(Match(BaseDir(), "match-0[0-9]"), "match-00,match-01");
-  EXPECT_EQ(Match(BaseDir(), "match-?[0-9]"), "match-00,match-01");
-  EXPECT_EQ(Match(BaseDir(), "match-?a*"), "match-0a,match-aaa");
-  EXPECT_EQ(Match(BaseDir(), "match-??"), "match-00,match-01,match-0a");
-}
-
-TEST_F(TestEnvTest, MatchDirectory) {
-  // Create some directories.
-  TF_EXPECT_OK(
-      env_->RecursivelyCreateDir(io::JoinPath(BaseDir(), "match-00/abc")));
-  TF_EXPECT_OK(
-      env_->RecursivelyCreateDir(io::JoinPath(BaseDir(), "match-0a/abc")));
-  TF_EXPECT_OK(
-      env_->RecursivelyCreateDir(io::JoinPath(BaseDir(), "match-01/abc")));
-  TF_EXPECT_OK(
-      env_->RecursivelyCreateDir(io::JoinPath(BaseDir(), "match-aaa/abc")));
-
-  // Create a few files.
-  TF_EXPECT_OK(
-      WriteStringToFile(env_, io::JoinPath(BaseDir(), "match-00/abc/x"), ""));
-  TF_EXPECT_OK(
-      WriteStringToFile(env_, io::JoinPath(BaseDir(), "match-0a/abc/x"), ""));
-  TF_EXPECT_OK(
-      WriteStringToFile(env_, io::JoinPath(BaseDir(), "match-01/abc/x"), ""));
-  TF_EXPECT_OK(
-      WriteStringToFile(env_, io::JoinPath(BaseDir(), "match-aaa/abc/x"), ""));
-
-  EXPECT_EQ(Match(BaseDir(), "match-*/abc/x"),
-            "match-00/abc/x,match-01/abc/x,match-0a/abc/x,match-aaa/abc/x");
-  EXPECT_EQ(Match(BaseDir(), "match-0[0-9]/abc/x"),
-            "match-00/abc/x,match-01/abc/x");
-  EXPECT_EQ(Match(BaseDir(), "match-?[0-9]/abc/x"),
-            "match-00/abc/x,match-01/abc/x");
-  EXPECT_EQ(Match(BaseDir(), "match-?a*/abc/x"),
-            "match-0a/abc/x,match-aaa/abc/x");
-  EXPECT_EQ(Match(BaseDir(), "match-?[^a]/abc/x"),
-            "match-00/abc/x,match-01/abc/x");
-}
-
-TEST_F(TestEnvTest, MatchMultipleWildcards) {
-  // Create some directories.
-  TF_EXPECT_OK(
-      env_->RecursivelyCreateDir(io::JoinPath(BaseDir(), "match-00/abc")));
-  TF_EXPECT_OK(
-      env_->RecursivelyCreateDir(io::JoinPath(BaseDir(), "match-01/abc")));
-  TF_EXPECT_OK(
-      env_->RecursivelyCreateDir(io::JoinPath(BaseDir(), "match-02/abc")));
-
-  // Create a few files.
-  TF_EXPECT_OK(
-      WriteStringToFile(env_, io::JoinPath(BaseDir(), "match-00/abc/00"), ""));
-  TF_EXPECT_OK(
-      WriteStringToFile(env_, io::JoinPath(BaseDir(), "match-00/abc/01"), ""));
-  TF_EXPECT_OK(
-      WriteStringToFile(env_, io::JoinPath(BaseDir(), "match-00/abc/09"), ""));
-  TF_EXPECT_OK(
-      WriteStringToFile(env_, io::JoinPath(BaseDir(), "match-01/abc/00"), ""));
-  TF_EXPECT_OK(
-      WriteStringToFile(env_, io::JoinPath(BaseDir(), "match-01/abc/04"), ""));
-  TF_EXPECT_OK(
-      WriteStringToFile(env_, io::JoinPath(BaseDir(), "match-01/abc/10"), ""));
-  TF_EXPECT_OK(
-      WriteStringToFile(env_, io::JoinPath(BaseDir(), "match-02/abc/00"), ""));
-
-  EXPECT_EQ(Match(BaseDir(), "match-0[0-1]/abc/0[0-8]"),
-            "match-00/abc/00,match-00/abc/01,match-01/abc/00,match-01/abc/04");
-}
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/file_statistics.h b/tensorflow/core/platform/file_statistics.h
index 6bb34c19dd0..7629db6ef9e 100644
--- a/tensorflow/core/platform/file_statistics.h
+++ b/tensorflow/core/platform/file_statistics.h
@@ -29,6 +29,8 @@ struct FileStatistics {
   bool is_directory = false;
 
   FileStatistics() {}
+  FileStatistics(int64 length, int64 mtime_nsec, bool is_directory)
+      : length(length), mtime_nsec(mtime_nsec), is_directory(is_directory) {}
   ~FileStatistics() {}
 };
 
diff --git a/tensorflow/core/platform/file_system.cc b/tensorflow/core/platform/file_system.cc
index 3e68f48eb17..62167b4f768 100644
--- a/tensorflow/core/platform/file_system.cc
+++ b/tensorflow/core/platform/file_system.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <deque>
 
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -29,6 +30,12 @@ limitations under the License.
 
 namespace tensorflow {
 
+namespace {
+
+constexpr int32 kNumThreads = 8;
+
+}  // anonymous namespace
+
 FileSystem::~FileSystem() {}
 
 string FileSystem::TranslateName(const string& name) const {
@@ -105,16 +112,32 @@ Status FileSystem::GetMatchingPaths(const string& pattern,
   std::deque<string> dir_q;
   dir_q.push_back(dir);
   Status ret;  // Status to return.
+  std::vector<bool> children_dir_status;  // holds is_dir status for children.
   while (!dir_q.empty()) {
     string current_dir = dir_q.front();
     dir_q.pop_front();
     std::vector<string> children;
     Status s = GetChildren(current_dir, &children);
     ret.Update(s);
-    for (const string& child : children) {
-      const string child_path = io::JoinPath(current_dir, child);
+    if (children.empty()) continue;
+    // This IsDirectory call can be expensive for some FS. Parallelizing it.
+    thread::ThreadPool* children_threads =
+        new thread::ThreadPool(Env::Default(), "TraverseChildren", kNumThreads);
+    children_dir_status.resize(children.size());
+    for (int i = 0; i < children.size(); ++i) {
+      const string child_path = io::JoinPath(current_dir, children[i]);
+      children_threads->Schedule([this, child_path, i, &children_dir_status] {
+        children_dir_status[i] = this->IsDirectory(child_path).ok();
+      });
+    }
+    delete children_threads;
+    for (int i = 0; i < children.size(); ++i) {
+      const string child_path = io::JoinPath(current_dir, children[i]);
+      // In case the child_path doesn't start with the fixed_prefix then we bail
+      // and don't add it to the queue / candidates.
+      if (!StringPiece(child_path).starts_with(fixed_prefix)) continue;
       // If the child is a directory add it to the queue.
-      if (IsDirectory(child_path).ok()) {
+      if (children_dir_status[i]) {
         dir_q.push_back(child_path);
       }
       all_files.push_back(child_path);
diff --git a/tensorflow/core/platform/file_system_test.cc b/tensorflow/core/platform/file_system_test.cc
index 1a37251177f..600af91206b 100644
--- a/tensorflow/core/platform/file_system_test.cc
+++ b/tensorflow/core/platform/file_system_test.cc
@@ -25,42 +25,125 @@ limitations under the License.
 
 namespace tensorflow {
 
+static const char* const kPrefix = "ipfs://solarsystem";
+
+// A file system that has Planets, Satellites and Sub Satellites. Sub satellites
+// cannot have children further.
 class InterPlanetaryFileSystem : public NullFileSystem {
  public:
-  Status IsDirectory(const string& dirname) override {
-    if (dirname == "ipfs://solarsystem" ||
-        dirname == "ipfs://solarsystem/Earth" ||
-        dirname == "ipfs://solarsystem/Jupiter") {
+  bool FileExists(const string& fname) override {
+    string parsed_path;
+    ParsePath(fname, &parsed_path);
+    return BodyExists(parsed_path);
+  }
+
+  // Adds the dir to the parent's children list and creates an entry for itself.
+  Status CreateDir(const string& dirname) override {
+    string parsed_path;
+    ParsePath(dirname, &parsed_path);
+    // If the directory already exists then ignore.
+    if (celestial_bodies_.find(parsed_path) != celestial_bodies_.end()) {
       return Status::OK();
     }
-    return Status(tensorflow::error::FAILED_PRECONDITION, "Not a directory");
+    std::vector<string> split_path = str_util::Split(parsed_path, '/');
+    // If the path is too long then we don't support it.
+    if (split_path.size() > 3) {
+      return Status(tensorflow::error::INVALID_ARGUMENT, "Bad dirname");
+    }
+    if (split_path.empty()) {
+      return Status::OK();
+    }
+    if (split_path.size() == 1) {
+      celestial_bodies_[""].insert(parsed_path);
+      celestial_bodies_.insert(
+          std::pair<string, std::set<string>>(parsed_path, {}));
+      return Status::OK();
+    }
+    if (split_path.size() == 2) {
+      if (!BodyExists(split_path[0])) {
+        return Status(tensorflow::error::FAILED_PRECONDITION,
+                      "Base dir not created");
+      }
+      celestial_bodies_[split_path[0]].insert(split_path[1]);
+      celestial_bodies_.insert(
+          std::pair<string, std::set<string>>(parsed_path, {}));
+      return Status::OK();
+    }
+    if (split_path.size() == 3) {
+      const string& parent_path = io::JoinPath(split_path[0], split_path[1]);
+      if (!BodyExists(parent_path)) {
+        return Status(tensorflow::error::FAILED_PRECONDITION,
+                      "Base dir not created");
+      }
+      celestial_bodies_[parent_path].insert(split_path[2]);
+      celestial_bodies_.insert(
+          std::pair<string, std::set<string>>(parsed_path, {}));
+      return Status::OK();
+    }
+    return Status(tensorflow::error::FAILED_PRECONDITION, "Failed to create");
+  }
+
+  Status IsDirectory(const string& dirname) override {
+    string parsed_path;
+    ParsePath(dirname, &parsed_path);
+    std::vector<string> split_path = str_util::Split(parsed_path, '/');
+    if (split_path.size() > 2) {
+      return Status(tensorflow::error::FAILED_PRECONDITION, "Not a dir");
+    }
+    if (celestial_bodies_.find(parsed_path) != celestial_bodies_.end()) {
+      return Status::OK();
+    }
+    return Status(tensorflow::error::FAILED_PRECONDITION, "Not a dir");
   }
 
   Status GetChildren(const string& dir, std::vector<string>* result) override {
-    std::vector<string> celestial_bodies;
-    if (dir == "ipfs://solarsystem") {
-      celestial_bodies = {"Mercury",  "Venus",   "Earth",  "Mars",
-                          "Jupiter",  "Saturn",  "Uranus", "Neptune",
-                          ".PlanetX", "Planet0", "Planet1"};
-
-    } else if (dir == "ipfs://solarsystem/Earth") {
-      celestial_bodies = {"Moon"};
-    } else if (dir == "ipfs://solarsystem/Jupiter") {
-      celestial_bodies = {"Europa", "Io", "Ganymede"};
-    }
-    result->insert(result->end(), celestial_bodies.begin(),
-                   celestial_bodies.end());
+    TF_RETURN_IF_ERROR(IsDirectory(dir));
+    string parsed_path;
+    ParsePath(dir, &parsed_path);
+    result->insert(result->begin(), celestial_bodies_[parsed_path].begin(),
+                   celestial_bodies_[parsed_path].end());
     return Status::OK();
   }
+
+ private:
+  bool BodyExists(const string& name) {
+    return celestial_bodies_.find(name) != celestial_bodies_.end();
+  }
+
+  void ParsePath(const string& name, string* parsed_path) {
+    StringPiece scheme, host, path;
+    ParseURI(name, &scheme, &host, &path);
+    ASSERT_EQ(scheme, "ipfs");
+    ASSERT_EQ(host, "solarsystem");
+    path.Consume("/");
+    *parsed_path = path.ToString();
+  }
+
+  std::map<string, std::set<string>> celestial_bodies_ = {
+      std::pair<string, std::set<string>>(
+          "", {"Mercury", "Venus", "Earth", "Mars", "Jupiter", "Saturn",
+               "Uranus", "Neptune"}),
+      std::pair<string, std::set<string>>("Mercury", {}),
+      std::pair<string, std::set<string>>("Venus", {}),
+      std::pair<string, std::set<string>>("Earth", {"Moon"}),
+      std::pair<string, std::set<string>>("Mars", {}),
+      std::pair<string, std::set<string>>("Jupiter",
+                                          {"Europa", "Io", "Ganymede"}),
+      std::pair<string, std::set<string>>("Saturn", {}),
+      std::pair<string, std::set<string>>("Uranus", {}),
+      std::pair<string, std::set<string>>("Neptune", {}),
+      std::pair<string, std::set<string>>("Earth/Moon", {}),
+      std::pair<string, std::set<string>>("Jupiter/Europa", {}),
+      std::pair<string, std::set<string>>("Jupiter/Io", {}),
+      std::pair<string, std::set<string>>("Jupiter/Ganymede", {})};
 };
 
 // Returns all the matched entries as a comma separated string removing the
 // common prefix of BaseDir().
-string Match(const string& base_dir, const string& suffix_pattern) {
-  InterPlanetaryFileSystem fs;
+string Match(InterPlanetaryFileSystem* ipfs, const string& suffix_pattern) {
   std::vector<string> results;
   Status s =
-      fs.GetMatchingPaths(io::JoinPath(base_dir, suffix_pattern), &results);
+      ipfs->GetMatchingPaths(io::JoinPath(kPrefix, suffix_pattern), &results);
   if (!s.ok()) {
     return s.ToString();
   } else {
@@ -68,7 +151,7 @@ string Match(const string& base_dir, const string& suffix_pattern) {
     std::sort(results.begin(), results.end());
     for (const string& result : results) {
       StringPiece trimmed_result(result);
-      EXPECT_TRUE(trimmed_result.Consume(base_dir + "/"));
+      EXPECT_TRUE(trimmed_result.Consume(strings::StrCat(kPrefix, "/")));
       trimmed_results.push_back(trimmed_result);
     }
     return str_util::Join(trimmed_results, ",");
@@ -76,17 +159,76 @@ string Match(const string& base_dir, const string& suffix_pattern) {
 }
 
 TEST(TestFileSystem, IPFSMatch) {
-  // Make sure we only get the 11 planets and not all their children.
-  EXPECT_EQ(Match("ipfs://solarsystem", "*"),
-            ".PlanetX,Earth,Jupiter,Mars,Mercury,Neptune,Planet0,Planet1,"
-            "Saturn,Uranus,Venus");
+  InterPlanetaryFileSystem ipfs;
+  EXPECT_EQ(Match(&ipfs, "thereisnosuchfile"), "");
+  EXPECT_EQ(Match(&ipfs, "*"),
+            "Earth,Jupiter,Mars,Mercury,Neptune,Saturn,Uranus,Venus");
   // Returns Jupiter's moons.
-  EXPECT_EQ(Match("ipfs://solarsystem", "Jupiter/*"),
+  EXPECT_EQ(Match(&ipfs, "Jupiter/*"),
             "Jupiter/Europa,Jupiter/Ganymede,Jupiter/Io");
   // Returns Jupiter's and Earth's moons.
-  EXPECT_EQ(Match("ipfs://solarsystem", "*/*"),
+  EXPECT_EQ(Match(&ipfs, "*/*"),
             "Earth/Moon,Jupiter/Europa,Jupiter/Ganymede,Jupiter/Io");
-  EXPECT_EQ(Match("ipfs://solarsystem", "Planet[0-1]"), "Planet0,Planet1");
+  TF_EXPECT_OK(ipfs.CreateDir(io::JoinPath(kPrefix, "Planet0")));
+  TF_EXPECT_OK(ipfs.CreateDir(io::JoinPath(kPrefix, "Planet1")));
+  EXPECT_EQ(Match(&ipfs, "Planet[0-1]"), "Planet0,Planet1");
+  EXPECT_EQ(Match(&ipfs, "Planet?"), "Planet0,Planet1");
+}
+
+TEST(TestFileSystem, MatchSimple) {
+  InterPlanetaryFileSystem ipfs;
+  TF_EXPECT_OK(ipfs.CreateDir(io::JoinPath(kPrefix, "match-00")));
+  TF_EXPECT_OK(ipfs.CreateDir(io::JoinPath(kPrefix, "match-0a")));
+  TF_EXPECT_OK(ipfs.CreateDir(io::JoinPath(kPrefix, "match-01")));
+  TF_EXPECT_OK(ipfs.CreateDir(io::JoinPath(kPrefix, "match-aaa")));
+
+  EXPECT_EQ(Match(&ipfs, "match-*"), "match-00,match-01,match-0a,match-aaa");
+  EXPECT_EQ(Match(&ipfs, "match-0[0-9]"), "match-00,match-01");
+  EXPECT_EQ(Match(&ipfs, "match-?[0-9]"), "match-00,match-01");
+  EXPECT_EQ(Match(&ipfs, "match-?a*"), "match-0a,match-aaa");
+  EXPECT_EQ(Match(&ipfs, "match-??"), "match-00,match-01,match-0a");
+}
+
+TEST(TestFileSystem, MatchDirectory) {
+  InterPlanetaryFileSystem ipfs;
+  TF_EXPECT_OK(
+      ipfs.RecursivelyCreateDir(io::JoinPath(kPrefix, "match-00/abc/x")));
+  TF_EXPECT_OK(
+      ipfs.RecursivelyCreateDir(io::JoinPath(kPrefix, "match-0a/abc/x")));
+  TF_EXPECT_OK(
+      ipfs.RecursivelyCreateDir(io::JoinPath(kPrefix, "match-01/abc/x")));
+  TF_EXPECT_OK(
+      ipfs.RecursivelyCreateDir(io::JoinPath(kPrefix, "match-aaa/abc/x")));
+
+  EXPECT_EQ(Match(&ipfs, "match-*/abc/x"),
+            "match-00/abc/x,match-01/abc/x,match-0a/abc/x,match-aaa/abc/x");
+  EXPECT_EQ(Match(&ipfs, "match-0[0-9]/abc/x"),
+            "match-00/abc/x,match-01/abc/x");
+  EXPECT_EQ(Match(&ipfs, "match-?[0-9]/abc/x"),
+            "match-00/abc/x,match-01/abc/x");
+  EXPECT_EQ(Match(&ipfs, "match-?a*/abc/x"), "match-0a/abc/x,match-aaa/abc/x");
+  EXPECT_EQ(Match(&ipfs, "match-?[^a]/abc/x"), "match-00/abc/x,match-01/abc/x");
+}
+
+TEST(TestFileSystem, MatchMultipleWildcards) {
+  InterPlanetaryFileSystem ipfs;
+  TF_EXPECT_OK(
+      ipfs.RecursivelyCreateDir(io::JoinPath(kPrefix, "match-00/abc/00")));
+  TF_EXPECT_OK(
+      ipfs.RecursivelyCreateDir(io::JoinPath(kPrefix, "match-00/abc/01")));
+  TF_EXPECT_OK(
+      ipfs.RecursivelyCreateDir(io::JoinPath(kPrefix, "match-00/abc/09")));
+  TF_EXPECT_OK(
+      ipfs.RecursivelyCreateDir(io::JoinPath(kPrefix, "match-01/abc/00")));
+  TF_EXPECT_OK(
+      ipfs.RecursivelyCreateDir(io::JoinPath(kPrefix, "match-01/abc/04")));
+  TF_EXPECT_OK(
+      ipfs.RecursivelyCreateDir(io::JoinPath(kPrefix, "match-01/abc/10")));
+  TF_EXPECT_OK(
+      ipfs.RecursivelyCreateDir(io::JoinPath(kPrefix, "match-02/abc/00")));
+
+  EXPECT_EQ(Match(&ipfs, "match-0[0-1]/abc/0[0-8]"),
+            "match-00/abc/00,match-00/abc/01,match-01/abc/00,match-01/abc/04");
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/protobuf/worker.proto b/tensorflow/core/protobuf/worker.proto
index 7a50aa3e649..81ff1047e7b 100644
--- a/tensorflow/core/protobuf/worker.proto
+++ b/tensorflow/core/protobuf/worker.proto
@@ -22,6 +22,7 @@ option java_multiple_files = true;
 option java_package = "org.tensorflow.distruntime";
 
 import "google/protobuf/any.proto";
+import "tensorflow/core/framework/cost_graph.proto";
 import "tensorflow/core/framework/step_stats.proto";
 import "tensorflow/core/framework/device_attributes.proto";
 import "tensorflow/core/framework/graph.proto";
@@ -181,8 +182,10 @@ message RunGraphResponse {
   // `RunGraphRequest.recv_key`.
   repeated NamedTensor recv = 1;
 
-  // If the request asked for execution stats, these are returned here.
+  // If the request asked for execution stats or cost graph, these are returned
+  // here.
   StepStats step_stats = 2;
+  CostGraphDef cost_graph = 3;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/tensorflow/core/util/example_proto_fast_parsing.cc b/tensorflow/core/util/example_proto_fast_parsing.cc
index 1a2c4aeedab..abf8d77f869 100644
--- a/tensorflow/core/util/example_proto_fast_parsing.cc
+++ b/tensorflow/core/util/example_proto_fast_parsing.cc
@@ -92,7 +92,8 @@ class Feature {
     return Status::OK();
   }
 
-  bool ParseBytesList(SmallVector<string>* bytes_list) {
+  template <typename Result>
+  bool ParseBytesList(Result* bytes_list) {
     DCHECK(bytes_list != nullptr);
     protobuf::io::CodedInputStream stream(
         reinterpret_cast<const uint8*>(serialized_.data()), serialized_.size());
@@ -116,7 +117,8 @@ class Feature {
     return true;
   }
 
-  bool ParseFloatList(SmallVector<float>* float_list) {
+  template <typename Result>
+  bool ParseFloatList(Result* float_list) {
     DCHECK(float_list != nullptr);
     protobuf::io::CodedInputStream stream(
         reinterpret_cast<const uint8*>(serialized_.data()), serialized_.size());
@@ -158,7 +160,8 @@ class Feature {
     return true;
   }
 
-  bool ParseInt64List(SmallVector<int64>* int64_list) {
+  template <typename Result>
+  bool ParseInt64List(Result* int64_list) {
     DCHECK(int64_list != nullptr);
     protobuf::io::CodedInputStream stream(
         reinterpret_cast<const uint8*>(serialized_.data()), serialized_.size());
@@ -181,7 +184,7 @@ class Feature {
         while (!stream.ExpectAtEnd()) {
           protobuf_uint64 n;  // There is no API for int64
           if (!stream.ReadVarint64(&n)) return false;
-          int64_list->push_back(n);
+          int64_list->push_back(static_cast<int64>(n));
         }
 
         stream.PopLimit(packed_limit);
@@ -190,7 +193,7 @@ class Feature {
           if (!stream.ExpectTag(kVarintTag(1))) return false;
           protobuf_uint64 n;  // There is no API for int64
           if (!stream.ReadVarint64(&n)) return false;
-          int64_list->push_back(n);
+          int64_list->push_back(static_cast<int64>(n));
         }
       }
     }
@@ -392,6 +395,28 @@ struct SeededHasher {
   uint64 seed{0xDECAFCAFFE};
 };
 
+template <typename T>
+class LimitedArraySlice {
+ public:
+  LimitedArraySlice(T* begin, size_t num_elements)
+      : current_(begin), end_(begin + num_elements) {}
+
+  // May return negative if there were push_back calls after slice was filled.
+  int64 EndDistance() const { return end_ - current_; }
+
+  // Attempts to push value to the back of this. If the slice has
+  // already been filled, this method has no effect on the underlying data, but
+  // it changes the number returned by EndDistance into negative values.
+  void push_back(T&& value) {
+    if (EndDistance() > 0) *current_ = std::move(value);
+    ++current_;
+  }
+
+ private:
+  T* current_;
+  T* end_;
+};
+
 Status FastParseSerializedExample(
     const string& serialized_example, const string& example_name,
     const size_t example_index, const Config& config,
@@ -487,37 +512,29 @@ Status FastParseSerializedExample(
 
       switch (config.dense[d].dtype) {
         case DT_INT64: {
-          SmallVector<int64> list;
-          list.reserve(num_elements);
-          if (!feature.ParseInt64List(&list)) return parse_error();
-          if (list.size() != num_elements) {
-            return shape_error(list.size(), "int64");
-          }
           auto out_p = out.flat<int64>().data() + offset;
-          std::copy_n(list.begin(), list.size(), out_p);
+          LimitedArraySlice<int64> slice(out_p, num_elements);
+          if (!feature.ParseInt64List(&slice)) return parse_error();
+          if (slice.EndDistance() != 0) {
+            return shape_error(num_elements - slice.EndDistance(), "int64");
+          }
           break;
         }
         case DT_FLOAT: {
-          SmallVector<float> list;
-          list.reserve(num_elements);
-          if (!feature.ParseFloatList(&list)) return parse_error();
-          if (list.size() != num_elements) {
-            return shape_error(list.size(), "float");
-          }
           auto out_p = out.flat<float>().data() + offset;
-          std::copy_n(list.begin(), list.size(), out_p);
+          LimitedArraySlice<float> slice(out_p, num_elements);
+          if (!feature.ParseFloatList(&slice)) return parse_error();
+          if (slice.EndDistance() != 0) {
+            return shape_error(num_elements - slice.EndDistance(), "float");
+          }
           break;
         }
         case DT_STRING: {
-          SmallVector<string> list;
-          list.reserve(num_elements);
-          if (!feature.ParseBytesList(&list)) return parse_error();
-          if (list.size() != num_elements) {
-            return shape_error(list.size(), "bytes");
-          }
           auto out_p = out.flat<string>().data() + offset;
-          for (size_t i = 0; i < list.size(); ++i) {
-            out_p[i] = std::move(list[i]);
+          LimitedArraySlice<string> slice(out_p, num_elements);
+          if (!feature.ParseBytesList(&slice)) return parse_error();
+          if (slice.EndDistance() != 0) {
+            return shape_error(num_elements - slice.EndDistance(), "bytes");
           }
           break;
         }
diff --git a/tensorflow/examples/android/AndroidManifest.xml b/tensorflow/examples/android/AndroidManifest.xml
index 3cb18ab73ce..0a48d3d50b7 100644
--- a/tensorflow/examples/android/AndroidManifest.xml
+++ b/tensorflow/examples/android/AndroidManifest.xml
@@ -33,9 +33,9 @@
         android:icon="@drawable/ic_launcher"
         android:theme="@style/MaterialTheme">
 
-        <activity android:name="org.tensorflow.demo.CameraActivity"
+        <activity android:name="org.tensorflow.demo.ClassifierActivity"
                   android:screenOrientation="portrait"
-                  android:label="@string/app_name">
+                  android:label="@string/activity_name_classification">
             <intent-filter>
                 <action android:name="android.intent.action.MAIN" />
                 <category android:name="android.intent.category.LAUNCHER" />
diff --git a/tensorflow/examples/android/res/values/base-strings.xml b/tensorflow/examples/android/res/values/base-strings.xml
index 992ba2dc987..93cfe0dac28 100644
--- a/tensorflow/examples/android/res/values/base-strings.xml
+++ b/tensorflow/examples/android/res/values/base-strings.xml
@@ -17,4 +17,5 @@
 
 <resources>
     <string name="app_name">TensorFlow Demo</string>
+    <string name="activity_name_classification">TF Classification</string>
 </resources>
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java
index 82c37ac757d..ede3af1467f 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java
@@ -18,13 +18,14 @@ package org.tensorflow.demo;
 
 import android.Manifest;
 import android.app.Activity;
+import android.app.Fragment;
 import android.content.pm.PackageManager;
 import android.os.Build;
 import android.os.Bundle;
 import android.view.WindowManager;
 import android.widget.Toast;
 
-public class CameraActivity extends Activity {
+public abstract class CameraActivity extends Activity {
   private static final int PERMISSIONS_REQUEST = 1;
 
   private static final String PERMISSION_CAMERA = Manifest.permission.CAMERA;
@@ -48,7 +49,8 @@ public class CameraActivity extends Activity {
   }
 
   @Override
-  public void onRequestPermissionsResult(int requestCode, String permissions[], int[] grantResults) {
+  public void onRequestPermissionsResult(
+      final int requestCode, final String[] permissions, final int[] grantResults) {
     switch (requestCode) {
       case PERMISSIONS_REQUEST: {
         if (grantResults.length > 0
@@ -79,10 +81,12 @@ public class CameraActivity extends Activity {
     }
   }
 
-  private void setFragment() {
+  protected void setFragment() {
     getFragmentManager()
-            .beginTransaction()
-            .replace(R.id.container, CameraConnectionFragment.newInstance())
-            .commit();
+        .beginTransaction()
+        .replace(R.id.container, createFragment())
+        .commit();
   }
+
+  protected abstract Fragment createFragment();
 }
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/CameraConnectionFragment.java b/tensorflow/examples/android/src/org/tensorflow/demo/CameraConnectionFragment.java
index e73278ed608..0bd963b39ef 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/CameraConnectionFragment.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/CameraConnectionFragment.java
@@ -69,7 +69,7 @@ public class CameraConnectionFragment extends Fragment {
    */
   private static final int MINIMUM_PREVIEW_SIZE = 320;
 
-  private RecognitionScoreView scoreView;
+  private ResultsView resultsView;
 
   /**
    * Conversion from screen rotation to JPEG orientation.
@@ -132,10 +132,10 @@ public class CameraConnectionFragment extends Fragment {
   private CameraDevice cameraDevice;
 
   /**
-   * The rotation in degrees of the camera sensor from the display. 
+   * The rotation in degrees of the camera sensor from the display.
    */
   private Integer sensorOrientation;
-  
+
   /**
    * The {@link android.util.Size} of camera preview.
    */
@@ -214,6 +214,27 @@ public class CameraConnectionFragment extends Fragment {
    */
   private final Semaphore cameraOpenCloseLock = new Semaphore(1);
 
+  /**
+   * A {@link Classifier} object wrapping TensorFlow to pass frames to.
+   */
+  private final Classifier classifier;
+  /**
+   * The input size in pixels desired by TensorFlow (width and height of a square bitmap).
+   */
+  private final int inputSize;
+
+  /**
+   * The layout identifier to inflate for this Fragment.
+   */
+  private final int layout;
+
+  private CameraConnectionFragment(
+      final Classifier classifier, final int layout, final int inputSize) {
+    this.classifier = classifier;
+    this.layout = layout;
+    this.inputSize = inputSize;
+  }
+
   /**
    * Shows a {@link Toast} on the UI thread.
    *
@@ -267,20 +288,21 @@ public class CameraConnectionFragment extends Fragment {
     }
   }
 
-  public static CameraConnectionFragment newInstance() {
-    return new CameraConnectionFragment();
+  public static CameraConnectionFragment newInstance(
+      final Classifier classifier, final int layout, final int inputSize) {
+    return new CameraConnectionFragment(classifier, layout, inputSize);
   }
 
   @Override
   public View onCreateView(
       final LayoutInflater inflater, final ViewGroup container, final Bundle savedInstanceState) {
-    return inflater.inflate(R.layout.camera_connection_fragment, container, false);
+    return inflater.inflate(layout, container, false);
   }
 
   @Override
   public void onViewCreated(final View view, final Bundle savedInstanceState) {
     textureView = (AutoFitTextureView) view.findViewById(R.id.texture);
-    scoreView = (RecognitionScoreView) view.findViewById(R.id.results);
+    resultsView = (ResultsView) view.findViewById(R.id.results);
   }
 
   @Override
@@ -344,7 +366,7 @@ public class CameraConnectionFragment extends Fragment {
                 new CompareSizesByArea());
 
         sensorOrientation = characteristics.get(CameraCharacteristics.SENSOR_ORIENTATION);
-        
+
         // Danger, W.R.! Attempting to use too large a preview size could  exceed the camera
         // bus' bandwidth limitation, resulting in gorgeous previews but the storage of
         // garbage capture data.
@@ -538,7 +560,7 @@ public class CameraConnectionFragment extends Fragment {
 
     LOGGER.i("Getting assets.");
     tfPreviewListener.initialize(
-        getActivity().getAssets(), scoreView, inferenceHandler, sensorOrientation);
+        classifier, resultsView, inputSize, inferenceHandler, sensorOrientation);
     LOGGER.i("TensorFlow initialized.");
   }
 
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java
new file mode 100644
index 00000000000..104ffbbd088
--- /dev/null
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java
@@ -0,0 +1,58 @@
+/*
+ * Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.tensorflow.demo;
+
+import java.io.IOException;
+
+import android.app.Fragment;
+import org.tensorflow.demo.env.Logger;
+
+public class ClassifierActivity extends CameraActivity {
+  private static final Logger LOGGER = new Logger();
+
+  // These are the settings for the original v1 Inception model. If you want to
+  // use a model that's been produced from the TensorFlow for Poets codelab,
+  // you'll need to set IMAGE_SIZE = 299, IMAGE_MEAN = 128, IMAGE_STD = 128,
+  // INPUT_NAME = "Mul:0", and OUTPUT_NAME = "final_result:0".
+  // You'll also need to update the MODEL_FILE and LABEL_FILE paths to point to
+  // the ones you produced.
+  private static final int NUM_CLASSES = 1001;
+  private static final int INPUT_SIZE = 224;
+  private static final int IMAGE_MEAN = 117;
+  private static final float IMAGE_STD = 1;
+  private static final String INPUT_NAME = "input:0";
+  private static final String OUTPUT_NAME = "output:0";
+
+  private static final String MODEL_FILE = "file:///android_asset/tensorflow_inception_graph.pb";
+  private static final String LABEL_FILE =
+      "file:///android_asset/imagenet_comp_graph_label_strings.txt";
+
+  @Override
+  protected Fragment createFragment() {
+    final TensorFlowImageClassifier classifier = new TensorFlowImageClassifier();
+    try {
+      classifier.initializeTensorFlow(
+        getAssets(), MODEL_FILE, LABEL_FILE, NUM_CLASSES, INPUT_SIZE, IMAGE_MEAN, IMAGE_STD,
+        INPUT_NAME, OUTPUT_NAME);
+    } catch (final IOException e) {
+      LOGGER.e(e, "Exception!");
+    }
+
+    return CameraConnectionFragment.newInstance(
+        classifier, R.layout.camera_connection_fragment, INPUT_SIZE);
+  }
+}
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/RecognitionScoreView.java b/tensorflow/examples/android/src/org/tensorflow/demo/RecognitionScoreView.java
index c20afcc22e4..764c16433c3 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/RecognitionScoreView.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/RecognitionScoreView.java
@@ -26,7 +26,7 @@ import org.tensorflow.demo.Classifier.Recognition;
 
 import java.util.List;
 
-public class RecognitionScoreView extends View {
+public class RecognitionScoreView extends View implements ResultsView {
   private static final float TEXT_SIZE_DIP = 24;
   private List<Recognition> results;
   private final float textSizePx;
@@ -46,6 +46,7 @@ public class RecognitionScoreView extends View {
     bgPaint.setColor(0xcc4285f4);
   }
 
+  @Override
   public void setResults(final List<Recognition> results) {
     this.results = results;
     postInvalidate();
diff --git a/tensorflow/tensorboard/gulp_tasks/tslint.js b/tensorflow/examples/android/src/org/tensorflow/demo/ResultsView.java
similarity index 58%
rename from tensorflow/tensorboard/gulp_tasks/tslint.js
rename to tensorflow/examples/android/src/org/tensorflow/demo/ResultsView.java
index 726001fc906..662495202b3 100644
--- a/tensorflow/tensorboard/gulp_tasks/tslint.js
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/ResultsView.java
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,19 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-var gulp = require('gulp');
-var tslint = require('gulp-tslint');
+package org.tensorflow.demo;
 
-module.exports = function(strict) {
-  return function() {
-    return gulp.src([
-      'components/tf-*/**/*.ts',
-      'components/vz-*/**/*.ts',
-      '!./components/**/deps.d.ts'
-    ])
-        .pipe(tslint())
-        .pipe(tslint.report('verbose', {
-          emitError: strict,
-        }));
-  };
+import org.tensorflow.demo.Classifier.Recognition;
+
+import java.util.List;
+
+public interface ResultsView {
+  public void setResults(final List<Recognition> results);
 }
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowImageListener.java b/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowImageListener.java
index f60652ffcff..33da3d40807 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowImageListener.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowImageListener.java
@@ -15,7 +15,6 @@ limitations under the License.
 
 package org.tensorflow.demo;
 
-import android.content.res.AssetManager;
 import android.graphics.Bitmap;
 import android.graphics.Bitmap.Config;
 import android.graphics.Canvas;
@@ -26,13 +25,12 @@ import android.media.ImageReader;
 import android.media.ImageReader.OnImageAvailableListener;
 import android.os.Handler;
 import android.os.Trace;
-
-import java.io.IOException;
-import java.util.List;
 import junit.framework.Assert;
 import org.tensorflow.demo.env.ImageUtils;
 import org.tensorflow.demo.env.Logger;
 
+import java.util.List;
+
 /**
  * Class that takes in preview frames and converts the image to Bitmaps to process with Tensorflow.
  */
@@ -41,29 +39,13 @@ public class TensorFlowImageListener implements OnImageAvailableListener {
 
   private static final boolean SAVE_PREVIEW_BITMAP = false;
 
-  // These are the settings for the original v1 Inception model. If you want to
-  // use a model that's been produced from the TensorFlow for Poets codelab,
-  // you'll need to set IMAGE_SIZE = 299, IMAGE_MEAN = 128, IMAGE_STD = 128,
-  // INPUT_NAME = "Mul:0", and OUTPUT_NAME = "final_result:0".
-  // You'll also need to update the MODEL_FILE and LABEL_FILE paths to point to
-  // the ones you produced.
-  private static final int NUM_CLASSES = 1001;
-  private static final int INPUT_SIZE = 224;
-  private static final int IMAGE_MEAN = 117;
-  private static final float IMAGE_STD = 1;
-  private static final String INPUT_NAME = "input:0";
-  private static final String OUTPUT_NAME = "output:0";
-
-  private static final String MODEL_FILE = "file:///android_asset/tensorflow_inception_graph.pb";
-  private static final String LABEL_FILE =
-      "file:///android_asset/imagenet_comp_graph_label_strings.txt";
-
   private Integer sensorOrientation;
 
-  private final TensorFlowImageClassifier tensorflow = new TensorFlowImageClassifier();
+  private Classifier tensorflow;
 
   private int previewWidth = 0;
   private int previewHeight = 0;
+  private int inputSize = 0;
   private byte[][] yuvBytes;
   private int[] rgbBytes = null;
   private Bitmap rgbFrameBitmap = null;
@@ -72,22 +54,18 @@ public class TensorFlowImageListener implements OnImageAvailableListener {
   private boolean computing = false;
   private Handler handler;
 
-  private RecognitionScoreView scoreView;
+  private ResultsView resultsView;
 
   public void initialize(
-      final AssetManager assetManager,
-      final RecognitionScoreView scoreView,
+      final Classifier tensorflow,
+      final ResultsView resultsView,
+      final int inputSize,
       final Handler handler,
       final Integer sensorOrientation) {
     Assert.assertNotNull(sensorOrientation);
-    try {
-      tensorflow.initializeTensorFlow(
-        assetManager, MODEL_FILE, LABEL_FILE, NUM_CLASSES, INPUT_SIZE, IMAGE_MEAN, IMAGE_STD,
-        INPUT_NAME, OUTPUT_NAME);
-    } catch (IOException e) {
-      LOGGER.e(e, "Exception!");
-    }
-    this.scoreView = scoreView;
+    this.tensorflow = tensorflow;
+    this.resultsView = resultsView;
+    this.inputSize = inputSize;
     this.handler = handler;
     this.sensorOrientation = sensorOrientation;
   }
@@ -146,7 +124,7 @@ public class TensorFlowImageListener implements OnImageAvailableListener {
         LOGGER.i("Initializing at size %dx%d", previewWidth, previewHeight);
         rgbBytes = new int[previewWidth * previewHeight];
         rgbFrameBitmap = Bitmap.createBitmap(previewWidth, previewHeight, Config.ARGB_8888);
-        croppedBitmap = Bitmap.createBitmap(INPUT_SIZE, INPUT_SIZE, Config.ARGB_8888);
+        croppedBitmap = Bitmap.createBitmap(inputSize, inputSize, Config.ARGB_8888);
 
         yuvBytes = new byte[planes.length][];
         for (int i = 0; i < planes.length; ++i) {
@@ -201,7 +179,7 @@ public class TensorFlowImageListener implements OnImageAvailableListener {
             for (final Classifier.Recognition result : results) {
               LOGGER.v("Result: " + result.getTitle());
             }
-            scoreView.setResults(results);
+            resultsView.setResults(results);
             computing = false;
           }
         });
diff --git a/tensorflow/examples/how_tos/reading_data/convert_to_records.py b/tensorflow/examples/how_tos/reading_data/convert_to_records.py
index c3555a882d6..5457b27ecac 100644
--- a/tensorflow/examples/how_tos/reading_data/convert_to_records.py
+++ b/tensorflow/examples/how_tos/reading_data/convert_to_records.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import argparse
 import os
+import sys
 
 import tensorflow as tf
 
@@ -102,6 +103,5 @@ if __name__ == '__main__':
       set.\
       """
   )
-  FLAGS = parser.parse_args()
-
-  tf.app.run()
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded.py b/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded.py
index 7795248f82d..888da421bfa 100644
--- a/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded.py
+++ b/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded.py
@@ -31,6 +31,7 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
+import sys
 import time
 
 import tensorflow as tf
@@ -184,6 +185,5 @@ if __name__ == '__main__':
       help='If true, uses fake data for unit testing.',
       action='store_true'
   )
-  FLAGS = parser.parse_args()
-
-  tf.app.run()
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded_var.py b/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded_var.py
index 5325afbe60e..f19c3f38fd5 100644
--- a/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded_var.py
+++ b/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded_var.py
@@ -30,6 +30,7 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
+import sys
 import time
 
 import tensorflow as tf
@@ -194,6 +195,5 @@ if __name__ == '__main__':
       help='If true, uses fake data for unit testing.',
       action='store_true'
   )
-  FLAGS = parser.parse_args()
-
-  tf.app.run()
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py b/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
index 127153a00bb..4c5dbc65c6f 100644
--- a/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
+++ b/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
@@ -29,6 +29,7 @@ from __future__ import print_function
 
 import argparse
 import os.path
+import sys
 import time
 
 import tensorflow as tf
@@ -224,6 +225,5 @@ if __name__ == '__main__':
       default='/tmp/data',
       help='Directory with the training data.'
   )
-  FLAGS = parser.parse_args()
-
-  tf.app.run()
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/image_retraining/retrain.py b/tensorflow/examples/image_retraining/retrain.py
index 4f06cb8add1..392f0176d37 100644
--- a/tensorflow/examples/image_retraining/retrain.py
+++ b/tensorflow/examples/image_retraining/retrain.py
@@ -1009,6 +1009,5 @@ if __name__ == '__main__':
       input pixels up or down by.\
       """
   )
-  FLAGS = parser.parse_args()
-
-  tf.app.run()
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/learn/random_forest_mnist.py b/tensorflow/examples/learn/random_forest_mnist.py
index c20965fff6e..a34d52275ac 100644
--- a/tensorflow/examples/learn/random_forest_mnist.py
+++ b/tensorflow/examples/learn/random_forest_mnist.py
@@ -18,13 +18,20 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
+import sys
 import tempfile
 
 import tensorflow as tf
 
 # pylint: disable=g-backslash-continuation
+from tensorflow.contrib.learn.python.learn\
+        import metric_spec
 from tensorflow.contrib.learn.python.learn.estimators\
         import random_forest
+from tensorflow.contrib.tensor_forest.client\
+        import eval_metrics
+from tensorflow.contrib.tensor_forest.python\
+        import tensor_forest
 from tensorflow.examples.tutorials.mnist import input_data
 
 FLAGS = None
@@ -35,7 +42,12 @@ def build_estimator(model_dir):
   params = tf.contrib.tensor_forest.python.tensor_forest.ForestHParams(
       num_classes=10, num_features=784,
       num_trees=FLAGS.num_trees, max_nodes=FLAGS.max_nodes)
-  return random_forest.TensorForestEstimator(params, model_dir=model_dir)
+  graph_builder_class = tensor_forest.RandomForestGraphs
+  if FLAGS.use_training_loss:
+    graph_builder_class = tensor_forest.TrainingLossForest
+  return random_forest.TensorForestEstimator(
+      params, graph_builder_class=graph_builder_class,
+      model_dir=model_dir)
 
 
 def train_and_eval():
@@ -45,20 +57,25 @@ def train_and_eval():
 
   estimator = build_estimator(model_dir)
 
-  # TensorForest's LossMonitor allows training to terminate early if the
+  # TensorForest's loss hook allows training to terminate early if the
   # forest is no longer growing.
   early_stopping_rounds = 100
-  check_every_n_steps = 100
-  monitor = random_forest.TensorForestLossMonitor(early_stopping_rounds,
-                                                  check_every_n_steps)
+  monitor = random_forest.TensorForestLossHook(early_stopping_rounds)
 
   mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=False)
 
   estimator.fit(x=mnist.train.images, y=mnist.train.labels,
                 batch_size=FLAGS.batch_size, monitors=[monitor])
 
+  metric_name = 'accuracy'
+  metric = {metric_name:
+            metric_spec.MetricSpec(
+                eval_metrics.get_metric(metric_name),
+                prediction_key=eval_metrics.get_prediction_key(metric_name))}
+
   results = estimator.evaluate(x=mnist.test.images, y=mnist.test.labels,
-                               batch_size=FLAGS.batch_size)
+                               batch_size=FLAGS.batch_size,
+                               metrics=metric)
   for key in sorted(results):
     print('%s: %s' % (key, results[key]))
 
@@ -105,6 +122,11 @@ if __name__ == '__main__':
       default=1000,
       help='Max total nodes in a single tree.'
   )
-  FLAGS = parser.parse_args()
-
-  tf.app.run()
+  parser.add_argument(
+      '--use_training_loss',
+      type=bool,
+      default=False,
+      help='If true, use training loss as termination criteria.'
+  )
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/learn/text_classification.py b/tensorflow/examples/learn/text_classification.py
index e0997cf921a..87a23831f35 100644
--- a/tensorflow/examples/learn/text_classification.py
+++ b/tensorflow/examples/learn/text_classification.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
+import sys
 
 import numpy as np
 import pandas
@@ -117,6 +118,5 @@ if __name__ == '__main__':
       help='Test the example code with fake data.',
       action='store_true'
   )
-  FLAGS = parser.parse_args()
-
-  tf.app.run()
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/learn/text_classification_builtin_rnn_model.py b/tensorflow/examples/learn/text_classification_builtin_rnn_model.py
index 865ce12516a..6a1c05b86b1 100644
--- a/tensorflow/examples/learn/text_classification_builtin_rnn_model.py
+++ b/tensorflow/examples/learn/text_classification_builtin_rnn_model.py
@@ -16,6 +16,7 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
+import sys
 
 import numpy as np
 import pandas
@@ -84,6 +85,5 @@ if __name__ == '__main__':
       help='Test the example code with fake data.',
       action='store_true'
   )
-  FLAGS = parser.parse_args()
-
-  tf.app.run()
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/learn/text_classification_character_cnn.py b/tensorflow/examples/learn/text_classification_character_cnn.py
index dbf34f35945..e84790471b5 100644
--- a/tensorflow/examples/learn/text_classification_character_cnn.py
+++ b/tensorflow/examples/learn/text_classification_character_cnn.py
@@ -29,6 +29,7 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
+import sys
 
 import numpy as np
 import pandas
@@ -114,6 +115,5 @@ if __name__ == '__main__':
       help='Test the example code with fake data.',
       action='store_true'
   )
-  FLAGS = parser.parse_args()
-
-  tf.app.run()
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/learn/text_classification_character_rnn.py b/tensorflow/examples/learn/text_classification_character_rnn.py
index 68b15505a67..e62663aa8af 100644
--- a/tensorflow/examples/learn/text_classification_character_rnn.py
+++ b/tensorflow/examples/learn/text_classification_character_rnn.py
@@ -29,6 +29,7 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
+import sys
 
 import numpy as np
 import pandas
@@ -94,6 +95,5 @@ if __name__ == '__main__':
       help='Test the example code with fake data.',
       action='store_true'
   )
-  FLAGS = parser.parse_args()
-
-  tf.app.run()
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/learn/text_classification_cnn.py b/tensorflow/examples/learn/text_classification_cnn.py
index e1836720cca..f71df272ead 100644
--- a/tensorflow/examples/learn/text_classification_cnn.py
+++ b/tensorflow/examples/learn/text_classification_cnn.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
+import sys
 
 import numpy as np
 import pandas
@@ -114,6 +115,5 @@ if __name__ == '__main__':
       help='Test the example code with fake data.',
       action='store_true'
   )
-  FLAGS = parser.parse_args()
-
-  tf.app.run()
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/tutorials/mnist/fully_connected_feed.py b/tensorflow/examples/tutorials/mnist/fully_connected_feed.py
index 5147801f961..7e4d4081102 100644
--- a/tensorflow/examples/tutorials/mnist/fully_connected_feed.py
+++ b/tensorflow/examples/tutorials/mnist/fully_connected_feed.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 # pylint: disable=missing-docstring
 import argparse
 import os.path
+import sys
 import time
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
@@ -271,5 +272,6 @@ if __name__ == '__main__':
       help='If true, uses fake data for unit testing.',
       action='store_true'
   )
-  FLAGS = parser.parse_args()
-  tf.app.run()
\ No newline at end of file
+
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/tutorials/mnist/mnist_softmax.py b/tensorflow/examples/tutorials/mnist/mnist_softmax.py
index c42d1eff15d..beb184f7755 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_softmax.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_softmax.py
@@ -23,6 +23,7 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
+import sys
 
 # Import data
 from tensorflow.examples.tutorials.mnist import input_data
@@ -73,5 +74,5 @@ if __name__ == '__main__':
   parser = argparse.ArgumentParser()
   parser.add_argument('--data_dir', type=str, default='/tmp/tensorflow/mnist/input_data',
                       help='Directory for storing input data')
-  FLAGS = parser.parse_args()
-  tf.app.run()
\ No newline at end of file
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
index 66946f72849..fc91ac4ddd3 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
@@ -25,6 +25,7 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
+import sys
 
 import tensorflow as tf
 
@@ -200,5 +201,5 @@ if __name__ == '__main__':
                       help='Directory for storing input data')
   parser.add_argument('--log_dir', type=str, default='/tmp/tensorflow/mnist/logs/mnist_with_summaries',
                       help='Summaries log directory')
-  FLAGS = parser.parse_args()
-  tf.app.run()
\ No newline at end of file
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/udacity/Dockerfile b/tensorflow/examples/udacity/Dockerfile
index b7b094621a8..9f5ef1aca3e 100644
--- a/tensorflow/examples/udacity/Dockerfile
+++ b/tensorflow/examples/udacity/Dockerfile
@@ -1,5 +1,13 @@
 FROM gcr.io/tensorflow/tensorflow:latest
 MAINTAINER Vincent Vanhoucke <vanhoucke@google.com>
+
+# Pillow needs libjpeg by default as of 3.0.
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        libjpeg8-dev \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
 RUN pip install scikit-learn pyreadline Pillow
 RUN rm -rf /notebooks/*
 ADD *.ipynb /notebooks/
diff --git a/tensorflow/examples/udacity/README.md b/tensorflow/examples/udacity/README.md
index 1b0e5df5ee4..2814e5c62a8 100644
--- a/tensorflow/examples/udacity/README.md
+++ b/tensorflow/examples/udacity/README.md
@@ -6,7 +6,7 @@ Course information can be found at https://www.udacity.com/course/deep-learning-
 Running the Docker container from the Google Cloud repository
 -------------------------------------------------------------
 
-    docker run -p 8888:8888 --name tensorflow-udacity -it b.gcr.io/tensorflow-udacity/assignments:0.5.0
+    docker run -p 8888:8888 --name tensorflow-udacity -it gcr.io/tensorflow/udacity-assignments:0.6.0
 
 Note that if you ever exit the container, you can return to it using:
 
@@ -82,11 +82,11 @@ This will allow you to save work and have access to generated files on the host
 Pushing a Google Cloud release
 ------------------------------
 
-    V=0.5.0
-    docker tag $USER/assignments b.gcr.io/tensorflow-udacity/assignments:$V
-    gcloud docker push b.gcr.io/tensorflow-udacity/assignments
-    docker tag -f $USER/assignments b.gcr.io/tensorflow-udacity/assignments:latest
-    gcloud docker push b.gcr.io/tensorflow-udacity/assignments
+    V=0.6.0
+    docker tag $USER/assignments gcr.io/tensorflow/udacity-assignments:$V
+    gcloud docker push gcr.io/tensorflow/udacity-assignments
+    docker tag -f $USER/assignments gcr.io/tensorflow/udacity-assignments:latest
+    gcloud docker push gcr.io/tensorflow/udacity-assignments
 
 History
 -------
@@ -96,3 +96,4 @@ History
 * 0.3.0: Use 0.7.1 release.
 * 0.4.0: Move notMMNIST data for Google Cloud.
 * 0.5.0: Actually use 0.7.1 release.
+* 0.6.0: Update to TF 0.10.0, add libjpeg (for Pillow).
diff --git a/tensorflow/g3doc/api_docs/python/contrib.distributions.bijector.md b/tensorflow/g3doc/api_docs/python/contrib.distributions.bijector.md
index 37d95f969ed..8577cd012e6 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.distributions.bijector.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.distributions.bijector.md
@@ -5,7 +5,7 @@
 
 Bijector Ops.
 
-An API for reversible (bijective) transformations of random variables.
+An API for invertible, differentiable transformations of random variables.
 
 ## Background
 
@@ -24,11 +24,13 @@ To apply a `Bijector`, use `distributions.TransformedDistribution`.
 
 ### `class tf.contrib.distributions.bijector.Bijector` {#Bijector}
 
-Interface for transforming a `Distribution` via `TransformedDistribution`.
+Interface for transforming a `Distribution` sample.
 
-A `Bijector` implements a bijective, differentiable function by transforming
-an input `Tensor`. The output `Tensor` shape is constrained by the input
-`sample`, `batch`, and `event` shape.  A `Bijector` is characterized by three
+A `Bijector` implements a
+[diffeomorphism](https://en.wikipedia.org/wiki/Diffeomorphism), i.e., a
+bijective, differentiable function. A `Bijector` is used by
+`TransformedDistribution` but can be generally used for transforming a
+`Distribution` generated `Tensor`.  A `Bijector` is characterized by three
 operations:
 
 1. Forward Evaluation
@@ -169,7 +171,8 @@ Tips for implementing `_inverse` and `_inverse_log_det_jacobian`:
 - The inverse `log o det o Jacobian` can be implemented as the negative of the
   forward `log o det o Jacobian`.  This is useful if the `inverse` is
   implemented as a cache or the inverse Jacobian is computationally more
-  expensive. The following demonstrates the suggested implementation.
+  expensive (e.g., `CholeskyOuterProduct` `Bijector`). The following
+  demonstrates the suggested implementation.
 
   ```python
   def _inverse_and_log_det_jacobian(self, y):
@@ -476,8 +479,8 @@ Instantiates `Chain` bijector.
 
 *  <b>`bijectors`</b>: Python list of bijector instances. An empty list makes this
     bijector equivalent to the `Identity` bijector.
-*  <b>`validate_args`</b>: `Boolean` indicated whether arguments should be checked for
-    correctness.
+*  <b>`validate_args`</b>: `Boolean` indicating whether arguments should be checked
+    for correctness.
 *  <b>`name`</b>: `String`, name given to ops managed by this object. Default: E.g.,
     `Chain([Exp(), Softplus()]).name == "chain_of_exp_of_softplus"`.
 
@@ -681,6 +684,234 @@ Returns True if Tensor arguments will be validated.
 
 
 
+- - -
+
+### `class tf.contrib.distributions.bijector.CholeskyOuterProduct` {#CholeskyOuterProduct}
+
+Bijector which computes Y = g(X) = X X^T where X is a lower-triangular, positive-diagonal matrix.
+
+`event_ndims` must be 0 or 2, i.e., scalar or matrix.
+
+Note: the upper-triangular part of X is ignored (whether or not its zero).
+
+Examples:
+
+```python
+bijector.CholeskyOuterProduct(event_ndims=2).forward(x=[[1., 0], [2, 1]])
+# Result: [[1, 1], [1, 5]], i.e., x x^T
+
+bijector.SoftmaxCentered(event_ndims=2).inverse(y=[[1., 1], [1, 5]])
+# Result: [[1, 0], [2, 1]], i.e., chol(y).
+```
+- - -
+
+#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.__init__(event_ndims=2, validate_args=False, name='cholesky_outer_product')` {#CholeskyOuterProduct.__init__}
+
+Instantiates the `CholeskyOuterProduct` bijector.
+
+##### Args:
+
+
+*  <b>`event_ndims`</b>: `constant` `int32` scalar `Tensor` indicating the number of
+    dimensions associated with a particular draw from the distribution. Must
+    be 0 or 2.
+*  <b>`validate_args`</b>: `Boolean` indicating whether arguments should be checked
+    for correctness.
+*  <b>`name`</b>: `String` name given to ops managed by this object.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: if event_ndims is neither 0 or 2.
+
+
+- - -
+
+#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.dtype` {#CholeskyOuterProduct.dtype}
+
+dtype of `Tensor`s transformable by this distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.forward(x, name='forward', **condition_kwargs)` {#CholeskyOuterProduct.forward}
+
+Returns the forward `Bijector` evaluation, i.e., X = g(Y).
+
+##### Args:
+
+
+*  <b>`x`</b>: `Tensor`. The input to the "forward" evaluation.
+*  <b>`name`</b>: The name to give this op.
+*  <b>`**condition_kwargs`</b>: Named arguments forwarded to subclass implementation.
+
+##### Returns:
+
+  `Tensor`.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if `self.dtype` is specified and `x.dtype` is not
+    `self.dtype`.
+*  <b>`NotImplementedError`</b>: if `_forward` is not implemented.
+
+
+- - -
+
+#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.forward_log_det_jacobian(x, name='forward_log_det_jacobian', **condition_kwargs)` {#CholeskyOuterProduct.forward_log_det_jacobian}
+
+Returns both the forward_log_det_jacobian.
+
+##### Args:
+
+
+*  <b>`x`</b>: `Tensor`. The input to the "forward" Jacobian evaluation.
+*  <b>`name`</b>: The name to give this op.
+*  <b>`**condition_kwargs`</b>: Named arguments forwarded to subclass implementation.
+
+##### Returns:
+
+  `Tensor`.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
+    `self.dtype`.
+*  <b>`NotImplementedError`</b>: if neither `_forward_log_det_jacobian`
+    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
+
+
+- - -
+
+#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.inverse(y, name='inverse', **condition_kwargs)` {#CholeskyOuterProduct.inverse}
+
+Returns the inverse `Bijector` evaluation, i.e., X = g^{-1}(Y).
+
+##### Args:
+
+
+*  <b>`y`</b>: `Tensor`. The input to the "inverse" evaluation.
+*  <b>`name`</b>: The name to give this op.
+*  <b>`**condition_kwargs`</b>: Named arguments forwarded to subclass implementation.
+
+##### Returns:
+
+  `Tensor`.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
+    `self.dtype`.
+*  <b>`NotImplementedError`</b>: if neither `_inverse` nor
+    `_inverse_and_inverse_log_det_jacobian` are implemented.
+
+
+- - -
+
+#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.inverse_and_inverse_log_det_jacobian(y, name='inverse_and_inverse_log_det_jacobian', **condition_kwargs)` {#CholeskyOuterProduct.inverse_and_inverse_log_det_jacobian}
+
+Returns both the inverse evaluation and inverse_log_det_jacobian.
+
+Enables possibly more efficient calculation when both inverse and
+corresponding Jacobian are needed.
+
+See `inverse()`, `inverse_log_det_jacobian()` for more details.
+
+##### Args:
+
+
+*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
+*  <b>`name`</b>: The name to give this op.
+*  <b>`**condition_kwargs`</b>: Named arguments forwarded to subclass implementation.
+
+##### Returns:
+
+  `Tensor`.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
+    `self.dtype`.
+*  <b>`NotImplementedError`</b>: if neither `_inverse_and_inverse_log_det_jacobian`
+    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
+
+
+- - -
+
+#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.inverse_log_det_jacobian(y, name='inverse_log_det_jacobian', **condition_kwargs)` {#CholeskyOuterProduct.inverse_log_det_jacobian}
+
+Returns the (log o det o Jacobian o inverse)(y).
+
+Mathematically, returns: `log(det(dX/dY))(Y)`. (Recall that: `X=g^{-1}(Y)`.)
+
+Note that `forward_log_det_jacobian` is the negative of this function.
+
+##### Args:
+
+
+*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
+*  <b>`name`</b>: The name to give this op.
+*  <b>`**condition_kwargs`</b>: Named arguments forwarded to subclass implementation.
+
+##### Returns:
+
+  `Tensor`.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
+    `self.dtype`.
+*  <b>`NotImplementedError`</b>: if neither `_inverse_log_det_jacobian` nor
+    `_inverse_and_inverse_log_det_jacobian` are implemented.
+
+
+- - -
+
+#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.is_constant_jacobian` {#CholeskyOuterProduct.is_constant_jacobian}
+
+Returns true iff the Jacobian is not a function of x.
+
+Note: Jacobian is either constant for both forward and inverse or neither.
+
+##### Returns:
+
+  `Boolean`.
+
+
+- - -
+
+#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.name` {#CholeskyOuterProduct.name}
+
+Returns the string name of this `Bijector`.
+
+
+- - -
+
+#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.parameters` {#CholeskyOuterProduct.parameters}
+
+Returns this `Bijector`'s parameters as a name/value dictionary.
+
+
+- - -
+
+#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.shaper` {#CholeskyOuterProduct.shaper}
+
+Returns shape object used to manage shape constraints.
+
+
+- - -
+
+#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.validate_args` {#CholeskyOuterProduct.validate_args}
+
+Returns True if Tensor arguments will be validated.
+
+
+
 - - -
 
 ### `class tf.contrib.distributions.bijector.Exp` {#Exp}
@@ -714,8 +945,8 @@ Instantiates the `Exp` bijector.
 
 *  <b>`event_ndims`</b>: Scalar `int32` `Tensor` indicating the number of dimensions
     associated with a particular draw from the distribution.
-*  <b>`validate_args`</b>: `Boolean` indicated whether arguments should be checked for
-    correctness.
+*  <b>`validate_args`</b>: `Boolean` indicating whether arguments should be checked
+    for correctness.
 *  <b>`name`</b>: `String` name given to ops managed by this object.
 
 
@@ -1130,7 +1361,7 @@ exp = Inline(
   inverse_fn=tf.log,
   inverse_log_det_jacobian_fn=(
     lambda y: -tf.reduce_sum(tf.log(y), reduction_indices=-1)),
-  name="Exp")
+  name="exp")
 ```
 
 The above example is equivalent to the `Bijector` `Exp(event_ndims=1)`.
@@ -1151,8 +1382,8 @@ Creates a `Bijector` from callables.
     log o det o jacobian of the forward transformation.
 *  <b>`is_constant_jacobian`</b>: `Boolean` indicating that the Jacobian is constant
     for all input arguments.
-*  <b>`validate_args`</b>: `Boolean` indicated whether arguments should be checked for
-    correctness.
+*  <b>`validate_args`</b>: `Boolean` indicating whether arguments should be checked
+    for correctness.
 *  <b>`name`</b>: `String`, name given to ops managed by this object.
 
 
@@ -1378,8 +1609,8 @@ return -self.inverse_log_det_jacobian(y, **condition_kwargs)
 
 
 *  <b>`bijector`</b>: Bijector instance.
-*  <b>`validate_args`</b>: `Boolean` indicated whether arguments should be checked for
-    correctness.
+*  <b>`validate_args`</b>: `Boolean` indicating whether arguments should be checked
+    for correctness.
 *  <b>`name`</b>: `String`, name given to ops managed by this object.
 
 
@@ -1634,8 +1865,8 @@ Instantiates the `Exp` bijector.
 *  <b>`scale`</b>: `Tensor` used to scale input, i.e., `Y = g(X) = scale * X + shift`.
 *  <b>`event_ndims`</b>: Scalar `int32` `Tensor` indicating the number of dimensions
     associated with a particular draw from the distribution.
-*  <b>`validate_args`</b>: `Boolean` indicated whether arguments should be checked for
-    correctness.
+*  <b>`validate_args`</b>: `Boolean` indicating whether arguments should be checked
+    for correctness.
 *  <b>`name`</b>: `String` name given to ops managed by this object.
 
 
diff --git a/tensorflow/g3doc/api_docs/python/contrib.distributions.md b/tensorflow/g3doc/api_docs/python/contrib.distributions.md
index 0011737c600..bc4a79cf85f 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.distributions.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.distributions.md
@@ -231,6 +231,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.Distribution.copy(**override_parameters_kwargs)` {#Distribution.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.Distribution.dtype` {#Distribution.dtype}
@@ -840,6 +863,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.Binomial.copy(**override_parameters_kwargs)` {#Binomial.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.Binomial.dtype` {#Binomial.dtype}
@@ -1442,6 +1488,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.Bernoulli.copy(**override_parameters_kwargs)` {#Bernoulli.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.Bernoulli.dtype` {#Bernoulli.dtype}
@@ -1987,6 +2056,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.BernoulliWithSigmoidP.copy(**override_parameters_kwargs)` {#BernoulliWithSigmoidP.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.BernoulliWithSigmoidP.dtype` {#BernoulliWithSigmoidP.dtype}
@@ -2642,6 +2734,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.Beta.copy(**override_parameters_kwargs)` {#Beta.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.Beta.dtype` {#Beta.dtype}
@@ -3206,6 +3321,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.BetaWithSoftplusAB.copy(**override_parameters_kwargs)` {#BetaWithSoftplusAB.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.BetaWithSoftplusAB.dtype` {#BetaWithSoftplusAB.dtype}
@@ -3809,6 +3947,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.Categorical.copy(**override_parameters_kwargs)` {#Categorical.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.Categorical.dtype` {#Categorical.dtype}
@@ -4388,6 +4549,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.Chi2.copy(**override_parameters_kwargs)` {#Chi2.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.Chi2.df` {#Chi2.df}
@@ -4951,6 +5135,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.Chi2WithAbsDf.copy(**override_parameters_kwargs)` {#Chi2WithAbsDf.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.Chi2WithAbsDf.df` {#Chi2WithAbsDf.df}
@@ -5536,6 +5743,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.Exponential.copy(**override_parameters_kwargs)` {#Exponential.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.Exponential.dtype` {#Exponential.dtype}
@@ -6099,6 +6329,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.ExponentialWithSoftplusLam.copy(**override_parameters_kwargs)` {#ExponentialWithSoftplusLam.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.ExponentialWithSoftplusLam.dtype` {#ExponentialWithSoftplusLam.dtype}
@@ -6711,6 +6964,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.Gamma.copy(**override_parameters_kwargs)` {#Gamma.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.Gamma.dtype` {#Gamma.dtype}
@@ -7267,6 +7543,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.GammaWithSoftplusAlphaBeta.copy(**override_parameters_kwargs)` {#GammaWithSoftplusAlphaBeta.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.GammaWithSoftplusAlphaBeta.dtype` {#GammaWithSoftplusAlphaBeta.dtype}
@@ -7868,6 +8167,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.InverseGamma.copy(**override_parameters_kwargs)` {#InverseGamma.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.InverseGamma.dtype` {#InverseGamma.dtype}
@@ -8434,6 +8756,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.InverseGammaWithSoftplusAlphaBeta.copy(**override_parameters_kwargs)` {#InverseGammaWithSoftplusAlphaBeta.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.InverseGammaWithSoftplusAlphaBeta.dtype` {#InverseGammaWithSoftplusAlphaBeta.dtype}
@@ -9019,6 +9364,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.Laplace.copy(**override_parameters_kwargs)` {#Laplace.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.Laplace.dtype` {#Laplace.dtype}
@@ -9553,6 +9921,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.LaplaceWithSoftplusScale.copy(**override_parameters_kwargs)` {#LaplaceWithSoftplusScale.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.LaplaceWithSoftplusScale.dtype` {#LaplaceWithSoftplusScale.dtype}
@@ -10151,6 +10542,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.Normal.copy(**override_parameters_kwargs)` {#Normal.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.Normal.dtype` {#Normal.dtype}
@@ -10685,6 +11099,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.NormalWithSoftplusSigma.copy(**override_parameters_kwargs)` {#NormalWithSoftplusSigma.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.NormalWithSoftplusSigma.dtype` {#NormalWithSoftplusSigma.dtype}
@@ -11243,6 +11680,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.Poisson.copy(**override_parameters_kwargs)` {#Poisson.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.Poisson.dtype` {#Poisson.dtype}
@@ -11862,6 +12322,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.StudentT.copy(**override_parameters_kwargs)` {#StudentT.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.StudentT.df` {#StudentT.df}
@@ -12419,6 +12902,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusSigma.copy(**override_parameters_kwargs)` {#StudentTWithAbsDfSoftplusSigma.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.StudentTWithAbsDfSoftplusSigma.df` {#StudentTWithAbsDfSoftplusSigma.df}
@@ -13032,6 +13538,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.Uniform.copy(**override_parameters_kwargs)` {#Uniform.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.Uniform.dtype` {#Uniform.dtype}
@@ -13633,6 +14162,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiag.copy(**override_parameters_kwargs)` {#MultivariateNormalDiag.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.MultivariateNormalDiag.dtype` {#MultivariateNormalDiag.dtype}
@@ -14274,6 +14826,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalFull.copy(**override_parameters_kwargs)` {#MultivariateNormalFull.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.MultivariateNormalFull.dtype` {#MultivariateNormalFull.dtype}
@@ -14924,6 +15499,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalCholesky.copy(**override_parameters_kwargs)` {#MultivariateNormalCholesky.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.MultivariateNormalCholesky.dtype` {#MultivariateNormalCholesky.dtype}
@@ -15600,6 +16198,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.copy(**override_parameters_kwargs)` {#MultivariateNormalDiagPlusVDVT.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.dtype` {#MultivariateNormalDiagPlusVDVT.dtype}
@@ -16180,6 +16801,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.copy(**override_parameters_kwargs)` {#MultivariateNormalDiagWithSoftplusStDev.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.dtype` {#MultivariateNormalDiagWithSoftplusStDev.dtype}
@@ -16920,6 +17564,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.Dirichlet.copy(**override_parameters_kwargs)` {#Dirichlet.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.Dirichlet.dtype` {#Dirichlet.dtype}
@@ -17576,6 +18243,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.DirichletMultinomial.copy(**override_parameters_kwargs)` {#DirichletMultinomial.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.DirichletMultinomial.dtype` {#DirichletMultinomial.dtype}
@@ -18247,6 +18937,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.Multinomial.copy(**override_parameters_kwargs)` {#Multinomial.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.Multinomial.dtype` {#Multinomial.dtype}
@@ -18906,6 +19619,29 @@ cdf(x) := P[X <= x]
 Boolean indicating if `Tensor` input/outputs are Cholesky factorized.
 
 
+- - -
+
+#### `tf.contrib.distributions.WishartCholesky.copy(**override_parameters_kwargs)` {#WishartCholesky.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.WishartCholesky.df` {#WishartCholesky.df}
@@ -19550,6 +20286,29 @@ cdf(x) := P[X <= x]
 Boolean indicating if `Tensor` input/outputs are Cholesky factorized.
 
 
+- - -
+
+#### `tf.contrib.distributions.WishartFull.copy(**override_parameters_kwargs)` {#WishartFull.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.WishartFull.df` {#WishartFull.df}
@@ -20227,8 +20986,8 @@ Additional documentation from `TransformedDistribution`:
 
 ##### <b>`condition_kwargs`</b>:
 
-*  <b>`distribution_kwargs`</b>: Python dictionary of arg names/values forwarded to the distribution.
 *  <b>`bijector_kwargs`</b>: Python dictionary of arg names/values forwarded to the bijector.
+*  <b>`distribution_kwargs`</b>: Python dictionary of arg names/values forwarded to the distribution.
 
 ##### Args:
 
@@ -20244,6 +21003,29 @@ Additional documentation from `TransformedDistribution`:
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.TransformedDistribution.copy(**override_parameters_kwargs)` {#TransformedDistribution.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.TransformedDistribution.distribution` {#TransformedDistribution.distribution}
@@ -20345,8 +21127,8 @@ Additional documentation from `TransformedDistribution`:
 
 ##### <b>`condition_kwargs`</b>:
 
-*  <b>`distribution_kwargs`</b>: Python dictionary of arg names/values forwarded to the distribution.
 *  <b>`bijector_kwargs`</b>: Python dictionary of arg names/values forwarded to the bijector.
+*  <b>`distribution_kwargs`</b>: Python dictionary of arg names/values forwarded to the distribution.
 
 ##### Args:
 
@@ -20429,8 +21211,8 @@ Implements `(log o p o g^{-1})(y) + (log o det o J o g^{-1})(y)`,
 
 ##### <b>`condition_kwargs`</b>:
 
-*  <b>`distribution_kwargs`</b>: Python dictionary of arg names/values forwarded to the distribution.
 *  <b>`bijector_kwargs`</b>: Python dictionary of arg names/values forwarded to the bijector.
+*  <b>`distribution_kwargs`</b>: Python dictionary of arg names/values forwarded to the distribution.
 
 ##### Args:
 
@@ -20468,8 +21250,8 @@ Additional documentation from `TransformedDistribution`:
 
 ##### <b>`condition_kwargs`</b>:
 
-*  <b>`distribution_kwargs`</b>: Python dictionary of arg names/values forwarded to the distribution.
 *  <b>`bijector_kwargs`</b>: Python dictionary of arg names/values forwarded to the bijector.
+*  <b>`distribution_kwargs`</b>: Python dictionary of arg names/values forwarded to the distribution.
 
 ##### Args:
 
@@ -20621,8 +21403,8 @@ Implements `p(g^{-1}(y)) det|J(g^{-1}(y))|`, where `g^{-1}` is the
 
 ##### <b>`condition_kwargs`</b>:
 
-*  <b>`distribution_kwargs`</b>: Python dictionary of arg names/values forwarded to the distribution.
 *  <b>`bijector_kwargs`</b>: Python dictionary of arg names/values forwarded to the bijector.
+*  <b>`distribution_kwargs`</b>: Python dictionary of arg names/values forwarded to the distribution.
 
 ##### Args:
 
@@ -20675,8 +21457,8 @@ Samples from the base distribution and then passes through
 
 ##### <b>`condition_kwargs`</b>:
 
-*  <b>`distribution_kwargs`</b>: Python dictionary of arg names/values forwarded to the distribution.
 *  <b>`bijector_kwargs`</b>: Python dictionary of arg names/values forwarded to the bijector.
+*  <b>`distribution_kwargs`</b>: Python dictionary of arg names/values forwarded to the distribution.
 
 ##### Args:
 
@@ -20724,8 +21506,8 @@ Additional documentation from `TransformedDistribution`:
 
 ##### <b>`condition_kwargs`</b>:
 
-*  <b>`distribution_kwargs`</b>: Python dictionary of arg names/values forwarded to the distribution.
 *  <b>`bijector_kwargs`</b>: Python dictionary of arg names/values forwarded to the bijector.
+*  <b>`distribution_kwargs`</b>: Python dictionary of arg names/values forwarded to the distribution.
 
 ##### Args:
 
@@ -20931,6 +21713,29 @@ The base distribution's `cdf` method must be defined on `y - 1`.
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.QuantizedDistribution.copy(**override_parameters_kwargs)` {#QuantizedDistribution.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.QuantizedDistribution.distribution` {#QuantizedDistribution.distribution}
@@ -21612,6 +22417,29 @@ cdf(x) := P[X <= x]
 
 
 
+- - -
+
+#### `tf.contrib.distributions.Mixture.copy(**override_parameters_kwargs)` {#Mixture.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.Mixture.dtype` {#Mixture.dtype}
@@ -22403,6 +23231,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.beta_aa.copy(**override_parameters_kwargs)` {#beta_aa.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.beta_aa.dtype` {#beta_aa.dtype}
@@ -22967,6 +23818,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.beta_bb.copy(**override_parameters_kwargs)` {#beta_bb.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.beta_bb.dtype` {#beta_bb.dtype}
diff --git a/tensorflow/g3doc/api_docs/python/contrib.layers.md b/tensorflow/g3doc/api_docs/python/contrib.layers.md
index 72c61191485..604c215b213 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.layers.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.layers.md
@@ -977,7 +977,11 @@ Various ways of passing optimizers, include:
 *  <b>`gradient_multipliers`</b>: dict of variables or variable names to floats.
                         If present, gradients for specified
                         variables will be multiplied by given constant.
-*  <b>`clip_gradients`</b>: float or `None`, clips gradients by this value.
+*  <b>`clip_gradients`</b>: float, callable or `None`. If float, is provided, a global
+    clipping is applied to prevent the norm of the gradient to exceed this
+    value. Alternatively, a callable can be provided e.g.: adaptive_clipping.
+    This callable takes a `list` of `(gradients, variables)` `tuple`s and
+    returns the same thing with the gradients modified.
 *  <b>`learning_rate_decay_fn`</b>: function, takes `learning_rate` and `global_step`
                           `Tensor`s, returns `Tensor`.
                           Can be used to implement any learning rate decay
@@ -1008,6 +1012,7 @@ Various ways of passing optimizers, include:
       * `global_step` is an invalid type or shape.
       * `learning_rate` is an invalid type or value.
       * `optimizer` is wrong type.
+      * `clip_gradients' is not float or callable.
       * `learning_rate` and `learning_rate_decay_fn` are supplied, but no
         `global_step` is available.
 
diff --git a/tensorflow/g3doc/api_docs/python/contrib.metrics.md b/tensorflow/g3doc/api_docs/python/contrib.metrics.md
index 2e159c475ce..326a90b2c40 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.metrics.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.metrics.md
@@ -86,11 +86,6 @@ Certain metrics, such as streaming_mean or streaming_accuracy, can be weighted
 via a `weights` argument. The `weights` tensor must be the same size as the
 labels and predictions tensors and results in a weighted average of the metric.
 
-Other metrics, such as streaming_recall, streaming_precision, and streaming_auc,
-are not well defined with regard to weighted samples. However, a binary
-`ignore_mask` argument can be used to ignore certain values at graph executation
-time.
-
 ## Metric `Ops`
 
 - - -
@@ -191,104 +186,100 @@ If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
 - - -
 
-### `tf.contrib.metrics.streaming_recall(*args, **kwargs)` {#streaming_recall}
+### `tf.contrib.metrics.streaming_recall(predictions, labels, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_recall}
 
-Computes the recall of the predictions with respect to the labels. (deprecated arguments)
+Computes the recall of the predictions with respect to the labels.
 
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-10-19.
-Instructions for updating:
-`ignore_mask` is being deprecated. Instead use `weights` with values 0.0 and 1.0 to mask values. For example, `weights=tf.logical_not(mask)`.
+The `streaming_recall` function creates two local variables, `true_positives`
+and `false_negatives`, that are used to compute the recall. This value is
+ultimately returned as `recall`, an idempotent operation that simply divides
+`true_positives` by the sum of `true_positives`  and `false_negatives`.
 
-  The `streaming_recall` function creates two local variables, `true_positives`
-  and `false_negatives`, that are used to compute the recall. This value is
-  ultimately returned as `recall`, an idempotent operation that simply divides
-  `true_positives` by the sum of `true_positives`  and `false_negatives`.
+For estimation of the metric  over a stream of data, the function creates an
+`update_op` that updates these variables and returns the `recall`. `update_op`
+weights each prediction by the corresponding value in `weights`.
 
-  For estimation of the metric  over a stream of data, the function creates an
-  `update_op` that updates these variables and returns the `recall`. `update_op`
-  weights each prediction by the corresponding value in `weights`.
+If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
-  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-  Alternatively, if `ignore_mask` is not `None`, then mask values where
-  `ignore_mask` is `True`.
+##### Args:
 
-  Args:
-    predictions: The predicted values, a `bool` `Tensor` of arbitrary shape.
-    labels: The ground truth values, a `bool` `Tensor` whose dimensions must
-      match `predictions`.
-    ignore_mask: An optional, `bool` `Tensor` whose shape matches `predictions`.
-    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
-    metrics_collections: An optional list of collections that `recall` should
-      be added to.
-    updates_collections: An optional list of collections that `update_op` should
-      be added to.
-    name: An optional variable_scope name.
 
-  Returns:
-    recall: Scalar float `Tensor` with the value of `true_positives` divided
-      by the sum of `true_positives` and `false_negatives`.
-    update_op: `Operation` that increments `true_positives` and
-      `false_negatives` variables appropriately and whose value matches
-      `recall`.
+*  <b>`predictions`</b>: The predicted values, a `bool` `Tensor` of arbitrary shape.
+*  <b>`labels`</b>: The ground truth values, a `bool` `Tensor` whose dimensions must
+    match `predictions`.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `predictions`.
+*  <b>`metrics_collections`</b>: An optional list of collections that `recall` should
+    be added to.
+*  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
+    be added to.
+*  <b>`name`</b>: An optional variable_scope name.
 
-  Raises:
-    ValueError: If `predictions` and `labels` have mismatched shapes, or if
-      `ignore_mask` is not `None` and its shape doesn't match `predictions`, or
-      if `weights` is not `None` and its shape doesn't match `predictions`, or
-      if either `metrics_collections` or `updates_collections` are not a list or
-      tuple.
+##### Returns:
+
+
+*  <b>`recall`</b>: Scalar float `Tensor` with the value of `true_positives` divided
+    by the sum of `true_positives` and `false_negatives`.
+*  <b>`update_op`</b>: `Operation` that increments `true_positives` and
+    `false_negatives` variables appropriately and whose value matches
+    `recall`.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
+    `weights` is not `None` and its shape doesn't match `predictions`, or if
+    either `metrics_collections` or `updates_collections` are not a list or
+    tuple.
 
 
 - - -
 
-### `tf.contrib.metrics.streaming_precision(*args, **kwargs)` {#streaming_precision}
+### `tf.contrib.metrics.streaming_precision(predictions, labels, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_precision}
 
-Computes the precision of the predictions with respect to the labels. (deprecated arguments)
+Computes the precision of the predictions with respect to the labels.
 
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-10-19.
-Instructions for updating:
-`ignore_mask` is being deprecated. Instead use `weights` with values 0.0 and 1.0 to mask values. For example, `weights=tf.logical_not(mask)`.
+The `streaming_precision` function creates two local variables,
+`true_positives` and `false_positives`, that are used to compute the
+precision. This value is ultimately returned as `precision`, an idempotent
+operation that simply divides `true_positives` by the sum of `true_positives`
+and `false_positives`.
 
-  The `streaming_precision` function creates two local variables,
-  `true_positives` and `false_positives`, that are used to compute the
-  precision. This value is ultimately returned as `precision`, an idempotent
-  operation that simply divides `true_positives` by the sum of `true_positives`
-  and `false_positives`.
+For estimation of the metric  over a stream of data, the function creates an
+`update_op` operation that updates these variables and returns the
+`precision`. `update_op` weights each prediction by the corresponding value in
+`weights`.
 
-  For estimation of the metric  over a stream of data, the function creates an
-  `update_op` operation that updates these variables and returns the
-  `precision`. `update_op` weights each prediction by the corresponding value in
-  `weights`.
+If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
-  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-  Alternatively, if `ignore_mask` is not `None`, then mask values where
-  `ignore_mask` is `True`.
+##### Args:
 
-  Args:
-    predictions: The predicted values, a `bool` `Tensor` of arbitrary shape.
-    labels: The ground truth values, a `bool` `Tensor` whose dimensions must
-      match `predictions`.
-    ignore_mask: An optional, `bool` `Tensor` whose shape matches `predictions`.
-    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
-    metrics_collections: An optional list of collections that `precision` should
-      be added to.
-    updates_collections: An optional list of collections that `update_op` should
-      be added to.
-    name: An optional variable_scope name.
 
-  Returns:
-    precision: Scalar float `Tensor` with the value of `true_positives`
-      divided by the sum of `true_positives` and `false_positives`.
-    update_op: `Operation` that increments `true_positives` and
-      `false_positives` variables appropriately and whose value matches
-      `precision`.
+*  <b>`predictions`</b>: The predicted values, a `bool` `Tensor` of arbitrary shape.
+*  <b>`labels`</b>: The ground truth values, a `bool` `Tensor` whose dimensions must
+    match `predictions`.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `predictions`.
+*  <b>`metrics_collections`</b>: An optional list of collections that `precision` should
+    be added to.
+*  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
+    be added to.
+*  <b>`name`</b>: An optional variable_scope name.
 
-  Raises:
-    ValueError: If `predictions` and `labels` have mismatched shapes, or if
-      `ignore_mask` is not `None` and its shape doesn't match `predictions`, or
-      if `weights` is not `None` and its shape doesn't match `predictions`, or
-      if either `metrics_collections` or `updates_collections` are not a list or
-      tuple.
+##### Returns:
+
+
+*  <b>`precision`</b>: Scalar float `Tensor` with the value of `true_positives`
+    divided by the sum of `true_positives` and `false_positives`.
+*  <b>`update_op`</b>: `Operation` that increments `true_positives` and
+    `false_positives` variables appropriately and whose value matches
+    `precision`.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
+    `weights` is not `None` and its shape doesn't match `predictions`, or if
+    either `metrics_collections` or `updates_collections` are not a list or
+    tuple.
 
 
 - - -
@@ -355,16 +346,12 @@ If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
 ### `tf.contrib.metrics.streaming_recall_at_k(*args, **kwargs)` {#streaming_recall_at_k}
 
-Computes the recall@k of the predictions with respect to dense labels. (deprecated arguments) (deprecated)
+Computes the recall@k of the predictions with respect to dense labels. (deprecated)
 
 THIS FUNCTION IS DEPRECATED. It will be removed after 2016-11-08.
 Instructions for updating:
 Please use `streaming_sparse_recall_at_k`, and reshape labels from [batch_size] to [batch_size, 1].
 
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-10-19.
-Instructions for updating:
-`ignore_mask` is being deprecated. Instead use `weights` with values 0.0 and 1.0 to mask values. For example, `weights=tf.logical_not(mask)`.
-
   The `streaming_recall_at_k` function creates two local variables, `total` and
   `count`, that are used to compute the recall@k frequency. This frequency is
   ultimately returned as `recall_at_<k>`: an idempotent operation that simply
@@ -379,15 +366,12 @@ Instructions for updating:
   increments `count` with the reduced sum of `weights`.
 
   If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-  Alternatively, if `ignore_mask` is not `None`, then mask values where
-  `ignore_mask` is `True`.
 
   Args:
     predictions: A floating point tensor of dimension [batch_size, num_classes]
     labels: A tensor of dimension [batch_size] whose type is in `int32`,
       `int64`.
     k: The number of top elements to look at for computing recall.
-    ignore_mask: An optional, `bool` `Tensor` whose shape matches `predictions`.
     weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
     metrics_collections: An optional list of collections that `recall_at_k`
       should be added to.
@@ -403,9 +387,8 @@ Instructions for updating:
 
   Raises:
     ValueError: If `predictions` and `labels` have mismatched shapes, or if
-      `ignore_mask` is not `None` and its shape doesn't match `predictions`, or
-      if `weights` is not `None` and its shape doesn't match `predictions`, or
-      if either `metrics_collections` or `updates_collections` are not a list or
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
       tuple.
 
 
@@ -462,56 +445,56 @@ If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
 - - -
 
-### `tf.contrib.metrics.streaming_mean_iou(*args, **kwargs)` {#streaming_mean_iou}
+### `tf.contrib.metrics.streaming_mean_iou(predictions, labels, num_classes, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_mean_iou}
 
-Calculate per-step mean Intersection-Over-Union (mIOU). (deprecated arguments)
+Calculate per-step mean Intersection-Over-Union (mIOU).
 
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-10-19.
-Instructions for updating:
-`ignore_mask` is being deprecated. Instead use `weights` with values 0.0 and 1.0 to mask values. For example, `weights=tf.logical_not(mask)`.
+Mean Intersection-Over-Union is a common evaluation metric for
+semantic image segmentation, which first computes the IOU for each
+semantic class and then computes the average over classes.
 
-  Mean Intersection-Over-Union is a common evaluation metric for
-  semantic image segmentation, which first computes the IOU for each
-  semantic class and then computes the average over classes.
-  IOU is defined as follows:
-    IOU = true_positive / (true_positive + false_positive + false_negative).
-  The predictions are accumulated in a confusion matrix, weighted by `weights`,
-  and mIOU is then calculated from it.
+##### IOU is defined as follows:
 
-  For estimation of the metric over a stream of data, the function creates an
-  `update_op` operation that updates these variables and returns the `mean_iou`.
+  IOU = true_positive / (true_positive + false_positive + false_negative).
+The predictions are accumulated in a confusion matrix, weighted by `weights`,
+and mIOU is then calculated from it.
 
-  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-  Alternatively, if `ignore_mask` is not `None`, then mask values where
-  `ignore_mask` is `True`.
+For estimation of the metric over a stream of data, the function creates an
+`update_op` operation that updates these variables and returns the `mean_iou`.
 
-  Args:
-    predictions: A tensor of prediction results for semantic labels, whose
-      shape is [batch size] and type `int32` or `int64`. The tensor will be
-      flattened, if its rank > 1.
-    labels: A tensor of ground truth labels with shape [batch size] and of
-      type `int32` or `int64`. The tensor will be flattened, if its rank > 1.
-    num_classes: The possible number of labels the prediction task can
-      have. This value must be provided, since a confusion matrix of
-      dimension = [num_classes, num_classes] will be allocated.
-    ignore_mask: An optional, `bool` `Tensor` whose shape matches `predictions`.
-    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
-    metrics_collections: An optional list of collections that `mean_iou`
-      should be added to.
-    updates_collections: An optional list of collections `update_op` should be
-      added to.
-    name: An optional variable_scope name.
+If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
-  Returns:
-    mean_iou: A tensor representing the mean intersection-over-union.
-    update_op: An operation that increments the confusion matrix.
+##### Args:
 
-  Raises:
-    ValueError: If `predictions` and `labels` have mismatched shapes, or if
-      `ignore_mask` is not `None` and its shape doesn't match `predictions`, or
-      if `weights` is not `None` and its shape doesn't match `predictions`, or
-      if either `metrics_collections` or `updates_collections` are not a list or
-      tuple.
+
+*  <b>`predictions`</b>: A tensor of prediction results for semantic labels, whose
+    shape is [batch size] and type `int32` or `int64`. The tensor will be
+    flattened, if its rank > 1.
+*  <b>`labels`</b>: A tensor of ground truth labels with shape [batch size] and of
+    type `int32` or `int64`. The tensor will be flattened, if its rank > 1.
+*  <b>`num_classes`</b>: The possible number of labels the prediction task can
+    have. This value must be provided, since a confusion matrix of
+    dimension = [num_classes, num_classes] will be allocated.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `predictions`.
+*  <b>`metrics_collections`</b>: An optional list of collections that `mean_iou`
+    should be added to.
+*  <b>`updates_collections`</b>: An optional list of collections `update_op` should be
+    added to.
+*  <b>`name`</b>: An optional variable_scope name.
+
+##### Returns:
+
+
+*  <b>`mean_iou`</b>: A tensor representing the mean intersection-over-union.
+*  <b>`update_op`</b>: An operation that increments the confusion matrix.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
+    `weights` is not `None` and its shape doesn't match `predictions`, or if
+    either `metrics_collections` or `updates_collections` are not a list or
+    tuple.
 
 
 - - -
@@ -828,50 +811,48 @@ If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
 - - -
 
-### `tf.contrib.metrics.streaming_percentage_less(*args, **kwargs)` {#streaming_percentage_less}
+### `tf.contrib.metrics.streaming_percentage_less(values, threshold, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_percentage_less}
 
-Computes the percentage of values less than the given threshold. (deprecated arguments)
+Computes the percentage of values less than the given threshold.
 
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-10-19.
-Instructions for updating:
-`ignore_mask` is being deprecated. Instead use `weights` with values 0.0 and 1.0 to mask values. For example, `weights=tf.logical_not(mask)`.
+The `streaming_percentage_less` function creates two local variables,
+`total` and `count` that are used to compute the percentage of `values` that
+fall below `threshold`. This rate is weighted by `weights`, and it is
+ultimately returned as `percentage` which is an idempotent operation that
+simply divides `total` by `count`.
 
-  The `streaming_percentage_less` function creates two local variables,
-  `total` and `count` that are used to compute the percentage of `values` that
-  fall below `threshold`. This rate is weighted by `weights`, and it is
-  ultimately returned as `percentage` which is an idempotent operation that
-  simply divides `total` by `count`.
+For estimation of the metric over a stream of data, the function creates an
+`update_op` operation that updates these variables and returns the
+`percentage`.
 
-  For estimation of the metric over a stream of data, the function creates an
-  `update_op` operation that updates these variables and returns the
-  `percentage`.
+If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
-  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-  Alternatively, if `ignore_mask` is not `None`, then mask values where
-  `ignore_mask` is `True`.
+##### Args:
 
-  Args:
-    values: A numeric `Tensor` of arbitrary size.
-    threshold: A scalar threshold.
-    ignore_mask: An optional, `bool` `Tensor` whose shape matches `values`.
-    weights: An optional `Tensor` whose shape is broadcastable to `values`.
-    metrics_collections: An optional list of collections that the metric
-      value variable should be added to.
-    updates_collections: An optional list of collections that the metric update
-      ops should be added to.
-    name: An optional variable_scope name.
 
-  Returns:
-    percentage: A tensor representing the current mean, the value of `total`
-      divided by `count`.
-    update_op: An operation that increments the `total` and `count` variables
-      appropriately.
+*  <b>`values`</b>: A numeric `Tensor` of arbitrary size.
+*  <b>`threshold`</b>: A scalar threshold.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `values`.
+*  <b>`metrics_collections`</b>: An optional list of collections that the metric
+    value variable should be added to.
+*  <b>`updates_collections`</b>: An optional list of collections that the metric update
+    ops should be added to.
+*  <b>`name`</b>: An optional variable_scope name.
 
-  Raises:
-    ValueError: If `ignore_mask` is not `None` and its shape doesn't match
-      `values`, or if `weights` is not `None` and its shape doesn't match
-      `values`, or if either `metrics_collections` or `updates_collections` are
-      not a list or tuple.
+##### Returns:
+
+
+*  <b>`percentage`</b>: A tensor representing the current mean, the value of `total`
+    divided by `count`.
+*  <b>`update_op`</b>: An operation that increments the `total` and `count` variables
+    appropriately.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If `weights` is not `None` and its shape doesn't match `values`,
+    or if either `metrics_collections` or `updates_collections` are not a list
+    or tuple.
 
 
 - - -
@@ -991,232 +972,223 @@ If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
 - - -
 
-### `tf.contrib.metrics.streaming_sparse_precision_at_k(*args, **kwargs)` {#streaming_sparse_precision_at_k}
+### `tf.contrib.metrics.streaming_sparse_precision_at_k(predictions, labels, k, class_id=None, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_sparse_precision_at_k}
 
-Computes precision@k of the predictions with respect to sparse labels. (deprecated arguments)
+Computes precision@k of the predictions with respect to sparse labels.
 
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-10-19.
-Instructions for updating:
-`ignore_mask` is being deprecated. Instead use `weights` with values 0.0 and 1.0 to mask values. For example, `weights=tf.logical_not(mask)`.
+If `class_id` is specified, we calculate precision by considering only the
+    entries in the batch for which `class_id` is in the top-k highest
+    `predictions`, and computing the fraction of them for which `class_id` is
+    indeed a correct label.
+If `class_id` is not specified, we'll calculate precision as how often on
+    average a class among the top-k classes with the highest predicted values
+    of a batch entry is correct and can be found in the label for that entry.
 
-  If `class_id` is specified, we calculate precision by considering only the
-      entries in the batch for which `class_id` is in the top-k highest
-      `predictions`, and computing the fraction of them for which `class_id` is
-      indeed a correct label.
-  If `class_id` is not specified, we'll calculate precision as how often on
-      average a class among the top-k classes with the highest predicted values
-      of a batch entry is correct and can be found in the label for that entry.
+`streaming_sparse_precision_at_k` creates two local variables,
+`true_positive_at_<k>` and `false_positive_at_<k>`, that are used to compute
+the precision@k frequency. This frequency is ultimately returned as
+`precision_at_<k>`: an idempotent operation that simply divides
+`true_positive_at_<k>` by total (`true_positive_at_<k>` +
+`false_positive_at_<k>`).
 
-  `streaming_sparse_precision_at_k` creates two local variables,
-  `true_positive_at_<k>` and `false_positive_at_<k>`, that are used to compute
-  the precision@k frequency. This frequency is ultimately returned as
-  `precision_at_<k>`: an idempotent operation that simply divides
-  `true_positive_at_<k>` by total (`true_positive_at_<k>` +
-  `false_positive_at_<k>`).
+For estimation of the metric over a stream of data, the function creates an
+`update_op` operation that updates these variables and returns the
+`precision_at_<k>`. Internally, a `top_k` operation computes a `Tensor`
+indicating the top `k` `predictions`. Set operations applied to `top_k` and
+`labels` calculate the true positives and false positives weighted by
+`weights`. Then `update_op` increments `true_positive_at_<k>` and
+`false_positive_at_<k>` using these values.
 
-  For estimation of the metric over a stream of data, the function creates an
-  `update_op` operation that updates these variables and returns the
-  `precision_at_<k>`. Internally, a `top_k` operation computes a `Tensor`
-  indicating the top `k` `predictions`. Set operations applied to `top_k` and
-  `labels` calculate the true positives and false positives weighted by
-  `weights`. Then `update_op` increments `true_positive_at_<k>` and
-  `false_positive_at_<k>` using these values.
+If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
-  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-  Alternatively, if `ignore_mask` is not `None`, then mask values where
-  `ignore_mask` is `True`.
+##### Args:
 
-  Args:
-    predictions: Float `Tensor` with shape [D1, ... DN, num_classes] where
-      N >= 1. Commonly, N=1 and predictions has shape [batch size, num_classes].
-      The final dimension contains the logit values for each class. [D1, ... DN]
-      must match `labels`.
-    labels: `int64` `Tensor` or `SparseTensor` with shape
-      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
-      target classes for the associated prediction. Commonly, N=1 and `labels`
-      has shape [batch_size, num_labels]. [D1, ... DN] must match
-      `predictions`. Values should be in range [0, num_classes), where
-      num_classes is the last dimension of `predictions`. Values outside this
-      range are ignored.
-    k: Integer, k for @k metric.
-    class_id: Integer class ID for which we want binary metrics. This should be
-      in range [0, num_classes], where num_classes is the last dimension of
-      `predictions`. If `class_id` is outside this range, the method returns
-      NAN.
-    ignore_mask: An optional, `bool` `Tensor` whose shape is broadcastable to
-      the the first [D1, ... DN] dimensions of `predictions` and `labels`.
-    weights: An optional `Tensor` whose shape is broadcastable to the the first
-      [D1, ... DN] dimensions of `predictions` and `labels`.
-    metrics_collections: An optional list of collections that values should
-      be added to.
-    updates_collections: An optional list of collections that updates should
-      be added to.
-    name: Name of new update operation, and namespace for other dependent ops.
 
-  Returns:
-    precision: Scalar `float64` `Tensor` with the value of `true_positives`
-      divided by the sum of `true_positives` and `false_positives`.
-    update_op: `Operation` that increments `true_positives` and
-      `false_positives` variables appropriately, and whose value matches
-      `precision`.
+*  <b>`predictions`</b>: Float `Tensor` with shape [D1, ... DN, num_classes] where
+    N >= 1. Commonly, N=1 and predictions has shape [batch size, num_classes].
+    The final dimension contains the logit values for each class. [D1, ... DN]
+    must match `labels`.
+*  <b>`labels`</b>: `int64` `Tensor` or `SparseTensor` with shape
+    [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+    target classes for the associated prediction. Commonly, N=1 and `labels`
+    has shape [batch_size, num_labels]. [D1, ... DN] must match
+    `predictions`. Values should be in range [0, num_classes), where
+    num_classes is the last dimension of `predictions`. Values outside this
+    range are ignored.
+*  <b>`k`</b>: Integer, k for @k metric.
+*  <b>`class_id`</b>: Integer class ID for which we want binary metrics. This should be
+    in range [0, num_classes], where num_classes is the last dimension of
+    `predictions`. If `class_id` is outside this range, the method returns
+    NAN.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to the the first
+    [D1, ... DN] dimensions of `predictions` and `labels`.
+*  <b>`metrics_collections`</b>: An optional list of collections that values should
+    be added to.
+*  <b>`updates_collections`</b>: An optional list of collections that updates should
+    be added to.
+*  <b>`name`</b>: Name of new update operation, and namespace for other dependent ops.
 
-  Raises:
-    ValueError: If `ignore_mask` is not `None` and its shape doesn't match
-      `predictions`, or if `weights` is not `None` and its shape doesn't match
-      `predictions`, or if either `metrics_collections` or `updates_collections`
-      are not a list or tuple.
+##### Returns:
+
+
+*  <b>`precision`</b>: Scalar `float64` `Tensor` with the value of `true_positives`
+    divided by the sum of `true_positives` and `false_positives`.
+*  <b>`update_op`</b>: `Operation` that increments `true_positives` and
+    `false_positives` variables appropriately, and whose value matches
+    `precision`.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If `weights` is not `None` and its shape doesn't match
+    `predictions`, or if either `metrics_collections` or `updates_collections`
+    are not a list or tuple.
 
 
 - - -
 
-### `tf.contrib.metrics.streaming_sparse_precision_at_top_k(*args, **kwargs)` {#streaming_sparse_precision_at_top_k}
+### `tf.contrib.metrics.streaming_sparse_precision_at_top_k(top_k_predictions, labels, class_id=None, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_sparse_precision_at_top_k}
 
-Computes precision@k of top-k predictions with respect to sparse labels. (deprecated arguments)
+Computes precision@k of top-k predictions with respect to sparse labels.
 
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-10-19.
-Instructions for updating:
-`ignore_mask` is being deprecated. Instead use `weights` with values 0.0 and 1.0 to mask values. For example, `weights=tf.logical_not(mask)`.
+If `class_id` is specified, we calculate precision by considering only the
+    entries in the batch for which `class_id` is in the top-k highest
+    `predictions`, and computing the fraction of them for which `class_id` is
+    indeed a correct label.
+If `class_id` is not specified, we'll calculate precision as how often on
+    average a class among the top-k classes with the highest predicted values
+    of a batch entry is correct and can be found in the label for that entry.
 
-  If `class_id` is specified, we calculate precision by considering only the
-      entries in the batch for which `class_id` is in the top-k highest
-      `predictions`, and computing the fraction of them for which `class_id` is
-      indeed a correct label.
-  If `class_id` is not specified, we'll calculate precision as how often on
-      average a class among the top-k classes with the highest predicted values
-      of a batch entry is correct and can be found in the label for that entry.
+`streaming_sparse_precision_at_top_k` creates two local variables,
+`true_positive_at_k` and `false_positive_at_k`, that are used to compute
+the precision@k frequency. This frequency is ultimately returned as
+`precision_at_k`: an idempotent operation that simply divides
+`true_positive_at_k` by total (`true_positive_at_k` + `false_positive_at_k`).
 
-  `streaming_sparse_precision_at_top_k` creates two local variables,
-  `true_positive_at_k` and `false_positive_at_k`, that are used to compute
-  the precision@k frequency. This frequency is ultimately returned as
-  `precision_at_k`: an idempotent operation that simply divides
-  `true_positive_at_k` by total (`true_positive_at_k` + `false_positive_at_k`).
+For estimation of the metric over a stream of data, the function creates an
+`update_op` operation that updates these variables and returns the
+`precision_at_k`. Internally, set operations applied to `top_k_predictions`
+and `labels` calculate the true positives and false positives weighted by
+`weights`. Then `update_op` increments `true_positive_at_k` and
+`false_positive_at_k` using these values.
 
-  For estimation of the metric over a stream of data, the function creates an
-  `update_op` operation that updates these variables and returns the
-  `precision_at_k`. Internally, set operations applied to `top_k_predictions`
-  and `labels` calculate the true positives and false positives weighted by
-  `weights`. Then `update_op` increments `true_positive_at_k` and
-  `false_positive_at_k` using these values.
+If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
-  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-  Alternatively, if `ignore_mask` is not `None`, then mask values where
-  `ignore_mask` is `True`.
+##### Args:
 
-  Args:
-    top_k_predictions: Integer `Tensor` with shape [D1, ... DN, k] where
-      N >= 1. Commonly, N=1 and top_k_predictions has shape [batch size, k].
-      The final dimension contains the indices of top-k labels. [D1, ... DN]
-      must match `labels`.
-    labels: `int64` `Tensor` or `SparseTensor` with shape
-      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
-      target classes for the associated prediction. Commonly, N=1 and `labels`
-      has shape [batch_size, num_labels]. [D1, ... DN] must match
-      `top_k_predictions`. Values should be in range [0, num_classes), where
-      num_classes is the last dimension of `predictions`. Values outside this
-      range are ignored.
-    class_id: Integer class ID for which we want binary metrics. This should be
-      in range [0, num_classes), where num_classes is the last dimension of
-      `predictions`. If `class_id` is outside this range, the method returns
-      NAN.
-    ignore_mask: An optional, `bool` `Tensor` whose shape is broadcastable to
-      the the first [D1, ... DN] dimensions of `predictions` and `labels`.
-    weights: An optional `Tensor` whose shape is broadcastable to the the first
-      [D1, ... DN] dimensions of `predictions` and `labels`.
-    metrics_collections: An optional list of collections that values should
-      be added to.
-    updates_collections: An optional list of collections that updates should
-      be added to.
-    name: Name of new update operation, and namespace for other dependent ops.
 
-  Returns:
-    precision: Scalar `float64` `Tensor` with the value of `true_positives`
-      divided by the sum of `true_positives` and `false_positives`.
-    update_op: `Operation` that increments `true_positives` and
-      `false_positives` variables appropriately, and whose value matches
-      `precision`.
+*  <b>`top_k_predictions`</b>: Integer `Tensor` with shape [D1, ... DN, k] where
+    N >= 1. Commonly, N=1 and top_k_predictions has shape [batch size, k].
+    The final dimension contains the indices of top-k labels. [D1, ... DN]
+    must match `labels`.
+*  <b>`labels`</b>: `int64` `Tensor` or `SparseTensor` with shape
+    [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+    target classes for the associated prediction. Commonly, N=1 and `labels`
+    has shape [batch_size, num_labels]. [D1, ... DN] must match
+    `top_k_predictions`. Values should be in range [0, num_classes), where
+    num_classes is the last dimension of `predictions`. Values outside this
+    range are ignored.
+*  <b>`class_id`</b>: Integer class ID for which we want binary metrics. This should be
+    in range [0, num_classes), where num_classes is the last dimension of
+    `predictions`. If `class_id` is outside this range, the method returns
+    NAN.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to the the first
+    [D1, ... DN] dimensions of `predictions` and `labels`.
+*  <b>`metrics_collections`</b>: An optional list of collections that values should
+    be added to.
+*  <b>`updates_collections`</b>: An optional list of collections that updates should
+    be added to.
+*  <b>`name`</b>: Name of new update operation, and namespace for other dependent ops.
 
-  Raises:
-    ValueError: If `ignore_mask` is not `None` and its shape doesn't match
-      `predictions`, or if `weights` is not `None` and its shape doesn't match
-      `predictions`, or if either `metrics_collections` or `updates_collections`
-      are not a list or tuple.
-    ValueError: If `top_k_predictions` has rank < 2.
+##### Returns:
+
+
+*  <b>`precision`</b>: Scalar `float64` `Tensor` with the value of `true_positives`
+    divided by the sum of `true_positives` and `false_positives`.
+*  <b>`update_op`</b>: `Operation` that increments `true_positives` and
+    `false_positives` variables appropriately, and whose value matches
+    `precision`.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If `weights` is not `None` and its shape doesn't match
+    `predictions`, or if either `metrics_collections` or `updates_collections`
+    are not a list or tuple.
+*  <b>`ValueError`</b>: If `top_k_predictions` has rank < 2.
 
 
 - - -
 
-### `tf.contrib.metrics.streaming_sparse_recall_at_k(*args, **kwargs)` {#streaming_sparse_recall_at_k}
+### `tf.contrib.metrics.streaming_sparse_recall_at_k(predictions, labels, k, class_id=None, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_sparse_recall_at_k}
 
-Computes recall@k of the predictions with respect to sparse labels. (deprecated arguments)
+Computes recall@k of the predictions with respect to sparse labels.
 
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-10-19.
-Instructions for updating:
-`ignore_mask` is being deprecated. Instead use `weights` with values 0.0 and 1.0 to mask values. For example, `weights=tf.logical_not(mask)`.
+If `class_id` is specified, we calculate recall by considering only the
+    entries in the batch for which `class_id` is in the label, and computing
+    the fraction of them for which `class_id` is in the top-k `predictions`.
+If `class_id` is not specified, we'll calculate recall as how often on
+    average a class among the labels of a batch entry is in the top-k
+    `predictions`.
 
-  If `class_id` is specified, we calculate recall by considering only the
-      entries in the batch for which `class_id` is in the label, and computing
-      the fraction of them for which `class_id` is in the top-k `predictions`.
-  If `class_id` is not specified, we'll calculate recall as how often on
-      average a class among the labels of a batch entry is in the top-k
-      `predictions`.
+`streaming_sparse_recall_at_k` creates two local variables,
+`true_positive_at_<k>` and `false_negative_at_<k>`, that are used to compute
+the recall_at_k frequency. This frequency is ultimately returned as
+`recall_at_<k>`: an idempotent operation that simply divides
+`true_positive_at_<k>` by total (`true_positive_at_<k>` +
+`false_negative_at_<k>`).
 
-  `streaming_sparse_recall_at_k` creates two local variables,
-  `true_positive_at_<k>` and `false_negative_at_<k>`, that are used to compute
-  the recall_at_k frequency. This frequency is ultimately returned as
-  `recall_at_<k>`: an idempotent operation that simply divides
-  `true_positive_at_<k>` by total (`true_positive_at_<k>` +
-  `false_negative_at_<k>`).
+For estimation of the metric over a stream of data, the function creates an
+`update_op` operation that updates these variables and returns the
+`recall_at_<k>`. Internally, a `top_k` operation computes a `Tensor`
+indicating the top `k` `predictions`. Set operations applied to `top_k` and
+`labels` calculate the true positives and false negatives weighted by
+`weights`. Then `update_op` increments `true_positive_at_<k>` and
+`false_negative_at_<k>` using these values.
 
-  For estimation of the metric over a stream of data, the function creates an
-  `update_op` operation that updates these variables and returns the
-  `recall_at_<k>`. Internally, a `top_k` operation computes a `Tensor`
-  indicating the top `k` `predictions`. Set operations applied to `top_k` and
-  `labels` calculate the true positives and false negatives weighted by
-  `weights`. Then `update_op` increments `true_positive_at_<k>` and
-  `false_negative_at_<k>` using these values.
+If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
-  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-  Alternatively, if `ignore_mask` is not `None`, then mask values where
-  `ignore_mask` is `True`.
+##### Args:
 
-  Args:
-    predictions: Float `Tensor` with shape [D1, ... DN, num_classes] where
-      N >= 1. Commonly, N=1 and predictions has shape [batch size, num_classes].
-      The final dimension contains the logit values for each class. [D1, ... DN]
-      must match `labels`.
-    labels: `int64` `Tensor` or `SparseTensor` with shape
-      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
-      target classes for the associated prediction. Commonly, N=1 and `labels`
-      has shape [batch_size, num_labels]. [D1, ... DN] must match `predictions`.
-      Values should be in range [0, num_classes), where num_classes is the last
-      dimension of `predictions`. Values outside this range always count
-      towards `false_negative_at_<k>`.
-    k: Integer, k for @k metric.
-    class_id: Integer class ID for which we want binary metrics. This should be
-      in range [0, num_classes), where num_classes is the last dimension of
-      `predictions`. If class_id is outside this range, the method returns NAN.
-    ignore_mask: An optional, `bool` `Tensor` whose shape is broadcastable to
-      the the first [D1, ... DN] dimensions of `predictions` and `labels`.
-    weights: An optional `Tensor` whose shape is broadcastable to the the first
-      [D1, ... DN] dimensions of `predictions` and `labels`.
-    metrics_collections: An optional list of collections that values should
-      be added to.
-    updates_collections: An optional list of collections that updates should
-      be added to.
-    name: Name of new update operation, and namespace for other dependent ops.
 
-  Returns:
-    recall: Scalar `float64` `Tensor` with the value of `true_positives` divided
-      by the sum of `true_positives` and `false_negatives`.
-    update_op: `Operation` that increments `true_positives` and
-      `false_negatives` variables appropriately, and whose value matches
-      `recall`.
+*  <b>`predictions`</b>: Float `Tensor` with shape [D1, ... DN, num_classes] where
+    N >= 1. Commonly, N=1 and predictions has shape [batch size, num_classes].
+    The final dimension contains the logit values for each class. [D1, ... DN]
+    must match `labels`.
+*  <b>`labels`</b>: `int64` `Tensor` or `SparseTensor` with shape
+    [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+    target classes for the associated prediction. Commonly, N=1 and `labels`
+    has shape [batch_size, num_labels]. [D1, ... DN] must match `predictions`.
+    Values should be in range [0, num_classes), where num_classes is the last
+    dimension of `predictions`. Values outside this range always count
+    towards `false_negative_at_<k>`.
+*  <b>`k`</b>: Integer, k for @k metric.
+*  <b>`class_id`</b>: Integer class ID for which we want binary metrics. This should be
+    in range [0, num_classes), where num_classes is the last dimension of
+    `predictions`. If class_id is outside this range, the method returns NAN.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to the the first
+    [D1, ... DN] dimensions of `predictions` and `labels`.
+*  <b>`metrics_collections`</b>: An optional list of collections that values should
+    be added to.
+*  <b>`updates_collections`</b>: An optional list of collections that updates should
+    be added to.
+*  <b>`name`</b>: Name of new update operation, and namespace for other dependent ops.
 
-  Raises:
-    ValueError: If `ignore_mask` is not `None` and its shape doesn't match
-      `predictions`, or if `weights` is not `None` and its shape doesn't match
-      `predictions`, or if either `metrics_collections` or `updates_collections`
-      are not a list or tuple.
+##### Returns:
+
+
+*  <b>`recall`</b>: Scalar `float64` `Tensor` with the value of `true_positives` divided
+    by the sum of `true_positives` and `false_negatives`.
+*  <b>`update_op`</b>: `Operation` that increments `true_positives` and
+    `false_negatives` variables appropriately, and whose value matches
+    `recall`.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If `weights` is not `None` and its shape doesn't match
+  `predictions`, or if either `metrics_collections` or `updates_collections`
+  are not a list or tuple.
 
 
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/contrib.rnn.md b/tensorflow/g3doc/api_docs/python/contrib.rnn.md
index f0d70436a5e..1d59c1c6304 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.rnn.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.rnn.md
@@ -744,7 +744,7 @@ the shapes `[batch_size x s]` for each s in `state_size`.
 
 Basic attention cell wrapper.
 
-Implementation based on https://arxiv.org/pdf/1601.06733.pdf.
+Implementation based on https://arxiv.org/abs/1409.0473.
 - - -
 
 #### `tf.contrib.rnn.AttentionCellWrapper.__call__(inputs, state, scope=None)` {#AttentionCellWrapper.__call__}
diff --git a/tensorflow/g3doc/api_docs/python/contrib.training.md b/tensorflow/g3doc/api_docs/python/contrib.training.md
index 8b22edf7c1a..935c163e060 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.training.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.training.md
@@ -726,8 +726,9 @@ It should be run in a separate thread via e.g. a `QueueRunner`.
 To resample data with replacement on a per-example basis, use
 ['rejection_sample'](#rejection_sample) or
 ['resample_at_rate'](#resample_at_rate). For `rejection_sample`, provide
-a boolean Tensor describing whether to accept or reject. For `resample_at_rate`,
-providing the desired rate for each example. If you wish to specify relative
+a boolean Tensor describing whether to accept or reject. Resulting batch sizes
+are always the same. For `resample_at_rate`, provide the desired rate for each
+example. Resulting batch sizes may vary. If you wish to specify relative
 rates, rather than absolute ones, use ['weighted_resample'](#weighted_resample)
 (which also returns the actual resampling rate used for each output example).
 
diff --git a/tensorflow/g3doc/api_docs/python/functional_ops.md b/tensorflow/g3doc/api_docs/python/functional_ops.md
index 338f315b553..3102cad0e55 100644
--- a/tensorflow/g3doc/api_docs/python/functional_ops.md
+++ b/tensorflow/g3doc/api_docs/python/functional_ops.md
@@ -41,6 +41,22 @@ Furthermore, `fn` may emit a different structure than its input.  For example,
 the `dtype` parameter is not optional: `dtype` must be a type or (possibly
 nested) tuple of types matching the output of `fn`.
 
+To apply a functional operation to the nonzero elements of a SparseTensor
+one of the following methods is recommended. First, if the function is
+expressible as TensorFlow ops, use
+
+```python
+  result = SparseTensor(input.indices, fn(input.values), input.shape)
+```
+
+If, however, the function is not expressible as a TensorFlow op, then use
+
+```python
+result = SparseTensor(input.indices, map_fn(fn, input.values), input.shape)
+```
+
+instead.
+
 ##### Args:
 
 
@@ -71,7 +87,7 @@ nested) tuple of types matching the output of `fn`.
 
 
 *  <b>`TypeError`</b>: if `fn` is not callable or the structure of the output of
-    `fn` and `dtype` do not match.
+    `fn` and `dtype` do not match, or if elems is a SparseTensor.
 *  <b>`ValueError`</b>: if the lengths of the output of `fn` and `dtype` do not match.
 
 ##### Examples:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Bernoulli.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Bernoulli.md
index 7338070ba5e..bb563579927 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Bernoulli.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Bernoulli.md
@@ -102,6 +102,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.Bernoulli.copy(**override_parameters_kwargs)` {#Bernoulli.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.Bernoulli.dtype` {#Bernoulli.dtype}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Chi2WithAbsDf.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Chi2WithAbsDf.md
index 551713320e6..7b99144e983 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Chi2WithAbsDf.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Chi2WithAbsDf.md
@@ -87,6 +87,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.Chi2WithAbsDf.copy(**override_parameters_kwargs)` {#Chi2WithAbsDf.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.Chi2WithAbsDf.df` {#Chi2WithAbsDf.df}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Dirichlet.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Dirichlet.md
index 0bc2ed75745..92ebb7b3a79 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Dirichlet.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Dirichlet.md
@@ -174,6 +174,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.Dirichlet.copy(**override_parameters_kwargs)` {#Dirichlet.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.Dirichlet.dtype` {#Dirichlet.dtype}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Distribution.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Distribution.md
index edbf045d475..a85e6bed2b6 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Distribution.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Distribution.md
@@ -213,6 +213,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.Distribution.copy(**override_parameters_kwargs)` {#Distribution.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.Distribution.dtype` {#Distribution.dtype}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.MultivariateNormalCholesky.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.MultivariateNormalCholesky.md
index ded3478b77b..d4b6c1c2180 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.MultivariateNormalCholesky.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.MultivariateNormalCholesky.md
@@ -143,6 +143,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalCholesky.copy(**override_parameters_kwargs)` {#MultivariateNormalCholesky.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.MultivariateNormalCholesky.dtype` {#MultivariateNormalCholesky.dtype}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.bijector.CholeskyOuterProduct.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.bijector.CholeskyOuterProduct.md
new file mode 100644
index 00000000000..5805851802a
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.bijector.CholeskyOuterProduct.md
@@ -0,0 +1,223 @@
+Bijector which computes Y = g(X) = X X^T where X is a lower-triangular, positive-diagonal matrix.
+
+`event_ndims` must be 0 or 2, i.e., scalar or matrix.
+
+Note: the upper-triangular part of X is ignored (whether or not its zero).
+
+Examples:
+
+```python
+bijector.CholeskyOuterProduct(event_ndims=2).forward(x=[[1., 0], [2, 1]])
+# Result: [[1, 1], [1, 5]], i.e., x x^T
+
+bijector.SoftmaxCentered(event_ndims=2).inverse(y=[[1., 1], [1, 5]])
+# Result: [[1, 0], [2, 1]], i.e., chol(y).
+```
+- - -
+
+#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.__init__(event_ndims=2, validate_args=False, name='cholesky_outer_product')` {#CholeskyOuterProduct.__init__}
+
+Instantiates the `CholeskyOuterProduct` bijector.
+
+##### Args:
+
+
+*  <b>`event_ndims`</b>: `constant` `int32` scalar `Tensor` indicating the number of
+    dimensions associated with a particular draw from the distribution. Must
+    be 0 or 2.
+*  <b>`validate_args`</b>: `Boolean` indicating whether arguments should be checked
+    for correctness.
+*  <b>`name`</b>: `String` name given to ops managed by this object.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: if event_ndims is neither 0 or 2.
+
+
+- - -
+
+#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.dtype` {#CholeskyOuterProduct.dtype}
+
+dtype of `Tensor`s transformable by this distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.forward(x, name='forward', **condition_kwargs)` {#CholeskyOuterProduct.forward}
+
+Returns the forward `Bijector` evaluation, i.e., X = g(Y).
+
+##### Args:
+
+
+*  <b>`x`</b>: `Tensor`. The input to the "forward" evaluation.
+*  <b>`name`</b>: The name to give this op.
+*  <b>`**condition_kwargs`</b>: Named arguments forwarded to subclass implementation.
+
+##### Returns:
+
+  `Tensor`.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if `self.dtype` is specified and `x.dtype` is not
+    `self.dtype`.
+*  <b>`NotImplementedError`</b>: if `_forward` is not implemented.
+
+
+- - -
+
+#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.forward_log_det_jacobian(x, name='forward_log_det_jacobian', **condition_kwargs)` {#CholeskyOuterProduct.forward_log_det_jacobian}
+
+Returns both the forward_log_det_jacobian.
+
+##### Args:
+
+
+*  <b>`x`</b>: `Tensor`. The input to the "forward" Jacobian evaluation.
+*  <b>`name`</b>: The name to give this op.
+*  <b>`**condition_kwargs`</b>: Named arguments forwarded to subclass implementation.
+
+##### Returns:
+
+  `Tensor`.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
+    `self.dtype`.
+*  <b>`NotImplementedError`</b>: if neither `_forward_log_det_jacobian`
+    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
+
+
+- - -
+
+#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.inverse(y, name='inverse', **condition_kwargs)` {#CholeskyOuterProduct.inverse}
+
+Returns the inverse `Bijector` evaluation, i.e., X = g^{-1}(Y).
+
+##### Args:
+
+
+*  <b>`y`</b>: `Tensor`. The input to the "inverse" evaluation.
+*  <b>`name`</b>: The name to give this op.
+*  <b>`**condition_kwargs`</b>: Named arguments forwarded to subclass implementation.
+
+##### Returns:
+
+  `Tensor`.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
+    `self.dtype`.
+*  <b>`NotImplementedError`</b>: if neither `_inverse` nor
+    `_inverse_and_inverse_log_det_jacobian` are implemented.
+
+
+- - -
+
+#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.inverse_and_inverse_log_det_jacobian(y, name='inverse_and_inverse_log_det_jacobian', **condition_kwargs)` {#CholeskyOuterProduct.inverse_and_inverse_log_det_jacobian}
+
+Returns both the inverse evaluation and inverse_log_det_jacobian.
+
+Enables possibly more efficient calculation when both inverse and
+corresponding Jacobian are needed.
+
+See `inverse()`, `inverse_log_det_jacobian()` for more details.
+
+##### Args:
+
+
+*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
+*  <b>`name`</b>: The name to give this op.
+*  <b>`**condition_kwargs`</b>: Named arguments forwarded to subclass implementation.
+
+##### Returns:
+
+  `Tensor`.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
+    `self.dtype`.
+*  <b>`NotImplementedError`</b>: if neither `_inverse_and_inverse_log_det_jacobian`
+    nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented.
+
+
+- - -
+
+#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.inverse_log_det_jacobian(y, name='inverse_log_det_jacobian', **condition_kwargs)` {#CholeskyOuterProduct.inverse_log_det_jacobian}
+
+Returns the (log o det o Jacobian o inverse)(y).
+
+Mathematically, returns: `log(det(dX/dY))(Y)`. (Recall that: `X=g^{-1}(Y)`.)
+
+Note that `forward_log_det_jacobian` is the negative of this function.
+
+##### Args:
+
+
+*  <b>`y`</b>: `Tensor`. The input to the "inverse" Jacobian evaluation.
+*  <b>`name`</b>: The name to give this op.
+*  <b>`**condition_kwargs`</b>: Named arguments forwarded to subclass implementation.
+
+##### Returns:
+
+  `Tensor`.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if `self.dtype` is specified and `y.dtype` is not
+    `self.dtype`.
+*  <b>`NotImplementedError`</b>: if neither `_inverse_log_det_jacobian` nor
+    `_inverse_and_inverse_log_det_jacobian` are implemented.
+
+
+- - -
+
+#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.is_constant_jacobian` {#CholeskyOuterProduct.is_constant_jacobian}
+
+Returns true iff the Jacobian is not a function of x.
+
+Note: Jacobian is either constant for both forward and inverse or neither.
+
+##### Returns:
+
+  `Boolean`.
+
+
+- - -
+
+#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.name` {#CholeskyOuterProduct.name}
+
+Returns the string name of this `Bijector`.
+
+
+- - -
+
+#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.parameters` {#CholeskyOuterProduct.parameters}
+
+Returns this `Bijector`'s parameters as a name/value dictionary.
+
+
+- - -
+
+#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.shaper` {#CholeskyOuterProduct.shaper}
+
+Returns shape object used to manage shape constraints.
+
+
+- - -
+
+#### `tf.contrib.distributions.bijector.CholeskyOuterProduct.validate_args` {#CholeskyOuterProduct.validate_args}
+
+Returns True if Tensor arguments will be validated.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.rnn.AttentionCellWrapper.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.rnn.AttentionCellWrapper.md
index 3bad4deb66d..607aea1f1d6 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.rnn.AttentionCellWrapper.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.rnn.AttentionCellWrapper.md
@@ -1,6 +1,6 @@
 Basic attention cell wrapper.
 
-Implementation based on https://arxiv.org/pdf/1601.06733.pdf.
+Implementation based on https://arxiv.org/abs/1409.0473.
 - - -
 
 #### `tf.contrib.rnn.AttentionCellWrapper.__call__(inputs, state, scope=None)` {#AttentionCellWrapper.__call__}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.MultivariateNormalDiag.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.MultivariateNormalDiag.md
index 5d656d040d2..739fb106fd9 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.MultivariateNormalDiag.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.MultivariateNormalDiag.md
@@ -142,6 +142,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiag.copy(**override_parameters_kwargs)` {#MultivariateNormalDiag.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.MultivariateNormalDiag.dtype` {#MultivariateNormalDiag.dtype}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.QuantizedDistribution.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.QuantizedDistribution.md
index 6cae002036b..4d16d13397b 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.QuantizedDistribution.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.QuantizedDistribution.md
@@ -170,6 +170,29 @@ The base distribution's `cdf` method must be defined on `y - 1`.
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.QuantizedDistribution.copy(**override_parameters_kwargs)` {#QuantizedDistribution.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.QuantizedDistribution.distribution` {#QuantizedDistribution.distribution}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.StudentT.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.StudentT.md
index 59dd01bf4d5..ec6513731fb 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.StudentT.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.StudentT.md
@@ -145,6 +145,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.StudentT.copy(**override_parameters_kwargs)` {#StudentT.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.StudentT.df` {#StudentT.df}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.TransformedDistribution.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.TransformedDistribution.md
index 2a5ff418470..4b4f4413b55 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.TransformedDistribution.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.TransformedDistribution.md
@@ -185,8 +185,8 @@ Additional documentation from `TransformedDistribution`:
 
 ##### <b>`condition_kwargs`</b>:
 
-*  <b>`distribution_kwargs`</b>: Python dictionary of arg names/values forwarded to the distribution.
 *  <b>`bijector_kwargs`</b>: Python dictionary of arg names/values forwarded to the bijector.
+*  <b>`distribution_kwargs`</b>: Python dictionary of arg names/values forwarded to the distribution.
 
 ##### Args:
 
@@ -202,6 +202,29 @@ Additional documentation from `TransformedDistribution`:
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.TransformedDistribution.copy(**override_parameters_kwargs)` {#TransformedDistribution.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.TransformedDistribution.distribution` {#TransformedDistribution.distribution}
@@ -303,8 +326,8 @@ Additional documentation from `TransformedDistribution`:
 
 ##### <b>`condition_kwargs`</b>:
 
-*  <b>`distribution_kwargs`</b>: Python dictionary of arg names/values forwarded to the distribution.
 *  <b>`bijector_kwargs`</b>: Python dictionary of arg names/values forwarded to the bijector.
+*  <b>`distribution_kwargs`</b>: Python dictionary of arg names/values forwarded to the distribution.
 
 ##### Args:
 
@@ -387,8 +410,8 @@ Implements `(log o p o g^{-1})(y) + (log o det o J o g^{-1})(y)`,
 
 ##### <b>`condition_kwargs`</b>:
 
-*  <b>`distribution_kwargs`</b>: Python dictionary of arg names/values forwarded to the distribution.
 *  <b>`bijector_kwargs`</b>: Python dictionary of arg names/values forwarded to the bijector.
+*  <b>`distribution_kwargs`</b>: Python dictionary of arg names/values forwarded to the distribution.
 
 ##### Args:
 
@@ -426,8 +449,8 @@ Additional documentation from `TransformedDistribution`:
 
 ##### <b>`condition_kwargs`</b>:
 
-*  <b>`distribution_kwargs`</b>: Python dictionary of arg names/values forwarded to the distribution.
 *  <b>`bijector_kwargs`</b>: Python dictionary of arg names/values forwarded to the bijector.
+*  <b>`distribution_kwargs`</b>: Python dictionary of arg names/values forwarded to the distribution.
 
 ##### Args:
 
@@ -579,8 +602,8 @@ Implements `p(g^{-1}(y)) det|J(g^{-1}(y))|`, where `g^{-1}` is the
 
 ##### <b>`condition_kwargs`</b>:
 
-*  <b>`distribution_kwargs`</b>: Python dictionary of arg names/values forwarded to the distribution.
 *  <b>`bijector_kwargs`</b>: Python dictionary of arg names/values forwarded to the bijector.
+*  <b>`distribution_kwargs`</b>: Python dictionary of arg names/values forwarded to the distribution.
 
 ##### Args:
 
@@ -633,8 +656,8 @@ Samples from the base distribution and then passes through
 
 ##### <b>`condition_kwargs`</b>:
 
-*  <b>`distribution_kwargs`</b>: Python dictionary of arg names/values forwarded to the distribution.
 *  <b>`bijector_kwargs`</b>: Python dictionary of arg names/values forwarded to the bijector.
+*  <b>`distribution_kwargs`</b>: Python dictionary of arg names/values forwarded to the distribution.
 
 ##### Args:
 
@@ -682,8 +705,8 @@ Additional documentation from `TransformedDistribution`:
 
 ##### <b>`condition_kwargs`</b>:
 
-*  <b>`distribution_kwargs`</b>: Python dictionary of arg names/values forwarded to the distribution.
 *  <b>`bijector_kwargs`</b>: Python dictionary of arg names/values forwarded to the bijector.
+*  <b>`distribution_kwargs`</b>: Python dictionary of arg names/values forwarded to the distribution.
 
 ##### Args:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.metrics.streaming_sparse_recall_at_k.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.metrics.streaming_sparse_recall_at_k.md
index 7fd1d30790d..1a1086fac19 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.metrics.streaming_sparse_recall_at_k.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.metrics.streaming_sparse_recall_at_k.md
@@ -1,73 +1,70 @@
-### `tf.contrib.metrics.streaming_sparse_recall_at_k(*args, **kwargs)` {#streaming_sparse_recall_at_k}
+### `tf.contrib.metrics.streaming_sparse_recall_at_k(predictions, labels, k, class_id=None, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_sparse_recall_at_k}
 
-Computes recall@k of the predictions with respect to sparse labels. (deprecated arguments)
+Computes recall@k of the predictions with respect to sparse labels.
 
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-10-19.
-Instructions for updating:
-`ignore_mask` is being deprecated. Instead use `weights` with values 0.0 and 1.0 to mask values. For example, `weights=tf.logical_not(mask)`.
+If `class_id` is specified, we calculate recall by considering only the
+    entries in the batch for which `class_id` is in the label, and computing
+    the fraction of them for which `class_id` is in the top-k `predictions`.
+If `class_id` is not specified, we'll calculate recall as how often on
+    average a class among the labels of a batch entry is in the top-k
+    `predictions`.
 
-  If `class_id` is specified, we calculate recall by considering only the
-      entries in the batch for which `class_id` is in the label, and computing
-      the fraction of them for which `class_id` is in the top-k `predictions`.
-  If `class_id` is not specified, we'll calculate recall as how often on
-      average a class among the labels of a batch entry is in the top-k
-      `predictions`.
+`streaming_sparse_recall_at_k` creates two local variables,
+`true_positive_at_<k>` and `false_negative_at_<k>`, that are used to compute
+the recall_at_k frequency. This frequency is ultimately returned as
+`recall_at_<k>`: an idempotent operation that simply divides
+`true_positive_at_<k>` by total (`true_positive_at_<k>` +
+`false_negative_at_<k>`).
 
-  `streaming_sparse_recall_at_k` creates two local variables,
-  `true_positive_at_<k>` and `false_negative_at_<k>`, that are used to compute
-  the recall_at_k frequency. This frequency is ultimately returned as
-  `recall_at_<k>`: an idempotent operation that simply divides
-  `true_positive_at_<k>` by total (`true_positive_at_<k>` +
-  `false_negative_at_<k>`).
+For estimation of the metric over a stream of data, the function creates an
+`update_op` operation that updates these variables and returns the
+`recall_at_<k>`. Internally, a `top_k` operation computes a `Tensor`
+indicating the top `k` `predictions`. Set operations applied to `top_k` and
+`labels` calculate the true positives and false negatives weighted by
+`weights`. Then `update_op` increments `true_positive_at_<k>` and
+`false_negative_at_<k>` using these values.
 
-  For estimation of the metric over a stream of data, the function creates an
-  `update_op` operation that updates these variables and returns the
-  `recall_at_<k>`. Internally, a `top_k` operation computes a `Tensor`
-  indicating the top `k` `predictions`. Set operations applied to `top_k` and
-  `labels` calculate the true positives and false negatives weighted by
-  `weights`. Then `update_op` increments `true_positive_at_<k>` and
-  `false_negative_at_<k>` using these values.
+If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
-  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-  Alternatively, if `ignore_mask` is not `None`, then mask values where
-  `ignore_mask` is `True`.
+##### Args:
 
-  Args:
-    predictions: Float `Tensor` with shape [D1, ... DN, num_classes] where
-      N >= 1. Commonly, N=1 and predictions has shape [batch size, num_classes].
-      The final dimension contains the logit values for each class. [D1, ... DN]
-      must match `labels`.
-    labels: `int64` `Tensor` or `SparseTensor` with shape
-      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
-      target classes for the associated prediction. Commonly, N=1 and `labels`
-      has shape [batch_size, num_labels]. [D1, ... DN] must match `predictions`.
-      Values should be in range [0, num_classes), where num_classes is the last
-      dimension of `predictions`. Values outside this range always count
-      towards `false_negative_at_<k>`.
-    k: Integer, k for @k metric.
-    class_id: Integer class ID for which we want binary metrics. This should be
-      in range [0, num_classes), where num_classes is the last dimension of
-      `predictions`. If class_id is outside this range, the method returns NAN.
-    ignore_mask: An optional, `bool` `Tensor` whose shape is broadcastable to
-      the the first [D1, ... DN] dimensions of `predictions` and `labels`.
-    weights: An optional `Tensor` whose shape is broadcastable to the the first
-      [D1, ... DN] dimensions of `predictions` and `labels`.
-    metrics_collections: An optional list of collections that values should
-      be added to.
-    updates_collections: An optional list of collections that updates should
-      be added to.
-    name: Name of new update operation, and namespace for other dependent ops.
 
-  Returns:
-    recall: Scalar `float64` `Tensor` with the value of `true_positives` divided
-      by the sum of `true_positives` and `false_negatives`.
-    update_op: `Operation` that increments `true_positives` and
-      `false_negatives` variables appropriately, and whose value matches
-      `recall`.
+*  <b>`predictions`</b>: Float `Tensor` with shape [D1, ... DN, num_classes] where
+    N >= 1. Commonly, N=1 and predictions has shape [batch size, num_classes].
+    The final dimension contains the logit values for each class. [D1, ... DN]
+    must match `labels`.
+*  <b>`labels`</b>: `int64` `Tensor` or `SparseTensor` with shape
+    [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+    target classes for the associated prediction. Commonly, N=1 and `labels`
+    has shape [batch_size, num_labels]. [D1, ... DN] must match `predictions`.
+    Values should be in range [0, num_classes), where num_classes is the last
+    dimension of `predictions`. Values outside this range always count
+    towards `false_negative_at_<k>`.
+*  <b>`k`</b>: Integer, k for @k metric.
+*  <b>`class_id`</b>: Integer class ID for which we want binary metrics. This should be
+    in range [0, num_classes), where num_classes is the last dimension of
+    `predictions`. If class_id is outside this range, the method returns NAN.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to the the first
+    [D1, ... DN] dimensions of `predictions` and `labels`.
+*  <b>`metrics_collections`</b>: An optional list of collections that values should
+    be added to.
+*  <b>`updates_collections`</b>: An optional list of collections that updates should
+    be added to.
+*  <b>`name`</b>: Name of new update operation, and namespace for other dependent ops.
 
-  Raises:
-    ValueError: If `ignore_mask` is not `None` and its shape doesn't match
-      `predictions`, or if `weights` is not `None` and its shape doesn't match
-      `predictions`, or if either `metrics_collections` or `updates_collections`
-      are not a list or tuple.
+##### Returns:
+
+
+*  <b>`recall`</b>: Scalar `float64` `Tensor` with the value of `true_positives` divided
+    by the sum of `true_positives` and `false_negatives`.
+*  <b>`update_op`</b>: `Operation` that increments `true_positives` and
+    `false_negatives` variables appropriately, and whose value matches
+    `recall`.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If `weights` is not `None` and its shape doesn't match
+  `predictions`, or if either `metrics_collections` or `updates_collections`
+  are not a list or tuple.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.map_fn.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.map_fn.md
index dd98fd9dd8a..5e49278a182 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.map_fn.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.map_fn.md
@@ -23,6 +23,22 @@ Furthermore, `fn` may emit a different structure than its input.  For example,
 the `dtype` parameter is not optional: `dtype` must be a type or (possibly
 nested) tuple of types matching the output of `fn`.
 
+To apply a functional operation to the nonzero elements of a SparseTensor
+one of the following methods is recommended. First, if the function is
+expressible as TensorFlow ops, use
+
+```python
+  result = SparseTensor(input.indices, fn(input.values), input.shape)
+```
+
+If, however, the function is not expressible as a TensorFlow op, then use
+
+```python
+result = SparseTensor(input.indices, map_fn(fn, input.values), input.shape)
+```
+
+instead.
+
 ##### Args:
 
 
@@ -53,7 +69,7 @@ nested) tuple of types matching the output of `fn`.
 
 
 *  <b>`TypeError`</b>: if `fn` is not callable or the structure of the output of
-    `fn` and `dtype` do not match.
+    `fn` and `dtype` do not match, or if elems is a SparseTensor.
 *  <b>`ValueError`</b>: if the lengths of the output of `fn` and `dtype` do not match.
 
 ##### Examples:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Categorical.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Categorical.md
index 87b72a52cdb..db1f68f83a9 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Categorical.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Categorical.md
@@ -133,6 +133,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.Categorical.copy(**override_parameters_kwargs)` {#Categorical.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.Categorical.dtype` {#Categorical.dtype}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Chi2.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Chi2.md
index c0268e6b012..8ed0532a845 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Chi2.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Chi2.md
@@ -109,6 +109,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.Chi2.copy(**override_parameters_kwargs)` {#Chi2.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.Chi2.df` {#Chi2.df}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Uniform.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Uniform.md
index a294d0b9c4f..0b4357976a6 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Uniform.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Uniform.md
@@ -129,6 +129,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.Uniform.copy(**override_parameters_kwargs)` {#Uniform.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.Uniform.dtype` {#Uniform.dtype}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.WishartCholesky.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.WishartCholesky.md
index 8aa83efb7b1..142c2b2c70c 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.WishartCholesky.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.WishartCholesky.md
@@ -159,6 +159,29 @@ cdf(x) := P[X <= x]
 Boolean indicating if `Tensor` input/outputs are Cholesky factorized.
 
 
+- - -
+
+#### `tf.contrib.distributions.WishartCholesky.copy(**override_parameters_kwargs)` {#WishartCholesky.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.WishartCholesky.df` {#WishartCholesky.df}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.bijector.Bijector.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.bijector.Bijector.md
index d994a57f457..b1f349e7592 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.bijector.Bijector.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.bijector.Bijector.md
@@ -1,8 +1,10 @@
-Interface for transforming a `Distribution` via `TransformedDistribution`.
+Interface for transforming a `Distribution` sample.
 
-A `Bijector` implements a bijective, differentiable function by transforming
-an input `Tensor`. The output `Tensor` shape is constrained by the input
-`sample`, `batch`, and `event` shape.  A `Bijector` is characterized by three
+A `Bijector` implements a
+[diffeomorphism](https://en.wikipedia.org/wiki/Diffeomorphism), i.e., a
+bijective, differentiable function. A `Bijector` is used by
+`TransformedDistribution` but can be generally used for transforming a
+`Distribution` generated `Tensor`.  A `Bijector` is characterized by three
 operations:
 
 1. Forward Evaluation
@@ -143,7 +145,8 @@ Tips for implementing `_inverse` and `_inverse_log_det_jacobian`:
 - The inverse `log o det o Jacobian` can be implemented as the negative of the
   forward `log o det o Jacobian`.  This is useful if the `inverse` is
   implemented as a cache or the inverse Jacobian is computationally more
-  expensive. The following demonstrates the suggested implementation.
+  expensive (e.g., `CholeskyOuterProduct` `Bijector`). The following
+  demonstrates the suggested implementation.
 
   ```python
   def _inverse_and_log_det_jacobian(self, y):
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.layers.optimize_loss.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.layers.optimize_loss.md
index dbd0d465729..fc460e7cacc 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.layers.optimize_loss.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.layers.optimize_loss.md
@@ -42,7 +42,11 @@ Various ways of passing optimizers, include:
 *  <b>`gradient_multipliers`</b>: dict of variables or variable names to floats.
                         If present, gradients for specified
                         variables will be multiplied by given constant.
-*  <b>`clip_gradients`</b>: float or `None`, clips gradients by this value.
+*  <b>`clip_gradients`</b>: float, callable or `None`. If float, is provided, a global
+    clipping is applied to prevent the norm of the gradient to exceed this
+    value. Alternatively, a callable can be provided e.g.: adaptive_clipping.
+    This callable takes a `list` of `(gradients, variables)` `tuple`s and
+    returns the same thing with the gradients modified.
 *  <b>`learning_rate_decay_fn`</b>: function, takes `learning_rate` and `global_step`
                           `Tensor`s, returns `Tensor`.
                           Can be used to implement any learning rate decay
@@ -73,6 +77,7 @@ Various ways of passing optimizers, include:
       * `global_step` is an invalid type or shape.
       * `learning_rate` is an invalid type or value.
       * `optimizer` is wrong type.
+      * `clip_gradients' is not float or callable.
       * `learning_rate` and `learning_rate_decay_fn` are supplied, but no
         `global_step` is available.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.streaming_sparse_precision_at_k.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.streaming_sparse_precision_at_k.md
index c2c025724dd..bb10bc85947 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.streaming_sparse_precision_at_k.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.streaming_sparse_precision_at_k.md
@@ -1,75 +1,72 @@
-### `tf.contrib.metrics.streaming_sparse_precision_at_k(*args, **kwargs)` {#streaming_sparse_precision_at_k}
+### `tf.contrib.metrics.streaming_sparse_precision_at_k(predictions, labels, k, class_id=None, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_sparse_precision_at_k}
 
-Computes precision@k of the predictions with respect to sparse labels. (deprecated arguments)
+Computes precision@k of the predictions with respect to sparse labels.
 
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-10-19.
-Instructions for updating:
-`ignore_mask` is being deprecated. Instead use `weights` with values 0.0 and 1.0 to mask values. For example, `weights=tf.logical_not(mask)`.
+If `class_id` is specified, we calculate precision by considering only the
+    entries in the batch for which `class_id` is in the top-k highest
+    `predictions`, and computing the fraction of them for which `class_id` is
+    indeed a correct label.
+If `class_id` is not specified, we'll calculate precision as how often on
+    average a class among the top-k classes with the highest predicted values
+    of a batch entry is correct and can be found in the label for that entry.
 
-  If `class_id` is specified, we calculate precision by considering only the
-      entries in the batch for which `class_id` is in the top-k highest
-      `predictions`, and computing the fraction of them for which `class_id` is
-      indeed a correct label.
-  If `class_id` is not specified, we'll calculate precision as how often on
-      average a class among the top-k classes with the highest predicted values
-      of a batch entry is correct and can be found in the label for that entry.
+`streaming_sparse_precision_at_k` creates two local variables,
+`true_positive_at_<k>` and `false_positive_at_<k>`, that are used to compute
+the precision@k frequency. This frequency is ultimately returned as
+`precision_at_<k>`: an idempotent operation that simply divides
+`true_positive_at_<k>` by total (`true_positive_at_<k>` +
+`false_positive_at_<k>`).
 
-  `streaming_sparse_precision_at_k` creates two local variables,
-  `true_positive_at_<k>` and `false_positive_at_<k>`, that are used to compute
-  the precision@k frequency. This frequency is ultimately returned as
-  `precision_at_<k>`: an idempotent operation that simply divides
-  `true_positive_at_<k>` by total (`true_positive_at_<k>` +
-  `false_positive_at_<k>`).
+For estimation of the metric over a stream of data, the function creates an
+`update_op` operation that updates these variables and returns the
+`precision_at_<k>`. Internally, a `top_k` operation computes a `Tensor`
+indicating the top `k` `predictions`. Set operations applied to `top_k` and
+`labels` calculate the true positives and false positives weighted by
+`weights`. Then `update_op` increments `true_positive_at_<k>` and
+`false_positive_at_<k>` using these values.
 
-  For estimation of the metric over a stream of data, the function creates an
-  `update_op` operation that updates these variables and returns the
-  `precision_at_<k>`. Internally, a `top_k` operation computes a `Tensor`
-  indicating the top `k` `predictions`. Set operations applied to `top_k` and
-  `labels` calculate the true positives and false positives weighted by
-  `weights`. Then `update_op` increments `true_positive_at_<k>` and
-  `false_positive_at_<k>` using these values.
+If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
-  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-  Alternatively, if `ignore_mask` is not `None`, then mask values where
-  `ignore_mask` is `True`.
+##### Args:
 
-  Args:
-    predictions: Float `Tensor` with shape [D1, ... DN, num_classes] where
-      N >= 1. Commonly, N=1 and predictions has shape [batch size, num_classes].
-      The final dimension contains the logit values for each class. [D1, ... DN]
-      must match `labels`.
-    labels: `int64` `Tensor` or `SparseTensor` with shape
-      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
-      target classes for the associated prediction. Commonly, N=1 and `labels`
-      has shape [batch_size, num_labels]. [D1, ... DN] must match
-      `predictions`. Values should be in range [0, num_classes), where
-      num_classes is the last dimension of `predictions`. Values outside this
-      range are ignored.
-    k: Integer, k for @k metric.
-    class_id: Integer class ID for which we want binary metrics. This should be
-      in range [0, num_classes], where num_classes is the last dimension of
-      `predictions`. If `class_id` is outside this range, the method returns
-      NAN.
-    ignore_mask: An optional, `bool` `Tensor` whose shape is broadcastable to
-      the the first [D1, ... DN] dimensions of `predictions` and `labels`.
-    weights: An optional `Tensor` whose shape is broadcastable to the the first
-      [D1, ... DN] dimensions of `predictions` and `labels`.
-    metrics_collections: An optional list of collections that values should
-      be added to.
-    updates_collections: An optional list of collections that updates should
-      be added to.
-    name: Name of new update operation, and namespace for other dependent ops.
 
-  Returns:
-    precision: Scalar `float64` `Tensor` with the value of `true_positives`
-      divided by the sum of `true_positives` and `false_positives`.
-    update_op: `Operation` that increments `true_positives` and
-      `false_positives` variables appropriately, and whose value matches
-      `precision`.
+*  <b>`predictions`</b>: Float `Tensor` with shape [D1, ... DN, num_classes] where
+    N >= 1. Commonly, N=1 and predictions has shape [batch size, num_classes].
+    The final dimension contains the logit values for each class. [D1, ... DN]
+    must match `labels`.
+*  <b>`labels`</b>: `int64` `Tensor` or `SparseTensor` with shape
+    [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+    target classes for the associated prediction. Commonly, N=1 and `labels`
+    has shape [batch_size, num_labels]. [D1, ... DN] must match
+    `predictions`. Values should be in range [0, num_classes), where
+    num_classes is the last dimension of `predictions`. Values outside this
+    range are ignored.
+*  <b>`k`</b>: Integer, k for @k metric.
+*  <b>`class_id`</b>: Integer class ID for which we want binary metrics. This should be
+    in range [0, num_classes], where num_classes is the last dimension of
+    `predictions`. If `class_id` is outside this range, the method returns
+    NAN.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to the the first
+    [D1, ... DN] dimensions of `predictions` and `labels`.
+*  <b>`metrics_collections`</b>: An optional list of collections that values should
+    be added to.
+*  <b>`updates_collections`</b>: An optional list of collections that updates should
+    be added to.
+*  <b>`name`</b>: Name of new update operation, and namespace for other dependent ops.
 
-  Raises:
-    ValueError: If `ignore_mask` is not `None` and its shape doesn't match
-      `predictions`, or if `weights` is not `None` and its shape doesn't match
-      `predictions`, or if either `metrics_collections` or `updates_collections`
-      are not a list or tuple.
+##### Returns:
+
+
+*  <b>`precision`</b>: Scalar `float64` `Tensor` with the value of `true_positives`
+    divided by the sum of `true_positives` and `false_positives`.
+*  <b>`update_op`</b>: `Operation` that increments `true_positives` and
+    `false_positives` variables appropriately, and whose value matches
+    `precision`.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If `weights` is not `None` and its shape doesn't match
+    `predictions`, or if either `metrics_collections` or `updates_collections`
+    are not a list or tuple.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.image.per_image_whitening.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.image.per_image_whitening.md
index 13797eeab84..dfad97e766e 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.image.per_image_whitening.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.image.per_image_whitening.md
@@ -1,30 +1,4 @@
 ### `tf.image.per_image_whitening(image)` {#per_image_whitening}
 
-Linearly scales `image` to have zero mean and unit norm.
-
-This op computes `(x - mean) / adjusted_stddev`, where `mean` is the average
-of all values in image, and
-`adjusted_stddev = max(stddev, 1.0/sqrt(image.NumElements()))`.
-
-`stddev` is the standard deviation of all values in `image`. It is capped
-away from zero to protect against division by 0 when handling uniform images.
-
-Note that this implementation is limited:
-
-*  It only whitens based on the statistics of an individual image.
-*  It does not take into account the covariance structure.
-
-##### Args:
 
 
-*  <b>`image`</b>: 3-D tensor of shape `[height, width, channels]`.
-
-##### Returns:
-
-  The whitened image with same shape as `image`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the shape of 'image' is incompatible with this function.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.natural_exp_decay.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.natural_exp_decay.md
new file mode 100644
index 00000000000..5fbff8f9d4e
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.natural_exp_decay.md
@@ -0,0 +1,56 @@
+### `tf.train.natural_exp_decay(learning_rate, global_step, decay_steps, decay_rate, staircase=False, name=None)` {#natural_exp_decay}
+
+Applies natural exponential decay to the initial learning rate.
+
+When training a model, it is often recommended to lower the learning rate as
+the training progresses.  This function applies an exponential decay function
+to a provided initial learning rate.  It requires an `global_step` value to
+compute the decayed learning rate.  You can just pass a TensorFlow variable
+that you increment at each training step.
+
+The function returns the decayed learning rate.  It is computed as:
+
+```python
+decayed_learning_rate = learning_rate * exp(-decay_rate * global_step)
+```
+
+Example: decay exponentially with a base of 0.96:
+
+```python
+...
+global_step = tf.Variable(0, trainable=False)
+learning_rate = 0.1
+k = 0.5
+learning_rate = tf.train.exponential_time_decay(learning_rate, global_step, k)
+
+# Passing global_step to minimize() will increment it at each step.
+learning_step = (
+    tf.train.GradientDescentOptimizer(learning_rate)
+    .minimize(...my loss..., global_step=global_step)
+)
+```
+
+##### Args:
+
+
+*  <b>`learning_rate`</b>: A scalar `float32` or `float64` `Tensor` or a
+    Python number.  The initial learning rate.
+*  <b>`global_step`</b>: A Python number.
+    Global step to use for the decay computation.  Must not be negative.
+*  <b>`decay_steps`</b>: How often to apply decay.
+*  <b>`decay_rate`</b>: A Python number.  The decay rate.
+*  <b>`staircase`</b>: Whether to apply decay in a discrete staircase, as opposed to
+    continuous, fashion.
+*  <b>`name`</b>: String.  Optional name of the operation.  Defaults to
+    'ExponentialTimeDecay'.
+
+##### Returns:
+
+  A scalar `Tensor` of the same type as `learning_rate`.  The decayed
+  learning rate.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: if `global_step` is not supplied.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.BetaWithSoftplusAB.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.BetaWithSoftplusAB.md
index 50ce4a3e6ee..a23bf3b5c53 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.BetaWithSoftplusAB.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.BetaWithSoftplusAB.md
@@ -94,6 +94,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.BetaWithSoftplusAB.copy(**override_parameters_kwargs)` {#BetaWithSoftplusAB.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.BetaWithSoftplusAB.dtype` {#BetaWithSoftplusAB.dtype}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Binomial.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Binomial.md
index 36989a55033..19e3a20bc8f 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Binomial.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Binomial.md
@@ -159,6 +159,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.Binomial.copy(**override_parameters_kwargs)` {#Binomial.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.Binomial.dtype` {#Binomial.dtype}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.DirichletMultinomial.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.DirichletMultinomial.md
index 6dcf35cd20b..76b7093595b 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.DirichletMultinomial.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.DirichletMultinomial.md
@@ -186,6 +186,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.DirichletMultinomial.copy(**override_parameters_kwargs)` {#DirichletMultinomial.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.DirichletMultinomial.dtype` {#DirichletMultinomial.dtype}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Exponential.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Exponential.md
index c1c2fde90d8..fad44a07215 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Exponential.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Exponential.md
@@ -109,6 +109,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.Exponential.copy(**override_parameters_kwargs)` {#Exponential.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.Exponential.dtype` {#Exponential.dtype}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Gamma.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Gamma.md
index 82f66d080e8..d990fcff3b2 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Gamma.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Gamma.md
@@ -136,6 +136,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.Gamma.copy(**override_parameters_kwargs)` {#Gamma.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.Gamma.dtype` {#Gamma.dtype}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.GammaWithSoftplusAlphaBeta.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.GammaWithSoftplusAlphaBeta.md
index 5c9ca305fb1..dfe8d1fb547 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.GammaWithSoftplusAlphaBeta.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.GammaWithSoftplusAlphaBeta.md
@@ -87,6 +87,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.GammaWithSoftplusAlphaBeta.copy(**override_parameters_kwargs)` {#GammaWithSoftplusAlphaBeta.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.GammaWithSoftplusAlphaBeta.dtype` {#GammaWithSoftplusAlphaBeta.dtype}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.InverseGamma.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.InverseGamma.md
index 077e2b5e2bc..01e3c77478e 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.InverseGamma.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.InverseGamma.md
@@ -132,6 +132,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.InverseGamma.copy(**override_parameters_kwargs)` {#InverseGamma.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.InverseGamma.dtype` {#InverseGamma.dtype}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.InverseGammaWithSoftplusAlphaBeta.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.InverseGammaWithSoftplusAlphaBeta.md
index 430b0243e79..e960ace66d7 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.InverseGammaWithSoftplusAlphaBeta.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.InverseGammaWithSoftplusAlphaBeta.md
@@ -87,6 +87,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.InverseGammaWithSoftplusAlphaBeta.copy(**override_parameters_kwargs)` {#InverseGammaWithSoftplusAlphaBeta.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.InverseGammaWithSoftplusAlphaBeta.dtype` {#InverseGammaWithSoftplusAlphaBeta.dtype}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Multinomial.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Multinomial.md
index 95fce3d5240..811f913be7a 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Multinomial.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Multinomial.md
@@ -169,6 +169,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.Multinomial.copy(**override_parameters_kwargs)` {#Multinomial.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.Multinomial.dtype` {#Multinomial.dtype}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.md
index c1774a5a63a..9aa1a69a73a 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.md
@@ -169,6 +169,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.copy(**override_parameters_kwargs)` {#MultivariateNormalDiagPlusVDVT.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.dtype` {#MultivariateNormalDiagPlusVDVT.dtype}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.bijector.ScaleAndShift.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.bijector.ScaleAndShift.md
index d8cd7de27c6..4c65892d755 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.bijector.ScaleAndShift.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.bijector.ScaleAndShift.md
@@ -51,8 +51,8 @@ Instantiates the `Exp` bijector.
 *  <b>`scale`</b>: `Tensor` used to scale input, i.e., `Y = g(X) = scale * X + shift`.
 *  <b>`event_ndims`</b>: Scalar `int32` `Tensor` indicating the number of dimensions
     associated with a particular draw from the distribution.
-*  <b>`validate_args`</b>: `Boolean` indicated whether arguments should be checked for
-    correctness.
+*  <b>`validate_args`</b>: `Boolean` indicating whether arguments should be checked
+    for correctness.
 *  <b>`name`</b>: `String` name given to ops managed by this object.
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.distributions.BernoulliWithSigmoidP.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.distributions.BernoulliWithSigmoidP.md
index 83dc4f9c7e0..e9a7b10c687 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.distributions.BernoulliWithSigmoidP.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.distributions.BernoulliWithSigmoidP.md
@@ -73,6 +73,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.BernoulliWithSigmoidP.copy(**override_parameters_kwargs)` {#BernoulliWithSigmoidP.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.BernoulliWithSigmoidP.dtype` {#BernoulliWithSigmoidP.dtype}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.distributions.bijector.Invert.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.distributions.bijector.Invert.md
index 80ba0266a88..41ced3f4755 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.distributions.bijector.Invert.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.distributions.bijector.Invert.md
@@ -29,8 +29,8 @@ return -self.inverse_log_det_jacobian(y, **condition_kwargs)
 
 
 *  <b>`bijector`</b>: Bijector instance.
-*  <b>`validate_args`</b>: `Boolean` indicated whether arguments should be checked for
-    correctness.
+*  <b>`validate_args`</b>: `Boolean` indicating whether arguments should be checked
+    for correctness.
 *  <b>`name`</b>: `String`, name given to ops managed by this object.
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.metrics.streaming_recall_at_k.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.metrics.streaming_recall_at_k.md
index 24e2d3d8b5a..9ae2059a5dc 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.metrics.streaming_recall_at_k.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.metrics.streaming_recall_at_k.md
@@ -1,15 +1,11 @@
 ### `tf.contrib.metrics.streaming_recall_at_k(*args, **kwargs)` {#streaming_recall_at_k}
 
-Computes the recall@k of the predictions with respect to dense labels. (deprecated arguments) (deprecated)
+Computes the recall@k of the predictions with respect to dense labels. (deprecated)
 
 THIS FUNCTION IS DEPRECATED. It will be removed after 2016-11-08.
 Instructions for updating:
 Please use `streaming_sparse_recall_at_k`, and reshape labels from [batch_size] to [batch_size, 1].
 
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-10-19.
-Instructions for updating:
-`ignore_mask` is being deprecated. Instead use `weights` with values 0.0 and 1.0 to mask values. For example, `weights=tf.logical_not(mask)`.
-
   The `streaming_recall_at_k` function creates two local variables, `total` and
   `count`, that are used to compute the recall@k frequency. This frequency is
   ultimately returned as `recall_at_<k>`: an idempotent operation that simply
@@ -24,15 +20,12 @@ Instructions for updating:
   increments `count` with the reduced sum of `weights`.
 
   If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-  Alternatively, if `ignore_mask` is not `None`, then mask values where
-  `ignore_mask` is `True`.
 
   Args:
     predictions: A floating point tensor of dimension [batch_size, num_classes]
     labels: A tensor of dimension [batch_size] whose type is in `int32`,
       `int64`.
     k: The number of top elements to look at for computing recall.
-    ignore_mask: An optional, `bool` `Tensor` whose shape matches `predictions`.
     weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
     metrics_collections: An optional list of collections that `recall_at_k`
       should be added to.
@@ -48,8 +41,7 @@ Instructions for updating:
 
   Raises:
     ValueError: If `predictions` and `labels` have mismatched shapes, or if
-      `ignore_mask` is not `None` and its shape doesn't match `predictions`, or
-      if `weights` is not `None` and its shape doesn't match `predictions`, or
-      if either `metrics_collections` or `updates_collections` are not a list or
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
       tuple.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.metrics.streaming_sparse_precision_at_top_k.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.metrics.streaming_sparse_precision_at_top_k.md
index 53f6e786b23..d9d3f8ecec4 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.metrics.streaming_sparse_precision_at_top_k.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.metrics.streaming_sparse_precision_at_top_k.md
@@ -1,73 +1,70 @@
-### `tf.contrib.metrics.streaming_sparse_precision_at_top_k(*args, **kwargs)` {#streaming_sparse_precision_at_top_k}
+### `tf.contrib.metrics.streaming_sparse_precision_at_top_k(top_k_predictions, labels, class_id=None, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_sparse_precision_at_top_k}
 
-Computes precision@k of top-k predictions with respect to sparse labels. (deprecated arguments)
+Computes precision@k of top-k predictions with respect to sparse labels.
 
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-10-19.
-Instructions for updating:
-`ignore_mask` is being deprecated. Instead use `weights` with values 0.0 and 1.0 to mask values. For example, `weights=tf.logical_not(mask)`.
+If `class_id` is specified, we calculate precision by considering only the
+    entries in the batch for which `class_id` is in the top-k highest
+    `predictions`, and computing the fraction of them for which `class_id` is
+    indeed a correct label.
+If `class_id` is not specified, we'll calculate precision as how often on
+    average a class among the top-k classes with the highest predicted values
+    of a batch entry is correct and can be found in the label for that entry.
 
-  If `class_id` is specified, we calculate precision by considering only the
-      entries in the batch for which `class_id` is in the top-k highest
-      `predictions`, and computing the fraction of them for which `class_id` is
-      indeed a correct label.
-  If `class_id` is not specified, we'll calculate precision as how often on
-      average a class among the top-k classes with the highest predicted values
-      of a batch entry is correct and can be found in the label for that entry.
+`streaming_sparse_precision_at_top_k` creates two local variables,
+`true_positive_at_k` and `false_positive_at_k`, that are used to compute
+the precision@k frequency. This frequency is ultimately returned as
+`precision_at_k`: an idempotent operation that simply divides
+`true_positive_at_k` by total (`true_positive_at_k` + `false_positive_at_k`).
 
-  `streaming_sparse_precision_at_top_k` creates two local variables,
-  `true_positive_at_k` and `false_positive_at_k`, that are used to compute
-  the precision@k frequency. This frequency is ultimately returned as
-  `precision_at_k`: an idempotent operation that simply divides
-  `true_positive_at_k` by total (`true_positive_at_k` + `false_positive_at_k`).
+For estimation of the metric over a stream of data, the function creates an
+`update_op` operation that updates these variables and returns the
+`precision_at_k`. Internally, set operations applied to `top_k_predictions`
+and `labels` calculate the true positives and false positives weighted by
+`weights`. Then `update_op` increments `true_positive_at_k` and
+`false_positive_at_k` using these values.
 
-  For estimation of the metric over a stream of data, the function creates an
-  `update_op` operation that updates these variables and returns the
-  `precision_at_k`. Internally, set operations applied to `top_k_predictions`
-  and `labels` calculate the true positives and false positives weighted by
-  `weights`. Then `update_op` increments `true_positive_at_k` and
-  `false_positive_at_k` using these values.
+If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
-  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-  Alternatively, if `ignore_mask` is not `None`, then mask values where
-  `ignore_mask` is `True`.
+##### Args:
 
-  Args:
-    top_k_predictions: Integer `Tensor` with shape [D1, ... DN, k] where
-      N >= 1. Commonly, N=1 and top_k_predictions has shape [batch size, k].
-      The final dimension contains the indices of top-k labels. [D1, ... DN]
-      must match `labels`.
-    labels: `int64` `Tensor` or `SparseTensor` with shape
-      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
-      target classes for the associated prediction. Commonly, N=1 and `labels`
-      has shape [batch_size, num_labels]. [D1, ... DN] must match
-      `top_k_predictions`. Values should be in range [0, num_classes), where
-      num_classes is the last dimension of `predictions`. Values outside this
-      range are ignored.
-    class_id: Integer class ID for which we want binary metrics. This should be
-      in range [0, num_classes), where num_classes is the last dimension of
-      `predictions`. If `class_id` is outside this range, the method returns
-      NAN.
-    ignore_mask: An optional, `bool` `Tensor` whose shape is broadcastable to
-      the the first [D1, ... DN] dimensions of `predictions` and `labels`.
-    weights: An optional `Tensor` whose shape is broadcastable to the the first
-      [D1, ... DN] dimensions of `predictions` and `labels`.
-    metrics_collections: An optional list of collections that values should
-      be added to.
-    updates_collections: An optional list of collections that updates should
-      be added to.
-    name: Name of new update operation, and namespace for other dependent ops.
 
-  Returns:
-    precision: Scalar `float64` `Tensor` with the value of `true_positives`
-      divided by the sum of `true_positives` and `false_positives`.
-    update_op: `Operation` that increments `true_positives` and
-      `false_positives` variables appropriately, and whose value matches
-      `precision`.
+*  <b>`top_k_predictions`</b>: Integer `Tensor` with shape [D1, ... DN, k] where
+    N >= 1. Commonly, N=1 and top_k_predictions has shape [batch size, k].
+    The final dimension contains the indices of top-k labels. [D1, ... DN]
+    must match `labels`.
+*  <b>`labels`</b>: `int64` `Tensor` or `SparseTensor` with shape
+    [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+    target classes for the associated prediction. Commonly, N=1 and `labels`
+    has shape [batch_size, num_labels]. [D1, ... DN] must match
+    `top_k_predictions`. Values should be in range [0, num_classes), where
+    num_classes is the last dimension of `predictions`. Values outside this
+    range are ignored.
+*  <b>`class_id`</b>: Integer class ID for which we want binary metrics. This should be
+    in range [0, num_classes), where num_classes is the last dimension of
+    `predictions`. If `class_id` is outside this range, the method returns
+    NAN.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to the the first
+    [D1, ... DN] dimensions of `predictions` and `labels`.
+*  <b>`metrics_collections`</b>: An optional list of collections that values should
+    be added to.
+*  <b>`updates_collections`</b>: An optional list of collections that updates should
+    be added to.
+*  <b>`name`</b>: Name of new update operation, and namespace for other dependent ops.
 
-  Raises:
-    ValueError: If `ignore_mask` is not `None` and its shape doesn't match
-      `predictions`, or if `weights` is not `None` and its shape doesn't match
-      `predictions`, or if either `metrics_collections` or `updates_collections`
-      are not a list or tuple.
-    ValueError: If `top_k_predictions` has rank < 2.
+##### Returns:
+
+
+*  <b>`precision`</b>: Scalar `float64` `Tensor` with the value of `true_positives`
+    divided by the sum of `true_positives` and `false_positives`.
+*  <b>`update_op`</b>: `Operation` that increments `true_positives` and
+    `false_positives` variables appropriately, and whose value matches
+    `precision`.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If `weights` is not `None` and its shape doesn't match
+    `predictions`, or if either `metrics_collections` or `updates_collections`
+    are not a list or tuple.
+*  <b>`ValueError`</b>: If `top_k_predictions` has rank < 2.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.distributions.beta_bb.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.distributions.beta_bb.md
index 8e16c312a83..d7fe415774c 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.distributions.beta_bb.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.distributions.beta_bb.md
@@ -94,6 +94,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.beta_bb.copy(**override_parameters_kwargs)` {#beta_bb.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.beta_bb.dtype` {#beta_bb.dtype}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.distributions.bijector.Chain.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.distributions.bijector.Chain.md
index f56cef2cb60..a129c3edef5 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.distributions.bijector.Chain.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.distributions.bijector.Chain.md
@@ -40,8 +40,8 @@ Instantiates `Chain` bijector.
 
 *  <b>`bijectors`</b>: Python list of bijector instances. An empty list makes this
     bijector equivalent to the `Identity` bijector.
-*  <b>`validate_args`</b>: `Boolean` indicated whether arguments should be checked for
-    correctness.
+*  <b>`validate_args`</b>: `Boolean` indicating whether arguments should be checked
+    for correctness.
 *  <b>`name`</b>: `String`, name given to ops managed by this object. Default: E.g.,
     `Chain([Exp(), Softplus()]).name == "chain_of_exp_of_softplus"`.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.distributions.bijector.Exp.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.distributions.bijector.Exp.md
index 2a50fd0cfea..84eb7e41277 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.distributions.bijector.Exp.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.distributions.bijector.Exp.md
@@ -27,8 +27,8 @@ Instantiates the `Exp` bijector.
 
 *  <b>`event_ndims`</b>: Scalar `int32` `Tensor` indicating the number of dimensions
     associated with a particular draw from the distribution.
-*  <b>`validate_args`</b>: `Boolean` indicated whether arguments should be checked for
-    correctness.
+*  <b>`validate_args`</b>: `Boolean` indicating whether arguments should be checked
+    for correctness.
 *  <b>`name`</b>: `String` name given to ops managed by this object.
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.metrics.streaming_percentage_less.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.metrics.streaming_percentage_less.md
index ccf6097f59e..c8c5c757076 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.metrics.streaming_percentage_less.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.metrics.streaming_percentage_less.md
@@ -1,45 +1,43 @@
-### `tf.contrib.metrics.streaming_percentage_less(*args, **kwargs)` {#streaming_percentage_less}
+### `tf.contrib.metrics.streaming_percentage_less(values, threshold, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_percentage_less}
 
-Computes the percentage of values less than the given threshold. (deprecated arguments)
+Computes the percentage of values less than the given threshold.
 
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-10-19.
-Instructions for updating:
-`ignore_mask` is being deprecated. Instead use `weights` with values 0.0 and 1.0 to mask values. For example, `weights=tf.logical_not(mask)`.
+The `streaming_percentage_less` function creates two local variables,
+`total` and `count` that are used to compute the percentage of `values` that
+fall below `threshold`. This rate is weighted by `weights`, and it is
+ultimately returned as `percentage` which is an idempotent operation that
+simply divides `total` by `count`.
 
-  The `streaming_percentage_less` function creates two local variables,
-  `total` and `count` that are used to compute the percentage of `values` that
-  fall below `threshold`. This rate is weighted by `weights`, and it is
-  ultimately returned as `percentage` which is an idempotent operation that
-  simply divides `total` by `count`.
+For estimation of the metric over a stream of data, the function creates an
+`update_op` operation that updates these variables and returns the
+`percentage`.
 
-  For estimation of the metric over a stream of data, the function creates an
-  `update_op` operation that updates these variables and returns the
-  `percentage`.
+If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
-  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-  Alternatively, if `ignore_mask` is not `None`, then mask values where
-  `ignore_mask` is `True`.
+##### Args:
 
-  Args:
-    values: A numeric `Tensor` of arbitrary size.
-    threshold: A scalar threshold.
-    ignore_mask: An optional, `bool` `Tensor` whose shape matches `values`.
-    weights: An optional `Tensor` whose shape is broadcastable to `values`.
-    metrics_collections: An optional list of collections that the metric
-      value variable should be added to.
-    updates_collections: An optional list of collections that the metric update
-      ops should be added to.
-    name: An optional variable_scope name.
 
-  Returns:
-    percentage: A tensor representing the current mean, the value of `total`
-      divided by `count`.
-    update_op: An operation that increments the `total` and `count` variables
-      appropriately.
+*  <b>`values`</b>: A numeric `Tensor` of arbitrary size.
+*  <b>`threshold`</b>: A scalar threshold.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `values`.
+*  <b>`metrics_collections`</b>: An optional list of collections that the metric
+    value variable should be added to.
+*  <b>`updates_collections`</b>: An optional list of collections that the metric update
+    ops should be added to.
+*  <b>`name`</b>: An optional variable_scope name.
 
-  Raises:
-    ValueError: If `ignore_mask` is not `None` and its shape doesn't match
-      `values`, or if `weights` is not `None` and its shape doesn't match
-      `values`, or if either `metrics_collections` or `updates_collections` are
-      not a list or tuple.
+##### Returns:
+
+
+*  <b>`percentage`</b>: A tensor representing the current mean, the value of `total`
+    divided by `count`.
+*  <b>`update_op`</b>: An operation that increments the `total` and `count` variables
+    appropriately.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If `weights` is not `None` and its shape doesn't match `values`,
+    or if either `metrics_collections` or `updates_collections` are not a list
+    or tuple.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.piecewise_constant.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.piecewise_constant.md
new file mode 100644
index 00000000000..b41f38eb494
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.piecewise_constant.md
@@ -0,0 +1,41 @@
+### `tf.train.piecewise_constant(x, boundaries, values, name=None)` {#piecewise_constant}
+
+Piecewise constant from boundaries and interval values.
+
+Example: use a learning rate that's 1.0 for the first 100000 steps, 0.5
+  for steps 100001 to 110000, and 0.1 for any additional steps.
+
+```python
+global_step = tf.Variable(0, trainable=False)
+boundaries = [100000, 110000]
+values = [1.0, 0.5, 0.1]
+learning_rate = tf.train.piecewise_constant(global_step, boundaries, values)
+
+# Later, whenever we perform an optimization step, we increment global_step.
+```
+
+##### Args:
+
+
+*  <b>`x`</b>: A 0-D scalar `Tensor`. Must be one of the following types: `float32`,
+    `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`.
+*  <b>`boundaries`</b>: A list of `Tensor`s or `int`s or `float`s with strictly
+    increasing entries, and with all elements having the same type as `x`.
+*  <b>`values`</b>: A list of `Tensor`s or float`s or `int`s that specifies the values
+    for the intervals defined by `boundaries`. It should have one more element
+    than `boundaries`, and all elements should have the same type.
+*  <b>`name`</b>: A string. Optional name of the operation. Defaults to
+    'PiecewiseConstant'.
+
+##### Returns:
+
+  A 0-D Tensor. Its value is `values[0]` when `x <= boundaries[0]`,
+  `values[1]` when `x > boundaries[0]` and `x <= boundaries[1]`, ...,
+  and values[-1] when `x > boundaries[-1]`.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: if types of `x` and `buondaries` do not match, or types of all
+      `values` do not match.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.polynomial_decay.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.polynomial_decay.md
new file mode 100644
index 00000000000..64a365fb08a
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.train.polynomial_decay.md
@@ -0,0 +1,78 @@
+### `tf.train.polynomial_decay(learning_rate, global_step, decay_steps, end_learning_rate=0.0001, power=1.0, cycle=False, name=None)` {#polynomial_decay}
+
+Applies a polynomial decay to the learning rate.
+
+It is commonly observed that a monotonically decreasing learning rate, whose
+degree of change is carefully chosen, results in a better performing model.
+This function applies a polynomial decay function to a provided initial
+`learning_rate` to reach an `end_learning_rate` in the given `decay_steps`.
+
+It requires a `global_step` value to compute the decayed learning rate.  You
+can just pass a TensorFlow variable that you increment at each training step.
+
+The function returns the decayed learning rate.  It is computed as:
+
+```python
+global_step = min(global_step, decay_steps)
+decayed_learning_rate = (learning_rate - end_learning_rate) *
+                        (1 - global_step / decay_steps) ^ (power) +
+                        end_learning_rate
+
+```
+
+If `cycle` is True then a multiple of `decay_steps` is used, the first one
+that is bigger than `global_steps`.
+
+```python
+decay_steps = decay_steps * ceil(global_step / decay_steps)
+decayed_learning_rate = (learning_rate - end_learning_rate) *
+                        (1 - global_step / decay_steps) ^ (power) +
+                        end_learning_rate
+
+```
+
+Example: decay from 0.1 to 0.01 in 10000 steps using sqrt (i.e. power=0.5):
+
+```python
+...
+global_step = tf.Variable(0, trainable=False)
+starter_learning_rate = 0.1
+end_learning_rate = 0.01
+decay_steps = 10000
+learning_rate = tf.train.polynomial_decay(starter_learning_rate, global_step,
+                                          decay_steps, end_learning_rate,
+                                          power=0.5)
+# Passing global_step to minimize() will increment it at each step.
+learning_step = (
+    tf.train.GradientDescentOptimizer(learning_rate)
+    .minimize(...my loss..., global_step=global_step)
+)
+```
+
+##### Args:
+
+
+*  <b>`learning_rate`</b>: A scalar `float32` or `float64` `Tensor` or a
+    Python number.  The initial learning rate.
+*  <b>`global_step`</b>: A scalar `int32` or `int64` `Tensor` or a Python number.
+    Global step to use for the decay computation.  Must not be negative.
+*  <b>`decay_steps`</b>: A scalar `int32` or `int64` `Tensor` or a Python number.
+    Must be positive.  See the decay computation above.
+*  <b>`end_learning_rate`</b>: A scalar `float32` or `float64` `Tensor` or a
+    Python number.  The minimal end learning rate.
+*  <b>`power`</b>: A scalar `float32` or `float64` `Tensor` or a
+    Python number.  The power of the polynomial. Defaults to sqrt, i.e. 0.5.
+*  <b>`cycle`</b>: A boolean, whether or not it should cycle beyond decay_steps.
+*  <b>`name`</b>: String.  Optional name of the operation. Defaults to
+    'PolynomialDecay'.
+
+##### Returns:
+
+  A scalar `Tensor` of the same type as `learning_rate`.  The decayed
+  learning rate.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: if `global_step` is not supplied.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.Beta.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.Beta.md
index cd70e98acfa..3a3a481a806 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.Beta.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.Beta.md
@@ -183,6 +183,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.Beta.copy(**override_parameters_kwargs)` {#Beta.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.Beta.dtype` {#Beta.dtype}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.Laplace.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.Laplace.md
index ea5c3375029..2adbad22a3f 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.Laplace.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.Laplace.md
@@ -106,6 +106,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.Laplace.copy(**override_parameters_kwargs)` {#Laplace.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.Laplace.dtype` {#Laplace.dtype}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.LaplaceWithSoftplusScale.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.LaplaceWithSoftplusScale.md
index 312dc02f8ca..6b4f3449841 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.LaplaceWithSoftplusScale.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.LaplaceWithSoftplusScale.md
@@ -73,6 +73,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.LaplaceWithSoftplusScale.copy(**override_parameters_kwargs)` {#LaplaceWithSoftplusScale.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.LaplaceWithSoftplusScale.dtype` {#LaplaceWithSoftplusScale.dtype}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.StudentTWithAbsDfSoftplusSigma.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.StudentTWithAbsDfSoftplusSigma.md
index 5cd5b51c303..6e1d00686dd 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.StudentTWithAbsDfSoftplusSigma.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.StudentTWithAbsDfSoftplusSigma.md
@@ -73,6 +73,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.StudentTWithAbsDfSoftplusSigma.copy(**override_parameters_kwargs)` {#StudentTWithAbsDfSoftplusSigma.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.StudentTWithAbsDfSoftplusSigma.df` {#StudentTWithAbsDfSoftplusSigma.df}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.metrics.streaming_mean_iou.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.metrics.streaming_mean_iou.md
index 45eaf48ba4e..bb5e60c2a8a 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.metrics.streaming_mean_iou.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.metrics.streaming_mean_iou.md
@@ -1,51 +1,51 @@
-### `tf.contrib.metrics.streaming_mean_iou(*args, **kwargs)` {#streaming_mean_iou}
+### `tf.contrib.metrics.streaming_mean_iou(predictions, labels, num_classes, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_mean_iou}
 
-Calculate per-step mean Intersection-Over-Union (mIOU). (deprecated arguments)
+Calculate per-step mean Intersection-Over-Union (mIOU).
 
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-10-19.
-Instructions for updating:
-`ignore_mask` is being deprecated. Instead use `weights` with values 0.0 and 1.0 to mask values. For example, `weights=tf.logical_not(mask)`.
+Mean Intersection-Over-Union is a common evaluation metric for
+semantic image segmentation, which first computes the IOU for each
+semantic class and then computes the average over classes.
 
-  Mean Intersection-Over-Union is a common evaluation metric for
-  semantic image segmentation, which first computes the IOU for each
-  semantic class and then computes the average over classes.
-  IOU is defined as follows:
-    IOU = true_positive / (true_positive + false_positive + false_negative).
-  The predictions are accumulated in a confusion matrix, weighted by `weights`,
-  and mIOU is then calculated from it.
+##### IOU is defined as follows:
 
-  For estimation of the metric over a stream of data, the function creates an
-  `update_op` operation that updates these variables and returns the `mean_iou`.
+  IOU = true_positive / (true_positive + false_positive + false_negative).
+The predictions are accumulated in a confusion matrix, weighted by `weights`,
+and mIOU is then calculated from it.
 
-  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-  Alternatively, if `ignore_mask` is not `None`, then mask values where
-  `ignore_mask` is `True`.
+For estimation of the metric over a stream of data, the function creates an
+`update_op` operation that updates these variables and returns the `mean_iou`.
 
-  Args:
-    predictions: A tensor of prediction results for semantic labels, whose
-      shape is [batch size] and type `int32` or `int64`. The tensor will be
-      flattened, if its rank > 1.
-    labels: A tensor of ground truth labels with shape [batch size] and of
-      type `int32` or `int64`. The tensor will be flattened, if its rank > 1.
-    num_classes: The possible number of labels the prediction task can
-      have. This value must be provided, since a confusion matrix of
-      dimension = [num_classes, num_classes] will be allocated.
-    ignore_mask: An optional, `bool` `Tensor` whose shape matches `predictions`.
-    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
-    metrics_collections: An optional list of collections that `mean_iou`
-      should be added to.
-    updates_collections: An optional list of collections `update_op` should be
-      added to.
-    name: An optional variable_scope name.
+If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
-  Returns:
-    mean_iou: A tensor representing the mean intersection-over-union.
-    update_op: An operation that increments the confusion matrix.
+##### Args:
 
-  Raises:
-    ValueError: If `predictions` and `labels` have mismatched shapes, or if
-      `ignore_mask` is not `None` and its shape doesn't match `predictions`, or
-      if `weights` is not `None` and its shape doesn't match `predictions`, or
-      if either `metrics_collections` or `updates_collections` are not a list or
-      tuple.
+
+*  <b>`predictions`</b>: A tensor of prediction results for semantic labels, whose
+    shape is [batch size] and type `int32` or `int64`. The tensor will be
+    flattened, if its rank > 1.
+*  <b>`labels`</b>: A tensor of ground truth labels with shape [batch size] and of
+    type `int32` or `int64`. The tensor will be flattened, if its rank > 1.
+*  <b>`num_classes`</b>: The possible number of labels the prediction task can
+    have. This value must be provided, since a confusion matrix of
+    dimension = [num_classes, num_classes] will be allocated.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `predictions`.
+*  <b>`metrics_collections`</b>: An optional list of collections that `mean_iou`
+    should be added to.
+*  <b>`updates_collections`</b>: An optional list of collections `update_op` should be
+    added to.
+*  <b>`name`</b>: An optional variable_scope name.
+
+##### Returns:
+
+
+*  <b>`mean_iou`</b>: A tensor representing the mean intersection-over-union.
+*  <b>`update_op`</b>: An operation that increments the confusion matrix.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
+    `weights` is not `None` and its shape doesn't match `predictions`, or if
+    either `metrics_collections` or `updates_collections` are not a list or
+    tuple.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.metrics.streaming_recall.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.metrics.streaming_recall.md
index e93630f46c1..34e8bd291fd 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.metrics.streaming_recall.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.metrics.streaming_recall.md
@@ -1,47 +1,45 @@
-### `tf.contrib.metrics.streaming_recall(*args, **kwargs)` {#streaming_recall}
+### `tf.contrib.metrics.streaming_recall(predictions, labels, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_recall}
 
-Computes the recall of the predictions with respect to the labels. (deprecated arguments)
+Computes the recall of the predictions with respect to the labels.
 
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-10-19.
-Instructions for updating:
-`ignore_mask` is being deprecated. Instead use `weights` with values 0.0 and 1.0 to mask values. For example, `weights=tf.logical_not(mask)`.
+The `streaming_recall` function creates two local variables, `true_positives`
+and `false_negatives`, that are used to compute the recall. This value is
+ultimately returned as `recall`, an idempotent operation that simply divides
+`true_positives` by the sum of `true_positives`  and `false_negatives`.
 
-  The `streaming_recall` function creates two local variables, `true_positives`
-  and `false_negatives`, that are used to compute the recall. This value is
-  ultimately returned as `recall`, an idempotent operation that simply divides
-  `true_positives` by the sum of `true_positives`  and `false_negatives`.
+For estimation of the metric  over a stream of data, the function creates an
+`update_op` that updates these variables and returns the `recall`. `update_op`
+weights each prediction by the corresponding value in `weights`.
 
-  For estimation of the metric  over a stream of data, the function creates an
-  `update_op` that updates these variables and returns the `recall`. `update_op`
-  weights each prediction by the corresponding value in `weights`.
+If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
-  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-  Alternatively, if `ignore_mask` is not `None`, then mask values where
-  `ignore_mask` is `True`.
+##### Args:
 
-  Args:
-    predictions: The predicted values, a `bool` `Tensor` of arbitrary shape.
-    labels: The ground truth values, a `bool` `Tensor` whose dimensions must
-      match `predictions`.
-    ignore_mask: An optional, `bool` `Tensor` whose shape matches `predictions`.
-    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
-    metrics_collections: An optional list of collections that `recall` should
-      be added to.
-    updates_collections: An optional list of collections that `update_op` should
-      be added to.
-    name: An optional variable_scope name.
 
-  Returns:
-    recall: Scalar float `Tensor` with the value of `true_positives` divided
-      by the sum of `true_positives` and `false_negatives`.
-    update_op: `Operation` that increments `true_positives` and
-      `false_negatives` variables appropriately and whose value matches
-      `recall`.
+*  <b>`predictions`</b>: The predicted values, a `bool` `Tensor` of arbitrary shape.
+*  <b>`labels`</b>: The ground truth values, a `bool` `Tensor` whose dimensions must
+    match `predictions`.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `predictions`.
+*  <b>`metrics_collections`</b>: An optional list of collections that `recall` should
+    be added to.
+*  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
+    be added to.
+*  <b>`name`</b>: An optional variable_scope name.
 
-  Raises:
-    ValueError: If `predictions` and `labels` have mismatched shapes, or if
-      `ignore_mask` is not `None` and its shape doesn't match `predictions`, or
-      if `weights` is not `None` and its shape doesn't match `predictions`, or
-      if either `metrics_collections` or `updates_collections` are not a list or
-      tuple.
+##### Returns:
+
+
+*  <b>`recall`</b>: Scalar float `Tensor` with the value of `true_positives` divided
+    by the sum of `true_positives` and `false_negatives`.
+*  <b>`update_op`</b>: `Operation` that increments `true_positives` and
+    `false_negatives` variables appropriately and whose value matches
+    `recall`.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
+    `weights` is not `None` and its shape doesn't match `predictions`, or if
+    either `metrics_collections` or `updates_collections` are not a list or
+    tuple.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.ExponentialWithSoftplusLam.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.ExponentialWithSoftplusLam.md
index 6246dafbc56..7b1605162e6 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.ExponentialWithSoftplusLam.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.ExponentialWithSoftplusLam.md
@@ -87,6 +87,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.ExponentialWithSoftplusLam.copy(**override_parameters_kwargs)` {#ExponentialWithSoftplusLam.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.ExponentialWithSoftplusLam.dtype` {#ExponentialWithSoftplusLam.dtype}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.MultivariateNormalFull.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.MultivariateNormalFull.md
index 04fc0b64b28..47ba0396b4a 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.MultivariateNormalFull.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.MultivariateNormalFull.md
@@ -134,6 +134,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalFull.copy(**override_parameters_kwargs)` {#MultivariateNormalFull.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.MultivariateNormalFull.dtype` {#MultivariateNormalFull.dtype}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.Normal.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.Normal.md
index 9d6ad275ca8..c61b240e020 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.Normal.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.Normal.md
@@ -137,6 +137,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.Normal.copy(**override_parameters_kwargs)` {#Normal.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.Normal.dtype` {#Normal.dtype}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.beta_aa.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.beta_aa.md
index f064fb3f4d0..08032b9ac52 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.beta_aa.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.beta_aa.md
@@ -94,6 +94,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.beta_aa.copy(**override_parameters_kwargs)` {#beta_aa.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.beta_aa.dtype` {#beta_aa.dtype}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.bijector.Inline.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.bijector.Inline.md
index 38143ede1e5..0e590264273 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.bijector.Inline.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.bijector.Inline.md
@@ -8,7 +8,7 @@ exp = Inline(
   inverse_fn=tf.log,
   inverse_log_det_jacobian_fn=(
     lambda y: -tf.reduce_sum(tf.log(y), reduction_indices=-1)),
-  name="Exp")
+  name="exp")
 ```
 
 The above example is equivalent to the `Bijector` `Exp(event_ndims=1)`.
@@ -29,8 +29,8 @@ Creates a `Bijector` from callables.
     log o det o jacobian of the forward transformation.
 *  <b>`is_constant_jacobian`</b>: `Boolean` indicating that the Jacobian is constant
     for all input arguments.
-*  <b>`validate_args`</b>: `Boolean` indicated whether arguments should be checked for
-    correctness.
+*  <b>`validate_args`</b>: `Boolean` indicating whether arguments should be checked
+    for correctness.
 *  <b>`name`</b>: `String`, name given to ops managed by this object.
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.metrics.streaming_precision.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.metrics.streaming_precision.md
index 0afe30d1899..61d1cfdcc05 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.metrics.streaming_precision.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.metrics.streaming_precision.md
@@ -1,49 +1,47 @@
-### `tf.contrib.metrics.streaming_precision(*args, **kwargs)` {#streaming_precision}
+### `tf.contrib.metrics.streaming_precision(predictions, labels, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_precision}
 
-Computes the precision of the predictions with respect to the labels. (deprecated arguments)
+Computes the precision of the predictions with respect to the labels.
 
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-10-19.
-Instructions for updating:
-`ignore_mask` is being deprecated. Instead use `weights` with values 0.0 and 1.0 to mask values. For example, `weights=tf.logical_not(mask)`.
+The `streaming_precision` function creates two local variables,
+`true_positives` and `false_positives`, that are used to compute the
+precision. This value is ultimately returned as `precision`, an idempotent
+operation that simply divides `true_positives` by the sum of `true_positives`
+and `false_positives`.
 
-  The `streaming_precision` function creates two local variables,
-  `true_positives` and `false_positives`, that are used to compute the
-  precision. This value is ultimately returned as `precision`, an idempotent
-  operation that simply divides `true_positives` by the sum of `true_positives`
-  and `false_positives`.
+For estimation of the metric  over a stream of data, the function creates an
+`update_op` operation that updates these variables and returns the
+`precision`. `update_op` weights each prediction by the corresponding value in
+`weights`.
 
-  For estimation of the metric  over a stream of data, the function creates an
-  `update_op` operation that updates these variables and returns the
-  `precision`. `update_op` weights each prediction by the corresponding value in
-  `weights`.
+If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
-  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-  Alternatively, if `ignore_mask` is not `None`, then mask values where
-  `ignore_mask` is `True`.
+##### Args:
 
-  Args:
-    predictions: The predicted values, a `bool` `Tensor` of arbitrary shape.
-    labels: The ground truth values, a `bool` `Tensor` whose dimensions must
-      match `predictions`.
-    ignore_mask: An optional, `bool` `Tensor` whose shape matches `predictions`.
-    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
-    metrics_collections: An optional list of collections that `precision` should
-      be added to.
-    updates_collections: An optional list of collections that `update_op` should
-      be added to.
-    name: An optional variable_scope name.
 
-  Returns:
-    precision: Scalar float `Tensor` with the value of `true_positives`
-      divided by the sum of `true_positives` and `false_positives`.
-    update_op: `Operation` that increments `true_positives` and
-      `false_positives` variables appropriately and whose value matches
-      `precision`.
+*  <b>`predictions`</b>: The predicted values, a `bool` `Tensor` of arbitrary shape.
+*  <b>`labels`</b>: The ground truth values, a `bool` `Tensor` whose dimensions must
+    match `predictions`.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `predictions`.
+*  <b>`metrics_collections`</b>: An optional list of collections that `precision` should
+    be added to.
+*  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
+    be added to.
+*  <b>`name`</b>: An optional variable_scope name.
 
-  Raises:
-    ValueError: If `predictions` and `labels` have mismatched shapes, or if
-      `ignore_mask` is not `None` and its shape doesn't match `predictions`, or
-      if `weights` is not `None` and its shape doesn't match `predictions`, or
-      if either `metrics_collections` or `updates_collections` are not a list or
-      tuple.
+##### Returns:
+
+
+*  <b>`precision`</b>: Scalar float `Tensor` with the value of `true_positives`
+    divided by the sum of `true_positives` and `false_positives`.
+*  <b>`update_op`</b>: `Operation` that increments `true_positives` and
+    `false_positives` variables appropriately and whose value matches
+    `precision`.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If `predictions` and `labels` have mismatched shapes, or if
+    `weights` is not `None` and its shape doesn't match `predictions`, or if
+    either `metrics_collections` or `updates_collections` are not a list or
+    tuple.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.train.inverse_time_decay.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.train.inverse_time_decay.md
new file mode 100644
index 00000000000..fe85cb1b128
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.train.inverse_time_decay.md
@@ -0,0 +1,56 @@
+### `tf.train.inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate, staircase=False, name=None)` {#inverse_time_decay}
+
+Applies inverse time decay to the initial learning rate.
+
+When training a model, it is often recommended to lower the learning rate as
+the training progresses.  This function applies an inverse decay function
+to a provided initial learning rate.  It requires an `global_step` value to
+compute the decayed learning rate.  You can just pass a TensorFlow variable
+that you increment at each training step.
+
+The function returns the decayed learning rate.  It is computed as:
+
+```python
+decayed_learning_rate = learning_rate / (1 + decay_rate * t)
+```
+
+Example: decay 1/t with a rate of 0.5:
+
+```python
+...
+global_step = tf.Variable(0, trainable=False)
+learning_rate = 0.1
+k = 0.5
+learning_rate = tf.train.inverse_time_decay(learning_rate, global_step, k)
+
+# Passing global_step to minimize() will increment it at each step.
+learning_step = (
+    tf.train.GradientDescentOptimizer(learning_rate)
+    .minimize(...my loss..., global_step=global_step)
+)
+```
+
+##### Args:
+
+
+*  <b>`learning_rate`</b>: A scalar `float32` or `float64` `Tensor` or a
+    Python number.  The initial learning rate.
+*  <b>`global_step`</b>: A Python number.
+    Global step to use for the decay computation.  Must not be negative.
+*  <b>`decay_steps`</b>: How often to apply decay.
+*  <b>`decay_rate`</b>: A Python number.  The decay rate.
+*  <b>`staircase`</b>: Whether to apply decay in a discrete staircase, as opposed to
+    continuous, fashion.
+*  <b>`name`</b>: String.  Optional name of the operation.  Defaults to
+    'InverseTimeDecay'.
+
+##### Returns:
+
+  A scalar `Tensor` of the same type as `learning_rate`.  The decayed
+  learning rate.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: if `global_step` is not supplied.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.distributions.Mixture.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.distributions.Mixture.md
index 133686cef52..b47fca09fce 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.distributions.Mixture.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.distributions.Mixture.md
@@ -134,6 +134,29 @@ cdf(x) := P[X <= x]
 
 
 
+- - -
+
+#### `tf.contrib.distributions.Mixture.copy(**override_parameters_kwargs)` {#Mixture.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.Mixture.dtype` {#Mixture.dtype}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.distributions.NormalWithSoftplusSigma.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.distributions.NormalWithSoftplusSigma.md
index e6a161f27a5..16e5bb2e9c9 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.distributions.NormalWithSoftplusSigma.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.distributions.NormalWithSoftplusSigma.md
@@ -73,6 +73,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.NormalWithSoftplusSigma.copy(**override_parameters_kwargs)` {#NormalWithSoftplusSigma.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.NormalWithSoftplusSigma.dtype` {#NormalWithSoftplusSigma.dtype}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.image.per_image_standardization.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.image.per_image_standardization.md
new file mode 100644
index 00000000000..8b7b8484432
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.image.per_image_standardization.md
@@ -0,0 +1,25 @@
+### `tf.image.per_image_standardization(image)` {#per_image_standardization}
+
+Linearly scales `image` to have zero mean and unit norm.
+
+This op computes `(x - mean) / adjusted_stddev`, where `mean` is the average
+of all values in image, and
+`adjusted_stddev = max(stddev, 1.0/sqrt(image.NumElements()))`.
+
+`stddev` is the standard deviation of all values in `image`. It is capped
+away from zero to protect against division by 0 when handling uniform images.
+
+##### Args:
+
+
+*  <b>`image`</b>: 3-D tensor of shape `[height, width, channels]`.
+
+##### Returns:
+
+  The standardized image with same shape as `image`.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: if the shape of 'image' is incompatible with this function.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.train.import_meta_graph.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.train.import_meta_graph.md
index 5f53eacdfcf..d0fa7f551eb 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.train.import_meta_graph.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.train.import_meta_graph.md
@@ -1,4 +1,4 @@
-### `tf.train.import_meta_graph(meta_graph_or_file, import_scope=None, **kwargs)` {#import_meta_graph}
+### `tf.train.import_meta_graph(meta_graph_or_file, clear_devices=False, import_scope=None, **kwargs)` {#import_meta_graph}
 
 Recreates a Graph saved in a `MetaGraphDef` proto.
 
@@ -55,6 +55,8 @@ device assignments have not changed.
 
 *  <b>`meta_graph_or_file`</b>: `MetaGraphDef` protocol buffer or filename (including
     the path) containing a `MetaGraphDef`.
+*  <b>`clear_devices`</b>: Whether or not to clear the device field for an `Operation`
+    or `Tensor` during import.
 *  <b>`import_scope`</b>: Optional `string`. Name scope to add. Only used when
     initializing from protocol buffer.
 *  <b>`**kwargs`</b>: Optional keyed arguments.
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.md
index 42f96581068..5eae3e7ff5b 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.md
@@ -73,6 +73,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.copy(**override_parameters_kwargs)` {#MultivariateNormalDiagWithSoftplusStDev.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.MultivariateNormalDiagWithSoftplusStDev.dtype` {#MultivariateNormalDiagWithSoftplusStDev.dtype}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.distributions.Poisson.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.distributions.Poisson.md
index 2a2cdeb7d7d..9763d6ba473 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.distributions.Poisson.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.distributions.Poisson.md
@@ -97,6 +97,29 @@ cdf(x) := P[X <= x]
     values of type `self.dtype`.
 
 
+- - -
+
+#### `tf.contrib.distributions.Poisson.copy(**override_parameters_kwargs)` {#Poisson.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.Poisson.dtype` {#Poisson.dtype}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.distributions.WishartFull.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.distributions.WishartFull.md
index 86f4f32cb4a..9781d8a33b4 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.distributions.WishartFull.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.distributions.WishartFull.md
@@ -155,6 +155,29 @@ cdf(x) := P[X <= x]
 Boolean indicating if `Tensor` input/outputs are Cholesky factorized.
 
 
+- - -
+
+#### `tf.contrib.distributions.WishartFull.copy(**override_parameters_kwargs)` {#WishartFull.copy}
+
+Creates a deep copy of the distribution.
+
+Note: the copy distribution may continue to depend on the original
+intialization arguments.
+
+##### Args:
+
+
+*  <b>`**override_parameters_kwargs`</b>: String/value dictionary of initialization
+    arguments to override with new values.
+
+##### Returns:
+
+
+*  <b>`distribution`</b>: A new instance of `type(self)` intitialized from the union
+    of self.parameters and override_parameters_kwargs, i.e.,
+    `dict(self.parameters, **override_parameters_kwargs)`.
+
+
 - - -
 
 #### `tf.contrib.distributions.WishartFull.df` {#WishartFull.df}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.StepCounterHook.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.StepCounterHook.md
index ac2c09b5bde..10c7d249043 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.StepCounterHook.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.StepCounterHook.md
@@ -1,7 +1,7 @@
 Steps per second monitor.
 - - -
 
-#### `tf.train.StepCounterHook.__init__(every_n_steps=100, output_dir=None, summary_writer=None)` {#StepCounterHook.__init__}
+#### `tf.train.StepCounterHook.__init__(every_n_steps=100, every_n_secs=None, output_dir=None, summary_writer=None)` {#StepCounterHook.__init__}
 
 
 
diff --git a/tensorflow/g3doc/api_docs/python/image.md b/tensorflow/g3doc/api_docs/python/image.md
index a5107d0eb09..08d027688ae 100644
--- a/tensorflow/g3doc/api_docs/python/image.md
+++ b/tensorflow/g3doc/api_docs/python/image.md
@@ -1210,7 +1210,7 @@ picked in the interval `[lower, upper]`.
 
 - - -
 
-### `tf.image.per_image_whitening(image)` {#per_image_whitening}
+### `tf.image.per_image_standardization(image)` {#per_image_standardization}
 
 Linearly scales `image` to have zero mean and unit norm.
 
@@ -1221,11 +1221,6 @@ of all values in image, and
 `stddev` is the standard deviation of all values in `image`. It is capped
 away from zero to protect against division by 0 when handling uniform images.
 
-Note that this implementation is limited:
-
-*  It only whitens based on the statistics of an individual image.
-*  It does not take into account the covariance structure.
-
 ##### Args:
 
 
@@ -1233,7 +1228,7 @@ Note that this implementation is limited:
 
 ##### Returns:
 
-  The whitened image with same shape as `image`.
+  The standardized image with same shape as `image`.
 
 ##### Raises:
 
@@ -1417,3 +1412,12 @@ false and no bounding boxes are supplied, an error is raised.
     Provide as input to `tf.image.draw_bounding_boxes`.
 
 
+
+## Other Functions and Classes
+- - -
+
+### `tf.image.per_image_whitening(image)` {#per_image_whitening}
+
+
+
+
diff --git a/tensorflow/g3doc/api_docs/python/index.md b/tensorflow/g3doc/api_docs/python/index.md
index 3f6130ad449..3d4031699cf 100644
--- a/tensorflow/g3doc/api_docs/python/index.md
+++ b/tensorflow/g3doc/api_docs/python/index.md
@@ -361,6 +361,7 @@
   * [`hsv_to_rgb`](../../api_docs/python/image.md#hsv_to_rgb)
   * [`non_max_suppression`](../../api_docs/python/image.md#non_max_suppression)
   * [`pad_to_bounding_box`](../../api_docs/python/image.md#pad_to_bounding_box)
+  * [`per_image_standardization`](../../api_docs/python/image.md#per_image_standardization)
   * [`per_image_whitening`](../../api_docs/python/image.md#per_image_whitening)
   * [`random_brightness`](../../api_docs/python/image.md#random_brightness)
   * [`random_contrast`](../../api_docs/python/image.md#random_contrast)
@@ -586,6 +587,7 @@
   * [`gradients`](../../api_docs/python/train.md#gradients)
   * [`histogram_summary`](../../api_docs/python/train.md#histogram_summary)
   * [`image_summary`](../../api_docs/python/train.md#image_summary)
+  * [`inverse_time_decay`](../../api_docs/python/train.md#inverse_time_decay)
   * [`LoggingTensorHook`](../../api_docs/python/train.md#LoggingTensorHook)
   * [`LooperThread`](../../api_docs/python/train.md#LooperThread)
   * [`merge_all_summaries`](../../api_docs/python/train.md#merge_all_summaries)
@@ -595,8 +597,11 @@
   * [`MonitoredTrainingSession`](../../api_docs/python/train.md#MonitoredTrainingSession)
   * [`NanLossDuringTrainingError`](../../api_docs/python/train.md#NanLossDuringTrainingError)
   * [`NanTensorHook`](../../api_docs/python/train.md#NanTensorHook)
+  * [`natural_exp_decay`](../../api_docs/python/train.md#natural_exp_decay)
   * [`NewCheckpointReader`](../../api_docs/python/train.md#NewCheckpointReader)
   * [`Optimizer`](../../api_docs/python/train.md#Optimizer)
+  * [`piecewise_constant`](../../api_docs/python/train.md#piecewise_constant)
+  * [`polynomial_decay`](../../api_docs/python/train.md#polynomial_decay)
   * [`ProximalAdagradOptimizer`](../../api_docs/python/train.md#ProximalAdagradOptimizer)
   * [`ProximalGradientDescentOptimizer`](../../api_docs/python/train.md#ProximalGradientDescentOptimizer)
   * [`QueueRunner`](../../api_docs/python/train.md#QueueRunner)
@@ -775,6 +780,7 @@
 * **[Random variable transformations (contrib)](../../api_docs/python/contrib.distributions.bijector.md)**:
   * [`Bijector`](../../api_docs/python/contrib.distributions.bijector.md#Bijector)
   * [`Chain`](../../api_docs/python/contrib.distributions.bijector.md#Chain)
+  * [`CholeskyOuterProduct`](../../api_docs/python/contrib.distributions.bijector.md#CholeskyOuterProduct)
   * [`Exp`](../../api_docs/python/contrib.distributions.bijector.md#Exp)
   * [`Identity`](../../api_docs/python/contrib.distributions.bijector.md#Identity)
   * [`Inline`](../../api_docs/python/contrib.distributions.bijector.md#Inline)
diff --git a/tensorflow/g3doc/api_docs/python/state_ops.md b/tensorflow/g3doc/api_docs/python/state_ops.md
index 71f3563a545..237f6541436 100644
--- a/tensorflow/g3doc/api_docs/python/state_ops.md
+++ b/tensorflow/g3doc/api_docs/python/state_ops.md
@@ -3158,7 +3158,7 @@ a subgraph.
 
 - - -
 
-### `tf.train.import_meta_graph(meta_graph_or_file, import_scope=None, **kwargs)` {#import_meta_graph}
+### `tf.train.import_meta_graph(meta_graph_or_file, clear_devices=False, import_scope=None, **kwargs)` {#import_meta_graph}
 
 Recreates a Graph saved in a `MetaGraphDef` proto.
 
@@ -3215,6 +3215,8 @@ device assignments have not changed.
 
 *  <b>`meta_graph_or_file`</b>: `MetaGraphDef` protocol buffer or filename (including
     the path) containing a `MetaGraphDef`.
+*  <b>`clear_devices`</b>: Whether or not to clear the device field for an `Operation`
+    or `Tensor` during import.
 *  <b>`import_scope`</b>: Optional `string`. Name scope to add. Only used when
     initializing from protocol buffer.
 *  <b>`**kwargs`</b>: Optional keyed arguments.
diff --git a/tensorflow/g3doc/api_docs/python/train.md b/tensorflow/g3doc/api_docs/python/train.md
index 6c4e08ab3cf..7b367cf77a4 100644
--- a/tensorflow/g3doc/api_docs/python/train.md
+++ b/tensorflow/g3doc/api_docs/python/train.md
@@ -995,6 +995,249 @@ learning_step = (
 *  <b>`ValueError`</b>: if `global_step` is not supplied.
 
 
+- - -
+
+### `tf.train.inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate, staircase=False, name=None)` {#inverse_time_decay}
+
+Applies inverse time decay to the initial learning rate.
+
+When training a model, it is often recommended to lower the learning rate as
+the training progresses.  This function applies an inverse decay function
+to a provided initial learning rate.  It requires an `global_step` value to
+compute the decayed learning rate.  You can just pass a TensorFlow variable
+that you increment at each training step.
+
+The function returns the decayed learning rate.  It is computed as:
+
+```python
+decayed_learning_rate = learning_rate / (1 + decay_rate * t)
+```
+
+Example: decay 1/t with a rate of 0.5:
+
+```python
+...
+global_step = tf.Variable(0, trainable=False)
+learning_rate = 0.1
+k = 0.5
+learning_rate = tf.train.inverse_time_decay(learning_rate, global_step, k)
+
+# Passing global_step to minimize() will increment it at each step.
+learning_step = (
+    tf.train.GradientDescentOptimizer(learning_rate)
+    .minimize(...my loss..., global_step=global_step)
+)
+```
+
+##### Args:
+
+
+*  <b>`learning_rate`</b>: A scalar `float32` or `float64` `Tensor` or a
+    Python number.  The initial learning rate.
+*  <b>`global_step`</b>: A Python number.
+    Global step to use for the decay computation.  Must not be negative.
+*  <b>`decay_steps`</b>: How often to apply decay.
+*  <b>`decay_rate`</b>: A Python number.  The decay rate.
+*  <b>`staircase`</b>: Whether to apply decay in a discrete staircase, as opposed to
+    continuous, fashion.
+*  <b>`name`</b>: String.  Optional name of the operation.  Defaults to
+    'InverseTimeDecay'.
+
+##### Returns:
+
+  A scalar `Tensor` of the same type as `learning_rate`.  The decayed
+  learning rate.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: if `global_step` is not supplied.
+
+
+- - -
+
+### `tf.train.natural_exp_decay(learning_rate, global_step, decay_steps, decay_rate, staircase=False, name=None)` {#natural_exp_decay}
+
+Applies natural exponential decay to the initial learning rate.
+
+When training a model, it is often recommended to lower the learning rate as
+the training progresses.  This function applies an exponential decay function
+to a provided initial learning rate.  It requires an `global_step` value to
+compute the decayed learning rate.  You can just pass a TensorFlow variable
+that you increment at each training step.
+
+The function returns the decayed learning rate.  It is computed as:
+
+```python
+decayed_learning_rate = learning_rate * exp(-decay_rate * global_step)
+```
+
+Example: decay exponentially with a base of 0.96:
+
+```python
+...
+global_step = tf.Variable(0, trainable=False)
+learning_rate = 0.1
+k = 0.5
+learning_rate = tf.train.exponential_time_decay(learning_rate, global_step, k)
+
+# Passing global_step to minimize() will increment it at each step.
+learning_step = (
+    tf.train.GradientDescentOptimizer(learning_rate)
+    .minimize(...my loss..., global_step=global_step)
+)
+```
+
+##### Args:
+
+
+*  <b>`learning_rate`</b>: A scalar `float32` or `float64` `Tensor` or a
+    Python number.  The initial learning rate.
+*  <b>`global_step`</b>: A Python number.
+    Global step to use for the decay computation.  Must not be negative.
+*  <b>`decay_steps`</b>: How often to apply decay.
+*  <b>`decay_rate`</b>: A Python number.  The decay rate.
+*  <b>`staircase`</b>: Whether to apply decay in a discrete staircase, as opposed to
+    continuous, fashion.
+*  <b>`name`</b>: String.  Optional name of the operation.  Defaults to
+    'ExponentialTimeDecay'.
+
+##### Returns:
+
+  A scalar `Tensor` of the same type as `learning_rate`.  The decayed
+  learning rate.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: if `global_step` is not supplied.
+
+
+- - -
+
+### `tf.train.piecewise_constant(x, boundaries, values, name=None)` {#piecewise_constant}
+
+Piecewise constant from boundaries and interval values.
+
+Example: use a learning rate that's 1.0 for the first 100000 steps, 0.5
+  for steps 100001 to 110000, and 0.1 for any additional steps.
+
+```python
+global_step = tf.Variable(0, trainable=False)
+boundaries = [100000, 110000]
+values = [1.0, 0.5, 0.1]
+learning_rate = tf.train.piecewise_constant(global_step, boundaries, values)
+
+# Later, whenever we perform an optimization step, we increment global_step.
+```
+
+##### Args:
+
+
+*  <b>`x`</b>: A 0-D scalar `Tensor`. Must be one of the following types: `float32`,
+    `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`.
+*  <b>`boundaries`</b>: A list of `Tensor`s or `int`s or `float`s with strictly
+    increasing entries, and with all elements having the same type as `x`.
+*  <b>`values`</b>: A list of `Tensor`s or float`s or `int`s that specifies the values
+    for the intervals defined by `boundaries`. It should have one more element
+    than `boundaries`, and all elements should have the same type.
+*  <b>`name`</b>: A string. Optional name of the operation. Defaults to
+    'PiecewiseConstant'.
+
+##### Returns:
+
+  A 0-D Tensor. Its value is `values[0]` when `x <= boundaries[0]`,
+  `values[1]` when `x > boundaries[0]` and `x <= boundaries[1]`, ...,
+  and values[-1] when `x > boundaries[-1]`.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: if types of `x` and `buondaries` do not match, or types of all
+      `values` do not match.
+
+
+- - -
+
+### `tf.train.polynomial_decay(learning_rate, global_step, decay_steps, end_learning_rate=0.0001, power=1.0, cycle=False, name=None)` {#polynomial_decay}
+
+Applies a polynomial decay to the learning rate.
+
+It is commonly observed that a monotonically decreasing learning rate, whose
+degree of change is carefully chosen, results in a better performing model.
+This function applies a polynomial decay function to a provided initial
+`learning_rate` to reach an `end_learning_rate` in the given `decay_steps`.
+
+It requires a `global_step` value to compute the decayed learning rate.  You
+can just pass a TensorFlow variable that you increment at each training step.
+
+The function returns the decayed learning rate.  It is computed as:
+
+```python
+global_step = min(global_step, decay_steps)
+decayed_learning_rate = (learning_rate - end_learning_rate) *
+                        (1 - global_step / decay_steps) ^ (power) +
+                        end_learning_rate
+
+```
+
+If `cycle` is True then a multiple of `decay_steps` is used, the first one
+that is bigger than `global_steps`.
+
+```python
+decay_steps = decay_steps * ceil(global_step / decay_steps)
+decayed_learning_rate = (learning_rate - end_learning_rate) *
+                        (1 - global_step / decay_steps) ^ (power) +
+                        end_learning_rate
+
+```
+
+Example: decay from 0.1 to 0.01 in 10000 steps using sqrt (i.e. power=0.5):
+
+```python
+...
+global_step = tf.Variable(0, trainable=False)
+starter_learning_rate = 0.1
+end_learning_rate = 0.01
+decay_steps = 10000
+learning_rate = tf.train.polynomial_decay(starter_learning_rate, global_step,
+                                          decay_steps, end_learning_rate,
+                                          power=0.5)
+# Passing global_step to minimize() will increment it at each step.
+learning_step = (
+    tf.train.GradientDescentOptimizer(learning_rate)
+    .minimize(...my loss..., global_step=global_step)
+)
+```
+
+##### Args:
+
+
+*  <b>`learning_rate`</b>: A scalar `float32` or `float64` `Tensor` or a
+    Python number.  The initial learning rate.
+*  <b>`global_step`</b>: A scalar `int32` or `int64` `Tensor` or a Python number.
+    Global step to use for the decay computation.  Must not be negative.
+*  <b>`decay_steps`</b>: A scalar `int32` or `int64` `Tensor` or a Python number.
+    Must be positive.  See the decay computation above.
+*  <b>`end_learning_rate`</b>: A scalar `float32` or `float64` `Tensor` or a
+    Python number.  The minimal end learning rate.
+*  <b>`power`</b>: A scalar `float32` or `float64` `Tensor` or a
+    Python number.  The power of the polynomial. Defaults to sqrt, i.e. 0.5.
+*  <b>`cycle`</b>: A boolean, whether or not it should cycle beyond decay_steps.
+*  <b>`name`</b>: String.  Optional name of the operation. Defaults to
+    'PolynomialDecay'.
+
+##### Returns:
+
+  A scalar `Tensor` of the same type as `learning_rate`.  The decayed
+  learning rate.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: if `global_step` is not supplied.
+
+
 
 ## Moving Averages
 
@@ -4417,7 +4660,7 @@ Initialize CheckpointSaverHook monitor.
 Steps per second monitor.
 - - -
 
-#### `tf.train.StepCounterHook.__init__(every_n_steps=100, output_dir=None, summary_writer=None)` {#StepCounterHook.__init__}
+#### `tf.train.StepCounterHook.__init__(every_n_steps=100, every_n_secs=None, output_dir=None, summary_writer=None)` {#StepCounterHook.__init__}
 
 
 
diff --git a/tensorflow/g3doc/how_tos/meta_graph/index.md b/tensorflow/g3doc/how_tos/meta_graph/index.md
index a7bce5101cd..7ff89972756 100644
--- a/tensorflow/g3doc/how_tos/meta_graph/index.md
+++ b/tensorflow/g3doc/how_tos/meta_graph/index.md
@@ -32,24 +32,37 @@ to and from `MetaGraphDef`, the Python class must implement `to_proto()` and
   For example,
 
   ```Python
-  def to_proto(self):
+  def to_proto(self, export_scope=None):
+
     """Converts a `Variable` to a `VariableDef` protocol buffer.
 
+    Args:
+      export_scope: Optional `string`. Name scope to remove.
+
     Returns:
-      A `VariableDef` protocol buffer.
+      A `VariableDef` protocol buffer, or `None` if the `Variable` is not
+      in the specified name scope.
     """
-    var_def = variable_pb2.VariableDef()
-    var_def.variable_name = self._variable.name
-    var_def.initializer_name = self.initializer.name
-    var_def.snapshot_name = self._snapshot.name
-    if self._save_slice_info:
-      var_def.save_slice_info_def.MergeFrom(self._save_slice_info.to_proto())
-    return var_def
+    if (export_scope is None or
+        self._variable.name.startswith(export_scope)):
+      var_def = variable_pb2.VariableDef()
+      var_def.variable_name = ops.strip_name_scope(
+          self._variable.name, export_scope)
+      var_def.initializer_name = ops.strip_name_scope(
+          self.initializer.name, export_scope)
+      var_def.snapshot_name = ops.strip_name_scope(
+          self._snapshot.name, export_scope)
+      if self._save_slice_info:
+        var_def.save_slice_info_def.MergeFrom(self._save_slice_info.to_proto(
+            export_scope=export_scope))
+      return var_def
+    else:
+      return None
 
   @staticmethod
-  def from_proto(variable_def):
+  def from_proto(variable_def, import_scope=None):
     """Returns a `Variable` object created from `variable_def`."""
-    return Variable(variable_def=variable_def)
+    return Variable(variable_def=variable_def, import_scope=import_scope)
 
   ops.register_proto_function(ops.GraphKeys.VARIABLES,
                               proto_type=variable_pb2.VariableDef,
@@ -228,6 +241,40 @@ Here are some of the typical usage models:
     sess.run(train_op)
   ```
 
+* Import a graph with preset devices.
+
+  Sometimes an exported meta graph is from a training environment that the
+  importer doesn't have. For example, the model might have been trained
+  on GPUs, or in a distributed environment with replicas. When importing
+  such models, it's useful to be able to clear the device settings in
+  the graph so that we can run it on locally available devices. This can
+  be achieved by calling `import_meta_graph` with the `clear_devices`
+  option set to `True`.
+
+  ```Python
+  with tf.Session() as sess:
+    new_saver = tf.train.import_meta_graph('my-save-dir/my-model-10000.meta',
+        clear_devices=True)
+    new_saver.restore(sess, 'my-save-dir/my-model-10000')
+    ...
+  ```
+
+* Import within the default graph.
+
+  Sometimes you might want to run `export_meta_graph` and `import_meta_graph`
+  in codelab using the default graph. In that case, you need to reset
+  the default graph by calling `tf.reset_default_graph()` first before
+  running import.
+
+  ```Python
+  meta_graph_def = tf.train.export_meta_graph()
+  ...
+  tf.reset_default_graph()
+  ...
+  tf.train.import_meta_graph(meta_graph_def)
+  ...
+  ```
+
 * Retrieve Hyper Parameters
 
   ```Python
diff --git a/tensorflow/g3doc/tutorials/deep_cnn/index.md b/tensorflow/g3doc/tutorials/deep_cnn/index.md
index 89ba53ac6fc..a5302df9147 100644
--- a/tensorflow/g3doc/tutorials/deep_cnn/index.md
+++ b/tensorflow/g3doc/tutorials/deep_cnn/index.md
@@ -122,7 +122,7 @@ The images are processed as follows:
 
 *  They are cropped to 24 x 24 pixels, centrally for evaluation or
    [randomly](../../api_docs/python/constant_op.md#random_crop) for training.
-*  They are [approximately whitened](../../api_docs/python/image.md#per_image_whitening)
+*  They are [approximately whitened](../../api_docs/python/image.md#per_image_standardization)
    to make the model insensitive to dynamic range.
 
 For training, we additionally apply a series of random distortions to
diff --git a/tensorflow/g3doc/tutorials/mnist/pros/index.md b/tensorflow/g3doc/tutorials/mnist/pros/index.md
index 72792c6fbe0..6237d7e048e 100644
--- a/tensorflow/g3doc/tutorials/mnist/pros/index.md
+++ b/tensorflow/g3doc/tutorials/mnist/pros/index.md
@@ -292,7 +292,7 @@ def max_pool_2x2(x):
 ### First Convolutional Layer
 
 We can now implement our first layer. It will consist of convolution, followed
-by max pooling. The convolutional will compute 32 features for each 5x5 patch.
+by max pooling. The convolution will compute 32 features for each 5x5 patch.
 Its weight tensor will have a shape of `[5, 5, 1, 32]`. The first two
 dimensions are the patch size, the next is the number of input channels, and
 the last is the number of output channels. We will also have a bias vector with
@@ -312,7 +312,8 @@ x_image = tf.reshape(x, [-1,28,28,1])
 ```
 
 We then convolve `x_image` with the weight tensor, add the
-bias, apply the ReLU function, and finally max pool.
+bias, apply the ReLU function, and finally max pool. The `max_pool_2x2` method will
+reduce the image size to 14x14.
 
 ```python
 h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
diff --git a/tensorflow/go/example_inception_inference_test.go b/tensorflow/go/example_inception_inference_test.go
index b58942aefb3..09c70044688 100644
--- a/tensorflow/go/example_inception_inference_test.go
+++ b/tensorflow/go/example_inception_inference_test.go
@@ -28,6 +28,7 @@ import (
 	"os"
 	"path/filepath"
 
+	"github.com/tensorflow/tensorflow/tensorflow/go/op"
 	tf "github.com/tensorflow/tensorflow/tensorflow/go"
 )
 
@@ -53,8 +54,14 @@ func Example() {
 	// This example:
 	// - Loads the serialized representation of the pre-trained model into a Graph
 	// - Creates a Session to execute operations on the Graph
-	// - Converts an image file to a Tensor to provide as input for Graph execution
-	// - Exectues the graph and prints out the label with the highest probability
+	// - Converts an image file to a Tensor to provide as input to a Session run
+	// - Executes the Session and prints out the label with the highest probability
+	//
+	// To convert an image file to a Tensor suitable for input to the Inception model,
+	// this example:
+	// - Constructs another TensorFlow graph to normalize the image into a
+	//   form suitable for the model (for example, resizing the image)
+	// - Creates an executes a Session to obtain a Tensor in this normalized form.
 	modeldir := flag.String("dir", "", "Directory containing the trained model files. The directory will be created and the model downloaded into it if necessary")
 	imagefile := flag.String("image", "", "Path of the image to extract labels for")
 	flag.Parse()
@@ -89,7 +96,7 @@ func Example() {
 	// For multiple images, session.Run() can be called in a loop (and
 	// concurrently). Furthermore, images can be batched together since the
 	// model accepts batches of image data as input.
-	tensor, err := makeTensorFromImageForInception(*imagefile)
+	tensor, err := makeTensorFromImage(*imagefile)
 	if err != nil {
 		log.Fatal(err)
 	}
@@ -136,54 +143,102 @@ func printBestLabel(probabilities []float32, labelsFile string) {
 	fmt.Printf("BEST MATCH: (%2.0f%% likely) %s\n", probabilities[bestIdx]*100.0, labels[bestIdx])
 }
 
-// Given an image stored in filename, returns a Tensor which is suitable for
-// providing the image data to the pre-defined model.
-func makeTensorFromImageForInception(filename string) (*tf.Tensor, error) {
-	const (
-		// Some constants specific to the pre-trained model at:
-		// https://storage.googleapis.com/download.tensorflow.org/models/inception5h.zip
-		//
-		// - The model was trained after with images scaled to 224x224 pixels.
-		// - The colors, represented as R, G, B in 1-byte each were converted to
-		//   float using (value - Mean)/Std.
-		//
-		// If using a different pre-trained model, the values will have to be adjusted.
-		H, W = 224, 224
-		Mean = 117
-		Std  = float32(1)
-	)
+// Conver the image in filename to a Tensor suitable as input to the Inception model.
+func makeTensorFromImage(filename string) (*tf.Tensor, error) {
+	// Load the pixels from the file
 	file, err := os.Open(filename)
 	if err != nil {
 		return nil, err
 	}
-	defer file.Close()
 	img, _, err := image.Decode(file)
+	file.Close()
 	if err != nil {
 		return nil, err
 	}
-	sz := img.Bounds().Size()
-	if sz.X != W || sz.Y != H {
-		return nil, fmt.Errorf("input image is required to be %dx%d pixels, was %dx%d", W, H, sz.X, sz.Y)
-	}
-	// 4-dimensional input:
-	// - 1st dimension: Batch size (the model takes a batch of images as
-	//                  input, here the "batch size" is 1)
-	// - 2nd dimension: Rows of the image
-	// - 3rd dimension: Columns of the row
-	// - 4th dimension: Colors of the pixel as (B, G, R)
-	// Thus, the shape is [1, 224, 224, 3]
-	var ret [1][H][W][3]float32
-	for y := 0; y < H; y++ {
-		for x := 0; x < W; x++ {
+	// Represent the image as [H][W][B,G,R]byte
+	contents := make([][][3]byte, img.Bounds().Size().Y)
+	for y := 0; y < len(contents); y++ {
+		contents[y] = make([][3]byte, img.Bounds().Size().X)
+		for x := 0; x < len(contents[y]); x++ {
 			px := x + img.Bounds().Min.X
 			py := y + img.Bounds().Min.Y
 			r, g, b, _ := img.At(px, py).RGBA()
-			ret[0][y][x][0] = float32((int(b>>8) - Mean)) / Std
-			ret[0][y][x][1] = float32((int(g>>8) - Mean)) / Std
-			ret[0][y][x][2] = float32((int(r>>8) - Mean)) / Std
+			// image.Image uses 16-bits for each color.
+			// We want 8-bits.
+			contents[y][x][0] = byte(b >> 8)
+			contents[y][x][1] = byte(g >> 8)
+			contents[y][x][2] = byte(r >> 8)
 		}
 	}
-	return tf.NewTensor(ret)
+	tensor, err := tf.NewTensor(contents)
+	if err != nil {
+		return nil, err
+	}
+	// Construct a graph to normalize the image
+	graph, input, output, err := constructGraphToNormalizeImage()
+	if err != nil {
+		return nil, err
+	}
+	// Execute that graph to normalize this one image
+	session, err := tf.NewSession(graph, nil)
+	if err != nil {
+		return nil, err
+	}
+	defer session.Close()
+	normalized, err := session.Run(
+		map[tf.Output]*tf.Tensor{input: tensor},
+		[]tf.Output{output},
+		nil)
+	if err != nil {
+		return nil, err
+	}
+	return normalized[0], nil
+}
+
+// The inception model takes as input the image described by a Tensor in a very
+// specific normalized format (a particular image size, shape of the input tensor,
+// normalized pixel values etc.).
+//
+// This function constructs a graph of TensorFlow operations which takes as input
+// the raw pixel values of an image in the form of a Tensor of shape [Height, Width, 3]
+// and returns a tensor suitable for input to the inception model.
+//
+// T[y][x] is the (Blue, Green, Red) values of the pixel at position (x, y) in the image,
+// with each color value represented as a single byte.
+func constructGraphToNormalizeImage() (graph *tf.Graph, input, output tf.Output, err error) {
+	// Some constants specific to the pre-trained model at:
+	// https://storage.googleapis.com/download.tensorflow.org/models/inception5h.zip
+	//
+	// - The model was trained after with images scaled to 224x224 pixels.
+	// - The colors, represented as R, G, B in 1-byte each were converted to
+	//   float using (value - Mean)/Scale.
+	//
+	// If using a different pre-trained model, the values will have to be adjusted.
+	const (
+		H, W  = 224, 224
+		Mean  = float32(117)
+		Scale = float32(1)
+	)
+	// - input is a 3D tensor of shape [Height, Width, Colors=3], where
+	//   each pixel is represented as a triplet of 1-byte colors
+	// - ResizeBilinear (and the inception model) takes a 4D tensor of shape
+	//   [BatchSize, Height, Width, Colors=3], where each pixel is
+	//   represented as a triplet of floats
+	// - Apply normalization on each pixel and use ExpandDims to make
+	//   this single image be a "batch" of size 1 for ResizeBilinear.
+	s := op.NewScope()
+	input = op.Placeholder(s, tf.Uint8)
+	output = op.Div(s,
+		op.Sub(s,
+			op.ResizeBilinear(s,
+				op.ExpandDims(s,
+					op.Cast(s, input, tf.Float),
+					op.Const(s.SubScope("make_batch"), int32(0))),
+				op.Const(s.SubScope("size"), []int32{H, W})),
+			op.Const(s.SubScope("mean"), Mean)),
+		op.Const(s.SubScope("scale"), Scale))
+	graph, err = s.Finalize()
+	return graph, input, output, err
 }
 
 func modelFiles(dir string) (modelfile, labelsfile string, err error) {
diff --git a/tensorflow/go/genop/internal/genop.go b/tensorflow/go/genop/internal/genop.go
index fdc55f5ebce..5d5aa269929 100644
--- a/tensorflow/go/genop/internal/genop.go
+++ b/tensorflow/go/genop/internal/genop.go
@@ -244,10 +244,14 @@ func {{.Op.Name}}
 {{if .OptionalAttrs}}, optional ...{{.Op.Name}}Attr{{end -}}
 )
 
-{{- /* Construct outputs: len(OpDef.OutputArg) + 1 (for error) */ -}}
+{{- /* Construct outputs: len(OpDef.OutputArg) */ -}}
 
-({{range $i,$a := .Op.OutputArg}}{{if $i}}, {{end}}{{Identifier $a.Name}} {{if IsListArg $a}}[]{{end}}tf.Output{{end -}}
-{{if .Op.OutputArg}}, {{end}}err error) {
+{{if .Op.OutputArg -}}
+({{range $i,$a := .Op.OutputArg}}{{if $i}}, {{end}}{{Identifier $a.Name}} {{if IsListArg $a}}[]{{end}}tf.Output{{end -}})
+{{- end }} {
+	if scope.Err() != nil {
+		return
+	}
 	{{if .HasAttrs -}}
 	attrs := map[string]interface{}{ {{- range .RequiredAttrs}}{{printf "%q" .Name}}: {{Identifier .Name}},{{end}}}
 	{{if .OptionalAttrs -}}
@@ -262,25 +266,37 @@ func {{.Op.Name}}
 		Input: []tf.Input{
 			{{range .Op.InputArg}}{{if IsListArg .}}tf.OutputList({{Identifier .Name}}){{else}}{{Identifier .Name}}{{end}}, {{end}}
 		},
-		{{end}}
-		{{- if .HasAttrs}}Attrs: attrs,{{end}}
+		{{- end}}
+		{{- if .HasAttrs}}
+		Attrs: attrs,
+		{{- end}}
 	}
-	{{if .Op.OutputArg}}op, err :={{else}}_, err ={{end}} scope.Graph().AddOperation(opspec)
+	{{- if .Op.OutputArg}}
 	{{- if .HasListOutput}}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
 	var idx int
+	var err error
 	{{- range $i, $a := .Op.OutputArg}}
 	{{- if IsListArg $a}}
 	if {{Identifier .Name}}, idx, err = makeOutputList(op, idx, {{printf "%q" .Name}}); err != nil {
-		return {{range $.Op.OutputArg}}{{Identifier .Name}}, {{end}}err
+		scope.UpdateErr({{printf "%q" $.Op.Name}}, err)
+		return
 	}
 	{{- else }}
 	{{Identifier .Name}} = op.Output(idx)
-	{{- end }}
-	{{- end }}
-	return {{range .Op.OutputArg}}{{Identifier .Name}}, {{end}}err
+	{{- end }}{{- /* if IsListArg */}}
+	{{- end }}{{- /* range .Op.OutputArg */}}
+	return {{range $i, $a := .Op.OutputArg}}{{if $i}}, {{end}}{{Identifier .Name}}{{end}}
 	{{- else }}
-	return {{range $i, $a := .Op.OutputArg}}op.Output({{$i}}), {{end}}err
-	{{- end }}
+	op := scope.AddOperation(opspec)
+	return {{range $i, $a := .Op.OutputArg}}{{if $i}}, {{end}}op.Output({{$i}}){{end}}
+	{{- end }}{{- /* if .HasListOutput */}}
+	{{- else }}
+	scope.AddOperation(opspec)
+	{{- end }}{{- /* if .Op.OutputArg */}}
 }
 `))
 )
diff --git a/tensorflow/go/genop/internal/genop_test.go b/tensorflow/go/genop/internal/genop_test.go
index dade7ce48f7..b3bcd9db052 100644
--- a/tensorflow/go/genop/internal/genop_test.go
+++ b/tensorflow/go/genop/internal/genop_test.go
@@ -39,12 +39,14 @@ summary: "No. Op."
 `,
 			wanted: `
 // No. Op.
-func NoOp(scope *Scope) (err error) {
+func NoOp(scope *Scope) {
+	if scope.Err() != nil {
+		return
+	}
 	opspec := tf.OpSpec{
 		Type: "NoOp",
 	}
-	_, err = scope.Graph().AddOperation(opspec)
-	return err
+	scope.AddOperation(opspec)
 }
 `,
 		},
@@ -81,15 +83,18 @@ description: "Blah blah",
 // Returns x + y element-wise.
 //
 // Blah blah
-func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output, err error) {
+func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
 	opspec := tf.OpSpec{
 		Type: "Add",
 		Input: []tf.Input{
 			x, y,
 		},
 	}
-	op, err := scope.Graph().AddOperation(opspec)
-	return op.Output(0), err
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 `,
 		},
@@ -117,7 +122,10 @@ summary: "Cast x of type SrcT to y of DstT."
 `,
 			wanted: `
 // Cast x of type SrcT to y of DstT.
-func Cast(scope *Scope, x tf.Output, DstT tf.DataType) (y tf.Output, err error) {
+func Cast(scope *Scope, x tf.Output, DstT tf.DataType) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
 	attrs := map[string]interface{}{"DstT": DstT}
 	opspec := tf.OpSpec{
 		Type: "Cast",
@@ -126,8 +134,8 @@ func Cast(scope *Scope, x tf.Output, DstT tf.DataType) (y tf.Output, err error)
 		},
 		Attrs: attrs,
 	}
-	op, err := scope.Graph().AddOperation(opspec)
-	return op.Output(0), err
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 `,
 		},
@@ -218,7 +226,10 @@ func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
 //	contents: 0-D.  The JPEG-encoded image.
 //
 // Returns 3-D with shape [height, width, channels]
-func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output, err error) {
+func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
 	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
@@ -230,8 +241,47 @@ func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (i
 		},
 		Attrs: attrs,
 	}
-	op, err := scope.Graph().AddOperation(opspec)
-	return op.Output(0), err
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+`,
+		},
+		{
+			tag: "MultipleOutputs",
+			opdef: `
+name: "TwoOutputs"
+input_arg: <
+  name: "input"
+  type_attr: "T"
+>
+output_arg <
+  name: "x"
+  type_attr: "T"
+>
+output_arg <
+  name: "y"
+  type_attr: "T"
+>
+attr: <
+  name: "T"
+  type: "type"
+>
+summary: "Op that produces multiple outputs"
+`,
+			wanted: `
+// Op that produces multiple outputs
+func TwoOutputs(scope *Scope, input tf.Output) (x tf.Output, y tf.Output) {
+        if scope.Err() != nil {
+                return
+        }
+        opspec := tf.OpSpec{
+                Type: "TwoOutputs",
+                Input: []tf.Input{
+                        input,
+                },
+        }
+        op := scope.AddOperation(opspec)
+        return op.Output(0), op.Output(1)
 }
 `,
 		},
@@ -290,7 +340,10 @@ func ShapeNOutType(value tf.DataType) ShapeNAttr {
 // Returns shape of tensors.
 //
 // Some description here.
-func ShapeN(scope *Scope, input []tf.Output, optional ...ShapeNAttr) (output []tf.Output, err error) {
+func ShapeN(scope *Scope, input []tf.Output, optional ...ShapeNAttr) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
 	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
@@ -302,12 +355,17 @@ func ShapeN(scope *Scope, input []tf.Output, optional ...ShapeNAttr) (output []t
 		},
 		Attrs: attrs,
 	}
-	op, err := scope.Graph().AddOperation(opspec)
-	var idx int
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		return output, err
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
 	}
-	return output, err
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("ShapeN", err)
+		return
+	}
+	return output
 }
 `,
 		},
@@ -325,11 +383,11 @@ func ShapeN(scope *Scope, input []tf.Output, optional ...ShapeNAttr) (output []t
 			}
 			got, err := format.Source(buf.Bytes())
 			if err != nil {
-				t.Fatal(err)
+				t.Fatalf("Unable to format: %v\n%s", err, buf.Bytes())
 			}
 			want, err := format.Source([]byte(test.wanted))
 			if err != nil {
-				t.Fatal(err)
+				t.Fatalf("Unable to format: %v\n%s", err, test.wanted)
 			}
 			if !bytes.Equal(got, want) {
 				t.Fatalf("Got:\n%s\nWant:\n%s\n", got, want)
diff --git a/tensorflow/go/op/op.go b/tensorflow/go/op/op.go
index dd79c2076ac..3d820a60e69 100644
--- a/tensorflow/go/op/op.go
+++ b/tensorflow/go/op/op.go
@@ -28,24 +28,23 @@ import (
 )
 
 // Const adds an operation to graph that produces value as output.
-func Const(scope *Scope, value interface{}) (tf.Output, error) {
-	if t, ok := value.(*tf.Tensor); ok {
-		return makeConst(scope, t)
+func Const(scope *Scope, value interface{}) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-	t, err := tf.NewTensor(value)
-	if err != nil {
-		return tf.Output{}, err
+	t, ok := value.(*tf.Tensor)
+	if !ok {
+		var err error
+		if t, err = tf.NewTensor(value); err != nil {
+			scope.UpdateErr("Const", err)
+			return
+		}
 	}
-	return makeConst(scope, t)
-}
-
-func makeConst(scope *Scope, t *tf.Tensor) (tf.Output, error) {
-	op, err := scope.Graph().AddOperation(tf.OpSpec{
+	return scope.AddOperation(tf.OpSpec{
 		Name: scope.opName("Const"),
 		Type: "Const",
 		Attrs: map[string]interface{}{
 			"dtype": t.DataType(),
 			"value": t,
-		}})
-	return op.Output(0), err
+		}}).Output(0)
 }
diff --git a/tensorflow/go/op/scope.go b/tensorflow/go/op/scope.go
index 25ebbae70f6..346c756f563 100644
--- a/tensorflow/go/op/scope.go
+++ b/tensorflow/go/op/scope.go
@@ -16,33 +16,60 @@ package op
 
 import (
 	"fmt"
+	"runtime/debug"
 
 	tf "github.com/tensorflow/tensorflow/tensorflow/go"
 )
 
-// Scope encapsulates common properties of operations being added to a Graph.
+// Scope encapsulates common operation properties when building a Graph.
 //
-// Scopes allow common properties (such as a name prefix) to be specified
-// once for multiple operations being added to a graph. The With* methods
-// create derivative scopes that encapsulate the same set of properties
-// as the parent Scope, except for the one being changed by the specific
-// With* method.
+// A Scope object (and its derivates, e.g., obtained from Scope.SubScope)
+// act as a builder for graphs. They allow common properties (such as
+// a name prefix) to be specified for multiple operations being added
+// to the graph.
 //
-// Scopes are NOT safe for concurrent use by multiple goroutines.
+// A Scope object and all its derivates (e.g., obtained from Scope.SubScope)
+// are not safe for concurrent use by multiple goroutines.
 type Scope struct {
 	graph     *tf.Graph
 	namemap   map[string]int
 	namespace string
+	err       *scopeErr
+}
+
+// scopeErr is used to share errors between all derivatives of a root scope.
+type scopeErr struct {
+	err error
 }
 
 // NewScope creates a Scope initialized with an empty Graph.
 func NewScope() *Scope {
-	return &Scope{graph: tf.NewGraph(), namemap: make(map[string]int)}
+	return &Scope{graph: tf.NewGraph(), namemap: make(map[string]int), err: new(scopeErr)}
 }
 
-// Graph returns the Graph which this Scope and its children are
-func (s *Scope) Graph() *tf.Graph {
-	return s.graph
+// Finalize returns the Graph on which this scope operates on and renders s
+// unusable. If there was an error during graph construction, that error is
+// returned instead.
+func (s *Scope) Finalize() (*tf.Graph, error) {
+	if err := s.Err(); err != nil {
+		return nil, err
+	}
+	s.err.err = fmt.Errorf("Scope has been finalized and is no longer usable")
+	return s.graph, nil
+}
+
+// AddOperation adds the operation to the Graph managed by s.
+//
+// See Graph.AddOperation.
+func (s *Scope) AddOperation(args tf.OpSpec) *tf.Operation {
+	if s.Err() != nil {
+		return nil
+	}
+	op, err := s.graph.AddOperation(args)
+	if err != nil {
+		s.UpdateErr(args.Type, err)
+	}
+	return op
 }
 
 // SubScope returns a new Scope which will cause all operations added to the
@@ -57,6 +84,25 @@ func (s *Scope) SubScope(namespace string) *Scope {
 		graph:     s.graph,
 		namemap:   make(map[string]int),
 		namespace: namespace,
+		err:       s.err,
+	}
+}
+
+// Err returns the error, if any, encountered during the construction
+// of the Graph managed by s.
+//
+// Once Err returns a non-nil error, all future calls will do the same,
+// indicating that the scope should be discarded as the graph could not
+// be constructed.
+func (s *Scope) Err() error {
+	return s.err.err
+}
+
+// UpdateErr is used to notify Scope of any graph construction errors
+// while creating the operation op.
+func (s *Scope) UpdateErr(op string, err error) {
+	if s.err.err == nil {
+		s.err.err = fmt.Errorf("failed to add operation %q: %v (Stacktrace: %s)", op, err, debug.Stack())
 	}
 }
 
diff --git a/tensorflow/go/op/scope_test.go b/tensorflow/go/op/scope_test.go
index ba0a183bb9c..4fcb1a56d56 100644
--- a/tensorflow/go/op/scope_test.go
+++ b/tensorflow/go/op/scope_test.go
@@ -22,13 +22,6 @@ import (
 )
 
 func TestScopeSubScope(t *testing.T) {
-	constant := func(s *Scope) string {
-		c, err := Const(s, int64(1))
-		if err != nil {
-			t.Fatal(err)
-		}
-		return c.Op.Name()
-	}
 	var (
 		root  = NewScope()
 		sub1  = root.SubScope("x")
@@ -37,54 +30,89 @@ func TestScopeSubScope(t *testing.T) {
 		sub2a = sub2.SubScope("y")
 	)
 	testdata := []struct {
-		got, want string
+		scope *Scope
+		name  string
 	}{
-		{constant(root), "Const"},
-		{constant(sub1), "x/Const"},
-		{constant(sub1a), "x/y/Const"},
-		{constant(sub2), "x_1/Const"},
-		{constant(sub2a), "x_1/y/Const"},
+		{root, "Const"},
+		{sub1, "x/Const"},
+		{sub1a, "x/y/Const"},
+		{sub2, "x_1/Const"},
+		{sub2a, "x_1/y/Const"},
 	}
-	for idx, test := range testdata {
-		if test.got != test.want {
-			t.Errorf("#%d: Got %q, want %q", idx, test.got, test.want)
+	for _, test := range testdata {
+		c := Const(test.scope, int64(1))
+		if err := test.scope.Err(); err != nil {
+			t.Fatalf("%q: %v", test.name, err)
+		}
+		if got := c.Op.Name(); got != test.name {
+			t.Errorf("%q: Got %q", test.name, got)
 		}
 	}
+}
 
+func TestScopeSubScopeErrors(t *testing.T) {
+	var (
+		root = NewScope()
+		sub  = root.SubScope("x")
+	)
+	// Error on the root, even after sub has been created should be propagated.
+	// Force an error by creating a Const which has a type that does not
+	// translate to the TensorFlow type system.
+	Const(root, int(1))
+	if err := root.Err(); err == nil {
+		t.Fatal("Expected error")
+	}
+	if err := sub.Err(); err == nil {
+		t.Errorf("Root scope had error [%v], but sub-scope did not", root.Err())
+	}
+}
+
+func TestScopeFinalize(t *testing.T) {
+	var (
+		root = NewScope()
+		sub1 = root.SubScope("x")
+		sub2 = sub1.SubScope("y")
+	)
+	if _, err := sub1.Finalize(); err != nil {
+		t.Fatal(err)
+	}
+	if err := root.Err(); err == nil {
+		t.Error("Root scope's Err() should be non-nil once Finalize has been called")
+	}
+	if err := sub2.Err(); err == nil {
+		t.Error("Sub scope's Err() should be non-nil once Finalize has been called")
+	}
 }
 
 func Example() {
 	// This example creates a Graph that multiplies a constant matrix with
 	// a matrix to be provided during graph execution (via
 	// tensorflow.Session).
-	scope := NewScope()
-	var m1, m2, product tf.Output
-	var err error
-	// A constant 2x1 matrix
-	if m1, err = Const(scope, [][]float32{{10}, {20}}); err != nil {
-		panic(err)
-	}
-	// A placeholder for another matrix
-	if m2, err = Placeholder(scope, tf.Float); err != nil {
-		panic(err)
-	}
-	// product = m1 x transpose(m2)
-	if product, err = MatMul(scope, m1, m2, MatMulTransposeB(true)); err != nil {// m1 x transpose(m2)
-		panic(err)
+	s := NewScope()
+	input := Placeholder(s, tf.Float) // Matrix to be provided to Session.Run
+	output := MatMul(s,
+		Const(s, [][]float32{{10}, {20}}), // Constant 2x1 matrix
+		input,
+		MatMulTransposeB(true))
+	if s.Err() != nil {
+		panic(s.Err())
 	}
 	// Shape of the product: The number of rows is fixed by m1, but the
 	// number of columns will depend on m2, which is unknown.
-	shape, _ := product.Shape()
+	shape, _ := output.Shape()
 	fmt.Println(shape)
 	// Output: [2 -1]
 }
 
 func ExampleScope_SubScope() {
 	var (
-		s     = NewScope()
-		c1, _ = Const(s.SubScope("x"), int64(1))
-		c2, _ = Const(s.SubScope("x"), int64(1))
+		s  = NewScope()
+		c1 = Const(s.SubScope("x"), int64(1))
+		c2 = Const(s.SubScope("x"), int64(1))
 	)
+	if s.Err() != nil {
+		panic(s.Err())
+	}
 	fmt.Println(c1.Op.Name(), c2.Op.Name())
 	// Output: x/Const x_1/Const
 }
diff --git a/tensorflow/models/image/alexnet/alexnet_benchmark.py b/tensorflow/models/image/alexnet/alexnet_benchmark.py
index 18ac4e13292..af13a075b55 100644
--- a/tensorflow/models/image/alexnet/alexnet_benchmark.py
+++ b/tensorflow/models/image/alexnet/alexnet_benchmark.py
@@ -36,6 +36,7 @@ from __future__ import print_function
 import argparse
 from datetime import datetime
 import math
+import sys
 import time
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
@@ -241,6 +242,5 @@ if __name__ == '__main__':
       default=100,
       help='Number of batches to run.'
   )
-  FLAGS = parser.parse_args()
-
-  tf.app.run()
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/models/image/imagenet/classify_image.py b/tensorflow/models/image/imagenet/classify_image.py
index 3759e88b791..9014ced0267 100644
--- a/tensorflow/models/image/imagenet/classify_image.py
+++ b/tensorflow/models/image/imagenet/classify_image.py
@@ -223,6 +223,5 @@ if __name__ == '__main__':
       default=5,
       help='Display this many predictions.'
   )
-  FLAGS = parser.parse_args()
-
-  tf.app.run()
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/models/image/mnist/convolutional.py b/tensorflow/models/image/mnist/convolutional.py
index 7630c59c99b..6108139d1dd 100644
--- a/tensorflow/models/image/mnist/convolutional.py
+++ b/tensorflow/models/image/mnist/convolutional.py
@@ -118,7 +118,7 @@ def error_rate(predictions, labels):
       predictions.shape[0])
 
 
-def main(argv=None):  # pylint: disable=unused-argument
+def main(_):
   if FLAGS.self_test:
     print('Running self-test.')
     train_data, train_labels = fake_data(256)
@@ -326,14 +326,12 @@ if __name__ == '__main__':
       '--use_fp16',
       default=False,
       help='Use half floats instead of full floats if True.',
-      action='store_true'
-  )
+      action='store_true')
   parser.add_argument(
       '--self_test',
       default=False,
       action='store_true',
-      help='True if running a self test.'
-  )
-  FLAGS = parser.parse_args()
+      help='True if running a self test.')
 
-  tf.app.run()
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index e3313b43352..6bb86a552a8 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -1861,6 +1861,7 @@ tf_py_wrap_cc(
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/core:lib",
         "//tensorflow/core/distributed_runtime:server_lib",
+        "//tensorflow/tools/tfprof/internal:print_model_analysis",
         "//util/python:python_headers",
     ] + tf_additional_lib_deps(),
 )
@@ -1930,7 +1931,6 @@ py_library(
 # Just used by tests.
 tf_cuda_library(
     name = "construction_fails_op",
-    testonly = 1,
     srcs = ["client/test_construction_fails_op.cc"],
     deps = [
         "//tensorflow/core",
@@ -2130,7 +2130,6 @@ py_tests(
         "summary/event_multiplexer_test.py",
         "summary/impl/directory_watcher_test.py",
         "summary/impl/event_file_loader_test.py",
-        "summary/impl/gcs_file_loader_test.py",
         "summary/impl/reservoir_test.py",
         "summary/summary_test.py",
         "summary/writer/writer_test.py",
diff --git a/tensorflow/python/debug/session_debug_test.py b/tensorflow/python/debug/session_debug_test.py
index 56f1fcdc6a3..48d7e944844 100644
--- a/tensorflow/python/debug/session_debug_test.py
+++ b/tensorflow/python/debug/session_debug_test.py
@@ -131,6 +131,72 @@ class SessionDebugTest(test_util.TensorFlowTestCase):
       self.assertGreaterEqual(
           dump.get_rel_timestamps("%s/read" % v_name, 0, "DebugIdentity")[0], 0)
 
+  def testDifferentWatchesOnDifferentRuns(self):
+    """Test watching different tensors on different runs of the same graph."""
+
+    with session.Session() as sess:
+      u_init_val = np.array([[5.0, 3.0], [-1.0, 0.0]])
+      v_init_val = np.array([[2.0], [-1.0]])
+
+      # Use node names with overlapping namespace (i.e., parent directory) to
+      # test concurrent, non-racing directory creation.
+      u_name = "diff_Watch/u"
+      v_name = "diff_Watch/v"
+
+      u_init = constant_op.constant(u_init_val, shape=[2, 2])
+      u = variables.Variable(u_init, name=u_name)
+      v_init = constant_op.constant(v_init_val, shape=[2, 1])
+      v = variables.Variable(v_init, name=v_name)
+
+      w = math_ops.matmul(u, v, name="diff_Watch/matmul")
+
+      u.initializer.run()
+      v.initializer.run()
+
+      for i in xrange(2):
+        run_options = config_pb2.RunOptions(output_partition_graphs=True)
+
+        run_dump_root = os.path.join(self._dump_root, "run_%d" % i)
+        debug_url = "file://%s" % run_dump_root
+
+        if i == 0:
+          # First debug run: Add debug tensor watch for u.
+          self._addDebugTensorWatch(
+              run_options, "%s/read" % u_name, 0, debug_urls=[debug_url])
+        else:
+          # Second debug run: Add debug tensor watch for v.
+          self._addDebugTensorWatch(
+              run_options, "%s/read" % v_name, 0, debug_urls=[debug_url])
+
+        run_metadata = config_pb2.RunMetadata()
+
+        # Invoke Session.run().
+        sess.run(w, options=run_options, run_metadata=run_metadata)
+
+        self.assertEqual(self._expected_partition_graph_count,
+                         len(run_metadata.partition_graphs))
+
+        dump = debug_data.DebugDumpDir(
+            run_dump_root, partition_graphs=run_metadata.partition_graphs)
+
+        # Each run should have generated only one dumped tensor, not two.
+        self.assertEqual(1, dump.size)
+
+        if i == 0:
+          self.assertAllClose([u_init_val],
+                              dump.get_tensors("%s/read" % u_name, 0,
+                                               "DebugIdentity"))
+          self.assertGreaterEqual(
+              dump.get_rel_timestamps("%s/read" % u_name, 0,
+                                      "DebugIdentity")[0], 0)
+        else:
+          self.assertAllClose([v_init_val],
+                              dump.get_tensors("%s/read" % v_name, 0,
+                                               "DebugIdentity"))
+          self.assertGreaterEqual(
+              dump.get_rel_timestamps("%s/read" % v_name, 0,
+                                      "DebugIdentity")[0], 0)
+
   def testDumpStringTensorsToFileSystem(self):
     with session.Session() as sess:
       str1_init_val = np.array(b"abc")
diff --git a/tensorflow/python/framework/cpp_shape_inference.cc b/tensorflow/python/framework/cpp_shape_inference.cc
index 0d8703fe8fe..bb5a57e617c 100644
--- a/tensorflow/python/framework/cpp_shape_inference.cc
+++ b/tensorflow/python/framework/cpp_shape_inference.cc
@@ -73,8 +73,10 @@ Status RunCppShapeInferenceImpl(
   }
 
   // Run shape inference.
-  tensorflow::shape_inference::InferenceContext c(&node, op_reg_data->op_def,
-                                                  input_shapes, input_tensors);
+  // TODO(cwhipkey): pass a value for input_tensors_as_shapes.
+  tensorflow::shape_inference::InferenceContext c(
+      &node, op_reg_data->op_def, input_shapes, input_tensors,
+      {} /* input_tensors_as_shapes */);
   TF_RETURN_IF_ERROR(c.construction_status());
 
   TF_RETURN_IF_ERROR(c.Run(op_reg_data->shape_inference_fn));
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index 13021d885ba..e7f63d3e80a 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -400,18 +400,26 @@ def import_graph_def(graph_def, input_map=None, return_elements=None,
             # would cause graphs to fail if imported after correcting.
             #
             # This can be removed after 2017/03/08.
-            if op.type not in ['RandomShuffleQueue', 'PaddingFIFOQueue',
-                               'FIFOQueue', 'PriorityQueue', 'QueueSize',
-                               'Stack', 'Barrier', 'BarrierReadySize',
-                               'BarrierIncompleteSize', 'HashTable',
-                               'MutableHashTable',
-                               'MutableHashTableOfTensors', 'Mutex',
-                               'CuckooTable', 'IndexTable',
-                               'WholeFileReader', 'TextLineReader',
-                               'FixedLengthRecordReader',
-                               'TFRecordReader', 'IdentityReader',
-                               'RefSwitch', 'RefEnter', 'RefNextIteration',
-                               'RefMerge', 'RefIdentity']:
+            if op.type in ['RandomShuffleQueue', 'PaddingFIFOQueue',
+                           'FIFOQueue', 'PriorityQueue', 'QueueSize',
+                           'Stack', 'Barrier', 'BarrierReadySize',
+                           'BarrierIncompleteSize', 'HashTable',
+                           'MutableHashTable',
+                           'MutableHashTableOfTensors', 'Mutex',
+                           'CuckooTable', 'IndexTable',
+                           'WholeFileReader', 'TextLineReader',
+                           'FixedLengthRecordReader',
+                           'TFRecordReader', 'IdentityReader',
+                           'RefSwitch', 'RefEnter', 'RefNextIteration',
+                           'RefMerge', 'RefIdentity']:
+              pass
+            elif op.type in [
+                'ConditionalAccumulator', 'SparseConditionalAccumulator',
+                'Table'
+            ]:
+              # This can be removed after 2017/04/24.
+              pass
+            else:
               raise e
 
         del op.node_def.attr['_output_shapes']
diff --git a/tensorflow/python/framework/meta_graph_test.py b/tensorflow/python/framework/meta_graph_test.py
index f6c1db6f2af..e654331271d 100644
--- a/tensorflow/python/framework/meta_graph_test.py
+++ b/tensorflow/python/framework/meta_graph_test.py
@@ -384,6 +384,32 @@ class ScopedMetaGraphTest(tf.test.TestCase):
           orig_meta_graph, import_scope="new_hidden1",
           input_map={"$unbound_inputs_MatMul": tf.constant(4.0, shape=[2, 2])})
 
+  def testClearDevices(self):
+    graph1 = tf.Graph()
+    with graph1.as_default():
+      with tf.device("/device:CPU:0"):
+        a = tf.Variable(tf.constant(1.0, shape=[2, 2]), name="a")
+      with tf.device("/job:ps/replica:0/task:0/gpu:0"):
+        b = tf.Variable(tf.constant(2.0, shape=[2, 2]), name="b")
+      with tf.device("/job:localhost/replica:0/task:0/cpu:0"):
+        tf.matmul(a, b, name="matmul")
+
+    self.assertEqual("/device:CPU:0", str(graph1.as_graph_element("a").device))
+    self.assertEqual("/job:ps/replica:0/task:0/device:GPU:0",
+                     str(graph1.as_graph_element("b").device))
+    self.assertEqual("/job:localhost/replica:0/task:0/device:CPU:0",
+                     str(graph1.as_graph_element("matmul").device))
+
+    orig_meta_graph, _ = meta_graph.export_scoped_meta_graph(graph=graph1)
+
+    graph2 = tf.Graph()
+    with graph2.as_default():
+      meta_graph.import_scoped_meta_graph(orig_meta_graph, clear_devices=True)
+
+    self.assertEqual("", str(graph2.as_graph_element("a").device))
+    self.assertEqual("", str(graph2.as_graph_element("b").device))
+    self.assertEqual("", str(graph2.as_graph_element("matmul").device))
+
 
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 0b028c28390..635d592912f 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -4053,6 +4053,7 @@ class GraphKeys(object):
   READY_FOR_LOCAL_INIT_OP = "ready_for_local_init_op"
   SUMMARY_OP = "summary_op"
   GLOBAL_STEP = "global_step"
+  TRAIN_OP = "train_op"
 
   # Key for control flow context.
   COND_CONTEXT = "cond_context"
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index 8c0021c3cb0..7e56bb5843b 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -775,22 +775,23 @@ class StridedSliceBenchmark(tf.test.Benchmark):
 
 class StridedSliceAssignChecker(object):
 
-  def __init__(self, test, x, tensor_type=tf.int32):
+  def __init__(self, test, x, tensor_type=tf.float32):
     self.tensor_type = tensor_type
     self.test = test
     self.x = tf.cast(tf.constant(x, dtype=tf.float32), dtype=tensor_type)
     self.x_np = np.array(x)
 
   def __setitem__(self, index, value):
-    with self.test.test_session() as sess:
-      var = tf.Variable(self.x)
-      sess.run(tf.initialize_variables([var]))
-      val = sess.run(var[index].assign(
-          tf.constant(
-              value, dtype=self.tensor_type)))
-      valnp = np.copy(self.x_np)
-      valnp[index] = np.array(value)
-      self.test.assertAllEqual(val, valnp)
+    for use_gpu in [False, True]:
+      with self.test.test_session(use_gpu=use_gpu) as sess:
+        var = tf.Variable(self.x)
+        sess.run(tf.initialize_variables([var]))
+        val = sess.run(var[index].assign(
+            tf.constant(
+                value, dtype=self.tensor_type)))
+        valnp = np.copy(self.x_np)
+        valnp[index] = np.array(value)
+        self.test.assertAllEqual(val, valnp)
 
 
 class SliceAssignTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/python/kernel_tests/constant_op_test.py b/tensorflow/python/kernel_tests/constant_op_test.py
index 14fe95dea66..0ba17208e77 100644
--- a/tensorflow/python/kernel_tests/constant_op_test.py
+++ b/tensorflow/python/kernel_tests/constant_op_test.py
@@ -322,7 +322,7 @@ class ZerosTest(tf.test.TestCase):
 class ZerosLikeTest(tf.test.TestCase):
 
   def _compareZeros(self, dtype, use_gpu):
-    with self.test_session(use_gpu=False):
+    with self.test_session(use_gpu=use_gpu):
       # Creates a tensor of non-zero values with shape 2 x 3.
       numpy_dtype = dtype.as_numpy_dtype
       d = tf.constant(np.ones((2, 3), dtype=numpy_dtype), dtype=dtype)
@@ -342,7 +342,7 @@ class ZerosLikeTest(tf.test.TestCase):
       self._compareZeros(dtype, False)
 
   def testZerosLikeGPU(self):
-    for dtype in [tf.float32, tf.float64, tf.int32]:
+    for dtype in [tf.float32, tf.float64, tf.int32, tf.bool]:
       self._compareZeros(dtype, True)
 
   def testZerosLikePartialShape(self):
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 47e8029a9b7..da5d51b0e19 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -252,6 +252,15 @@ class ControlFlowTest(tf.test.TestCase):
       result = exit_i.eval()
     self.assertAllEqual(10, result)
 
+  def testDifferentFrame(self):
+    with self.test_session():
+      data = tf.placeholder(tf.float32, shape=[])
+      enter_1 = control_flow_ops.enter(data, "foo_1", False)
+      enter_2 = control_flow_ops.enter(data, "foo_2", False)
+      res = tf.add(enter_1, enter_2)
+      with self.assertRaisesOpError("has inputs from different frames"):
+        res.eval(feed_dict={data: 1.0})
+
   def testCondBool(self):
     values = tf.constant(10)
     fn1 = lambda: tf.add(values, 1)
@@ -507,7 +516,7 @@ class ControlFlowTest(tf.test.TestCase):
                  ]
       self.assertAllEqual(dense_gv, [0.0, 2.0])
 
-  # Microbenchmark: 250,000 iterations/s.
+  # Microbenchmark: 256,000 iterations/s.
   def testWhile_1(self):
     with self.test_session():
       n = tf.constant(0)
diff --git a/tensorflow/python/kernel_tests/decode_raw_op_test.py b/tensorflow/python/kernel_tests/decode_raw_op_test.py
index f3cf0643fa0..bb707b32f7e 100644
--- a/tensorflow/python/kernel_tests/decode_raw_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_raw_op_test.py
@@ -65,10 +65,7 @@ class DecodeRawOpTest(tf.test.TestCase):
       self.assertEqual([None, None], decode.get_shape().as_list())
 
       expected_result = np.matrix([[1, -2, -3, 4]], dtype=np.float16)
-      result = decode.eval(
-        feed_dict={
-          in_bytes: [expected_result.tobytes()]
-        })
+      result = decode.eval(feed_dict={in_bytes: [expected_result.tostring()]})
 
       self.assertAllEqual(expected_result, result)
 
diff --git a/tensorflow/python/kernel_tests/functional_ops_test.py b/tensorflow/python/kernel_tests/functional_ops_test.py
index e73d61d2617..fe20ec7ebc0 100644
--- a/tensorflow/python/kernel_tests/functional_ops_test.py
+++ b/tensorflow/python/kernel_tests/functional_ops_test.py
@@ -114,6 +114,13 @@ class FunctionalOpsTest(tf.test.TestCase):
       r = tf.map_fn(lambda x: tf.mul(tf.add(x, 3), 2), elems)
       self.assertAllEqual(np.array([(x + 3) * 2 for x in nums]), r.eval())
 
+  def testMapSparseTensor(self):
+    with self.test_session():
+      with self.assertRaises(TypeError):
+        tf.map_fn(lambda x: x, tf.SparseTensor(indices=[[0, 0], [0, 1], [1, 0]],
+                                               values=tf.constant([0, 1, 2]),
+                                               shape=[2, 2]))
+
   def testMap_Scoped(self):
     with self.test_session() as sess:
 
diff --git a/tensorflow/python/kernel_tests/softmax_op_test.py b/tensorflow/python/kernel_tests/softmax_op_test.py
index 42201f7ae19..7c591707e99 100644
--- a/tensorflow/python/kernel_tests/softmax_op_test.py
+++ b/tensorflow/python/kernel_tests/softmax_op_test.py
@@ -120,9 +120,8 @@ class SoftmaxTest(tf.test.TestCase):
 
   def testDouble(self):
     self._testSoftmax(
-        np.array([[1., 1., 1., 1.], [1., 2., 3., 4.]]).astype(np.float64),
-        use_gpu=False)
-    self._testOverflow(use_gpu=False)
+        np.array([[1., 1., 1., 1.], [1., 2., 3., 4.]]).astype(np.float64))
+    self._testOverflow()
 
   def test1DTesnorAsInput(self):
     self._testSoftmax(
diff --git a/tensorflow/python/kernel_tests/sparse_matmul_op_test.py b/tensorflow/python/kernel_tests/sparse_matmul_op_test.py
index c6a11ee4cc9..9f789798b0c 100644
--- a/tensorflow/python/kernel_tests/sparse_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_matmul_op_test.py
@@ -64,6 +64,20 @@ class SparseMatMulTest(tf.test.TestCase):
       for y_dtype in (tf.float32, tf.bfloat16):
         self._testCpuMatmul(x, y, x_dtype=x_dtype, y_dtype=y_dtype)
 
+  def testZeroDim(self):
+    x = np.ones((4, 0)).astype(np.float32)
+    y = np.ones((0, 3)).astype(np.float32)
+    for x_dtype in (tf.float32, tf.bfloat16):
+      for y_dtype in (tf.float32, tf.bfloat16):
+        self._testCpuMatmul(x, y, x_dtype=x_dtype, y_dtype=y_dtype)
+
+  def testEmpty(self):
+    x = np.ones((0, 0)).astype(np.float32)
+    y = np.ones((0, 0)).astype(np.float32)
+    for x_dtype in (tf.float32, tf.bfloat16):
+      for y_dtype in (tf.float32, tf.bfloat16):
+        self._testCpuMatmul(x, y, x_dtype=x_dtype, y_dtype=y_dtype)
+
   # Tests setting one dimension to be a high value.
   def testLarge(self):
     r1 = np.random.randint(6000, 20000)
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index 1e2db3e565e..5010b79a6a7 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -637,19 +637,19 @@ class VariableScopeTest(tf.test.TestCase):
 
   def testGetVarWithDevice(self):
     g = tf.Graph()
-    varname_shape = []
+    varname_type = []
 
     def device_func(op):
       if op.type == "Variable":
-        varname_shape.append((op.name, tf.TensorShape(op.get_attr("shape"))))
+        varname_type.append((op.name, op.get_attr("dtype")))
       return "/gpu:0"
 
     with g.as_default():
       with tf.device(device_func):
-        _ = tf.get_variable("x", (100, 200))  # init fn
-        _ = tf.get_variable("y", initializer=numpy.arange(73))  # init constant
-    self.assertEqual(varname_shape[0], ("x", tf.TensorShape([100, 200])))
-    self.assertEqual(varname_shape[1], ("y", tf.TensorShape([73])))
+        _ = tf.get_variable("x", (100, 200))
+        _ = tf.get_variable("y", dtype=tf.int64, initializer=numpy.arange(73))
+    self.assertEqual(varname_type[0], ("x", tf.float32))
+    self.assertEqual(varname_type[1], ("y", tf.int64))
 
 
 def axis0_into1_partitioner(shape=None, **unused_kwargs):
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 6abce62ecc2..dcb57d7e0c3 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -1905,7 +1905,6 @@ def _EditDistanceShape(op):
   return common_shapes.call_cpp_shape_fn(op, input_tensors_needed=[2, 5])
 
 
-# The remaining ops do not change the shape of their inputs.
 @ops.RegisterShape("Quantize")
 @ops.RegisterShape("Dequantize")
 def _QuantizeDequantizeShape(op):
@@ -1914,6 +1913,45 @@ def _QuantizeDequantizeShape(op):
   return common_shapes.unchanged_shape(op)
 
 
+@ops.RegisterShape("FakeQuantWithMinMaxArgs")
+def _FakeQuantWithMinMaxArgsShape(op):
+  """Shape function for FakeQuantWithMinMaxArgs op: preserve the input shape."""
+  return [op.inputs[0].get_shape()]
+
+
+@ops.RegisterGradient("FakeQuantWithMinMaxArgs")
+def _FakeQuantWithMinMaxArgsGradient(op, grad):
+  """Gradient for FakeQuantWithMinMaxArgs op."""
+  return fake_quant_with_min_max_args_gradient(grad, op.inputs[0])
+
+
+@ops.RegisterShape("FakeQuantWithMinMaxVars")
+def _FakeQuantWithMinMaxVarsShape(op):
+  """Shape function for FakeQuantWithMinMaxVars op: preserve the input shape."""
+  return [op.inputs[0].get_shape()]
+
+
+@ops.RegisterGradient("FakeQuantWithMinMaxVars")
+def _FakeQuantWithMinMaxVarsGradient(op, grad):
+  """Gradient for FakeQuantWithMinMaxVars op."""
+  return fake_quant_with_min_max_vars_gradient(grad, op.inputs[0], op.inputs[1],
+                                               op.inputs[2])
+
+
+@ops.RegisterShape("FakeQuantWithMinMaxVarsPerChannel")
+def _FakeQuantWithMinMaxVarsPerChannelShape(op):
+  """Shape function for FakeQuantWithMinMaxVarsPerChannel op: input shape."""
+  return [op.inputs[0].get_shape()]
+
+
+@ops.RegisterGradient("FakeQuantWithMinMaxVarsPerChannel")
+def _FakeQuantWithMinMaxVarsPerChannelGradient(op, grad):
+  """Gradient for FakeQuantWithMinMaxVarsPerChannel op."""
+  return fake_quant_with_min_max_vars_per_channel_gradient(grad, op.inputs[0],
+                                                           op.inputs[1],
+                                                           op.inputs[2])
+
+
 ops.RegisterShape("ExtractImagePatches")(common_shapes.call_cpp_shape_fn)
 
 
diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py
index 06d7308b384..d2de88a9ca9 100644
--- a/tensorflow/python/ops/data_flow_ops.py
+++ b/tensorflow/python/ops/data_flow_ops.py
@@ -1455,14 +1455,14 @@ class SparseConditionalAccumulator(ConditionalAccumulatorBase):
         dense_shape=return_val.shape)
 
 
-ops.RegisterShape("AccumulatorNumAccumulated")(common_shapes.scalar_shape)
-ops.RegisterShape("AccumulatorSetGlobalStep")(common_shapes.no_outputs)
-
-ops.RegisterShape("ConditionalAccumulator")(common_shapes.scalar_shape)
-
-ops.RegisterShape("AccumulatorApplyGradient")(common_shapes.no_outputs)
-ops.RegisterShape("AccumulatorTakeGradient")(common_shapes.unknown_shape)
-
-ops.RegisterShape("SparseConditionalAccumulator")(common_shapes.scalar_shape)
-ops.RegisterShape("SparseAccumulatorApplyGradient")(common_shapes.no_outputs)
-ops.RegisterShape("SparseAccumulatorTakeGradient")(common_shapes.unknown_shape)
+ops.RegisterShape("AccumulatorNumAccumulated")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("AccumulatorSetGlobalStep")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("ConditionalAccumulator")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("AccumulatorApplyGradient")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("AccumulatorTakeGradient")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("SparseConditionalAccumulator")(
+    common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("SparseAccumulatorApplyGradient")(
+    common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("SparseAccumulatorTakeGradient")(
+    common_shapes.call_cpp_shape_fn)
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index d765989f497..8ef05b03344 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -234,6 +234,22 @@ def map_fn(fn, elems, dtype=None, parallel_iterations=10, back_prop=True,
   the `dtype` parameter is not optional: `dtype` must be a type or (possibly
   nested) tuple of types matching the output of `fn`.
 
+  To apply a functional operation to the nonzero elements of a SparseTensor
+  one of the following methods is recommended. First, if the function is
+  expressible as TensorFlow ops, use
+
+  ```python
+    result = SparseTensor(input.indices, fn(input.values), input.shape)
+  ```
+
+  If, however, the function is not expressible as a TensorFlow op, then use
+
+  ```python
+  result = SparseTensor(input.indices, map_fn(fn, input.values), input.shape)
+  ```
+
+  instead.
+
   Args:
     fn: The callable to be performed.  It accepts one argument, which will
       have the same (possibly nested) structure as `elems`.  Its output
@@ -259,7 +275,7 @@ def map_fn(fn, elems, dtype=None, parallel_iterations=10, back_prop=True,
 
   Raises:
     TypeError: if `fn` is not callable or the structure of the output of
-      `fn` and `dtype` do not match.
+      `fn` and `dtype` do not match, or if elems is a SparseTensor.
     ValueError: if the lengths of the output of `fn` and `dtype` do not match.
 
   Examples:
@@ -285,6 +301,12 @@ def map_fn(fn, elems, dtype=None, parallel_iterations=10, back_prop=True,
   if not callable(fn):
     raise TypeError("fn must be callable.")
 
+  if isinstance(elems, ops.SparseTensor):
+    raise TypeError(
+        "To perform a map on the values of a sparse tensor use either "
+        " SparseTensor(input.indices, fn(input.values), input.shape) or "
+        " SparseTensor(input.indices, map_fn(fn, input.values), input.shape)")
+
   input_is_sequence = nest.is_sequence(elems)
   input_flatten = lambda x: nest.flatten(x) if input_is_sequence else [x]
   def input_pack(x):
diff --git a/tensorflow/python/ops/image_ops.py b/tensorflow/python/ops/image_ops.py
index 451b3e5bf09..2836fbabdc0 100644
--- a/tensorflow/python/ops/image_ops.py
+++ b/tensorflow/python/ops/image_ops.py
@@ -152,7 +152,7 @@ type and representation (RGB or HSV).
 @@adjust_saturation
 @@random_saturation
 
-@@per_image_whitening
+@@per_image_standardization
 
 ## Working with Bounding Boxes
 
@@ -827,7 +827,7 @@ def resize_images(images,
   return images
 
 
-def per_image_whitening(image):
+def per_image_standardization(image):
   """Linearly scales `image` to have zero mean and unit norm.
 
   This op computes `(x - mean) / adjusted_stddev`, where `mean` is the average
@@ -837,16 +837,11 @@ def per_image_whitening(image):
   `stddev` is the standard deviation of all values in `image`. It is capped
   away from zero to protect against division by 0 when handling uniform images.
 
-  Note that this implementation is limited:
-
-  *  It only whitens based on the statistics of an individual image.
-  *  It does not take into account the covariance structure.
-
   Args:
     image: 3-D tensor of shape `[height, width, channels]`.
 
   Returns:
-    The whitened image with same shape as `image`.
+    The standardized image with same shape as `image`.
 
   Raises:
     ValueError: if the shape of 'image' is incompatible with this function.
@@ -873,6 +868,11 @@ def per_image_whitening(image):
   return image
 
 
+# TODO(skye): remove once users switch to per_image_standardization()
+def per_image_whitening(image):
+  return per_image_standardization(image)
+
+
 def random_brightness(image, max_delta, seed=None):
   """Adjust the brightness of images by a random factor.
 
@@ -1380,3 +1380,6 @@ ops.RegisterShape('NonMaxSuppression')(common_shapes.call_cpp_shape_fn)
 __all__ = make_all(__name__)
 # ResizeMethod is not documented, but is documented in functions that use it.
 __all__.append('ResizeMethod')
+# TODO(skye): per_image_whitening() will be removed once all callers switch to
+# per_image_standardization()
+__all__.append('per_image_whitening')
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 215731a5cb8..1a34634cf28 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -2021,3 +2021,5 @@ def reduced_shape(input_shape, axes):
 
 
 ops.RegisterShape("QuantizedMatMul")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("Requantize")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("RequantizationRange")(common_shapes.call_cpp_shape_fn)
diff --git a/tensorflow/python/ops/state_ops.py b/tensorflow/python/ops/state_ops.py
index f869301873f..636acc3e2ad 100644
--- a/tensorflow/python/ops/state_ops.py
+++ b/tensorflow/python/ops/state_ops.py
@@ -116,6 +116,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import gen_state_ops
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
@@ -146,6 +147,8 @@ def variable_op(shape, dtype, name="Variable", set_shape=True, container="",
   Returns:
     A variable tensor.
   """
+  if not set_shape:
+    shape = tensor_shape.unknown_shape()
   ret = gen_state_ops._variable(shape=shape, dtype=dtype, name=name,
                                 container=container, shared_name=shared_name)
   # TODO(mrry): Move this to where it is used, so we can get rid of this op
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index b03a49988c4..05f780ccaac 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 from tensorflow.core.framework import variable_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -280,6 +281,7 @@ class Variable(object):
           "or set. Got %s of type %s" % (collections, type(collections)))
     if trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections:
       collections = list(collections) + [ops.GraphKeys.TRAINABLE_VARIABLES]
+    expected_shape = tensor_shape.as_shape(expected_shape)
     with ops.control_dependencies(None):
       with ops.name_scope(name, "Variable", [] if init_from_fn else
                           [initial_value]) as name:
@@ -287,6 +289,13 @@ class Variable(object):
         # Get the initial value from a callable function. The real shape of the
         # variable will be set later, since under the init_from_fn case, the
         # shape won't be known until after the function is invoked.
+        #
+        # NOTE: The current Variable OpKernel does not support
+        # partially defined shapes, so we only set the shape if it is
+        # fully defined. For historical reasons, we use the scalar
+        # shape (`[]`) to represent an unknown or partially known
+        # shape. A future version of the Variable ops will remove this
+        # limitation.
         def full_shape_to_list(shape):
           """Returns shape as a list if shape is fully defined."""
           if shape and shape.is_fully_defined():
@@ -302,8 +311,10 @@ class Variable(object):
 
         if init_from_fn:
           expected_shape_list = full_shape_to_list(expected_shape)
+          set_shape = validate_shape and expected_shape.is_fully_defined()
           self._variable = state_ops.variable_op(
-              expected_shape_list, dtype.base_dtype, set_shape=False, name=name)
+              expected_shape_list, dtype.base_dtype, set_shape=set_shape,
+              name=name)
           with ops.colocate_with(self._variable.op):
             with ops.name_scope("Initializer"):
               # Colocate the tensors created by the initial_value() function
@@ -317,12 +328,14 @@ class Variable(object):
           self._initial_value = ops.convert_to_tensor(
               initial_value, name="initial_value", dtype=dtype)
           assert_expected_shape()
+          set_shape = (validate_shape
+                       and self._initial_value.get_shape().is_fully_defined())
           # In this case, the variable op can't be created until after the
           # initial_value has been converted to a Tensor with a known type.
           self._variable = state_ops.variable_op(
               full_shape_to_list(self._initial_value.get_shape()),
               self._initial_value.dtype.base_dtype,
-              set_shape=False,
+              set_shape=set_shape,
               name=name)
 
         # Manually overrides the variable's shape with the initial value's.
@@ -976,13 +989,8 @@ class PartitionedVariable(object):
     Returns:
       `Tensor` containing the concatenated value.
     """
-    if self._as_tensor is None:
-      # Be sure to cache the concatenated tensor to not do extraneous
-      # computations.
-      with ops.control_dependencies(None):
-        self._as_tensor = self._concat()
-
-    return self._as_tensor
+    with ops.control_dependencies(None):
+      return self._concat()
 
   @staticmethod
   def _TensorConversionFunction(v, dtype=None, name=None, as_ref=False):
diff --git a/tensorflow/python/platform/app.py b/tensorflow/python/platform/app.py
index b82a6987eca..bd58db7b45d 100644
--- a/tensorflow/python/platform/app.py
+++ b/tensorflow/python/platform/app.py
@@ -23,10 +23,21 @@ import sys
 from tensorflow.python.platform import flags
 
 
-def run(main=None):
+def run(main=None, argv=None):
+  """Runs the program with an optional 'main' function and 'argv' list."""
   f = flags.FLAGS
+
+  # Extract the args from the optional `argv` list.
+  args = argv[1:] if argv else None
+
+  # Parse the known flags from that list, or from the command
+  # line otherwise.
   # pylint: disable=protected-access
-  flags_passthrough = f._parse_flags()
+  flags_passthrough = f._parse_flags(args=args)
   # pylint: enable=protected-access
+
   main = main or sys.modules['__main__'].main
+
+  # Call the main function, passing through any arguments
+  # to the final program.
   sys.exit(main(sys.argv[:1] + flags_passthrough))
diff --git a/tensorflow/python/platform/flags.py b/tensorflow/python/platform/flags.py
index 0522f76b9c3..3e417ab3213 100644
--- a/tensorflow/python/platform/flags.py
+++ b/tensorflow/python/platform/flags.py
@@ -31,8 +31,8 @@ class _FlagValues(object):
     self.__dict__['__flags'] = {}
     self.__dict__['__parsed'] = False
 
-  def _parse_flags(self):
-    result, unparsed = _global_parser.parse_known_args()
+  def _parse_flags(self, args=None):
+    result, unparsed = _global_parser.parse_known_args(args=args)
     for flag_name, val in vars(result).items():
       self.__dict__['__flags'][flag_name] = val
     self.__dict__['__parsed'] = True
diff --git a/tensorflow/python/platform/flags_test.py b/tensorflow/python/platform/flags_test.py
index d2b7da7ad25..0dbaafd1fab 100644
--- a/tensorflow/python/platform/flags_test.py
+++ b/tensorflow/python/platform/flags_test.py
@@ -12,20 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Tests for our flags implementation."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import argparse
 import sys
 import unittest
 
 from tensorflow.python.platform import app
 from tensorflow.python.platform import flags
 
-
 flags.DEFINE_string("string_foo", "default_val", "HelpString")
 flags.DEFINE_integer("int_foo", 42, "HelpString")
 flags.DEFINE_float("float_foo", 42.0, "HelpString")
@@ -40,6 +37,7 @@ flags.DEFINE_bool("bool_e", True, "HelpString")
 
 FLAGS = flags.FLAGS
 
+
 class FlagsTest(unittest.TestCase):
 
   def testString(self):
@@ -82,17 +80,7 @@ class FlagsTest(unittest.TestCase):
     self.assertEqual(-1.0, FLAGS.float_foo)
 
 
-def main(argv):
-  # Test that argparse can parse flags that aren't registered
-  # with tf.flags.
-  parser = argparse.ArgumentParser()
-  parser.add_argument("--argparse_val", type=int, default=1000,
-                      help="Test flag")
-  argparse_flags, _ = parser.parse_known_args(argv)
-  if argparse_flags.argparse_val != 10:
-    raise ValueError("argparse flag was not parsed: got %d",
-                     argparse_flags.argparse_val)
-
+def main(_):
   # unittest.main() tries to interpret the unknown flags, so use the
   # direct functions instead.
   runner = unittest.TextTestRunner()
@@ -102,9 +90,9 @@ def main(argv):
 
 if __name__ == "__main__":
   # Test command lines
-  sys.argv.extend(["--bool_a", "--nobool_negation",
-                   "--bool_c=True", "--bool_d=False",
-                   "--unknown_flag", "--argparse_val=10",
-                   "and_argument"])
+  sys.argv.extend([
+      "--bool_a", "--nobool_negation", "--bool_c=True", "--bool_d=False",
+      "and_argument"
+  ])
 
   app.run()
diff --git a/tensorflow/python/saved_model/builder.py b/tensorflow/python/saved_model/builder.py
index fcca5aa5e85..43b97cf70c6 100644
--- a/tensorflow/python/saved_model/builder.py
+++ b/tensorflow/python/saved_model/builder.py
@@ -86,8 +86,12 @@ class SavedModelBuilder(object):
         constants.SAVED_MODEL_SCHEMA_VERSION)
 
     self._export_dir = export_dir
-    if not file_io.file_exists(export_dir):
-      file_io.recursive_create_dir(self._export_dir)
+    if file_io.file_exists(export_dir):
+      raise AssertionError(
+          "Export directory already exists. Please specify a different export "
+          "directory.")
+
+    file_io.recursive_create_dir(self._export_dir)
 
     # Boolean to track whether variables and assets corresponding to the
     # SavedModel have been saved. Specifically, the first meta graph to be added
@@ -163,8 +167,12 @@ class SavedModelBuilder(object):
       asset_destination_filepath = os.path.join(
           compat.as_bytes(assets_destination_dir),
           compat.as_bytes(asset_source_filename))
-      file_io.copy(
-          asset_source_filepath, asset_destination_filepath, overwrite=True)
+
+      # Only copy the asset file to the destination if it does not already
+      # exist. This is to ensure that an asset with the same name defined as
+      # part of multiple graphs is only copied the first time.
+      if not file_io.file_exists(asset_destination_filepath):
+        file_io.copy(asset_source_filepath, asset_destination_filepath)
 
     tf_logging.info("Assets written to: %s", assets_destination_dir)
 
@@ -271,8 +279,8 @@ class SavedModelBuilder(object):
           "Variables and assets have not been saved yet. "
           "Please invoke `add_meta_graph_and_variables()` first.")
 
-    # Save asset files, if any.
-    self._maybe_save_assets(assets_collection)
+    # Save asset files and write them to disk, if any.
+    self._save_and_write_assets(assets_collection)
 
     # Add legacy init op to the SavedModel.
     self._maybe_add_legacy_init_op(legacy_init_op)
diff --git a/tensorflow/python/saved_model/example/saved_model_half_plus_two.py b/tensorflow/python/saved_model/example/saved_model_half_plus_two.py
index 9ba37e42fae..7c25a7ec1ef 100644
--- a/tensorflow/python/saved_model/example/saved_model_half_plus_two.py
+++ b/tensorflow/python/saved_model/example/saved_model_half_plus_two.py
@@ -97,6 +97,12 @@ def _generate_saved_model_for_half_plus_two(export_dir, as_text=False):
     # Set up the assets collection.
     assets_filepath = tf.constant(original_assets_filepath)
     tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, assets_filepath)
+    filename_tensor = tf.Variable(
+        original_assets_filename,
+        name="filename_tensor",
+        trainable=False,
+        collections=[])
+    assign_filename_op = filename_tensor.assign(original_assets_filename)
 
     # Set up the signature for regression with input and output tensor
     # specification.
@@ -118,7 +124,8 @@ def _generate_saved_model_for_half_plus_two(export_dir, as_text=False):
         signature_def_map={
             signature_constants.REGRESS_METHOD_NAME: signature_def
         },
-        assets_collection=tf.get_collection(tf.GraphKeys.ASSET_FILEPATHS))
+        assets_collection=tf.get_collection(tf.GraphKeys.ASSET_FILEPATHS),
+        legacy_init_op=tf.group(assign_filename_op))
     builder.save(as_text)
 
 
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index ff6e86a2092..a50620e113c 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -38,6 +38,39 @@ def tearDownModule():
 
 class SavedModelTest(tf.test.TestCase):
 
+  def _init_and_validate_variable(self, sess, variable_name, variable_value):
+    v = tf.Variable(variable_value, name=variable_name)
+    sess.run(tf.initialize_all_variables())
+    self.assertEqual(variable_value, v.eval())
+
+  def _build_asset_collection(self, asset_file_name, asset_file_contents,
+                              asset_file_tensor_name):
+    asset_filepath = os.path.join(
+        compat.as_bytes(tf.test.get_temp_dir()),
+        compat.as_bytes(asset_file_name))
+    file_io.write_string_to_file(asset_filepath, asset_file_contents)
+    asset_file_tensor = tf.constant(asset_filepath, name=asset_file_tensor_name)
+    tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, asset_file_tensor)
+    asset_collection = tf.get_collection(tf.GraphKeys.ASSET_FILEPATHS)
+    return asset_collection
+
+  def _validate_asset_collection(self, export_dir, graph_collection_def,
+                                 expected_asset_file_name,
+                                 expected_asset_file_contents,
+                                 expected_asset_tensor_name):
+    assets_any = graph_collection_def[constants.ASSETS_KEY].any_list.value
+    asset = meta_graph_pb2.AssetFileDef()
+    assets_any[0].Unpack(asset)
+    assets_path = os.path.join(
+        compat.as_bytes(export_dir),
+        compat.as_bytes(constants.ASSETS_DIRECTORY),
+        compat.as_bytes(expected_asset_file_name))
+    actual_asset_contents = file_io.read_file_to_string(assets_path)
+    self.assertEqual(expected_asset_file_contents,
+                     compat.as_text(actual_asset_contents))
+    self.assertEqual(expected_asset_file_name, asset.filename)
+    self.assertEqual(expected_asset_tensor_name, asset.tensor_info.name)
+
   def testSequence(self):
     export_dir = os.path.join(tf.test.get_temp_dir(), "test_sequence")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
@@ -50,9 +83,7 @@ class SavedModelTest(tf.test.TestCase):
     # Expect an assertion error for multiple calls of
     # add_meta_graph_and_variables() since weights should be saved exactly once.
     with self.test_session(graph=tf.Graph()) as sess:
-      v = tf.Variable(42, name="v")
-      sess.run(tf.initialize_all_variables())
-      self.assertEqual(42, v.eval())
+      self._init_and_validate_variable(sess, "v", 42)
       builder.add_meta_graph_and_variables(sess, ["bar"])
       self.assertRaises(AssertionError, builder.add_meta_graph_and_variables,
                         sess, ["baz"])
@@ -65,27 +96,21 @@ class SavedModelTest(tf.test.TestCase):
     # - add with weights.
     # - a single tag (from predefined constants).
     with self.test_session(graph=tf.Graph()) as sess:
-      v = tf.Variable(42, name="v")
-      sess.run(tf.initialize_all_variables())
-      self.assertEqual(42, v.eval())
+      self._init_and_validate_variable(sess, "v", 42)
       builder.add_meta_graph_and_variables(sess, [tag_constants.TRAINING])
 
     # Graph that updates the single variable. SavedModel invoked to:
     # - simply add the model (weights are not updated).
     # - a single tag (from predefined constants).
     with self.test_session(graph=tf.Graph()) as sess:
-      v = tf.Variable(43, name="v")
-      sess.run(tf.initialize_all_variables())
-      self.assertEqual(43, v.eval())
+      self._init_and_validate_variable(sess, "v", 43)
       builder.add_meta_graph([tag_constants.SERVING])
 
     # Graph that updates the single variable. SavedModel is invoked:
     # - to add the model (weights are not updated).
     # - multiple custom tags.
     with self.test_session(graph=tf.Graph()) as sess:
-      v = tf.Variable(44, name="v")
-      sess.run(tf.initialize_all_variables())
-      self.assertEqual(44, v.eval())
+      self._init_and_validate_variable(sess, "v", 44)
       builder.add_meta_graph(["foo", "bar"])
 
     # Save the SavedModel to disk.
@@ -128,29 +153,22 @@ class SavedModelTest(tf.test.TestCase):
     # Graph with two variables. SavedModel invoked to:
     # - add with weights.
     with self.test_session(graph=tf.Graph()) as sess:
-      v1 = tf.Variable(1, name="v1")
-      v2 = tf.Variable(2, name="v2")
-      sess.run(tf.initialize_all_variables())
-      self.assertEqual(1, v1.eval())
-      self.assertEqual(2, v2.eval())
+      self._init_and_validate_variable(sess, "v1", 1)
+      self._init_and_validate_variable(sess, "v2", 2)
       builder.add_meta_graph_and_variables(sess, ["foo"])
 
     # Graph with a single variable (subset of the variables from the previous
     # graph whose weights were saved). SavedModel invoked to:
     # - simply add the model (weights are not updated).
     with self.test_session(graph=tf.Graph()) as sess:
-      v2 = tf.Variable(3, name="v2")
-      sess.run(tf.initialize_all_variables())
-      self.assertEqual(3, v2.eval())
+      self._init_and_validate_variable(sess, "v2", 3)
       builder.add_meta_graph(["bar"])
 
     # Graph with a single variable (disjoint set of variables from the previous
     # graph whose weights were saved). SavedModel invoked to:
     # - simply add the model (weights are not updated).
     with self.test_session(graph=tf.Graph()) as sess:
-      v3 = tf.Variable(4, name="v3")
-      sess.run(tf.initialize_all_variables())
-      self.assertEqual(4, v3.eval())
+      self._init_and_validate_variable(sess, "v3", 4)
       builder.add_meta_graph(["baz"])
 
     # Save the SavedModel to disk.
@@ -180,6 +198,29 @@ class SavedModelTest(tf.test.TestCase):
       self.assertRaises(errors.NotFoundError, loader.load, sess, ["baz"],
                         export_dir)
 
+  def testNoOverwrite(self):
+    export_dir = os.path.join(tf.test.get_temp_dir(), "test_no_overwrite")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+    # Graph with a single variable. SavedModel invoked to:
+    # - add with weights.
+    with self.test_session(graph=tf.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 42)
+      builder.add_meta_graph_and_variables(sess, ["foo"])
+
+    # Save the SavedModel to disk in text format.
+    builder.save(as_text=True)
+
+    # Restore the graph with tag "foo", whose variables were saved.
+    with self.test_session(graph=tf.Graph()) as sess:
+      loader.load(sess, ["foo"], export_dir)
+      self.assertEqual(42, tf.get_collection(tf.GraphKeys.VARIABLES)[0].eval())
+
+    # An attempt to create another builder with the same export directory should
+    # result in an assertion error.
+    self.assertRaises(AssertionError, saved_model_builder.SavedModelBuilder,
+                      export_dir)
+
   def testSaveAsText(self):
     export_dir = os.path.join(tf.test.get_temp_dir(), "test_astext")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
@@ -187,17 +228,13 @@ class SavedModelTest(tf.test.TestCase):
     # Graph with a single variable. SavedModel invoked to:
     # - add with weights.
     with self.test_session(graph=tf.Graph()) as sess:
-      v = tf.Variable(42, name="v")
-      sess.run(tf.initialize_all_variables())
-      self.assertEqual(42, v.eval())
+      self._init_and_validate_variable(sess, "v", 42)
       builder.add_meta_graph_and_variables(sess, ["foo"])
 
     # Graph with the same single variable. SavedModel invoked to:
     # - simply add the model (weights are not updated).
     with self.test_session(graph=tf.Graph()) as sess:
-      v = tf.Variable(43, name="v")
-      sess.run(tf.initialize_all_variables())
-      self.assertEqual(43, v.eval())
+      self._init_and_validate_variable(sess, "v", 43)
       builder.add_meta_graph(["bar"])
 
     # Save the SavedModel to disk in text format.
@@ -270,9 +307,7 @@ class SavedModelTest(tf.test.TestCase):
     # Graph with a single variable and a single entry in the signature def map.
     # SavedModel is invoked to add with weights.
     with self.test_session(graph=tf.Graph()) as sess:
-      v = tf.Variable(42, name="v")
-      sess.run(tf.initialize_all_variables())
-      self.assertEqual(42, v.eval())
+      self._init_and_validate_variable(sess, "v", 42)
       # Build and populate an empty SignatureDef for testing.
       foo_signature = utils.build_signature_def(dict(), dict(), "foo")
       builder.add_meta_graph_and_variables(
@@ -281,10 +316,7 @@ class SavedModelTest(tf.test.TestCase):
     # Graph with the same single variable and multiple entries in the signature
     # def map. No weights are saved by SavedModel.
     with self.test_session(graph=tf.Graph()) as sess:
-      v = tf.Variable(43, name="v")
-      sess.run(tf.initialize_all_variables())
-      self.assertEqual(43, v.eval())
-
+      self._init_and_validate_variable(sess, "v", 43)
       # Build and populate a different SignatureDef for testing.
       bar_signature = utils.build_signature_def(dict(), dict(), "bar")
       # Also, build a different SignatureDef corresponding to "foo_key" defined
@@ -325,24 +357,17 @@ class SavedModelTest(tf.test.TestCase):
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with self.test_session(graph=tf.Graph()) as sess:
-      v = tf.Variable(42, name="v")
-      sess.run(tf.initialize_all_variables())
-      self.assertEqual(42, v.eval())
+      self._init_and_validate_variable(sess, "v", 42)
 
       # Build an asset collection.
-      asset_filepath = os.path.join(
-          compat.as_bytes(tf.test.get_temp_dir()),
-          compat.as_bytes("hello42.txt"))
-      file_io.write_string_to_file(asset_filepath, "foo bar baz")
-      asset_file_tensor = tf.constant(asset_filepath, name="asset_file_tensor")
-      tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, asset_file_tensor)
-
       ignored_filepath = os.path.join(
           compat.as_bytes(tf.test.get_temp_dir()),
           compat.as_bytes("ignored.txt"))
       file_io.write_string_to_file(ignored_filepath, "will be ignored")
 
-      asset_collection = tf.get_collection(tf.GraphKeys.ASSET_FILEPATHS)
+      asset_collection = self._build_asset_collection("hello42.txt",
+                                                      "foo bar baz",
+                                                      "asset_file_tensor")
 
       builder.add_meta_graph_and_variables(
           sess, ["foo"], assets_collection=asset_collection)
@@ -352,21 +377,9 @@ class SavedModelTest(tf.test.TestCase):
 
     with self.test_session(graph=tf.Graph()) as sess:
       foo_graph = loader.load(sess, ["foo"], export_dir)
-
-      # Validate the assets.
-      collection_def = foo_graph.collection_def
-      assets_any = collection_def[constants.ASSETS_KEY].any_list.value
-      self.assertEqual(len(assets_any), 1)
-      asset = meta_graph_pb2.AssetFileDef()
-      assets_any[0].Unpack(asset)
-      assets_path = os.path.join(
-          compat.as_bytes(export_dir),
-          compat.as_bytes(constants.ASSETS_DIRECTORY),
-          compat.as_bytes("hello42.txt"))
-      asset_contents = file_io.read_file_to_string(assets_path)
-      self.assertEqual("foo bar baz", compat.as_text(asset_contents))
-      self.assertEqual("hello42.txt", asset.filename)
-      self.assertEqual("asset_file_tensor:0", asset.tensor_info.name)
+      self._validate_asset_collection(export_dir, foo_graph.collection_def,
+                                      "hello42.txt", "foo bar baz",
+                                      "asset_file_tensor:0")
       ignored_asset_path = os.path.join(
           compat.as_bytes(export_dir),
           compat.as_bytes(constants.ASSETS_DIRECTORY),
@@ -407,6 +420,96 @@ class SavedModelTest(tf.test.TestCase):
       # the legacy_init_op, following a restore.
       self.assertEqual(3, tf.get_collection("v")[2].eval())
 
+  def testMultipleAssets(self):
+    export_dir = os.path.join(tf.test.get_temp_dir(), "test_multiple_assets")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+    with self.test_session(graph=tf.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 42)
+
+      # Build an asset collection specific to `foo` graph.
+      asset_collection = self._build_asset_collection("foo.txt", "content_foo",
+                                                      "asset_file_tensor")
+
+      # Add the asset collection as part of the graph with tag "foo".
+      builder.add_meta_graph_and_variables(
+          sess, ["foo"], assets_collection=asset_collection)
+
+    with self.test_session(graph=tf.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 42)
+
+      # Build an asset collection specific to `bar` graph.
+      asset_collection = self._build_asset_collection("bar.txt", "content_bar",
+                                                      "asset_file_tensor")
+
+      # Add the asset collection as part of the graph with tag "bar".
+      builder.add_meta_graph(["bar"], assets_collection=asset_collection)
+
+    # Save the SavedModel to disk.
+    builder.save()
+
+    # Check assets restored for graph with tag "foo".
+    with self.test_session(graph=tf.Graph()) as sess:
+      foo_graph = loader.load(sess, ["foo"], export_dir)
+      self._validate_asset_collection(export_dir, foo_graph.collection_def,
+                                      "foo.txt", "content_foo",
+                                      "asset_file_tensor:0")
+
+    # Check assets restored for graph with tag "bar".
+    with self.test_session(graph=tf.Graph()) as sess:
+      bar_graph = loader.load(sess, ["bar"], export_dir)
+      self._validate_asset_collection(export_dir, bar_graph.collection_def,
+                                      "bar.txt", "content_bar",
+                                      "asset_file_tensor:0")
+
+  def testDuplicateAssets(self):
+    export_dir = os.path.join(tf.test.get_temp_dir(), "test_duplicate_assets")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+    with self.test_session(graph=tf.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 42)
+
+      # Build an asset collection with `foo.txt` that has `foo` specific
+      # content.
+      asset_collection = self._build_asset_collection("foo.txt", "content_foo",
+                                                      "asset_file_tensor")
+
+      # Add the asset collection as part of the graph with tag "foo".
+      builder.add_meta_graph_and_variables(
+          sess, ["foo"], assets_collection=asset_collection)
+
+    with self.test_session(graph=tf.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 42)
+
+      # Build an asset collection with `foo.txt` that has `bar` specific
+      # content.
+      asset_collection = self._build_asset_collection("foo.txt", "content_bar",
+                                                      "asset_file_tensor")
+
+      # Add the asset collection as part of the graph with tag "bar".
+      builder.add_meta_graph(["bar"], assets_collection=asset_collection)
+
+    # Save the SavedModel to disk.
+    builder.save()
+
+    # Check assets restored for graph with tag "foo".
+    with self.test_session(graph=tf.Graph()) as sess:
+      foo_graph = loader.load(sess, ["foo"], export_dir)
+      self._validate_asset_collection(export_dir, foo_graph.collection_def,
+                                      "foo.txt", "content_foo",
+                                      "asset_file_tensor:0")
+
+    # Check assets restored for graph with tag "bar".
+    with self.test_session(graph=tf.Graph()) as sess:
+      bar_graph = loader.load(sess, ["bar"], export_dir)
+
+      # Validate the assets for `bar` graph. `foo.txt` should contain the
+      # original contents corresponding to `foo` graph since an asset with the
+      # same name across multiple graphs is only stored the first time
+      self._validate_asset_collection(export_dir, bar_graph.collection_def,
+                                      "foo.txt", "content_foo",
+                                      "asset_file_tensor:0")
+
   def testOp(self):
     export_dir = os.path.join(tf.test.get_temp_dir(), "test_op")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
diff --git a/tensorflow/python/summary/event_accumulator.py b/tensorflow/python/summary/event_accumulator.py
index a4bc93344cd..063f100b94f 100644
--- a/tensorflow/python/summary/event_accumulator.py
+++ b/tensorflow/python/summary/event_accumulator.py
@@ -31,7 +31,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary
 from tensorflow.python.summary.impl import directory_watcher
-from tensorflow.python.summary.impl import io_wrapper
+from tensorflow.python.summary.impl import event_file_loader
 from tensorflow.python.summary.impl import reservoir
 from tensorflow.python.util import compat
 
@@ -664,10 +664,10 @@ def _GetPurgeMessage(most_recent_step, most_recent_wall_time, event_step,
 def _GeneratorFromPath(path):
   """Create an event generator for file or directory at given path string."""
   if IsTensorFlowEventsFile(path):
-    return io_wrapper.CreateFileLoader(path)
+    return event_file_loader.EventFileLoader(path)
   else:
-    return directory_watcher.DirectoryWatcher(path, io_wrapper.CreateFileLoader,
-                                              IsTensorFlowEventsFile)
+    return directory_watcher.DirectoryWatcher(
+        path, event_file_loader.EventFileLoader, IsTensorFlowEventsFile)
 
 
 def _ParseFileVersion(file_version):
diff --git a/tensorflow/python/summary/event_multiplexer.py b/tensorflow/python/summary/event_multiplexer.py
index 85de6350d27..d3a14804d34 100644
--- a/tensorflow/python/summary/event_multiplexer.py
+++ b/tensorflow/python/summary/event_multiplexer.py
@@ -23,6 +23,7 @@ import threading
 
 import six
 
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import event_accumulator
 from tensorflow.python.summary.impl import directory_watcher
@@ -386,7 +387,7 @@ class EventMultiplexer(object):
 
 def GetLogdirSubdirectories(path):
   """Returns subdirectories with event files on path."""
-  if io_wrapper.Exists(path) and not io_wrapper.IsDirectory(path):
+  if gfile.Exists(path) and not gfile.IsDirectory(path):
     raise ValueError('GetLogdirSubdirectories: path exists and is not a '
                      'directory, %s' % path)
 
diff --git a/tensorflow/python/summary/impl/directory_watcher.py b/tensorflow/python/summary/impl/directory_watcher.py
index 56a08b11eaf..799e01a8366 100644
--- a/tensorflow/python/summary/impl/directory_watcher.py
+++ b/tensorflow/python/summary/impl/directory_watcher.py
@@ -21,8 +21,8 @@ from __future__ import print_function
 import bisect
 
 from tensorflow.python.framework import errors
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.summary.impl import gcs
 from tensorflow.python.summary.impl import io_wrapper
 
 
@@ -88,7 +88,7 @@ class DirectoryWatcher(object):
       for event in self._LoadInternal():
         yield event
     except errors.OpError:
-      if not io_wrapper.Exists(self._directory):
+      if not gfile.Exists(self._directory):
         raise DirectoryDeletedError(
             'Directory %s has been permanently deleted' % self._directory)
 
@@ -178,10 +178,10 @@ class DirectoryWatcher(object):
       path: The full path of the file to watch.
     """
     old_path = self._path
-    if old_path and not gcs.IsGCSPath(old_path):
+    if old_path and not io_wrapper.IsGCSPath(old_path):
       try:
         # We're done with the path, so store its size.
-        size = io_wrapper.Size(old_path)
+        size = gfile.Stat(old_path).length
         logging.debug('Setting latest size of %s to %d', old_path, size)
         self._finalized_sizes[old_path] = size
       except errors.OpError as e:
@@ -210,7 +210,7 @@ class DirectoryWatcher(object):
 
     # Don't bother checking if the paths are GCS (which we can't check) or if
     # we've already detected an OOO write.
-    if not gcs.IsGCSPath(paths[0]) and not self._ooo_writes_detected:
+    if not io_wrapper.IsGCSPath(paths[0]) and not self._ooo_writes_detected:
       # Check the previous _OOO_WRITE_CHECK_COUNT paths for out of order writes.
       current_path_index = bisect.bisect_left(paths, self._path)
       ooo_check_start = max(0, current_path_index - self._OOO_WRITE_CHECK_COUNT)
@@ -230,7 +230,7 @@ class DirectoryWatcher(object):
   def _HasOOOWrite(self, path):
     """Returns whether the path has had an out-of-order write."""
     # Check the sizes of each path before the current one.
-    size = io_wrapper.Size(path)
+    size = gfile.Stat(path).length
     old_size = self._finalized_sizes.get(path, None)
     if size != old_size:
       if old_size is None:
diff --git a/tensorflow/python/summary/impl/directory_watcher_test.py b/tensorflow/python/summary/impl/directory_watcher_test.py
index b4e5f03daec..b6ecc158493 100644
--- a/tensorflow/python/summary/impl/directory_watcher_test.py
+++ b/tensorflow/python/summary/impl/directory_watcher_test.py
@@ -23,6 +23,7 @@ import os
 import shutil
 
 from tensorflow.python.framework import test_util
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import googletest
 from tensorflow.python.summary.impl import directory_watcher
 from tensorflow.python.summary.impl import io_wrapper
@@ -193,10 +194,12 @@ class DirectoryWatcherTest(test_util.TensorFlowTestCase):
 
     FakeFactory.has_been_called = False
 
-    for stub_name in ['ListDirectoryAbsolute', 'ListRecursively', 'IsDirectory',
-                      'Exists', 'Size']:
+    for stub_name in ['ListDirectoryAbsolute', 'ListRecursively']:
       self.stubs.Set(io_wrapper, stub_name,
                      FakeFactory(getattr(io_wrapper, stub_name)))
+    for stub_name in ['IsDirectory', 'Exists', 'Stat']:
+      self.stubs.Set(gfile, stub_name,
+                     FakeFactory(getattr(gfile, stub_name)))
 
     with self.assertRaises((IOError, OSError)):
       self._LoadAllEvents()
diff --git a/tensorflow/python/summary/impl/gcs.py b/tensorflow/python/summary/impl/gcs.py
deleted file mode 100644
index cf2c61067f6..00000000000
--- a/tensorflow/python/summary/impl/gcs.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Functions for communicating with Google Cloud Storage."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import subprocess
-
-from tensorflow.python.platform import tf_logging as logging
-
-# All GCS paths should start with this.
-PATH_PREFIX = 'gs://'
-
-# TODO(phurst): We should use the GCS Python API.
-
-
-def CopyContents(gcs_path, byte_offset, local_file):
-  """Copies the contents of gcs_path from byte_offset onwards to local_file.
-
-  Args:
-    gcs_path: The path to the GCS object.
-    byte_offset: The byte offset to start appending from.
-    local_file: The file object to write into.
-
-  Raises:
-    ValueError: If offset is negative or gcs_path is not a valid GCS path.
-    CalledProcessError: If the gsutil command failed.
-  """
-  if byte_offset < 0:
-    raise ValueError('byte_offset must not be negative')
-  command = ['gsutil', 'cat', '-r', '%d-' % byte_offset, gcs_path]
-  subprocess.check_call(command, stdout=local_file)
-  local_file.flush()
-
-
-def ListDirectory(directory):
-  """Lists all files in the given directory."""
-  command = ['gsutil', 'ls', directory]
-  return subprocess.check_output(command).splitlines()
-
-
-def ListRecursively(top):
-  """Walks a directory tree, yielding (dir_path, file_paths) tuples.
-
-  For each top |top| and its subdirectories, yields a tuple containing the path
-  to the directory and the path to each of the contained files.  Note that
-  unlike os.Walk()/gfile.Walk(), this does not list subdirectories and the file
-  paths are all absolute.
-
-  Args:
-    top: A path to a GCS directory.
-  Returns:
-    A list of (dir_path, file_paths) tuples.
-
-  """
-  if top.endswith('/'):
-    wildcard = top + '**'
-  else:
-    wildcard = top + '/**'
-  tuples = []
-  try:
-    file_paths = ListDirectory(wildcard)
-  except subprocess.CalledProcessError as e:
-    logging.info('%s, assuming it means no files were found', e)
-    return []
-  for file_path in file_paths:
-    dir_path = os.path.dirname(file_path)
-    if tuples and tuples[-1][0] == dir_path:
-      tuples[-1][1].append(file_path)
-    else:
-      tuples.append((dir_path, [file_path]))
-  return tuples
-
-
-def IsDirectory(path):
-  """Returns true if path exists and is a directory."""
-  path = path.rstrip('/')
-  try:
-    ls = ListDirectory(path)
-  except subprocess.CalledProcessError:
-    # Doesn't exist.
-    return False
-  if len(ls) == 1:
-    # Either it's a file (which ls-es as itself) or it's a dir with one file.
-    return ls[0] != path
-  else:
-    return True
-
-
-def Exists(path):
-  """Returns true if path exists."""
-  try:
-    ListDirectory(path)
-    return True
-  except subprocess.CalledProcessError:
-    return False
-
-
-def IsGCSPath(path):
-  return path.startswith(PATH_PREFIX)
-
-
-def CheckIsSupported():
-  """Raises an OSError if the system isn't set up for Google Cloud Storage.
-
-  Raises:
-    OSError: If the system hasn't been set up so that TensorBoard can access
-      Google Cloud Storage.   The error's message contains installation
-      instructions.
-  """
-  try:
-    subprocess.check_output(['gsutil', 'version'])
-  except OSError as e:
-    logging.error('Error while checking for gsutil: %s', e)
-    raise OSError(
-        'Unable to execute the gsutil binary, which is required for Google '
-        'Cloud Storage support. You can find installation instructions at '
-        'https://goo.gl/sST520')
diff --git a/tensorflow/python/summary/impl/gcs_file_loader.py b/tensorflow/python/summary/impl/gcs_file_loader.py
deleted file mode 100644
index c46534dbb52..00000000000
--- a/tensorflow/python/summary/impl/gcs_file_loader.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Loads events from a file stored on Google Cloud Storage."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tempfile
-
-from tensorflow.core.util import event_pb2
-from tensorflow.python import pywrap_tensorflow
-from tensorflow.python.framework import errors
-from tensorflow.python.platform import app
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.summary.impl import gcs
-from tensorflow.python.util import compat
-
-
-class GCSFileLoader(object):
-  """A GCSFileLoader loads Event protos from a path to GCS storage.
-
-  The GCSFileLoader keeps track of the offset in the file, copies the contents
-  of the file to local disk, reads it, and then immediately deletes the file.
-  """
-
-  def __init__(self, gcs_path):
-    if not gcs.IsGCSPath(gcs_path):
-      raise ValueError('A GCS path is required')
-    self._gcs_path = gcs_path
-    self._gcs_offset = 0
-
-  def Load(self):
-    # Create a temp file to hold the contents that we haven't seen yet.
-    with tempfile.NamedTemporaryFile(prefix='tf-gcs-') as temp_file:
-      name = temp_file.name
-      logging.debug('Temp file created at %s', name)
-      gcs.CopyContents(self._gcs_path, self._gcs_offset, temp_file)
-      with errors.raise_exception_on_not_ok_status() as status:
-        reader = pywrap_tensorflow.PyRecordReader_New(
-            compat.as_bytes(name), 0, compat.as_bytes(''), status)
-      while reader.GetNext():
-        event = event_pb2.Event()
-        event.ParseFromString(reader.record())
-        yield event
-      logging.debug('No more events in %s', name)
-      self._gcs_offset += reader.offset()
-
-
-def main(argv):
-  if len(argv) != 2:
-    print('Usage: gcs_file_loader <path-to-gcs-object>')
-    return 1
-  loader = GCSFileLoader(argv[1])
-  for event in loader.Load():
-    print(event)
-
-
-if __name__ == '__main__':
-  app.run()
diff --git a/tensorflow/python/summary/impl/gcs_file_loader_test.py b/tensorflow/python/summary/impl/gcs_file_loader_test.py
deleted file mode 100644
index d35f3df4fc5..00000000000
--- a/tensorflow/python/summary/impl/gcs_file_loader_test.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from tensorflow.python.platform import googletest
-from tensorflow.python.summary.impl import gcs
-from tensorflow.python.summary.impl import gcs_file_loader
-
-
-class GCSFileLoaderTest(tf.test.TestCase):
-
-  def setUp(self):
-    self._append_contents_call_count = 0
-    # A record containing a simple event.
-    self._stubs = googletest.StubOutForTesting()
-    self._stubs.Set(gcs, 'CopyContents', self._MockCopyContents)
-
-  def tearDown(self):
-    self._stubs.CleanUp()
-
-  def testLoad(self):
-    loader = gcs_file_loader.GCSFileLoader('gs://some-fake-url')
-    events = list(loader.Load())
-    self.assertEqual(len(events), 1)
-    self.assertEqual(events[0].file_version, 'brain.Event:1')
-    events = list(loader.Load())
-    self.assertEqual(len(events), 1)
-    self.assertEqual(events[0].file_version, 'brain.Event:2')
-    events = list(loader.Load())
-    self.assertEqual(len(events), 0)
-    self.assertEqual(self._append_contents_call_count, 3)
-
-  # A couple of simple records.
-  MOCK_RECORDS = [
-      b'\x18\x00\x00\x00\x00\x00\x00\x00\xa3\x7fK"\t\x00\x00\xc0%\xddu'
-      b'\xd5A\x1a\rbrain.Event:1\xec\xf32\x8d',
-      b'\x18\x00\x00\x00\x00\x00\x00\x00\xa3\x7fK"\t\x00\x00\x00\'\xe6'
-      b'\xb3\xd5A\x1a\rbrain.Event:2jM\x0b\x15'
-  ]
-
-  def _MockCopyContents(self, gcs_path, offset, local_file):
-    if self._append_contents_call_count == 0:
-      self.assertEqual(offset, 0)
-    elif self._append_contents_call_count == 1:
-      self.assertEqual(offset, len(self.MOCK_RECORDS[0]))
-    else:
-      self.assertEqual(offset,
-                       len(self.MOCK_RECORDS[0]) + len(self.MOCK_RECORDS[1]))
-
-    if self._append_contents_call_count < len(self.MOCK_RECORDS):
-      local_file.write(self.MOCK_RECORDS[self._append_contents_call_count])
-      local_file.flush()
-    self._append_contents_call_count += 1
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tensorflow/python/summary/impl/io_wrapper.py b/tensorflow/python/summary/impl/io_wrapper.py
index f7138833d6b..258fe8c804f 100644
--- a/tensorflow/python/summary/impl/io_wrapper.py
+++ b/tensorflow/python/summary/impl/io_wrapper.py
@@ -12,13 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Functions that wrap both gfile and gcs.
-
-This module is *not* intended to be a general-purpose IO wrapper library; it
-only implements the operations that are necessary for loading event files. The
-functions either dispatch to the gcs library or to gfile, depending on whether
-the path is a GCS 'pseudo-path' (i.e., it satisfies gcs.IsGCSPath) or not.
-"""
+"""IO helper functions."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -26,32 +20,16 @@ from __future__ import print_function
 import os
 
 from tensorflow.python.platform import gfile
-from tensorflow.python.summary.impl import event_file_loader
-from tensorflow.python.summary.impl import gcs
-from tensorflow.python.summary.impl import gcs_file_loader
 
 
-def CreateFileLoader(path):
-  """Creates a file loader for the given path.
-
-  Args:
-    path: A string representing either a normal path or a GCS
-  Returns:
-    An object with a Load() method that yields event_pb2.Event protos.
-  """
-  if gcs.IsGCSPath(path):
-    return gcs_file_loader.GCSFileLoader(path)
-  else:
-    return event_file_loader.EventFileLoader(path)
+def IsGCSPath(path):
+  return path.startswith("gs://")
 
 
 def ListDirectoryAbsolute(directory):
   """Yields all files in the given directory. The paths are absolute."""
-  if gcs.IsGCSPath(directory):
-    return gcs.ListDirectory(directory)
-  else:
-    return (os.path.join(directory, path)
-            for path in gfile.ListDirectory(directory))
+  return (os.path.join(directory, path)
+          for path in gfile.ListDirectory(directory))
 
 
 def ListRecursively(top):
@@ -69,33 +47,6 @@ def ListRecursively(top):
   Yields:
     A list of (dir_path, file_paths) tuples.
   """
-  if gcs.IsGCSPath(top):
-    for x in gcs.ListRecursively(top):
-      yield x
-  else:
-    for dir_path, _, filenames in gfile.Walk(top):
-      yield (dir_path, (os.path.join(dir_path, filename)
-                        for filename in filenames))
-
-
-def IsDirectory(path):
-  """Returns true if path exists and is a directory."""
-  if gcs.IsGCSPath(path):
-    return gcs.IsDirectory(path)
-  else:
-    return gfile.IsDirectory(path)
-
-
-def Exists(path):
-  if gcs.IsGCSPath(path):
-    return gcs.Exists(path)
-  else:
-    return gfile.Exists(path)
-
-
-def Size(path):
-  """Returns the number of bytes in the given file. Doesn't work on GCS."""
-  if gcs.IsGCSPath(path):
-    raise NotImplementedError("io_wrapper.Size doesn't support GCS paths")
-  else:
-    return gfile.Open(path).size()
+  for dir_path, _, filenames in gfile.Walk(top):
+    yield (dir_path, (os.path.join(dir_path, filename)
+                      for filename in filenames))
diff --git a/tensorflow/python/summary/summary.py b/tensorflow/python/summary/summary.py
index 5dbde1c5477..a6b348cc991 100644
--- a/tensorflow/python/summary/summary.py
+++ b/tensorflow/python/summary/summary.py
@@ -33,6 +33,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import re as _re
+
 import six
 
 from google.protobuf import json_format as _json_format
@@ -56,16 +58,22 @@ def _collect(val, collections, default_collections):
     _ops.add_to_collection(key, val)
 
 
+_INVALID_TAG_CHARACTERS = _re.compile(r'[^-/\w\.]')
+
 def _clean_tag(name):
   # In the past, the first argument to summary ops was a tag, which allowed
-  # spaces. Since now we pass in the name, spaces are disallowed; to ease the
-  # transition and support backwards compatbility, we will convert the spaces
-  # to underscores (and also warn about it).
-  if name is not None and ' ' in name:
-    _logging.warning(
-        'Summary tag name %s contains spaces; replacing with underscores.' %
-        name)
-    name = name.replace(' ', '_')
+  # arbitrary characters. Now we are changing the first argument to be the node
+  # name. This has a number of advantages (users of summary ops now can
+  # take advantage of the tf name scope system) but risks breaking existing
+  # usage, because a much smaller set of characters are allowed in node names.
+  # This function replaces all illegal characters with _s, and logs a warning.
+  if name is not None:
+    new_name = _INVALID_TAG_CHARACTERS.sub('_', name)
+    if new_name != name:
+      _logging.warning(
+          'Summary tag name %s has illegal chars; replacing with underscores.' %
+          name)
+      name = new_name
   return name
 
 
diff --git a/tensorflow/python/summary/summary_test.py b/tensorflow/python/summary/summary_test.py
index bd819bbdfed..8acdcb0906b 100644
--- a/tensorflow/python/summary/summary_test.py
+++ b/tensorflow/python/summary/summary_test.py
@@ -85,6 +85,9 @@ class ScalarSummaryTest(tf.test.TestCase):
     s = tf.summary.scalar('name with spaces', c)
     self.assertEqual(s.op.name, 'name_with_spaces')
 
+    s2 = tf.summary.scalar('name with many $#illegal^: characters!', c)
+    self.assertEqual(s2.op.name, 'name_with_many___illegal___characters_')
+
 
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py
index da930f2bdb9..d986a7b4263 100644
--- a/tensorflow/python/training/basic_session_run_hooks.py
+++ b/tensorflow/python/training/basic_session_run_hooks.py
@@ -88,8 +88,28 @@ class _SecondOrStepTimer(object):
     return False
 
   def update_last_triggered_step(self, step):
-    self._last_triggered_time = time.time()
+    """Update the last triggered time and step number.
+
+    Args:
+      step: The current step.
+
+    Returns:
+      A pair `(elapsed_time, elapsed_steps)`, where `elapsed_time` is the number
+      of seconds between the current trigger and the last one (a float), and
+      `elapsed_steps` is the number of steps between the current trigger and
+      the last one. Both values will be set to `None` on the first trigger.
+    """
+    current_time = time.time()
+    if self._last_triggered_time is None:
+      elapsed_secs = None
+      elapsed_steps = None
+    else:
+      elapsed_secs = current_time - self._last_triggered_time
+      elapsed_steps = step - self._last_triggered_step
+
+    self._last_triggered_time = current_time
     self._last_triggered_step = step
+    return (elapsed_secs, elapsed_steps)
 
   def last_triggered_step(self):
     return self._last_triggered_step
@@ -272,16 +292,24 @@ class CheckpointSaverHook(session_run_hook.SessionRunHook):
 class StepCounterHook(session_run_hook.SessionRunHook):
   """Steps per second monitor."""
 
-  def __init__(self, every_n_steps=100, output_dir=None, summary_writer=None):
+  def __init__(self,
+               every_n_steps=100,
+               every_n_secs=None,
+               output_dir=None,
+               summary_writer=None):
     self._summary_tag = "global_step/sec"
-    self._every_n_steps = every_n_steps
+
+    if (every_n_steps is None) == (every_n_secs is None):
+      raise ValueError(
+          "exactly one of every_n_steps and every_n_secs should be provided.")
+    self._timer = _SecondOrStepTimer(every_steps=every_n_steps,
+                                     every_secs=every_n_secs)
+
     self._summary_writer = summary_writer
     if summary_writer is None and output_dir:
       self._summary_writer = SummaryWriterCache.get(output_dir)
 
   def begin(self):
-    self._last_reported_time = None
-    self._last_reported_step = None
     self._global_step_tensor = training_util.get_global_step()
     if self._global_step_tensor is None:
       raise RuntimeError(
@@ -294,22 +322,16 @@ class StepCounterHook(session_run_hook.SessionRunHook):
     _ = run_context
 
     global_step = run_values.results
-    current_time = time.time()
-    if self._last_reported_time is None:
-      self._last_reported_step = global_step
-      self._last_reported_time = current_time
-    else:
-      if global_step >= self._every_n_steps + self._last_reported_step:
-        added_steps = global_step - self._last_reported_step
-        elapsed_time = current_time - self._last_reported_time
-        steps_per_sec = added_steps / elapsed_time
+    if self._timer.should_trigger_for_step(global_step):
+      elapsed_time, elapsed_steps = self._timer.update_last_triggered_step(
+          global_step)
+      if elapsed_time is not None:
+        steps_per_sec = elapsed_steps / elapsed_time
         if self._summary_writer is not None:
           summary = Summary(value=[Summary.Value(
               tag=self._summary_tag, simple_value=steps_per_sec)])
           self._summary_writer.add_summary(summary, global_step)
         logging.info("%s: %g", self._summary_tag, steps_per_sec)
-        self._last_reported_step = global_step
-        self._last_reported_time = current_time
 
 
 class NanLossDuringTrainingError(RuntimeError):
diff --git a/tensorflow/python/training/basic_session_run_hooks_test.py b/tensorflow/python/training/basic_session_run_hooks_test.py
index 77be27a4ff3..fbf0394c5a4 100644
--- a/tensorflow/python/training/basic_session_run_hooks_test.py
+++ b/tensorflow/python/training/basic_session_run_hooks_test.py
@@ -62,6 +62,21 @@ class SecondOrStepTimerTest(tf.test.TestCase):
     self.assertFalse(timer.should_trigger_for_step(3))
     self.assertTrue(timer.should_trigger_for_step(4))
 
+  def test_update_last_triggered_step(self):
+    timer = basic_session_run_hooks._SecondOrStepTimer(every_steps=1)
+
+    elapsed_secs, elapsed_steps = timer.update_last_triggered_step(1)
+    self.assertEqual(None, elapsed_secs)
+    self.assertEqual(None, elapsed_steps)
+
+    elapsed_secs, elapsed_steps = timer.update_last_triggered_step(5)
+    self.assertLess(0, elapsed_secs)
+    self.assertEqual(4, elapsed_steps)
+
+    elapsed_secs, elapsed_steps = timer.update_last_triggered_step(7)
+    self.assertLess(0, elapsed_secs)
+    self.assertEqual(2, elapsed_steps)
+
 
 class StopAtStepTest(tf.test.TestCase):
 
@@ -297,7 +312,7 @@ class StepCounterHookTest(tf.test.TestCase):
   def tearDown(self):
     shutil.rmtree(self.log_dir, ignore_errors=True)
 
-  def test_step_counter(self):
+  def test_step_counter_every_n_steps(self):
     with tf.Graph().as_default() as g, tf.Session() as sess:
       global_step = tf.contrib.framework.get_or_create_global_step()
       train_op = tf.assign_add(global_step, 1)
@@ -316,11 +331,41 @@ class StepCounterHookTest(tf.test.TestCase):
           expected_logdir=self.log_dir,
           expected_graph=g,
           expected_summaries={})
+      self.assertItemsEqual([11, 21], summary_writer.summaries.keys())
       for step in [11, 21]:
         summary_value = summary_writer.summaries[step][0].value[0]
-        self.assertTrue(summary_value.tag, 'global_step/sec')
-        # check at least 10 steps per sec is recorded.
-        self.assertGreater(summary_value.simple_value, 10)
+        self.assertEqual('global_step/sec', summary_value.tag)
+        self.assertGreater(summary_value.simple_value, 0)
+
+  def test_step_counter_every_n_secs(self):
+    with tf.Graph().as_default() as g, tf.Session() as sess:
+      global_step = tf.contrib.framework.get_or_create_global_step()
+      train_op = tf.assign_add(global_step, 1)
+      summary_writer = testing.FakeSummaryWriter(self.log_dir, g)
+      hook = tf.train.StepCounterHook(
+          summary_writer=summary_writer, every_n_steps=None, every_n_secs=0.1)
+
+      hook.begin()
+      sess.run(tf.initialize_all_variables())
+      mon_sess = monitored_session._HookedSession(sess, [hook])
+      mon_sess.run(train_op)
+      time.sleep(0.2)
+      mon_sess.run(train_op)
+      time.sleep(0.2)
+      mon_sess.run(train_op)
+      hook.end(sess)
+
+      summary_writer.assert_summaries(
+          test_case=self,
+          expected_logdir=self.log_dir,
+          expected_graph=g,
+          expected_summaries={})
+      self.assertTrue(summary_writer.summaries, 'No summaries were created.')
+      self.assertItemsEqual([2, 3], summary_writer.summaries.keys())
+      for summary in summary_writer.summaries.values():
+        summary_value = summary[0].value[0]
+        self.assertEqual('global_step/sec', summary_value.tag)
+        self.assertGreater(summary_value.simple_value, 0)
 
 
 class SummarySaverHookTest(tf.test.TestCase):
diff --git a/tensorflow/python/training/input.py b/tensorflow/python/training/input.py
index 2bdfb211608..c976f19775a 100644
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@@ -647,7 +647,6 @@ def batch(tensors, batch_size, num_threads=1, capacity=32,
     # TODO(josh11b,mrry): Switch to BatchQueue once it is written.
     queue = _which_queue(dynamic_pad)(
         capacity=capacity, dtypes=types, shapes=shapes, shared_name=shared_name)
-    print("Enqueueing: ", enqueue_many, tensor_list, shapes)
     _enqueue(queue, tensor_list, num_threads, enqueue_many)
     summary.scalar("queue/%s/fraction_of_%d_full" % (queue.name, capacity),
                    math_ops.cast(queue.size(), dtypes.float32) *
diff --git a/tensorflow/python/training/input_test.py b/tensorflow/python/training/input_test.py
index 8cc15849398..07b8ac3ccf2 100644
--- a/tensorflow/python/training/input_test.py
+++ b/tensorflow/python/training/input_test.py
@@ -701,37 +701,37 @@ class BatchTest(tf.test.TestCase):
 
   def testBatchedSparseTensorInferredShape(self):
     sparse = tf.SparseTensor(indices=[[0]], values=[1.0], shape=[1])
-    self.assertAllEqual(sparse.shape.get_shape().as_list(), [1])
+    self.assertAllEqual((1,), sparse.shape.get_shape().as_list())
     batched = tf.train.batch([sparse], batch_size=2)
-    self.assertAllEqual(batched.shape.get_shape().as_list(), [2])
+    self.assertAllEqual((2,), batched.shape.get_shape().as_list())
 
   def testBatchedSparseTensorInferredShapeEnqueueMany(self):
     sparse = tf.SparseTensor(indices=[[0]], values=[1.0], shape=[1])
-    self.assertAllEqual(sparse.shape.get_shape().as_list(), [1])
+    self.assertAllEqual((1,), sparse.shape.get_shape().as_list())
     batched = tf.train.batch([sparse], batch_size=2, enqueue_many=True)
-    self.assertAllEqual(batched.shape.get_shape().as_list(), [1])
+    self.assertAllEqual((1,), batched.shape.get_shape().as_list())
 
   def testBatchedSparseTensorInferredShapeUnknownRank(self):
     sparse = tf.SparseTensor(
         indices=tf.placeholder(tf.int64),
         values=tf.placeholder(tf.float32),
         shape=tf.placeholder(tf.int64))
-    self.assertIs(sparse.shape.get_shape().num_elements(), None)
+    self.assertIs(None, sparse.shape.get_shape().num_elements())
     batched = tf.train.batch([sparse], batch_size=2)
-    self.assertIs(batched.shape.get_shape().num_elements(), None)
+    self.assertIs(None, batched.shape.get_shape().num_elements())
 
   def testBatchedSparseTensorInferredShapeUnknownRankEnqueueMany(self):
     sparse = tf.SparseTensor(
         indices=tf.placeholder(tf.int64),
         values=tf.placeholder(tf.float32),
         shape=tf.placeholder(tf.int64))
-    self.assertIs(sparse.shape.get_shape().num_elements(), None)
+    self.assertIs(None, sparse.shape.get_shape().num_elements())
     batched = tf.train.batch([sparse], batch_size=2, enqueue_many=True)
-    self.assertIs(batched.shape.get_shape().num_elements(), None)
+    self.assertIs(None, batched.shape.get_shape().num_elements())
 
   def testSingleElementDict(self):
     x = tf.train.batch({"c": [12, 12]}, batch_size=8)
-    self.assertEqual([8, 2], x["c"].get_shape().as_list())
+    self.assertAllEqual((8, 2), x["c"].get_shape().as_list())
 
 
 class BatchJoinTest(tf.test.TestCase):
@@ -771,6 +771,17 @@ class BatchJoinTest(tf.test.TestCase):
              [ninety_nine, sparse_ninety_nine, "b"]],
             batch_size=batch_size)
         batched_fetch = batched
+
+      # Shapes.
+      self.assertEqual(3, len(batched_fetch))
+      self.assertAllEqual((batch_size,), batched_fetch[0].get_shape().as_list())
+      self.assertAllEqual(
+          (None, 2), batched_fetch[1].indices.get_shape().as_list())
+      self.assertAllEqual(
+          (None,), batched_fetch[1].values.get_shape().as_list())
+      self.assertAllEqual((2,), batched_fetch[1].shape.get_shape().as_list())
+      self.assertAllEqual((batch_size,), batched_fetch[2].get_shape().as_list())
+
       tf.initialize_all_variables().run()
       tf.initialize_local_variables().run()
       threads = tf.train.start_queue_runners()
@@ -782,9 +793,9 @@ class BatchJoinTest(tf.test.TestCase):
       num_batches = (num_a + num_b) // batch_size
       for i in range(num_batches):
         results = sess.run(batched_fetch)
-        tf.logging.info("Batch %d: %s", i, results[0])
-        self.assertEqual(len(results[0]), batch_size)
-        self.assertEqual(len(results[2]), batch_size)
+        self.assertEqual(3, len(results))
+        self.assertEqual(batch_size, len(results[0]))
+        self.assertEqual(batch_size, len(results[2]))
         self.assertAllEqual(results[0], results[1].values)
         self.assertAllEqual(
             results[1].indices,
@@ -846,6 +857,12 @@ class BatchJoinTest(tf.test.TestCase):
           [[counter, a],
            [ninety_nine, b]],
           batch_size=batch_size, dynamic_pad=True)
+
+      # Shapes.
+      self.assertEqual(2, len(batched))
+      self.assertAllEqual((batch_size,), batched[0].get_shape().as_list())
+      self.assertAllEqual((batch_size, None), batched[1].get_shape().as_list())
+
       tf.initialize_all_variables().run()
       tf.initialize_local_variables().run()
       threads = tf.train.start_queue_runners()
@@ -858,7 +875,7 @@ class BatchJoinTest(tf.test.TestCase):
       num_batches = (num_a + num_b) // batch_size
       for i in range(num_batches):
         results = sess.run(batched)
-        tf.logging.info("Batch %d: %s", i, results[0])
+        self.assertEqual(2, len(results))
         self.assertEqual(len(results[0]), batch_size)
         self.assertEqual(len(results[1]), batch_size)
         for s in results[1]:
@@ -920,6 +937,14 @@ class BatchJoinTest(tf.test.TestCase):
           batch_size=batch_size,
           allow_smaller_final_batch=True)
 
+      # Shapes.
+      self.assertEqual(3, len(batched))
+      self.assertAllEqual((None,), batched[0].get_shape().as_list())
+      self.assertAllEqual((None, 2), batched[1].indices.get_shape().as_list())
+      self.assertAllEqual((None,), batched[1].values.get_shape().as_list())
+      self.assertAllEqual((2,), batched[1].shape.get_shape().as_list())
+      self.assertAllEqual((None,), batched[2].get_shape().as_list())
+
       tf.initialize_all_variables().run()
       tf.initialize_local_variables().run()
       threads = tf.train.start_queue_runners()
@@ -1003,6 +1028,12 @@ class BatchJoinTest(tf.test.TestCase):
           batch_size=batch_size,
           dynamic_pad=True,
           allow_smaller_final_batch=True)
+
+      # Shapes.
+      self.assertEqual(2, len(batched))
+      self.assertAllEqual((None,), batched[0].get_shape().as_list())
+      self.assertAllEqual((None, None), batched[1].get_shape().as_list())
+
       tf.initialize_all_variables().run()
       tf.initialize_local_variables().run()
       threads = tf.train.start_queue_runners()
@@ -1075,6 +1106,11 @@ class BatchJoinTest(tf.test.TestCase):
           [[counter, "string"]], batch_size=batch_size,
           shared_name="SHARED_NAME_XYZ", name="Q")
 
+      # Shapes.
+      self.assertEqual(2, len(batched))
+      self.assertAllEqual((batch_size,), batched[0].get_shape().as_list())
+      self.assertAllEqual((batch_size,), batched[1].get_shape().as_list())
+
       self.assertProtoEquals(
           "s: 'SHARED_NAME_XYZ'",
           batched[0].op.inputs[0].op.node_def.attr["shared_name"])
@@ -1087,7 +1123,7 @@ class BatchJoinTest(tf.test.TestCase):
 
   def testSingleElementDict(self):
     x = tf.train.batch_join([{"c": [12, 12]}], batch_size=8)
-    self.assertEqual([8, 2], x["c"].get_shape().as_list())
+    self.assertAllEqual((8, 2), x["c"].get_shape().as_list())
 
 
 class ShuffleBatchTest(tf.test.TestCase):
@@ -1356,6 +1392,16 @@ class ShuffleBatchJoinTest(tf.test.TestCase):
             min_after_dequeue=16, seed=223607)
         batched_fetch = batched
 
+      # Shapes.
+      self.assertEqual(3, len(batched_fetch))
+      self.assertAllEqual((batch_size,), batched_fetch[0].get_shape().as_list())
+      self.assertAllEqual(
+          (None, 2), batched_fetch[1].indices.get_shape().as_list())
+      self.assertAllEqual(
+          (None,), batched_fetch[1].values.get_shape().as_list())
+      self.assertAllEqual((2,), batched_fetch[1].shape.get_shape().as_list())
+      self.assertAllEqual((batch_size,), batched_fetch[2].get_shape().as_list())
+
       tf.initialize_all_variables().run()
       tf.initialize_local_variables().run()
       threads = tf.train.start_queue_runners()
@@ -1367,7 +1413,7 @@ class ShuffleBatchJoinTest(tf.test.TestCase):
       num_batches = (num_a + num_b) // batch_size
       for i in range(num_batches):
         results = sess.run(batched_fetch)
-        tf.logging.info("Batch %d: %s", i, results[0])
+        self.assertEqual(3, len(results))
         self.assertEqual(len(results[0]), batch_size)
         self.assertEqual(len(results[2]), batch_size)
         self.assertAllEqual(results[0], results[1].values)
@@ -1436,6 +1482,14 @@ class ShuffleBatchJoinTest(tf.test.TestCase):
           batch_size=batch_size, capacity=32,
           min_after_dequeue=16, seed=223607, allow_smaller_final_batch=True)
 
+      # Shapes.
+      self.assertEqual(3, len(batched))
+      self.assertAllEqual((None,), batched[0].get_shape().as_list())
+      self.assertAllEqual((None, 2), batched[1].indices.get_shape().as_list())
+      self.assertAllEqual((None,), batched[1].values.get_shape().as_list())
+      self.assertAllEqual((2,), batched[1].shape.get_shape().as_list())
+      self.assertAllEqual((None,), batched[2].get_shape().as_list())
+
       tf.initialize_all_variables().run()
       tf.initialize_local_variables().run()
       threads = tf.train.start_queue_runners()
@@ -1518,6 +1572,11 @@ class ShuffleBatchJoinTest(tf.test.TestCase):
           min_after_dequeue=10,
           shared_name="SHARED_NAME_XYZ", name="Q")
 
+      # Shapes.
+      self.assertEqual(2, len(batched))
+      self.assertAllEqual((batch_size,), batched[0].get_shape().as_list())
+      self.assertAllEqual((batch_size,), batched[1].get_shape().as_list())
+
       self.assertProtoEquals(
           "s: 'SHARED_NAME_XYZ'",
           batched[0].op.inputs[0].op.node_def.attr["shared_name"])
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index 0a08a5b5aac..9a331e69a79 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -1000,6 +1000,7 @@ class Saver(object):
       self.build()
     if self.saver_def:
       self._check_saver_def()
+      self._write_version = self.saver_def.version
 
   def build(self):
     """Builds saver_def."""
@@ -1461,8 +1462,8 @@ def latest_checkpoint(checkpoint_dir, latest_filename=None):
   return None
 
 
-def import_meta_graph(meta_graph_or_file, import_scope=None,
-                      **kwargs):
+def import_meta_graph(meta_graph_or_file, clear_devices=False,
+                      import_scope=None, **kwargs):
   """Recreates a Graph saved in a `MetaGraphDef` proto.
 
   This function takes a `MetaGraphDef` protocol buffer as input. If
@@ -1516,6 +1517,8 @@ def import_meta_graph(meta_graph_or_file, import_scope=None,
   Args:
     meta_graph_or_file: `MetaGraphDef` protocol buffer or filename (including
       the path) containing a `MetaGraphDef`.
+    clear_devices: Whether or not to clear the device field for an `Operation`
+      or `Tensor` during import.
     import_scope: Optional `string`. Name scope to add. Only used when
       initializing from protocol buffer.
     **kwargs: Optional keyed arguments.
@@ -1532,6 +1535,7 @@ def import_meta_graph(meta_graph_or_file, import_scope=None,
     meta_graph_def = meta_graph_or_file
 
   meta_graph.import_scoped_meta_graph(meta_graph_def,
+                                      clear_devices=clear_devices,
                                       import_scope=import_scope,
                                       **kwargs)
   if meta_graph_def.HasField("saver_def"):
diff --git a/tensorflow/python/training/saver_large_variable_test.py b/tensorflow/python/training/saver_large_variable_test.py
index 40f0a47e430..1e6d9e0c770 100644
--- a/tensorflow/python/training/saver_large_variable_test.py
+++ b/tensorflow/python/training/saver_large_variable_test.py
@@ -37,7 +37,8 @@ class SaverLargeVariableTest(tf.test.TestCase):
       with tf.device("/cpu:0"):
         var = tf.Variable(
             tf.constant(False, shape=[2, 1024, 1024, 1024], dtype=tf.bool))
-      save = tf.train.Saver({var.op.name: var})
+      save = tf.train.Saver({var.op.name: var},
+                            write_version=tf.train.SaverDef.V1)
       var.initializer.run()
       with self.assertRaisesRegexp(
           tf.errors.InvalidArgumentError,
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index 987b7164d65..23bd61c384e 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -1590,12 +1590,40 @@ class MetaGraphTest(tf.test.TestCase):
       new_saver = tf.train.import_meta_graph(
           filename + ".meta", graph=graph, import_scope="new_model")
       new_saver.restore(sess, filename)
-      tf.train.write_graph(graph, "/tmp", "new_graph.pbtxt", as_text=True)
-      label = [0] * 10
-      label[4] = 4
       sess.run(["new_model/optimize"],
                {"new_model/image:0": np.random.random([1, 784]),
-                "new_model/label:0": np.reshape(label, [1, 10])})
+                "new_model/label:0":
+                np.random.random_integers(10, size=[1, 10])})
+
+  def testClearDevices(self):
+    # Test that we import a graph without its devices and run successfully.
+    with tf.Graph().as_default():
+      with tf.device("/job:ps/replica:0/task:0/device:GPU:0"):
+        image = tf.placeholder(tf.float32, [None, 784], name="image")
+        label = tf.placeholder(tf.float32, [None, 10], name="label")
+        weights = tf.Variable(tf.random_uniform([784, 10]), name="weights")
+        bias = tf.Variable(tf.zeros([10]), name="bias")
+        logit = tf.nn.relu(tf.matmul(image, weights) + bias)
+        tf.nn.softmax(logit, name="prediction")
+        cost = tf.nn.softmax_cross_entropy_with_logits(logit, label)
+        tf.train.AdamOptimizer().minimize(cost, name="optimize")
+      meta_graph_def = tf.train.export_meta_graph()
+
+    with tf.Session(graph=tf.Graph()) as sess:
+      tf.train.import_meta_graph(
+          meta_graph_def, clear_devices=False, import_scope="new_model")
+      with self.assertRaisesRegexp(tf.errors.InvalidArgumentError,
+                                   "Cannot assign a device to node"):
+        sess.run(tf.initialize_all_variables())
+
+    with tf.Session(graph=tf.Graph()) as sess:
+      tf.train.import_meta_graph(
+          meta_graph_def, clear_devices=True, import_scope="new_model")
+      sess.run(tf.initialize_all_variables())
+      sess.run(["new_model/optimize"],
+               {"new_model/image:0": np.random.random([1, 784]),
+                "new_model/label:0":
+                np.random.random_integers(10, size=[1, 10])})
 
 
 class CheckpointReaderTest(tf.test.TestCase):
diff --git a/tensorflow/python/training/training.py b/tensorflow/python/training/training.py
index a8484c4ae17..1a11eb86f8d 100644
--- a/tensorflow/python/training/training.py
+++ b/tensorflow/python/training/training.py
@@ -67,6 +67,10 @@ gradients.
 
 ## Decaying the learning rate
 @@exponential_decay
+@@inverse_time_decay
+@@natural_exp_decay
+@@piecewise_constant
+@@polynomial_decay
 
 ## Moving Averages
 
diff --git a/tensorflow/tensorboard/TAG b/tensorflow/tensorboard/TAG
index f5c89552bd3..bb95160cb6e 100644
--- a/tensorflow/tensorboard/TAG
+++ b/tensorflow/tensorboard/TAG
@@ -1 +1 @@
-32
+33
diff --git a/tensorflow/tensorboard/backend/server.py b/tensorflow/tensorboard/backend/server.py
index 6c2f51e14a2..f590b5e02f4 100644
--- a/tensorflow/tensorboard/backend/server.py
+++ b/tensorflow/tensorboard/backend/server.py
@@ -32,7 +32,7 @@ from six.moves import socketserver
 
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import event_accumulator
-from tensorflow.python.summary.impl import gcs
+from tensorflow.python.summary.impl import io_wrapper
 from tensorflow.tensorboard.backend import handler
 
 # How many elements to store per tag, by tag type
@@ -69,7 +69,8 @@ def ParseEventFilesSpec(logdir):
     return files
   for specification in logdir.split(','):
     # If it's a gcs or hdfs path, don't split on colon
-    if gcs.IsGCSPath(specification) or specification.startswith('hdfs://'):
+    if (io_wrapper.IsGCSPath(specification) or
+        specification.startswith('hdfs://')):
       run_name = None
       path = specification
     # If the spec looks like /foo:bar/baz, then we assume it's a path with a
@@ -80,7 +81,7 @@ def ParseEventFilesSpec(logdir):
     else:
       run_name = None
       path = specification
-    if not (gcs.IsGCSPath(path) or path.startswith('hdfs://')):
+    if not (io_wrapper.IsGCSPath(path) or path.startswith('hdfs://')):
       path = os.path.realpath(path)
     files[path] = run_name
   return files
@@ -120,14 +121,6 @@ def StartMultiplexerReloadingThread(multiplexer, path_to_run, load_interval):
   """
   # We don't call multiplexer.Reload() here because that would make
   # AddRunsFromDirectory block until the runs have all loaded.
-  for path in path_to_run.keys():
-    if gcs.IsGCSPath(path):
-      gcs.CheckIsSupported()
-      logging.info(
-          'Assuming %s is intended to be a Google Cloud Storage path because '
-          'it starts with %s. If it isn\'t, prefix it with \'/.\' (i.e., use '
-          '/.%s instead)', path, gcs.PATH_PREFIX, path)
-
   def _ReloadForever():
     while True:
       ReloadMultiplexer(multiplexer, path_to_run)
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-multi-checkbox.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-multi-checkbox.html
index e934e8a9181..dadad81a343 100644
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-multi-checkbox.html
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/tf-multi-checkbox.html
@@ -43,7 +43,8 @@ handle these situations gracefully.
         id="runs-regex"
         no-label-float
         label="Write a regex to filter runs"
-        value="{{regexInput}}"
+        value="[[regexInput]]"
+        on-bind-value-changed="_debouncedRegexChange"
       ></paper-input>
     <div id="outer-container" class="scrollbar">
       <template
@@ -161,15 +162,14 @@ handle these situations gracefully.
     is: "tf-multi-checkbox",
     properties: {
       names: Array, // All the runs in consideration
-
       regexInput: {
         type: String,
         value: TF.URIStorage.getStringInitializer("regexInput", ""),
-        observer: "_regexInputObserver"
+        observer: "_regexInputObserver",
       }, // Regex for filtering the runs
       regex: {
         type: Object,
-        computed: "makeRegex(regexInput)"
+        computed: "_makeRegex(regexInput)"
       },
       namesMatchingRegex: {
         type: Array,
@@ -189,6 +189,31 @@ handle these situations gracefully.
         type: Object,
         observer: "synchronizeColors",
       }, // map from run name to css class
+      _debouncedRegexChange: {
+        type: Function,
+        // Updating the regex can be slow, because it involves updating styles
+        // on a large number of Polymer paper-checkboxes. We don't want to do
+        // this while the user is typing, as it may make a bad, laggy UI.
+        // So we debounce the updates that come from user typing.
+        value: function() {
+          _this = this;
+          var debounced = _.debounce(function(r) {
+            _this.regexInput = r;
+          }, 150, {leading: false});
+          return function() {
+            var r = this.$$("#runs-regex").value;
+            if (r == "") {
+              // If the user cleared the field, they may be done typing, so
+              // update more quickly.
+              this.async(function() {
+                _this.regexInput = r;
+              }, 30);
+            } else {
+              debounced(r);
+            };
+          };
+        },
+      },
     },
     listeners: {
       'dom-change': 'synchronizeColors',
@@ -196,10 +221,10 @@ handle these situations gracefully.
     observers: [
       "_initializeRunToIsCheckedMapping(names.*)",
       "_setIsolatorIcon(runToIsCheckedMapping)",
-      "_storeRunToIsCheckedMapping(runToIsCheckedMapping)"
+      "_storeRunToIsCheckedMapping(runToIsCheckedMapping)",
     ],
     _storeRunToIsCheckedMapping: TF.URIStorage.getObjectObserver('runToIsCheckedMapping', {}),
-    makeRegex: function(regex) {
+    _makeRegex: function(regex) {
       try {
         return new RegExp(regex)
       } catch (e) {
@@ -261,11 +286,9 @@ handle these situations gracefully.
         var color = scale.scale(p.name);
         p.style['color'] = color;
       });
-      this.updateStyles();
       // The updateStyles call fails silently if the browser doesn't have focus,
       // e.g. if TensorBoard was opened into a new tab that isn't visible.
-      // As a workaround... we know requestAnimationFrame won't fire until the
-      // page has focus, so updateStyles again on requestAnimationFrame.
+      // So we wait for requestAnimationFrame.
       var _this = this;
       window.requestAnimationFrame(function() {_this.updateStyles();});
     },
diff --git a/tensorflow/tensorboard/components/tf_storage/storage.ts b/tensorflow/tensorboard/components/tf_storage/storage.ts
index aefa55f3130..f7b2179d381 100644
--- a/tensorflow/tensorboard/components/tf_storage/storage.ts
+++ b/tensorflow/tensorboard/components/tf_storage/storage.ts
@@ -295,12 +295,30 @@ module TF.URIStorage {
       get: (name: string) => T, propertyName: string, defaultVal: T): Function {
     return function() {
       let URIStorageName = getURIStorageName(this, propertyName);
+      // setComponentValue will be called every time the hash changes, and is
+      // responsible for ensuring that new state in the hash will be propagated
+      // to the component with that property.
+      // It is important that this function does not re-assign needlessly,
+      // to avoid Polymer observer churn.
       let setComponentValue = () => {
-        // Clone, in case the caller will mutuate this object, we
-        // don't want to mutate our default instance
-        let v = _.clone(defaultVal);
         let uriValue = get(URIStorageName);
-        this[propertyName] = uriValue !== undefined ? uriValue : v;
+        let currentValue = this[propertyName];
+        // if uriValue is undefined, we will ensure that the property has the
+        // default value
+        if (uriValue === undefined) {
+          if (!_.isEqual(currentValue, defaultVal)) {
+            // If we don't have an explicit URI value, then we need to ensure
+            // the property value is equal to the default value.
+            // We will assign a clone rather than the canonical default, because
+            // the component receiving this property may mutate it, and we need
+            // to keep a pristine copy of the default.
+            this[propertyName] = _.clone(defaultVal);
+          }
+          // In this case, we have an explicit URI value, so we will ensure that
+          // the component has an equivalent value.
+        } else if (!_.isEqual(uriValue, currentValue)) {
+          this[propertyName] = uriValue;
+        }
       };
       // Set the value on the property.
       setComponentValue();
diff --git a/tensorflow/tensorboard/components/vz_projector/data-loader.ts b/tensorflow/tensorboard/components/vz_projector/data-loader.ts
deleted file mode 100644
index ed1589c574c..00000000000
--- a/tensorflow/tensorboard/components/vz_projector/data-loader.ts
+++ /dev/null
@@ -1,580 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-import {runAsyncTask} from './util';
-import * as logging from './logging';
-import {ColumnStats, DataPoint, DataSet, DatasetMetadata, MetadataInfo, PointMetadata, State, DataProto} from './data';
-
-
-/** Maximum number of colors supported in the color map. */
-const NUM_COLORS_COLOR_MAP = 20;
-
-const METADATA_MSG_ID = 'metadata';
-const TENSORS_MSG_ID = 'tensors';
-
-/** Information associated with a tensor. */
-export interface TensorInfo {
-  /** Name of the tensor. */
-  name: string;
-  /** The shape of the tensor. */
-  shape: [number, number];
-  /** The path to the metadata file associated with the tensor. */
-  metadataFile: string;
-  /** The path to the bookmarks file associated with the tensor. */
-  bookmarksFile: string;
-}
-
-/** Information for the model checkpoint. */
-export interface CheckpointInfo {
-  tensors: {[name: string]: TensorInfo};
-  checkpointFile: string;
-}
-
-export type ServingMode = 'demo' | 'server' | 'proto';
-
-/** Interface between the data storage and the UI. */
-export interface DataProvider {
-  /** Returns a list of run names that have embedding config files. */
-  retrieveRuns(callback: (runs: string[]) => void): void;
-
-  /**
-   * Returns info about the checkpoint: number of tensors, their shapes,
-   * and their associated metadata files.
-   */
-  retrieveCheckpointInfo(run: string, callback: (d: CheckpointInfo) => void): void;
-
-  /** Fetches and returns the tensor with the specified name. */
-  retrieveTensor(run: string, tensorName: string, callback: (ds: DataSet) => void);
-
-  /**
-   * Fetches the metadata for the specified tensor.
-   */
-  retrieveMetadata(run: string, tensorName: string,
-      callback: (r: MetadataInfo) => void): void;
-
-  /**
-   * Returns the name of the tensor that should be fetched by default.
-   * Used in demo mode to load a tensor when the app starts. Returns null if no
-   * default tensor exists.
-   */
-  getDefaultTensor(run: string, callback: (tensorName: string) => void): void;
-
-  getBookmarks(run: string, tensorName: string, callback: (r: State[]) => void):
-      void;
-}
-
-/**
- * Data provider that loads data provided by a python server (usually backed
- * by a checkpoint file).
- */
-class ServerDataProvider implements DataProvider {
-  private routePrefix: string;
-  private runCheckpointInfoCache: {[run: string]: CheckpointInfo} = {};
-
-  constructor(routePrefix: string) {
-    this.routePrefix = routePrefix;
-  }
-
-  retrieveRuns(callback: (runs: string[]) => void): void {
-    let msgId = logging.setModalMessage('Fetching runs...');
-    d3.json(`${this.routePrefix}/runs`, (err, runs) => {
-      logging.setModalMessage(null, msgId);
-      callback(runs);
-    });
-  }
-
-  retrieveCheckpointInfo(run: string, callback: (d: CheckpointInfo) => void)
-      : void {
-    if (run in this.runCheckpointInfoCache) {
-      callback(this.runCheckpointInfoCache[run]);
-      return;
-    }
-
-    let msgId = logging.setModalMessage('Fetching checkpoint info...');
-    d3.json(`${this.routePrefix}/info?run=${run}`, (err, checkpointInfo) => {
-      logging.setModalMessage(null, msgId);
-      this.runCheckpointInfoCache[run] = checkpointInfo;
-      callback(checkpointInfo);
-    });
-  }
-
-  retrieveTensor(run: string, tensorName: string, callback: (ds: DataSet) => void) {
-    // Get the tensor.
-    logging.setModalMessage('Fetching tensor values...', TENSORS_MSG_ID);
-    d3.text(
-        `${this.routePrefix}/tensor?run=${run}&name=${tensorName}`,
-        (err: Error, tsv: string) => {
-          if (err) {
-            console.error(err);
-            return;
-          }
-          parseTensors(tsv).then(dataPoints => {
-            callback(new DataSet(dataPoints));
-          });
-        });
-  }
-
-  retrieveMetadata(run: string, tensorName: string,
-      callback: (r: MetadataInfo) => void) {
-    logging.setModalMessage('Fetching metadata...', METADATA_MSG_ID);
-    d3.text(
-        `${this.routePrefix}/metadata?run=${run}&name=${tensorName}`,
-        (err: Error, rawMetadata: string) => {
-          if (err) {
-            console.error(err);
-            return;
-          }
-          parseMetadata(rawMetadata).then(result => callback(result));
-        });
-  }
-
-  getDefaultTensor(run: string, callback: (tensorName: string) => void) {
-    this.retrieveCheckpointInfo(run, checkpointInfo => {
-      let tensorNames = Object.keys(checkpointInfo.tensors);
-      // Return the first tensor that has metadata.
-      for (let i = 0; i < tensorNames.length; i++) {
-        let tensorName = tensorNames[i];
-        if (checkpointInfo.tensors[tensorName].metadataFile) {
-          callback(tensorName);
-          return;
-        }
-      }
-      callback(tensorNames.length >= 1 ? tensorNames[0] : null);
-    });
-  }
-
-  getBookmarks(
-      run: string, tensorName: string, callback: (r: State[]) => void) {
-    let msgId = logging.setModalMessage('Fetching bookmarks...');
-    d3.json(
-        `${this.routePrefix}/bookmarks?run=${run}&name=${tensorName}`,
-        (err, bookmarks) => {
-          logging.setModalMessage(null, msgId);
-          if (!err) {
-            callback(bookmarks as State[]);
-          }
-        });
-  }
-}
-
-class ProtoDataProvider implements DataProvider {
-  private dataProto: DataProto;
-
-  constructor(dataProto: DataProto) {
-    this.dataProto = dataProto;
-  }
-
-  retrieveRuns(callback: (runs: string[]) => void): void {
-    callback(['proto']);
-  }
-
-  retrieveCheckpointInfo(run: string, callback: (d: CheckpointInfo) => void) {
-    callback({
-      tensors: {
-        'proto': {
-          name: 'proto',
-          shape: this.dataProto.shape,
-          metadataFile: 'proto',
-          bookmarksFile: null
-        }
-      },
-      checkpointFile: 'proto'
-    });
-  }
-
-  retrieveTensor(run: string, tensorName: string,
-      callback: (ds: DataSet) => void) {
-    callback(this.flatArrayToDataset(this.dataProto.tensor));
-  }
-
-  retrieveMetadata(run: string, tensorName: string,
-      callback: (r: MetadataInfo) => void): void {
-    let columnNames = this.dataProto.metadata.columns.map(c => c.name);
-    let n = this.dataProto.shape[0];
-    let pointsMetadata: PointMetadata[] = new Array(n);
-    this.dataProto.metadata.columns.forEach(c => {
-      let values = c.numericValues || c.stringValues;
-      for (let i = 0; i < n; i++) {
-        pointsMetadata[i] = pointsMetadata[i] || {};
-        pointsMetadata[i][c.name] = values[i];
-      }
-    });
-    callback({
-      stats: analyzeMetadata(columnNames, pointsMetadata),
-      pointsInfo: pointsMetadata
-    });
-  }
-
-  getDefaultTensor(run: string, callback: (tensorName: string) => void): void {
-    callback('proto');
-  }
-
-  getBookmarks(run: string, tensorName: string,
-      callback: (r: State[]) => void): void {
-    return callback([]);
-  }
-
-  private flatArrayToDataset(tensor: number[]): DataSet {
-    let points: DataPoint[] = [];
-    let n = this.dataProto.shape[0];
-    let d = this.dataProto.shape[1];
-    if (n * d !== tensor.length) {
-      throw 'The shape doesn\'t match the length of the flattened array';
-    }
-    for (let i = 0; i < n; i++) {
-      let vector: number[] = [];
-      let offset = i * d;
-      for (let j = 0; j < d; j++) {
-        vector.push(tensor[offset++]);
-      }
-      points.push({
-        vector: vector,
-        metadata: {},
-        projections: null,
-        projectedPoint: null,
-        index: i
-      });
-    }
-    return new DataSet(points);
-  }
-}
-
-/**
- * Returns a data provider, depending on what is available. The detection of
- * a server backend is done by issuing an HTTP request at /data/info and seeing
- * if it returns 200 or 404.
- *
- * @param servingMode Information how the data served (server, proto, etc.).
- * @param dataProto The projector data, in a proto format. Available if
- *     serving mode is 'proto'.
- * @param routePrefix The prefix to add to the url routes when asking for data
- *     from the backend. For example, when hosted inside tensorboard, the route
- *     is prefixed by the plugin name.
- * @param callback Called with the data provider.
- */
-export function getDataProvider(servingMode: ServingMode, dataProto: DataProto,
-    routePrefix: string, callback: (dp: DataProvider) => void) {
-  if (servingMode === 'demo') {
-    callback(new DemoDataProvider());
-  } else if (servingMode === 'server') {
-    if (!routePrefix) {
-      throw 'route-prefix is a required parameter';
-    }
-    callback(new ServerDataProvider(routePrefix));
-  } else if (servingMode === 'proto' && dataProto != null) {
-    callback(new ProtoDataProvider(dataProto));
-  }
-}
-
-export function parseRawTensors(
-    content: string, callback: (ds: DataSet) => void) {
-  parseTensors(content).then(data => {
-    callback(new DataSet(data));
-  });
-}
-
-export function parseRawMetadata(
-    contents: string, callback: (r: MetadataInfo) => void) {
-  parseMetadata(contents).then(result => callback(result));
-}
-
-/** Parses a tsv text file. */
-function parseTensors(content: string, delim = '\t'): Promise<DataPoint[]> {
-  let data: DataPoint[] = [];
-  let numDim: number;
-  return runAsyncTask('Parsing tensors...', () => {
-    let lines = content.split('\n');
-    lines.forEach(line => {
-      line = line.trim();
-      if (line === '') {
-        return;
-      }
-      let row = line.split(delim);
-      let dataPoint: DataPoint = {
-        metadata: {},
-        vector: null,
-        index: data.length,
-        projections: null,
-        projectedPoint: null
-      };
-      // If the first label is not a number, take it as the label.
-      if (isNaN(row[0] as any) || numDim === row.length - 1) {
-        dataPoint.metadata['label'] = row[0];
-        dataPoint.vector = row.slice(1).map(Number);
-      } else {
-        dataPoint.vector = row.map(Number);
-      }
-      data.push(dataPoint);
-      if (numDim == null) {
-        numDim = dataPoint.vector.length;
-      }
-      if (numDim !== dataPoint.vector.length) {
-        logging.setModalMessage(
-            'Parsing failed. Vector dimensions do not match');
-        throw Error('Parsing failed');
-      }
-      if (numDim <= 1) {
-        logging.setModalMessage(
-            'Parsing failed. Found a vector with only one dimension?');
-        throw Error('Parsing failed');
-      }
-    });
-    return data;
-  }, TENSORS_MSG_ID).then(dataPoints => {
-    logging.setModalMessage(null, TENSORS_MSG_ID);
-    return dataPoints;
-  });
-}
-
-function analyzeMetadata(columnNames, pointsMetadata: PointMetadata[]):
-    ColumnStats[] {
-  let columnStats: ColumnStats[] = columnNames.map(name => {
-    return {
-      name: name,
-      isNumeric: true,
-      tooManyUniqueValues: false,
-      min: Number.POSITIVE_INFINITY,
-      max: Number.NEGATIVE_INFINITY
-    };
-  });
-  let mapOfValues = columnNames.map(() => d3.map<number>());
-  pointsMetadata.forEach(metadata => {
-    columnNames.forEach((name: string, colIndex: number) => {
-      let stats = columnStats[colIndex];
-      let map = mapOfValues[colIndex];
-      let value = metadata[name];
-
-      // Skip missing values.
-      if (value == null) {
-        return;
-      }
-
-      if (!stats.tooManyUniqueValues) {
-        if (map.has(value)) {
-          map.set(value, map.get(value) + 1);
-        } else {
-          map.set(value, 1);
-        }
-        if (map.size() > NUM_COLORS_COLOR_MAP) {
-          stats.tooManyUniqueValues = true;
-        }
-      }
-      if (isNaN(value as any)) {
-        stats.isNumeric = false;
-      } else {
-        metadata[name] = +value;
-        stats.min = Math.min(stats.min, +value);
-        stats.max = Math.max(stats.max, +value);
-      }
-    });
-  });
-  columnStats.forEach((stats, colIndex) => {
-    let map = mapOfValues[colIndex];
-    if (!stats.tooManyUniqueValues) {
-      stats.uniqueEntries = map.entries().map(e => {
-        return {label: e.key, count: e.value};
-      });
-    }
-  });
-  return columnStats;
-}
-
-function parseMetadata(content: string): Promise<MetadataInfo> {
-  return runAsyncTask('Parsing metadata...', () => {
-    let lines = content.split('\n').filter(line => line.trim().length > 0);
-    let hasHeader = lines[0].indexOf('\t') >= 0;
-    let pointsMetadata: PointMetadata[] = [];
-    // If the first row doesn't contain metadata keys, we assume that the values
-    // are labels.
-    let columnNames = ['label'];
-    if (hasHeader) {
-      columnNames = lines[0].split('\t');
-      lines = lines.slice(1);
-    }
-    lines.forEach((line: string) => {
-      let rowValues = line.split('\t');
-      let metadata: PointMetadata = {};
-      pointsMetadata.push(metadata);
-      columnNames.forEach((name: string, colIndex: number) => {
-        let value = rowValues[colIndex];
-        // Normalize missing values.
-        value = (value === '' ? null : value);
-        metadata[name] = value;
-      });
-    });
-    return {
-      stats: analyzeMetadata(columnNames, pointsMetadata),
-      pointsInfo: pointsMetadata
-    } as MetadataInfo;
-  }, METADATA_MSG_ID).then(metadata => {
-    logging.setModalMessage(null, METADATA_MSG_ID);
-    return metadata;
-  });
-}
-
-function fetchImage(url: string): Promise<HTMLImageElement> {
-  return new Promise<HTMLImageElement>((resolve, reject) => {
-    let image = new Image();
-    image.onload = () => resolve(image);
-    image.onerror = (err) => reject(err);
-    image.src = url;
-  });
-}
-
-type DemoDataset = {
-  fpath: string; metadata_path?: string; metadata?: DatasetMetadata;
-  bookmarks_path?: string;
-  shape: [number, number];
-};
-
-/** Data provider that loads data from a demo folder. */
-class DemoDataProvider implements DataProvider {
-  /** List of demo datasets for showing the capabilities of the tool. */
-  private static DEMO_DATASETS: {[name: string]: DemoDataset} = {
-    'Glove Wiki 5K': {
-      shape: [5000, 50],
-      fpath: 'wiki_5000_50d_tensors.ssv',
-      metadata_path: 'wiki_5000_50d_labels.ssv'
-    },
-    'Glove Wiki 10K': {
-      shape: [10000, 100],
-      fpath: 'wiki_10000_100d_tensors.ssv',
-      metadata_path: 'wiki_10000_100d_labels.ssv'
-    },
-    'Glove Wiki 40K': {
-      shape: [40000, 100],
-      fpath: 'wiki_40000_100d_tensors.ssv',
-      metadata_path: 'wiki_40000_100d_labels.ssv'
-    },
-    'SmartReply 5K': {
-      shape: [5000, 256],
-      fpath: 'smartreply_5000_256d_tensors.tsv',
-      metadata_path: 'smartreply_5000_256d_labels.tsv'
-    },
-    'SmartReply All': {
-      shape: [35860, 256],
-      fpath: 'smartreply_full_256d_tensors.tsv',
-      metadata_path: 'smartreply_full_256d_labels.tsv'
-    },
-    'Mnist with images 10K': {
-      shape: [10000, 784],
-      fpath: 'mnist_10k_784d_tensors.tsv',
-      metadata_path: 'mnist_10k_784d_labels.tsv',
-      metadata: {
-        image:
-            {sprite_fpath: 'mnist_10k_sprite.png', single_image_dim: [28, 28]}
-      },
-    },
-    'Iris': {
-      shape: [150, 4],
-      fpath: 'iris_tensors.tsv',
-      metadata_path: 'iris_labels.tsv'
-    }
-  };
-  /** Name of the folder where the demo datasets are stored. */
-  private static DEMO_FOLDER = 'data';
-
-  retrieveRuns(callback: (runs: string[]) => void): void {
-    callback(['Demo']);
-  }
-
-  retrieveCheckpointInfo(run: string, callback: (d: CheckpointInfo) => void)
-      : void {
-    let tensorsInfo: {[name: string]: TensorInfo} = {};
-    for (let name in DemoDataProvider.DEMO_DATASETS) {
-      if (!DemoDataProvider.DEMO_DATASETS.hasOwnProperty(name)) {
-        continue;
-      }
-      let demoInfo = DemoDataProvider.DEMO_DATASETS[name];
-      tensorsInfo[name] = {
-        name: name,
-        shape: demoInfo.shape,
-        metadataFile: demoInfo.metadata_path,
-        bookmarksFile: demoInfo.bookmarks_path
-      };
-    }
-    callback({
-      tensors: tensorsInfo,
-      checkpointFile: 'Demo datasets',
-    });
-  }
-
-  getDefaultTensor(run: string, callback: (tensorName: string) => void) {
-    callback('SmartReply 5K');
-  }
-
-  retrieveTensor(run: string, tensorName: string,
-      callback: (ds: DataSet) => void) {
-    let demoDataSet = DemoDataProvider.DEMO_DATASETS[tensorName];
-    let separator = demoDataSet.fpath.substr(-3) === 'tsv' ? '\t' : ' ';
-    let url = `${DemoDataProvider.DEMO_FOLDER}/${demoDataSet.fpath}`;
-    logging.setModalMessage('Fetching tensors...', TENSORS_MSG_ID);
-    d3.text(url, (error: Error, dataString: string) => {
-      if (error) {
-        console.error(error);
-        logging.setModalMessage('Error loading data.');
-        return;
-      }
-      parseTensors(dataString, separator).then(points => {
-        callback(new DataSet(points));
-      });
-    });
-  }
-
-  retrieveMetadata(run: string, tensorName: string,
-      callback: (r: MetadataInfo) => void) {
-    let demoDataSet = DemoDataProvider.DEMO_DATASETS[tensorName];
-    let dataSetPromise: Promise<MetadataInfo> = null;
-    if (demoDataSet.metadata_path) {
-      dataSetPromise = new Promise<MetadataInfo>((resolve, reject) => {
-        logging.setModalMessage('Fetching metadata...', METADATA_MSG_ID);
-        d3.text(
-            `${DemoDataProvider.DEMO_FOLDER}/${demoDataSet.metadata_path}`,
-            (err: Error, rawMetadata: string) => {
-              if (err) {
-                console.error(err);
-                reject(err);
-                return;
-              }
-              resolve(parseMetadata(rawMetadata));
-            });
-      });
-    }
-    let spriteMsgId = null;
-    let spritesPromise: Promise<HTMLImageElement> = null;
-    if (demoDataSet.metadata && demoDataSet.metadata.image) {
-      let spriteFilePath = demoDataSet.metadata.image.sprite_fpath;
-      spriteMsgId = logging.setModalMessage('Fetching sprite image...');
-      spritesPromise =
-          fetchImage(`${DemoDataProvider.DEMO_FOLDER}/${spriteFilePath}`);
-    }
-
-    // Fetch the metadata and the image in parallel.
-    Promise.all([dataSetPromise, spritesPromise]).then(values => {
-      if (spriteMsgId) {
-        logging.setModalMessage(null, spriteMsgId);
-      }
-      let [metadata, spriteImage] = values;
-      metadata.spriteImage = spriteImage;
-      metadata.datasetInfo = demoDataSet.metadata;
-      callback(metadata);
-    });
-  }
-
-  getBookmarks(
-      run: string, tensorName: string, callback: (r: State[]) => void) {
-    callback([]);
-  }
-}
diff --git a/tensorflow/tensorboard/components/vz_projector/data-provider-demo.ts b/tensorflow/tensorboard/components/vz_projector/data-provider-demo.ts
new file mode 100644
index 00000000000..38bab8d068f
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_projector/data-provider-demo.ts
@@ -0,0 +1,169 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+import {DataSet, DatasetMetadata, MetadataInfo, State} from './data';
+import {CheckpointInfo, DataProvider, fetchImage, METADATA_MSG_ID, parseMetadata, parseTensors, TensorInfo, TENSORS_MSG_ID} from './data-provider';
+import * as logging from './logging';
+
+
+type DemoDataset = {
+  fpath: string; metadata_path?: string; metadata?: DatasetMetadata;
+  bookmarks_path?: string;
+  shape: [number, number];
+};
+
+/** Data provider that loads data from a demo folder. */
+export class DemoDataProvider implements DataProvider {
+  /** List of demo datasets for showing the capabilities of the tool. */
+  private static DEMO_DATASETS: {[name: string]: DemoDataset} = {
+    'Word2Vec 5K': {
+      shape: [5000, 200],
+      fpath: 'word2vec_5000_200d_tensors.tsv',
+      metadata_path: 'word2vec_5000_200d_labels.tsv'
+    },
+    'Word2Vec 10K': {
+      shape: [10000, 200],
+      fpath: 'word2vec_10000_200d_tensors.tsv',
+      metadata_path: 'word2vec_10000_200d_labels.tsv'
+    },
+    'Word2Vec All': {
+      shape: [71291, 200],
+      fpath: 'word2vec_full_200d_tensors.tsv',
+      metadata_path: 'word2vec_full_200d_labels.tsv'
+    },
+    'SmartReply 5K': {
+      shape: [5000, 256],
+      fpath: 'smartreply_5000_256d_tensors.tsv',
+      metadata_path: 'smartreply_5000_256d_labels.tsv'
+    },
+    'SmartReply All': {
+      shape: [35860, 256],
+      fpath: 'smartreply_full_256d_tensors.tsv',
+      metadata_path: 'smartreply_full_256d_labels.tsv'
+    },
+    'Mnist with images 10K': {
+      shape: [10000, 784],
+      fpath: 'mnist_10k_784d_tensors.tsv',
+      metadata_path: 'mnist_10k_784d_labels.tsv',
+      metadata: {
+        image:
+            {sprite_fpath: 'mnist_10k_sprite.png', single_image_dim: [28, 28]}
+      },
+    },
+    'Iris': {
+      shape: [150, 4],
+      fpath: 'iris_tensors.tsv',
+      metadata_path: 'iris_labels.tsv'
+    },
+    'Unit Cube': {
+      shape: [8, 3],
+      fpath: 'cube_tensors.tsv',
+      metadata_path: 'cube_metadata.tsv'
+    }
+  };
+  /** Name of the folder where the demo datasets are stored. */
+  private static DEMO_FOLDER = 'data';
+
+  retrieveRuns(callback: (runs: string[]) => void): void {
+    callback(['Demo']);
+  }
+
+  retrieveCheckpointInfo(run: string, callback: (d: CheckpointInfo) => void)
+      : void {
+    let tensorsInfo: {[name: string]: TensorInfo} = {};
+    for (let name in DemoDataProvider.DEMO_DATASETS) {
+      if (!DemoDataProvider.DEMO_DATASETS.hasOwnProperty(name)) {
+        continue;
+      }
+      let demoInfo = DemoDataProvider.DEMO_DATASETS[name];
+      tensorsInfo[name] = {
+        name: name,
+        shape: demoInfo.shape,
+        metadataFile: demoInfo.metadata_path,
+        bookmarksFile: demoInfo.bookmarks_path
+      };
+    }
+    callback({
+      tensors: tensorsInfo,
+      checkpointFile: 'Demo datasets',
+    });
+  }
+
+  getDefaultTensor(run: string, callback: (tensorName: string) => void) {
+    callback('SmartReply 5K');
+  }
+
+  retrieveTensor(run: string, tensorName: string,
+      callback: (ds: DataSet) => void) {
+    let demoDataSet = DemoDataProvider.DEMO_DATASETS[tensorName];
+    let separator = demoDataSet.fpath.substr(-3) === 'tsv' ? '\t' : ' ';
+    let url = `${DemoDataProvider.DEMO_FOLDER}/${demoDataSet.fpath}`;
+    logging.setModalMessage('Fetching tensors...', TENSORS_MSG_ID);
+    d3.text(url, (error: any, dataString: string) => {
+      if (error) {
+        logging.setModalMessage('Error: ' + error.responseText);
+        return;
+      }
+      parseTensors(dataString, separator).then(points => {
+        callback(new DataSet(points));
+      });
+    });
+  }
+
+  retrieveMetadata(run: string, tensorName: string,
+      callback: (r: MetadataInfo) => void) {
+    let demoDataSet = DemoDataProvider.DEMO_DATASETS[tensorName];
+    let dataSetPromise: Promise<MetadataInfo> = null;
+    if (demoDataSet.metadata_path) {
+      dataSetPromise = new Promise<MetadataInfo>((resolve, reject) => {
+        logging.setModalMessage('Fetching metadata...', METADATA_MSG_ID);
+        d3.text(
+            `${DemoDataProvider.DEMO_FOLDER}/${demoDataSet.metadata_path}`,
+            (err: any, rawMetadata: string) => {
+              if (err) {
+                logging.setModalMessage('Error: ' + err.responseText);
+                reject(err);
+                return;
+              }
+              resolve(parseMetadata(rawMetadata));
+            });
+      });
+    }
+    let spriteMsgId = null;
+    let spritesPromise: Promise<HTMLImageElement> = null;
+    if (demoDataSet.metadata && demoDataSet.metadata.image) {
+      let spriteFilePath = demoDataSet.metadata.image.sprite_fpath;
+      spriteMsgId = logging.setModalMessage('Fetching sprite image...');
+      spritesPromise =
+          fetchImage(`${DemoDataProvider.DEMO_FOLDER}/${spriteFilePath}`);
+    }
+
+    // Fetch the metadata and the image in parallel.
+    Promise.all([dataSetPromise, spritesPromise]).then(values => {
+      if (spriteMsgId) {
+        logging.setModalMessage(null, spriteMsgId);
+      }
+      let [metadata, spriteImage] = values;
+      metadata.spriteImage = spriteImage;
+      metadata.datasetInfo = demoDataSet.metadata;
+      callback(metadata);
+    });
+  }
+
+  getBookmarks(
+      run: string, tensorName: string, callback: (r: State[]) => void) {
+    callback([]);
+  }
+}
diff --git a/tensorflow/tensorboard/components/vz_projector/data-provider-proto.ts b/tensorflow/tensorboard/components/vz_projector/data-provider-proto.ts
new file mode 100644
index 00000000000..039798fc55a
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_projector/data-provider-proto.ts
@@ -0,0 +1,100 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+import {DataPoint, DataProto, DataSet, MetadataInfo, PointMetadata, State} from './data';
+import {analyzeMetadata, CheckpointInfo, DataProvider} from './data-provider';
+
+
+export class ProtoDataProvider implements DataProvider {
+  private dataProto: DataProto;
+
+  constructor(dataProto: DataProto) {
+    this.dataProto = dataProto;
+  }
+
+  retrieveRuns(callback: (runs: string[]) => void): void {
+    callback(['proto']);
+  }
+
+  retrieveCheckpointInfo(run: string, callback: (d: CheckpointInfo) => void) {
+    callback({
+      tensors: {
+        'proto': {
+          name: 'proto',
+          shape: this.dataProto.shape,
+          metadataFile: 'proto',
+          bookmarksFile: null
+        }
+      },
+      checkpointFile: 'proto'
+    });
+  }
+
+  retrieveTensor(run: string, tensorName: string,
+      callback: (ds: DataSet) => void) {
+    callback(this.flatArrayToDataset(this.dataProto.tensor));
+  }
+
+  retrieveMetadata(run: string, tensorName: string,
+      callback: (r: MetadataInfo) => void): void {
+    let columnNames = this.dataProto.metadata.columns.map(c => c.name);
+    let n = this.dataProto.shape[0];
+    let pointsMetadata: PointMetadata[] = new Array(n);
+    this.dataProto.metadata.columns.forEach(c => {
+      let values = c.numericValues || c.stringValues;
+      for (let i = 0; i < n; i++) {
+        pointsMetadata[i] = pointsMetadata[i] || {};
+        pointsMetadata[i][c.name] = values[i];
+      }
+    });
+    callback({
+      stats: analyzeMetadata(columnNames, pointsMetadata),
+      pointsInfo: pointsMetadata
+    });
+  }
+
+  getDefaultTensor(run: string, callback: (tensorName: string) => void): void {
+    callback('proto');
+  }
+
+  getBookmarks(run: string, tensorName: string,
+      callback: (r: State[]) => void): void {
+    return callback([]);
+  }
+
+  private flatArrayToDataset(tensor: number[]): DataSet {
+    let points: DataPoint[] = [];
+    let n = this.dataProto.shape[0];
+    let d = this.dataProto.shape[1];
+    if (n * d !== tensor.length) {
+      throw 'The shape doesn\'t match the length of the flattened array';
+    }
+    for (let i = 0; i < n; i++) {
+      let vector: number[] = [];
+      let offset = i * d;
+      for (let j = 0; j < d; j++) {
+        vector.push(tensor[offset++]);
+      }
+      points.push({
+        vector: vector,
+        metadata: {},
+        projections: null,
+        projectedPoint: null,
+        index: i
+      });
+    }
+    return new DataSet(points);
+  }
+}
diff --git a/tensorflow/tensorboard/components/vz_projector/data-provider-server.ts b/tensorflow/tensorboard/components/vz_projector/data-provider-server.ts
new file mode 100644
index 00000000000..9ad408a7a44
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_projector/data-provider-server.ts
@@ -0,0 +1,121 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+import {DataSet, MetadataInfo, State} from './data';
+import {CheckpointInfo, DataProvider, METADATA_MSG_ID, parseMetadata, parseTensors, TENSORS_MSG_ID} from './data-provider';
+import * as logging from './logging';
+
+
+/**
+ * Data provider that loads data provided by a python server (usually backed
+ * by a checkpoint file).
+ */
+export class ServerDataProvider implements DataProvider {
+  private routePrefix: string;
+  private runCheckpointInfoCache: {[run: string]: CheckpointInfo} = {};
+
+  constructor(routePrefix: string) {
+    this.routePrefix = routePrefix;
+  }
+
+  retrieveRuns(callback: (runs: string[]) => void): void {
+    let msgId = logging.setModalMessage('Fetching runs...');
+    d3.json(`${this.routePrefix}/runs`, (err, runs) => {
+      if (err) {
+        logging.setModalMessage('Error: ' + err.responseText);
+        return;
+      }
+      logging.setModalMessage(null, msgId);
+      callback(runs);
+    });
+  }
+
+  retrieveCheckpointInfo(run: string, callback: (d: CheckpointInfo) => void)
+      : void {
+    if (run in this.runCheckpointInfoCache) {
+      callback(this.runCheckpointInfoCache[run]);
+      return;
+    }
+
+    let msgId = logging.setModalMessage('Fetching checkpoint info...');
+    d3.json(`${this.routePrefix}/info?run=${run}`, (err, checkpointInfo) => {
+      if (err) {
+        logging.setModalMessage('Error: ' + err.responseText);
+        return;
+      }
+      logging.setModalMessage(null, msgId);
+      this.runCheckpointInfoCache[run] = checkpointInfo;
+      callback(checkpointInfo);
+    });
+  }
+
+  retrieveTensor(run: string, tensorName: string, callback: (ds: DataSet) => void) {
+    // Get the tensor.
+    logging.setModalMessage('Fetching tensor values...', TENSORS_MSG_ID);
+    d3.text(
+        `${this.routePrefix}/tensor?run=${run}&name=${tensorName}`,
+        (err: any, tsv: string) => {
+          if (err) {
+            logging.setModalMessage('Error: ' + err.responseText);
+            return;
+          }
+          parseTensors(tsv).then(dataPoints => {
+            callback(new DataSet(dataPoints));
+          });
+        });
+  }
+
+  retrieveMetadata(run: string, tensorName: string,
+      callback: (r: MetadataInfo) => void) {
+    logging.setModalMessage('Fetching metadata...', METADATA_MSG_ID);
+    d3.text(
+        `${this.routePrefix}/metadata?run=${run}&name=${tensorName}`,
+        (err: any, rawMetadata: string) => {
+          if (err) {
+            logging.setModalMessage('Error: ' + err.responseText);
+            return;
+          }
+          parseMetadata(rawMetadata).then(result => callback(result));
+        });
+  }
+
+  getDefaultTensor(run: string, callback: (tensorName: string) => void) {
+    this.retrieveCheckpointInfo(run, checkpointInfo => {
+      let tensorNames = Object.keys(checkpointInfo.tensors);
+      // Return the first tensor that has metadata.
+      for (let i = 0; i < tensorNames.length; i++) {
+        let tensorName = tensorNames[i];
+        if (checkpointInfo.tensors[tensorName].metadataFile) {
+          callback(tensorName);
+          return;
+        }
+      }
+      callback(tensorNames.length >= 1 ? tensorNames[0] : null);
+    });
+  }
+
+  getBookmarks(
+      run: string, tensorName: string, callback: (r: State[]) => void) {
+    let msgId = logging.setModalMessage('Fetching bookmarks...');
+    d3.json(
+        `${this.routePrefix}/bookmarks?run=${run}&name=${tensorName}`,
+        (err, bookmarks) => {
+          logging.setModalMessage(null, msgId);
+          if (!err) {
+            callback(bookmarks as State[]);
+          }
+        });
+  }
+}
diff --git a/tensorflow/tensorboard/components/vz_projector/data-provider.ts b/tensorflow/tensorboard/components/vz_projector/data-provider.ts
new file mode 100644
index 00000000000..c3f8c714414
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_projector/data-provider.ts
@@ -0,0 +1,231 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+import {ColumnStats, DataPoint, DataSet, MetadataInfo, PointMetadata, State} from './data';
+import * as logging from './logging';
+import {runAsyncTask} from './util';
+
+/** Maximum number of colors supported in the color map. */
+const NUM_COLORS_COLOR_MAP = 20;
+
+export const METADATA_MSG_ID = 'metadata';
+export const TENSORS_MSG_ID = 'tensors';
+
+/** Information associated with a tensor. */
+export interface TensorInfo {
+  /** Name of the tensor. */
+  name: string;
+  /** The shape of the tensor. */
+  shape: [number, number];
+  /** The path to the metadata file associated with the tensor. */
+  metadataFile: string;
+  /** The path to the bookmarks file associated with the tensor. */
+  bookmarksFile: string;
+}
+
+/** Information for the model checkpoint. */
+export interface CheckpointInfo {
+  tensors: {[name: string]: TensorInfo};
+  checkpointFile: string;
+}
+
+export type ServingMode = 'demo' | 'server' | 'proto';
+
+/** Interface between the data storage and the UI. */
+export interface DataProvider {
+  /** Returns a list of run names that have embedding config files. */
+  retrieveRuns(callback: (runs: string[]) => void): void;
+
+  /**
+   * Returns info about the checkpoint: number of tensors, their shapes,
+   * and their associated metadata files.
+   */
+  retrieveCheckpointInfo(run: string, callback: (d: CheckpointInfo) => void): void;
+
+  /** Fetches and returns the tensor with the specified name. */
+  retrieveTensor(run: string, tensorName: string, callback: (ds: DataSet) => void);
+
+  /**
+   * Fetches the metadata for the specified tensor.
+   */
+  retrieveMetadata(run: string, tensorName: string,
+      callback: (r: MetadataInfo) => void): void;
+
+  /**
+   * Returns the name of the tensor that should be fetched by default.
+   * Used in demo mode to load a tensor when the app starts. Returns null if no
+   * default tensor exists.
+   */
+  getDefaultTensor(run: string, callback: (tensorName: string) => void): void;
+
+  getBookmarks(run: string, tensorName: string, callback: (r: State[]) => void):
+      void;
+}
+
+export function parseRawTensors(
+    content: string, callback: (ds: DataSet) => void) {
+  parseTensors(content).then(data => {
+    callback(new DataSet(data));
+  });
+}
+
+export function parseRawMetadata(
+    contents: string, callback: (r: MetadataInfo) => void) {
+  parseMetadata(contents).then(result => callback(result));
+}
+
+/** Parses a tsv text file. */
+export function parseTensors(
+    content: string, delim = '\t'): Promise<DataPoint[]> {
+  let data: DataPoint[] = [];
+  let numDim: number;
+  return runAsyncTask('Parsing tensors...', () => {
+    let lines = content.split('\n');
+    lines.forEach(line => {
+      line = line.trim();
+      if (line === '') {
+        return;
+      }
+      let row = line.split(delim);
+      let dataPoint: DataPoint = {
+        metadata: {},
+        vector: null,
+        index: data.length,
+        projections: null,
+        projectedPoint: null
+      };
+      // If the first label is not a number, take it as the label.
+      if (isNaN(row[0] as any) || numDim === row.length - 1) {
+        dataPoint.metadata['label'] = row[0];
+        dataPoint.vector = row.slice(1).map(Number);
+      } else {
+        dataPoint.vector = row.map(Number);
+      }
+      data.push(dataPoint);
+      if (numDim == null) {
+        numDim = dataPoint.vector.length;
+      }
+      if (numDim !== dataPoint.vector.length) {
+        logging.setModalMessage(
+            'Parsing failed. Vector dimensions do not match');
+        throw Error('Parsing failed');
+      }
+      if (numDim <= 1) {
+        logging.setModalMessage(
+            'Parsing failed. Found a vector with only one dimension?');
+        throw Error('Parsing failed');
+      }
+    });
+    return data;
+  }, TENSORS_MSG_ID).then(dataPoints => {
+    logging.setModalMessage(null, TENSORS_MSG_ID);
+    return dataPoints;
+  });
+}
+
+export function analyzeMetadata(
+    columnNames, pointsMetadata: PointMetadata[]): ColumnStats[] {
+  let columnStats: ColumnStats[] = columnNames.map(name => {
+    return {
+      name: name,
+      isNumeric: true,
+      tooManyUniqueValues: false,
+      min: Number.POSITIVE_INFINITY,
+      max: Number.NEGATIVE_INFINITY
+    };
+  });
+  let mapOfValues = columnNames.map(() => d3.map<number>());
+  pointsMetadata.forEach(metadata => {
+    columnNames.forEach((name: string, colIndex: number) => {
+      let stats = columnStats[colIndex];
+      let map = mapOfValues[colIndex];
+      let value = metadata[name];
+
+      // Skip missing values.
+      if (value == null) {
+        return;
+      }
+
+      if (!stats.tooManyUniqueValues) {
+        if (map.has(value)) {
+          map.set(value, map.get(value) + 1);
+        } else {
+          map.set(value, 1);
+        }
+        if (map.size() > NUM_COLORS_COLOR_MAP) {
+          stats.tooManyUniqueValues = true;
+        }
+      }
+      if (isNaN(value as any)) {
+        stats.isNumeric = false;
+      } else {
+        metadata[name] = +value;
+        stats.min = Math.min(stats.min, +value);
+        stats.max = Math.max(stats.max, +value);
+      }
+    });
+  });
+  columnStats.forEach((stats, colIndex) => {
+    let map = mapOfValues[colIndex];
+    if (!stats.tooManyUniqueValues) {
+      stats.uniqueEntries = map.entries().map(e => {
+        return {label: e.key, count: e.value};
+      });
+    }
+  });
+  return columnStats;
+}
+
+export function parseMetadata(content: string): Promise<MetadataInfo> {
+  return runAsyncTask('Parsing metadata...', () => {
+    let lines = content.split('\n').filter(line => line.trim().length > 0);
+    let hasHeader = lines[0].indexOf('\t') >= 0;
+    let pointsMetadata: PointMetadata[] = [];
+    // If the first row doesn't contain metadata keys, we assume that the values
+    // are labels.
+    let columnNames = ['label'];
+    if (hasHeader) {
+      columnNames = lines[0].split('\t');
+      lines = lines.slice(1);
+    }
+    lines.forEach((line: string) => {
+      let rowValues = line.split('\t');
+      let metadata: PointMetadata = {};
+      pointsMetadata.push(metadata);
+      columnNames.forEach((name: string, colIndex: number) => {
+        let value = rowValues[colIndex];
+        // Normalize missing values.
+        value = (value === '' ? null : value);
+        metadata[name] = value;
+      });
+    });
+    return {
+      stats: analyzeMetadata(columnNames, pointsMetadata),
+      pointsInfo: pointsMetadata
+    } as MetadataInfo;
+  }, METADATA_MSG_ID).then(metadata => {
+    logging.setModalMessage(null, METADATA_MSG_ID);
+    return metadata;
+  });
+}
+
+export function fetchImage(url: string): Promise<HTMLImageElement> {
+  return new Promise<HTMLImageElement>((resolve, reject) => {
+    let image = new Image();
+    image.onload = () => resolve(image);
+    image.onerror = (err) => reject(err);
+    image.src = url;
+  });
+}
diff --git a/tensorflow/tensorboard/components/vz_projector/data.ts b/tensorflow/tensorboard/components/vz_projector/data.ts
index 8388c47e7f6..d692b8f5879 100644
--- a/tensorflow/tensorboard/components/vz_projector/data.ts
+++ b/tensorflow/tensorboard/components/vz_projector/data.ts
@@ -115,6 +115,7 @@ export class DataSet implements scatterPlot.DataSet {
   projections = d3.set();
   nearest: knn.NearestEntry[][];
   nearestK: number;
+  tSNEIteration: number = 0;
   tSNEShouldStop = true;
   dim = [0, 0];
   hasTSNERun: boolean = false;
@@ -123,26 +124,19 @@ export class DataSet implements scatterPlot.DataSet {
 
   private tsne: TSNE;
 
-  /**
-   * Creates a new Dataset by copying out data from an array of datapoints.
-   * We make a copy because we have to modify the vectors by normalizing them.
-   */
+  /** Creates a new Dataset */
   constructor(points: DataPoint[]) {
-    // Keep a list of indices seen so we don't compute traces for a given
-    // point twice.
-    let indicesSeen: boolean[] = [];
     this.points = points;
-    this.points.forEach(dp => {
-      indicesSeen.push(false);
-    });
-
     this.sampledDataIndices =
         shuffle(d3.range(this.points.length)).slice(0, SAMPLE_SIZE);
-    this.traces = this.computeTraces(points, indicesSeen);
+    this.traces = this.computeTraces(points);
     this.dim = [this.points.length, this.points[0].vector.length];
   }
 
-  private computeTraces(points: DataPoint[], indicesSeen: boolean[]) {
+  private computeTraces(points: DataPoint[]) {
+    // Keep a list of indices seen so we don't compute traces for a given
+    // point twice.
+    let indicesSeen = new Int8Array(points.length);
     // Compute traces.
     let indexToTrace: {[index: number]: scatterPlot.DataTrace} = {};
     let traces: scatterPlot.DataTrace[] = [];
@@ -150,7 +144,7 @@ export class DataSet implements scatterPlot.DataSet {
       if (indicesSeen[i]) {
         continue;
       }
-      indicesSeen[i] = true;
+      indicesSeen[i] = 1;
 
       // Ignore points without a trace attribute.
       let next = points[i].metadata[TRACE_METADATA_ATTR];
@@ -173,7 +167,7 @@ export class DataSet implements scatterPlot.DataSet {
         newTrace.pointIndices.push(currentIndex);
         let next = points[currentIndex].metadata[TRACE_METADATA_ATTR];
         if (next != null && next !== '') {
-          indicesSeen[+next] = true;
+          indicesSeen[+next] = 1;
           currentIndex = +next;
         } else {
           currentIndex = -1;
@@ -192,6 +186,9 @@ export class DataSet implements scatterPlot.DataSet {
         [null, null, null];
     const prefix = (projection === 'custom') ? 'linear' : projection;
     for (let i = 0; i < components.length; ++i) {
+      if (components[i] == null) {
+        continue;
+      }
       accessors[i] =
           (index =>
                this.points[index].projections[prefix + '-' + components[i]]);
@@ -199,6 +196,13 @@ export class DataSet implements scatterPlot.DataSet {
     return accessors;
   }
 
+  hasMeaningfulVisualization(projection: Projection): boolean {
+    if (projection !== 'tsne') {
+      return true;
+    }
+    return this.tSNEIteration > 0;
+  }
+
   /**
    * Returns a new subset dataset by copying out data. We make a copy because
    * we have to modify the vectors by normalizing them.
@@ -295,11 +299,12 @@ export class DataSet implements scatterPlot.DataSet {
     let opt = {epsilon: learningRate, perplexity: perplexity, dim: tsneDim};
     this.tsne = new TSNE(opt);
     this.tSNEShouldStop = false;
-    let iter = 0;
+    this.tSNEIteration = 0;
 
     let step = () => {
       if (this.tSNEShouldStop) {
         stepCallback(null);
+        this.tsne = null;
         return;
       }
       this.tsne.step();
@@ -313,8 +318,8 @@ export class DataSet implements scatterPlot.DataSet {
           dataPoint.projections['tsne-2'] = result[i * tsneDim + 2];
         }
       });
-      iter++;
-      stepCallback(iter);
+      this.tSNEIteration++;
+      stepCallback(this.tSNEIteration);
       requestAnimationFrame(step);
     };
 
@@ -338,7 +343,6 @@ export class DataSet implements scatterPlot.DataSet {
       runAsyncTask('Initializing T-SNE...', () => {
         this.tsne.initDataDist(this.nearest);
       }).then(step);
-
     });
   }
 
@@ -419,34 +423,69 @@ export interface ColorOption {
  * An interface that holds all the data for serializing the current state of
  * the world.
  */
-export interface State {
+export class State {
   /** A label identifying this state. */
-  label?: string;
+  label: string = '';
 
   /** Whether this State is selected in the bookmarks pane. */
-  isSelected?: boolean;
+  isSelected: boolean = false;
 
   /** The selected projection tab. */
-  selectedProjection?: Projection;
+  selectedProjection: Projection;
 
-  /** The projection component dimensions (for PCA) */
-  componentDimensions?: number[];
+  /** t-SNE parameters */
+  tSNEIteration: number = 0;
+  tSNEPerplexity: number = 0;
+  tSNELearningRate: number = 0;
+  tSNEis3d: boolean = true;
+
+  /** PCA projection component dimensions */
+  pcaComponentDimensions: number[] = [];
+
+  /** Custom projection parameters */
+  customSelectedSearchByMetadataOption: string;
+  customXLeftText: string;
+  customXLeftRegex: boolean;
+  customXRightText: string;
+  customXRightRegex: boolean;
+  customYUpText: string;
+  customYUpRegex: boolean;
+  customYDownText: string;
+  customYDownRegex: boolean;
 
   /** The computed projections of the tensors. */
-  projections?: Array<{[key: string]: number}>;
+  projections: Array<{[key: string]: number}> = [];
 
   /** The indices of selected points. */
-  selectedPoints?: number[];
+  selectedPoints: number[] = [];
 
   /** Camera state (2d/3d, position, target, zoom, etc). */
-  cameraDef?: scatterPlot.CameraDef;
+  cameraDef: scatterPlot.CameraDef;
 
   /** Color by option. */
-  selectedColorOptionName?: string;
+  selectedColorOptionName: string;
 
   /** Label by option. */
-  selectedLabelOption?: string;
-
-  /** Whether the state is a 3d view. If false, the state is a 2d view. */
-  is3d?: boolean;
+  selectedLabelOption: string;
+}
+
+export function stateGetAccessorDimensions(state: State): Array<number|string> {
+  let dimensions: Array<number|string>;
+  switch (state.selectedProjection) {
+    case 'pca':
+      dimensions = state.pcaComponentDimensions.slice();
+      break;
+    case 'tsne':
+      dimensions = [0, 1];
+      if (state.tSNEis3d) {
+        dimensions.push(2);
+      }
+      break;
+    case 'custom':
+      dimensions = ['x', 'y'];
+      break;
+    default:
+      throw new Error('Unexpected fallthrough');
+  }
+  return dimensions;
 }
diff --git a/tensorflow/tensorboard/components/vz_projector/data_test.ts b/tensorflow/tensorboard/components/vz_projector/data_test.ts
index 62dd2350ef4..b9c4846766e 100644
--- a/tensorflow/tensorboard/components/vz_projector/data_test.ts
+++ b/tensorflow/tensorboard/components/vz_projector/data_test.ts
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-import {DataPoint, DataSet} from './data';
+import {DataPoint, DataSet, State, stateGetAccessorDimensions} from './data';
 
 /**
  * Helper method that makes a list of points given an array of
@@ -38,28 +38,58 @@ function makePointsWithTraces(traces: number[]) {
   return points;
 }
 
-const assert = chai.assert;
+describe('constructor_with_traces', () => {
+  it('Simple forward pointing traces', () => {
+    // The input is: 0->2, 1->None, 2->3, 3->None. This should return
+    // one trace 0->2->3.
+    const points = makePointsWithTraces([2, -1, 3, -1]);
+    let dataset = new DataSet(points);
+    expect(dataset.traces.length).toEqual(1);
+    expect(dataset.traces[0].pointIndices).toEqual([0, 2, 3]);
+  });
 
-it('Simple forward pointing traces', () => {
-  // The input is: 0->2, 1->None, 2->3, 3->None. This should return
-  // one trace 0->2->3.
-  let points = makePointsWithTraces([2, -1, 3, -1]);
-  let dataset = new DataSet(points);
-  assert.equal(dataset.traces.length, 1);
-  assert.deepEqual(dataset.traces[0].pointIndices, [0, 2, 3]);
+  it('No traces', () => {
+    let points = makePointsWithTraces([-1, -1, -1, -1]);
+    let dataset = new DataSet(points);
+    expect(dataset.traces.length).toEqual(0);
+  });
+
+  it('A trace that goes backwards and forward in the array', () => {
+    // The input is: 0->2, 1->0, 2->nothing, 3->1. This should return
+    // one trace 3->1->0->2.
+    let points = makePointsWithTraces([2, 0, -1, 1]);
+    let dataset = new DataSet(points);
+    expect(dataset.traces.length).toEqual(1);
+    expect(dataset.traces[0].pointIndices).toEqual([3, 1, 0, 2]);
+  });
 });
 
-it('No traces', () => {
-  let points = makePointsWithTraces([-1, -1, -1, -1]);
-  let dataset = new DataSet(points);
-  assert.equal(dataset.traces.length, 0);
-});
+describe('stateGetAccessorDimensions', () => {
+  it('returns [0, 1] for 2d t-SNE', () => {
+    const state = new State();
+    state.selectedProjection = 'tsne';
+    state.tSNEis3d = false;
+    expect(stateGetAccessorDimensions(state)).toEqual([0, 1]);
+  });
 
-it('A trace that goes backwards and forward in the array', () => {
-  // The input is: 0->2, 1->0, 2->nothing, 3->1. This should return
-  // one trace 3->1->0->2.
-  let points = makePointsWithTraces([2, 0, -1, 1]);
-  let dataset = new DataSet(points);
-  assert.equal(dataset.traces.length, 1);
-  assert.deepEqual(dataset.traces[0].pointIndices, [3, 1, 0, 2]);
+  it('returns [0, 1, 2] for 3d t-SNE', () => {
+    const state = new State();
+    state.selectedProjection = 'tsne';
+    state.tSNEis3d = true;
+    expect(stateGetAccessorDimensions(state)).toEqual([0, 1, 2]);
+  });
+
+  it('returns pca component dimensions array for pca', () => {
+    const state = new State();
+    state.selectedProjection = 'pca';
+    state.pcaComponentDimensions = [13, 12, 11, 10];
+    expect(stateGetAccessorDimensions(state))
+        .toEqual(state.pcaComponentDimensions);
+  });
+
+  it('returns ["x", "y"] for custom projections', () => {
+    const state = new State();
+    state.selectedProjection = 'custom';
+    expect(stateGetAccessorDimensions(state)).toEqual(['x', 'y']);
+  });
 });
diff --git a/tensorflow/tensorboard/components/vz_projector/knn.ts b/tensorflow/tensorboard/components/vz_projector/knn.ts
index 3a47dd07b53..4db45d207bd 100644
--- a/tensorflow/tensorboard/components/vz_projector/knn.ts
+++ b/tensorflow/tensorboard/components/vz_projector/knn.ts
@@ -114,6 +114,13 @@ export function findKNNGPUCosine<T>(
         bigMatrix.delete();
         resolve(nearest);
       }
+    }, error => {
+      // GPU failed. Reverting back to CPU.
+      logging.setModalMessage(null, KNN_GPU_MSG_ID);
+      let distFunc = (a, b, limit) => vector.cosDistNorm(a, b);
+      findKNN(dataPoints, k, accessor, distFunc).then(nearest => {
+        resolve(nearest);
+      });
     });
   }
   return new Promise<NearestEntry[][]>(resolve => step(resolve));
diff --git a/tensorflow/tensorboard/components/vz_projector/label.ts b/tensorflow/tensorboard/components/vz_projector/label.ts
index c041a6c5cb5..67987f06ea3 100644
--- a/tensorflow/tensorboard/components/vz_projector/label.ts
+++ b/tensorflow/tensorboard/components/vz_projector/label.ts
@@ -82,8 +82,8 @@ export class CollisionGrid {
    */
   insert(bound: BoundingBox, justTest = false): boolean {
     // Reject if the label is out of bounds.
-    if (bound.loX < this.bound.loX || bound.hiX > this.bound.hiX ||
-        bound.loY < this.bound.loY || bound.hiY > this.bound.hiY) {
+    if ((bound.hiX < this.bound.loX) || (bound.loX > this.bound.hiX) ||
+        (bound.hiY < this.bound.loY) || (bound.loY > this.bound.hiY)) {
       return false;
     }
 
diff --git a/tensorflow/tensorboard/components/vz_projector/projectorScatterPlotAdapter.ts b/tensorflow/tensorboard/components/vz_projector/projectorScatterPlotAdapter.ts
index 253a72432b5..ab93d2791ea 100644
--- a/tensorflow/tensorboard/components/vz_projector/projectorScatterPlotAdapter.ts
+++ b/tensorflow/tensorboard/components/vz_projector/projectorScatterPlotAdapter.ts
@@ -19,7 +19,7 @@ import {LabelRenderParams} from './renderContext';
 
 const LABEL_FONT_SIZE = 10;
 const LABEL_SCALE_DEFAULT = 1.0;
-const LABEL_SCALE_LARGE = 1.7;
+const LABEL_SCALE_LARGE = 2;
 const LABEL_FILL_COLOR = 0x000000;
 const LABEL_STROKE_COLOR = 0xFFFFFF;
 
@@ -36,6 +36,11 @@ const POINT_SCALE_HOVER = 1.2;
 const LABELS_3D_COLOR_UNSELECTED = 0xFFFFFF;
 const LABELS_3D_COLOR_NO_SELECTION = 0xFFFFFF;
 
+const TRACE_START_HUE = 60;
+const TRACE_END_HUE = 360;
+const TRACE_SATURATION = 1;
+const TRACE_LIGHTNESS = .3;
+
 /**
  * Interprets projector events and assembes the arrays and commands necessary
  * to use the ScatterPlot to render the current projected data set.
@@ -129,6 +134,66 @@ export class ProjectorScatterPlotAdapter {
     return scale;
   }
 
+  generateLineSegmentColorMap(
+      ds: DataSet, legendPointColorer: (index: number) => string):
+      {[trace: number]: Float32Array} {
+    let traceColorArrayMap: {[trace: number]: Float32Array} = {};
+    if (ds == null) {
+      return traceColorArrayMap;
+    }
+
+    for (let i = 0; i < ds.traces.length; i++) {
+      let dataTrace = ds.traces[i];
+
+      let colors =
+          new Float32Array(2 * (dataTrace.pointIndices.length - 1) * 3);
+      let colorIndex = 0;
+
+      if (legendPointColorer) {
+        for (let j = 0; j < dataTrace.pointIndices.length - 1; j++) {
+          const c1 =
+              new THREE.Color(legendPointColorer(dataTrace.pointIndices[j]));
+          const c2 = new THREE.Color(
+              legendPointColorer(dataTrace.pointIndices[j + 1]));
+          colors[colorIndex++] = c1.r;
+          colors[colorIndex++] = c1.g;
+          colors[colorIndex++] = c1.b;
+
+          colors[colorIndex++] = c2.r;
+          colors[colorIndex++] = c2.g;
+          colors[colorIndex++] = c2.b;
+        }
+      } else {
+        for (let j = 0; j < dataTrace.pointIndices.length - 1; j++) {
+          const c1 = this.getDefaultPointInTraceColor(
+              j, dataTrace.pointIndices.length);
+          const c2 = this.getDefaultPointInTraceColor(
+              j + 1, dataTrace.pointIndices.length);
+          colors[colorIndex++] = c1.r;
+          colors[colorIndex++] = c1.g;
+          colors[colorIndex++] = c1.b;
+
+          colors[colorIndex++] = c2.r;
+          colors[colorIndex++] = c2.g;
+          colors[colorIndex++] = c2.b;
+        }
+      }
+
+      traceColorArrayMap[i] = colors;
+    }
+
+    return traceColorArrayMap;
+  }
+
+  private getDefaultPointInTraceColor(index: number, totalPoints: number):
+      THREE.Color {
+    let hue = TRACE_START_HUE +
+        (TRACE_END_HUE - TRACE_START_HUE) * index / totalPoints;
+
+    let rgb = d3.hsl(hue, TRACE_SATURATION, TRACE_LIGHTNESS).rgb();
+    return new THREE.Color(rgb.r / 255, rgb.g / 255, rgb.b / 255);
+  }
+
   generatePointColorArray(
       ds: DataSet, legendPointColorer: (index: number) => string,
       selectedPointIndices: number[], neighborsOfFirstPoint: NearestEntry[],
diff --git a/tensorflow/tensorboard/components/vz_projector/renderContext.ts b/tensorflow/tensorboard/components/vz_projector/renderContext.ts
index 3da699bd82b..b20ec7cb5e3 100644
--- a/tensorflow/tensorboard/components/vz_projector/renderContext.ts
+++ b/tensorflow/tensorboard/components/vz_projector/renderContext.ts
@@ -57,13 +57,14 @@ export class RenderContext {
   pointScaleFactors: Float32Array;
   labelAccessor: (index: number) => string;
   labels: LabelRenderParams;
+  traceColors: {[trace: number]: Float32Array};
 
   constructor(
       camera: THREE.Camera, cameraTarget: THREE.Vector3, screenWidth: number,
       screenHeight: number, nearestCameraSpacePointZ: number,
       farthestCameraSpacePointZ: number, pointColors: Float32Array,
       pointScaleFactors: Float32Array, labelAccessor: (index: number) => string,
-      labels: LabelRenderParams) {
+      labels: LabelRenderParams, traceColors: {[trace: number]: Float32Array}) {
     this.camera = camera;
     this.cameraTarget = cameraTarget;
     this.screenWidth = screenWidth;
@@ -74,5 +75,6 @@ export class RenderContext {
     this.pointScaleFactors = pointScaleFactors;
     this.labelAccessor = labelAccessor;
     this.labels = labels;
+    this.traceColors = traceColors;
   }
 }
diff --git a/tensorflow/tensorboard/components/vz_projector/scatterPlot.ts b/tensorflow/tensorboard/components/vz_projector/scatterPlot.ts
index 7b098472cb6..a7bba69c400 100644
--- a/tensorflow/tensorboard/components/vz_projector/scatterPlot.ts
+++ b/tensorflow/tensorboard/components/vz_projector/scatterPlot.ts
@@ -132,7 +132,7 @@ export class ScatterPlot {
   private light: THREE.PointLight;
   private selectionSphere: THREE.Mesh;
 
-  private cameraDef: CameraDef|null = null;
+  private cameraDef: CameraDef = null;
   private camera: THREE.Camera;
   private orbitCameraControls: any;
   private orbitAnimationId: number;
@@ -141,6 +141,8 @@ export class ScatterPlot {
   private pointScaleFactors: Float32Array;
   private labels: LabelRenderParams;
 
+  private traceColors: {[trace: number]: Float32Array};
+
   private selecting = false;
   private nearestPoint: number;
   private mouseIsDown = false;
@@ -698,7 +700,7 @@ export class ScatterPlot {
         this.camera, this.orbitCameraControls.target, this.width, this.height,
         cameraSpacePointExtents[0], cameraSpacePointExtents[1],
         this.pointColors, this.pointScaleFactors, this.labelAccessor,
-        this.labels);
+        this.labels, this.traceColors);
 
     // Render first pass to picking target. This render fills pickingTexture
     // with colors that are actually point ids, so that sampling the texture at
@@ -756,6 +758,11 @@ export class ScatterPlot {
     this.labels = labels;
   }
 
+  /** Set the colors for every data trace. (RGB triplets) */
+  setTraceColors(colors: {[trace: number]: Float32Array}) {
+    this.traceColors = colors;
+  }
+
   getMode(): Mode { return this.mode; }
 
   resetZoom() {
diff --git a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerAxes.ts b/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerAxes.ts
index b3c7825e984..29d0c1b616d 100644
--- a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerAxes.ts
+++ b/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerAxes.ts
@@ -18,67 +18,26 @@ import {DataSet} from './scatterPlot';
 import {ScatterPlotVisualizer} from './scatterPlotVisualizer';
 
 /**
- * Maintains and renders 2d and 3d axes for the scatter plot.
+ * Maintains and renders 3d axes for the scatter plot.
  */
 export class ScatterPlotVisualizerAxes implements ScatterPlotVisualizer {
-  private axis3D: THREE.AxisHelper;
-  private axis2D: THREE.LineSegments;
-  private sceneIs3D: boolean = true;
+  private axis: THREE.AxisHelper;
 
   constructor() {
-    this.axis3D = new THREE.AxisHelper();
-  }
-
-  private createAxis2D() {
-    if (this.axis2D) {
-      this.axis2D.material.dispose();
-      this.axis2D.geometry.dispose();
-    }
-
-    let vertices = new Float32Array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0]);
-
-    const red = new THREE.Color(1, 0, 0);
-    const green = new THREE.Color(0, 1, 0);
-
-    const axisColors = new Float32Array([
-      red.r, red.g, red.b, red.r, red.g, red.b, green.r, green.g, green.b,
-      green.r, green.g, green.b
-    ]);
-
-    const RGB_NUM_BYTES = 3;
-    const XYZ_NUM_BYTES = 3;
-
-    let lineGeometry = new THREE.BufferGeometry();
-    lineGeometry.addAttribute(
-        'position', new THREE.BufferAttribute(vertices, XYZ_NUM_BYTES));
-    lineGeometry.addAttribute(
-        'color', new THREE.BufferAttribute(axisColors, RGB_NUM_BYTES));
-    let material =
-        new THREE.LineBasicMaterial({vertexColors: THREE.VertexColors});
-    this.axis2D = new THREE.LineSegments(lineGeometry, material);
+    this.axis = new THREE.AxisHelper();
   }
 
   onDataSet(dataSet: DataSet) {}
 
   onRecreateScene(
       scene: THREE.Scene, sceneIs3D: boolean, backgroundColor: number) {
-    this.sceneIs3D = sceneIs3D;
     if (sceneIs3D) {
-      scene.add(this.axis3D);
-    } else {
-      this.createAxis2D();
-      scene.add(this.axis2D);
+      scene.add(this.axis);
     }
   }
 
   removeAllFromScene(scene: THREE.Scene) {
-    if (this.sceneIs3D) {
-      scene.remove(this.axis3D);
-    } else {
-      scene.remove(this.axis2D);
-      this.axis2D.material.dispose();
-      this.axis2D.geometry.dispose();
-    }
+    scene.remove(this.axis);
   }
 
   onPickingRender(renderContext: RenderContext) {}
diff --git a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerTraces.ts b/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerTraces.ts
index 7450ed3d2e2..3cf9f872446 100644
--- a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerTraces.ts
+++ b/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerTraces.ts
@@ -18,18 +18,14 @@ import {DataSet} from './scatterPlot';
 import {ScatterPlotVisualizer} from './scatterPlotVisualizer';
 import {SelectionContext} from './selectionContext';
 
-const TRACE_START_HUE = 60;
-const TRACE_END_HUE = 360;
-const TRACE_SATURATION = 1;
-const TRACE_LIGHTNESS = .3;
 const TRACE_DEFAULT_OPACITY = .2;
 const TRACE_DEFAULT_LINEWIDTH = 2;
 const TRACE_SELECTED_OPACITY = .9;
 const TRACE_SELECTED_LINEWIDTH = 3;
 const TRACE_DESELECTED_OPACITY = .05;
 
-const RGB_NUM_BYTES = 3;
-const XYZ_NUM_BYTES = 3;
+const RGB_NUM_ELEMENTS = 3;
+const XYZ_NUM_ELEMENTS = 3;
 
 /**
  * Renders 'traces' (polylines) that connect multiple points in the dataset
@@ -38,6 +34,7 @@ export class ScatterPlotVisualizerTraces implements ScatterPlotVisualizer {
   private dataSet: DataSet;
   private traces: THREE.Line[];
   private tracePositionBuffer: {[trace: number]: THREE.BufferAttribute} = {};
+  private traceColorBuffer: {[trace: number]: THREE.BufferAttribute} = {};
 
   constructor(selectionContext: SelectionContext) {
     selectionContext.registerSelectionChangedListener(
@@ -57,29 +54,18 @@ export class ScatterPlotVisualizerTraces implements ScatterPlotVisualizer {
     for (let i = 0; i < this.dataSet.traces.length; i++) {
       let dataTrace = this.dataSet.traces[i];
 
-      let geometry = new THREE.BufferGeometry();
-      let colors: number[] = [];
-
       for (let j = 0; j < dataTrace.pointIndices.length - 1; j++) {
         this.dataSet.points[dataTrace.pointIndices[j]].traceIndex = i;
         this.dataSet.points[dataTrace.pointIndices[j + 1]].traceIndex = i;
-
-        let color1 =
-            this.getPointInTraceColor(j, dataTrace.pointIndices.length);
-        let color2 =
-            this.getPointInTraceColor(j + 1, dataTrace.pointIndices.length);
-
-        colors.push(
-            color1.r / 255, color1.g / 255, color1.b / 255, color2.r / 255,
-            color2.g / 255, color2.b / 255);
       }
 
+      let geometry = new THREE.BufferGeometry();
+
       geometry.addAttribute('position', this.tracePositionBuffer[i]);
       this.tracePositionBuffer[i].needsUpdate = true;
 
-      geometry.addAttribute(
-          'color',
-          new THREE.BufferAttribute(new Float32Array(colors), RGB_NUM_BYTES));
+      geometry.addAttribute('color', this.traceColorBuffer[i]);
+      this.traceColorBuffer[i].needsUpdate = true;
 
       // We use the same material for every line.
       let material = new THREE.LineBasicMaterial({
@@ -95,13 +81,6 @@ export class ScatterPlotVisualizerTraces implements ScatterPlotVisualizer {
     }
   }
 
-  private getPointInTraceColor(index: number, totalPoints: number) {
-    let hue = TRACE_START_HUE +
-        (TRACE_END_HUE - TRACE_START_HUE) * index / totalPoints;
-
-    return d3.hsl(hue, TRACE_SATURATION, TRACE_LIGHTNESS).rgb();
-  }
-
   private resetTraces() {
     if (!this.traces) {
       return;
@@ -130,10 +109,15 @@ export class ScatterPlotVisualizerTraces implements ScatterPlotVisualizer {
       // Set up the position buffer arrays for each trace.
       for (let i = 0; i < this.dataSet.traces.length; i++) {
         let dataTrace = this.dataSet.traces[i];
-        let traces = new Float32Array(
-            2 * (dataTrace.pointIndices.length - 1) * XYZ_NUM_BYTES);
+        const vertexCount = 2 * (dataTrace.pointIndices.length - 1);
+
+        let traces = new Float32Array(vertexCount * XYZ_NUM_ELEMENTS);
         this.tracePositionBuffer[i] =
-            new THREE.BufferAttribute(traces, XYZ_NUM_BYTES);
+            new THREE.BufferAttribute(traces, XYZ_NUM_ELEMENTS);
+
+        let colors = new Float32Array(vertexCount * RGB_NUM_ELEMENTS);
+        this.traceColorBuffer[i] =
+            new THREE.BufferAttribute(colors, RGB_NUM_ELEMENTS);
       }
     }
   }
@@ -188,8 +172,14 @@ export class ScatterPlotVisualizerTraces implements ScatterPlotVisualizer {
     }
   }
 
+  onRender(renderContext: RenderContext) {
+    for (let i = 0; i < this.dataSet.traces.length; i++) {
+      this.traceColorBuffer[i].array = renderContext.traceColors[i];
+      this.traceColorBuffer[i].needsUpdate = true;
+    }
+  }
+
   onPickingRender(renderContext: RenderContext) {}
-  onRender(renderContext: RenderContext) {}
   onResize(newWidth: number, newHeight: number) {}
   onSetLabelAccessor(labelAccessor: (index: number) => string) {}
 }
diff --git a/tensorflow/tensorboard/components/vz_projector/util.ts b/tensorflow/tensorboard/components/vz_projector/util.ts
index f1712ffcf0e..42f5e9b0c78 100644
--- a/tensorflow/tensorboard/components/vz_projector/util.ts
+++ b/tensorflow/tensorboard/components/vz_projector/util.ts
@@ -13,17 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-import {DataSet} from './scatterPlot';
-import {Point2D} from './vector';
 import {DataPoint} from './data';
 import * as logging from './logging';
+import {DataSet} from './scatterPlot';
+import {Point2D} from './vector';
 
 /**
  * Delay for running expensive tasks, in milliseconds.
  * The duration was empirically found so that it leaves enough time for the
  * browser to update its UI state before starting an expensive UI-blocking task.
  */
-const TASK_DELAY_MS = 25;
+const TASK_DELAY_MS = 200;
 
 /** Shuffles the array in-place in O(n) time using Fisher-Yates algorithm. */
 export function shuffle<T>(array: T[]): T[] {
@@ -157,10 +157,9 @@ export function runAsyncTask<T>(message: string, task: () => T,
         }
         resolve(result);
       } catch (ex) {
-        logging.setModalMessage('Error: ' + ex.message);
         reject(ex);
       }
       return true;
     }, TASK_DELAY_MS);
   });
-}
\ No newline at end of file
+}
diff --git a/tensorflow/tensorboard/components/vz_projector/util_test.ts b/tensorflow/tensorboard/components/vz_projector/util_test.ts
new file mode 100644
index 00000000000..42775b4ed51
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_projector/util_test.ts
@@ -0,0 +1,14 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-bookmark-panel.ts b/tensorflow/tensorboard/components/vz_projector/vz-projector-bookmark-panel.ts
index 2b6feeeb6cc..96522b8c740 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-bookmark-panel.ts
+++ b/tensorflow/tensorboard/components/vz_projector/vz-projector-bookmark-panel.ts
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 import {State} from './data';
-import {DataProvider, TensorInfo} from './data-loader';
+import {DataProvider, TensorInfo} from './data-provider';
 import {Projector} from './vz-projector';
 // tslint:disable-next-line:no-unused-variable
 import {PolymerElement, PolymerHTMLElement} from './vz-projector-util';
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-dashboard.html b/tensorflow/tensorboard/components/vz_projector/vz-projector-dashboard.html
index f411856cdfd..f77a3cdce8f 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-dashboard.html
+++ b/tensorflow/tensorboard/components/vz_projector/vz-projector-dashboard.html
@@ -32,6 +32,7 @@ limitations under the License.
       serving-mode="server"
     ></vz-projector>
   </template>
+</template>
 <script>
 (function() {
 Polymer({
@@ -49,5 +50,4 @@ Polymer({
 });
 })();
 </script>
-</template>
 </dom-module>
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.html b/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.html
index eec1afe9e66..7e9c20294a2 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.html
+++ b/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.html
@@ -86,7 +86,7 @@ paper-dropdown-menu paper-item {
   color: black;
   display: flex;
   font-weight: 500;
-  height: 50px;
+  height: 59px;
   padding-left: 20px;
 }
 
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.ts b/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.ts
index 068035148c5..342144c245a 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.ts
+++ b/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.ts
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-import {ColorOption, ColumnStats} from './data';
-import {CheckpointInfo, DataProvider, parseRawMetadata, parseRawTensors} from './data-loader';
+import {ColorOption, ColumnStats, MetadataInfo} from './data';
+import {CheckpointInfo, DataProvider, parseRawMetadata, parseRawTensors} from './data-provider';
 import {Projector} from './vz-projector';
 import {ColorLegendRenderInfo, ColorLegendThreshold} from './vz-projector-legend';
 // tslint:disable-next-line:no-unused-variable
@@ -75,8 +75,8 @@ export class DataPanel extends DataPanelPolymer {
     // Get all the runs.
     this.dataProvider.retrieveRuns(runs => {
       this.runNames = runs;
-      // If there is only 1 run, choose that one by default.
-      if (this.runNames.length === 1) {
+      // Choose the first run by default.
+      if (this.runNames.length > 0) {
         this.selectedRun = runs[0];
       }
     });
@@ -86,23 +86,23 @@ export class DataPanel extends DataPanelPolymer {
     return isSeparator ? 'separator' : null;
   }
 
-  updateMetadataUI(columnStats: ColumnStats[], metadataFile: string) {
+  metadataChanged(metadata: MetadataInfo, metadataFile: string) {
+    this.updateMetadataUI(metadata.stats, metadataFile);
+  }
+
+  private updateMetadataUI(columnStats: ColumnStats[], metadataFile: string) {
     this.dom.select('#metadata-file')
         .text(metadataFile)
         .attr('title', metadataFile);
     // Label by options.
     let labelIndex = -1;
-    if (columnStats.length > 1) {
-      this.labelOptions = columnStats.map((stats, i) => {
-        // Make the default label by the first non-numeric column.
-        if (!stats.isNumeric && labelIndex === -1) {
-          labelIndex = i;
-        }
-        return stats.name;
-      });
-    } else {
-      this.labelOptions = ['label'];
-    }
+    this.labelOptions = columnStats.map((stats, i) => {
+      // Make the default label by the first non-numeric column.
+      if (!stats.isNumeric && labelIndex === -1) {
+        labelIndex = i;
+      }
+      return stats.name;
+    });
     this.selectedLabelOption = this.labelOptions[Math.max(0, labelIndex)];
 
     // Color by options.
@@ -170,11 +170,10 @@ export class DataPanel extends DataPanelPolymer {
       if (metadataFile) {
         this.dataProvider.retrieveMetadata(
             this.selectedRun, this.selectedTensor, metadata => {
-              this.projector.updateDataSet(ds, metadata);
-              this.updateMetadataUI(metadata.stats, metadataFile);
+              this.projector.updateDataSet(ds, metadata, metadataFile);
             });
       } else {
-        this.projector.updateDataSet(ds, null);
+        this.projector.updateDataSet(ds);
       }
     });
     this.projector.setSelectedTensor(
@@ -208,7 +207,13 @@ export class DataPanel extends DataPanelPolymer {
           .text(this.checkpointInfo.checkpointFile)
           .attr('title', this.checkpointInfo.checkpointFile);
       this.dataProvider.getDefaultTensor(this.selectedRun, defaultTensor => {
-        this.selectedTensor = defaultTensor;
+        if (this.selectedTensor === defaultTensor) {
+          // Explicitly call the observer. Polymer won't call it if the previous
+          // string matches the current string.
+          this._selectedTensorChanged();
+        } else {
+          this.selectedTensor = defaultTensor;
+        }
       });
     });
   }
@@ -254,14 +259,13 @@ export class DataPanel extends DataPanelPolymer {
       this.dom.select('#checkpoint-file')
           .text(fileName)
           .attr('title', fileName);
-      this.projector.updateDataSet(ds, null);
+      this.projector.updateDataSet(ds);
     });
   }
 
   private metadataWasReadFromFile(rawContents: string, fileName: string) {
     parseRawMetadata(rawContents, metadata => {
-      this.projector.updateDataSet(this.projector.currentDataSet, metadata);
-      this.updateMetadataUI(metadata.stats, fileName);
+      this.projector.updateDataSet(this.projector.dataSet, metadata, fileName);
     });
   }
 
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-input.ts b/tensorflow/tensorboard/components/vz_projector/vz-projector-input.ts
index 35630412606..6270185dd4a 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-input.ts
+++ b/tensorflow/tensorboard/components/vz_projector/vz-projector-input.ts
@@ -27,51 +27,53 @@ export interface InputChangedListener {
 /** Input control with custom capabilities (e.g. regex). */
 export class ProjectorInput extends PolymerClass {
   private dom: d3.Selection<HTMLElement>;
-  private inputChangedListeners: InputChangedListener[];
+  private textChangedListeners: InputChangedListener[];
   private paperInput: HTMLInputElement;
+  private inRegexModeButton: HTMLButtonElement;
   private inRegexMode: boolean;
 
   /** Message that will be displayed at the bottom of the input control. */
   message: string;
-  /** Placeholder text for the input control. */
-  label: string;
 
   /** Subscribe to be called everytime the input changes. */
-  onInputChanged(listener: InputChangedListener) {
-    this.inputChangedListeners.push(listener);
+  registerInputChangedListener(listener: InputChangedListener) {
+    this.textChangedListeners.push(listener);
   }
 
   ready() {
     this.inRegexMode = false;
-    this.inputChangedListeners = [];
+    this.textChangedListeners = [];
     this.dom = d3.select(this);
     this.paperInput = this.querySelector('paper-input') as HTMLInputElement;
-    let paperButton = this.querySelector('paper-button') as HTMLButtonElement;
+    this.inRegexModeButton =
+        this.querySelector('paper-button') as HTMLButtonElement;
     this.paperInput.setAttribute('error-message', 'Invalid regex');
 
     this.paperInput.addEventListener('input', () => {
-      this.inputChanged();
+      this.onTextChanged();
     });
 
     this.paperInput.addEventListener('keydown', event => {
       event.stopPropagation();
     });
 
-    // Setup the regex mode button.
-    paperButton.addEventListener('click', () => {
-      this.inRegexMode = (paperButton as any).active;
-      this.showHideSlashes();
-      this.inputChanged();
-    });
-    this.showHideSlashes();
-    this.inputChanged();
+    this.inRegexModeButton.addEventListener(
+        'click', () => this.onClickRegexModeButton());
+    this.updateRegexModeDisplaySlashes();
+    this.onTextChanged();
+  }
+
+  private onClickRegexModeButton() {
+    this.inRegexMode = (this.inRegexModeButton as any).active;
+    this.updateRegexModeDisplaySlashes();
+    this.onTextChanged();
   }
 
   private notifyInputChanged(value: string, inRegexMode: boolean) {
-    this.inputChangedListeners.forEach(l => l(value, inRegexMode));
+    this.textChangedListeners.forEach(l => l(value, inRegexMode));
   }
 
-  private inputChanged() {
+  private onTextChanged() {
     try {
       if (this.inRegexMode) {
         new RegExp(this.paperInput.value);
@@ -86,7 +88,7 @@ export class ProjectorInput extends PolymerClass {
     this.notifyInputChanged(this.paperInput.value, this.inRegexMode);
   }
 
-  private showHideSlashes() {
+  private updateRegexModeDisplaySlashes() {
     d3.select(this.paperInput)
         .selectAll('.slash')
         .style('display', this.inRegexMode ? null : 'none');
@@ -99,6 +101,12 @@ export class ProjectorInput extends PolymerClass {
   getInRegexMode(): boolean {
     return this.inRegexMode;
   }
+
+  set(value: string, inRegexMode: boolean) {
+    (this.inRegexModeButton as any).active = inRegexMode;
+    this.paperInput.value = value;
+    this.onClickRegexModeButton();
+  }
 }
 
 document.registerElement(ProjectorInput.prototype.is, ProjectorInput);
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-inspector-panel.html b/tensorflow/tensorboard/components/vz_projector/vz-projector-inspector-panel.html
index cd888369ea0..7554c322cef 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-inspector-panel.html
+++ b/tensorflow/tensorboard/components/vz_projector/vz-projector-inspector-panel.html
@@ -54,6 +54,19 @@ limitations under the License.
   margin-right: 0;
 }
 
+.nn {
+  display: flex;
+  flex-direction: column;
+}
+
+.nn > * {
+  padding: 0 20px;
+}
+
+.nn-list {
+  overflow-y: auto;
+}
+
 .nn-list .neighbor {
   font-size: 12px;
   margin-bottom: 8px;
@@ -154,6 +167,10 @@ limitations under the License.
   margin-right: 10px;
 }
 
+.matches-list {
+  padding: 0 20px;
+}
+
 .matches-list .row {
   border-bottom: 1px solid #ddd;
   cursor: pointer;
@@ -164,8 +181,8 @@ limitations under the License.
 }
 
 .results {
-  overflow-y: auto;
-  padding: 0 20px;
+  display: flex;
+  flex-direction: column;
 }
 </style>
 <template>
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-inspector-panel.ts b/tensorflow/tensorboard/components/vz_projector/vz-projector-inspector-panel.ts
index 386475dd023..272b10177d2 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-inspector-panel.ts
+++ b/tensorflow/tensorboard/components/vz_projector/vz-projector-inspector-panel.ts
@@ -138,7 +138,7 @@ export class InspectorPanel extends PolymerClass {
   }
 
   private getLabelFromIndex(pointIndex: number): string {
-    let point = this.projector.currentDataSet.points[pointIndex];
+    let point = this.projector.dataSet.points[pointIndex];
     return point.metadata[this.selectedMetadataField].toString();
   }
 
@@ -218,7 +218,7 @@ export class InspectorPanel extends PolymerClass {
       this.dom.selectAll('.distance a').classed('selected', false);
       eucDist.classed('selected', true);
       this.distFunc = vector.dist;
-      let neighbors = this.projector.currentDataSet.findNeighbors(
+      let neighbors = this.projector.dataSet.findNeighbors(
           this.selectedPointIndex, this.distFunc, this.numNN);
       this.updateNeighborsList(neighbors);
     });
@@ -228,7 +228,7 @@ export class InspectorPanel extends PolymerClass {
       this.dom.selectAll('.distance a').classed('selected', false);
       cosDist.classed('selected', true);
       this.distFunc = vector.cosDist;
-      let neighbors = this.projector.currentDataSet.findNeighbors(
+      let neighbors = this.projector.dataSet.findNeighbors(
           this.selectedPointIndex, this.distFunc, this.numNN);
       this.updateNeighborsList(neighbors);
     });
@@ -240,7 +240,7 @@ export class InspectorPanel extends PolymerClass {
         this.projector.notifySelectionChanged([]);
         return;
       }
-      let indices = this.projector.currentDataSet.query(value, inRegexMode,
+      let indices = this.projector.dataSet.query(value, inRegexMode,
           this.selectedMetadataField);
       if (indices.length === 0) {
         this.searchBox.message = '0 matches.';
@@ -249,7 +249,7 @@ export class InspectorPanel extends PolymerClass {
       }
       this.projector.notifySelectionChanged(indices);
     };
-    this.searchBox.onInputChanged((value, inRegexMode) => {
+    this.searchBox.registerInputChangedListener((value, inRegexMode) => {
       updateInput(value, inRegexMode);
     });
 
@@ -278,7 +278,7 @@ export class InspectorPanel extends PolymerClass {
     });
 
     this.clearSelectionButton.on('click', () => {
-      this.projector.clearSelectionAndHover();
+      this.projector.adjustSelectionAndHover([]);
     });
     this.resetFilterButton.attr('disabled', true);
   }
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel.html b/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel.html
index 2e3df9082a0..9bcd72ba8b4 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel.html
+++ b/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel.html
@@ -120,6 +120,11 @@ limitations under the License.
 .container {
   padding: 20px;
 }
+
+.book-icon {
+  height: 20px;
+  color: rgba(0, 0, 0, 0.7);
+}
 </style>
 <div id="main">
   <div class="ink-panel-header">
@@ -149,7 +154,7 @@ limitations under the License.
         <label>Dimension</label>
         <div class="two-way-toggle">
           <span>2D</span>
-          <paper-toggle-button id="tsne-toggle" checked="{{is3d}}">3D</paper-toggle-button>
+          <paper-toggle-button id="tsne-toggle" checked="{{tSNEis3d}}">3D</paper-toggle-button>
         </div>
       </div>
       <div class="slider tsne-perplexity">
@@ -188,6 +193,12 @@ limitations under the License.
       <p id="tsne-sampling" class="notice">
         For fast results, the data will be sampled down to [[getTsneSampleSize()]] points.
       </p>
+      <p>
+        <iron-icon icon="book" class="book-icon"></iron-icon>
+        <a target="_blank" href="http://distill.pub/2016/misread-tsne/">
+          How to use t-SNE effectively.
+        </a>
+      </p>
     </div>
     <!-- PCA Controls -->
     <div data-panel="pca" class="ink-panel-content">
@@ -196,8 +207,8 @@ limitations under the License.
           <paper-dropdown-menu style="width: 100%" vertical-align="bottom" no-animations label="X">
             <paper-listbox attr-for-selected="value" class="dropdown-content" selected="{{pcaX}}">
               <template is="dom-repeat" items="[[pcaComponents]]">
-                <paper-item class="dropdown-item" value="[[item]]" label="Component #[[item]]">
-                  Component #[[item]]
+                <paper-item class="dropdown-item" value="[[item]]" label="Component #[[_addOne(item)]]">
+                  Component #[[_addOne(item)]]
                 </paper-item>
               </template>
             </paper-listbox>
@@ -205,8 +216,8 @@ limitations under the License.
           <paper-dropdown-menu no-animations vertical-align="bottom" label="Z" disabled="[[!hasPcaZ]]" id="z-dropdown">
             <paper-listbox attr-for-selected="value" class="dropdown-content" selected="{{pcaZ}}">
               <template is="dom-repeat" items="[[pcaComponents]]">
-                <paper-item class="dropdown-item" value="[[item]]" label="Component #[[item]]">
-                  Component #[[item]]
+                <paper-item class="dropdown-item" value="[[item]]" label="Component #[[_addOne(item)]]">
+                  Component #[[_addOne(item)]]
                 </paper-item>
               </template>
             </paper-listbox>
@@ -216,13 +227,13 @@ limitations under the License.
           <paper-dropdown-menu style="width: 100%" vertical-align="bottom" no-animations label="Y">
             <paper-listbox attr-for-selected="value" class="dropdown-content" selected="{{pcaY}}">
               <template is="dom-repeat" items="[[pcaComponents]]">
-                <paper-item class="dropdown-item" value="[[item]]" label="Component #[[item]]">
-                  Component #[[item]]
+                <paper-item class="dropdown-item" value="[[item]]" label="Component #[[_addOne(item)]]">
+                  Component #[[_addOne(item)]]
                 </paper-item>
               </template>
             </paper-listbox>
           </paper-dropdown-menu>
-          <paper-checkbox id="z-checkbox" checked="{{is3d}}"></paper-checkbox>
+          <paper-checkbox id="z-checkbox" checked="{{pcaIs3d}}"></paper-checkbox>
         </div>
       </div>
       <p id="pca-sampling" class="notice">
@@ -236,7 +247,7 @@ limitations under the License.
     <!-- Custom Controls -->
     <div data-panel="custom" class="ink-panel-content">
       <paper-dropdown-menu style="width: 100%" no-animations label="Search by">
-        <paper-listbox attr-for-selected="value" class="dropdown-content" selected="{{selectedSearchByMetadataOption}}">
+        <paper-listbox attr-for-selected="value" class="dropdown-content" selected="{{customSelectedSearchByMetadataOption}}">
           <template is="dom-repeat" items="[[searchByMetadataOptions]]">
             <paper-item class="dropdown-item" value="[[item]]" label="[[item]]">
               [[item]]
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel.ts b/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel.ts
index c952b60d633..06c393329a3 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel.ts
+++ b/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel.ts
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-import {DataSet, MetadataInfo, PCA_SAMPLE_DIM, Projection, SAMPLE_SIZE} from './data';
+import {DataSet, MetadataInfo, PCA_SAMPLE_DIM, Projection, SAMPLE_SIZE, State} from './data';
 import * as vector from './vector';
 import {Projector} from './vz-projector';
 import {ProjectorInput} from './vz-projector-input';
@@ -24,36 +24,43 @@ import {PolymerElement, PolymerHTMLElement} from './vz-projector-util';
 export let ProjectionsPanelPolymer = PolymerElement({
   is: 'vz-projector-projections-panel',
   properties: {
-    is3d: {type: Boolean, observer: '_dimensionsObserver'},
+    pcaIs3d:
+        {type: Boolean, value: true, observer: '_pcaDimensionToggleObserver'},
+    tSNEis3d:
+        {type: Boolean, value: true, observer: '_tsneDimensionToggleObserver'},
     // PCA projection.
-    pcaComponents: {type: Array, value: d3.range(1, 11)},
-    pcaX: {type: Number, value: 1, observer: 'showPCAIfEnabled'},
-    pcaY: {type: Number, value: 2, observer: 'showPCAIfEnabled'},
-    pcaZ: {type: Number, value: 3, observer: 'showPCAIfEnabled'},
+    pcaComponents: {type: Array, value: d3.range(0, 10)},
+    pcaX: {type: Number, value: 0, observer: 'showPCAIfEnabled'},
+    pcaY: {type: Number, value: 1, observer: 'showPCAIfEnabled'},
+    pcaZ: {type: Number, value: 2, observer: 'showPCAIfEnabled'},
     // Custom projection.
-    selectedSearchByMetadataOption: {
+    customSelectedSearchByMetadataOption: {
       type: String,
-      value: 'label',
-      observer: '_searchByMetadataOptionChanged'
+      observer: '_customSelectedSearchByMetadataOptionChanged'
     },
   }
 });
 
 type InputControlName = 'xLeft' | 'xRight' | 'yUp' | 'yDown';
 
+type CentroidResult = {
+  centroid?: number[]; numMatches?: number;
+};
+
+type Centroids = {
+  [key: string]: number[]; xLeft: number[]; xRight: number[]; yUp: number[];
+  yDown: number[];
+};
+
 /**
  * A polymer component which handles the projection tabs in the projector.
  */
 export class ProjectionsPanel extends ProjectionsPanelPolymer {
-  selectedSearchByMetadataOption: string;
-  is3d: boolean;
-
   private projector: Projector;
   private currentProjection: Projection;
   private polymerChangesTriggerReprojection: boolean;
-
-  // The working subset of the data source's original data set.
-  private currentDataSet: DataSet;
+  private dataSet: DataSet;
+  private originalDataSet: DataSet;
   private dim: number;
 
   /** T-SNE perplexity. Roughly how many neighbors each point influences. */
@@ -70,24 +77,32 @@ export class ProjectionsPanel extends ProjectionsPanelPolymer {
   private allCentroid: number[];
 
   /** Polymer properties. */
-  private pcaX: number;
-  private pcaY: number;
-  private pcaZ: number;
+  // TODO(nsthorat): Move these to a separate view controller.
+  public tSNEis3d: boolean;
+  public pcaIs3d: boolean;
+  public pcaX: number;
+  public pcaY: number;
+  public pcaZ: number;
+  public customSelectedSearchByMetadataOption: string;
 
   /** Polymer elements. */
+  private dom: d3.Selection<any>;
   private runTsneButton: d3.Selection<HTMLButtonElement>;
   private stopTsneButton: d3.Selection<HTMLButtonElement>;
-
-  private dom: d3.Selection<any>;
-
+  private perplexitySlider: HTMLInputElement;
+  private learningRateInput: HTMLInputElement;
   private zDropdown: d3.Selection<HTMLElement>;
+  private iterationLabel: d3.Selection<HTMLElement>;
+
+  private customProjectionXLeftInput: ProjectorInput;
+  private customProjectionXRightInput: ProjectorInput;
+  private customProjectionYUpInput: ProjectorInput;
+  private customProjectionYDownInput: ProjectorInput;
 
   initialize(projector: Projector) {
     this.polymerChangesTriggerReprojection = true;
     this.projector = projector;
 
-    this.is3d = true;
-
     // Set up TSNE projections.
     this.perplexity = 30;
     this.learningRate = 10;
@@ -102,7 +117,12 @@ export class ProjectionsPanel extends ProjectionsPanelPolymer {
   ready() {
     this.dom = d3.select(this);
     this.zDropdown = this.dom.select('#z-dropdown');
-    this.searchByMetadataOptions = ['label'];
+    this.runTsneButton = this.dom.select('.run-tsne');
+    this.stopTsneButton = this.dom.select('.stop-tsne');
+    this.perplexitySlider = this.$$('#perplexity-slider') as HTMLInputElement;
+    this.learningRateInput =
+        this.$$('#learning-rate-slider') as HTMLInputElement;
+    this.iterationLabel = this.dom.select('.run-tsne-iter');
   }
 
   disablePolymerChangesTriggerReprojection() {
@@ -113,72 +133,156 @@ export class ProjectionsPanel extends ProjectionsPanelPolymer {
     this.polymerChangesTriggerReprojection = true;
   }
 
+  private updateTSNEPerplexityFromUIChange() {
+    if (this.perplexitySlider) {
+      this.perplexity = +this.perplexitySlider.value;
+    }
+    this.dom.select('.tsne-perplexity span').text(this.perplexity);
+  }
+
+  private updateTSNELearningRateFromUIChange() {
+    if (this.learningRateInput) {
+      this.learningRate = Math.pow(10, +this.learningRateInput.value);
+    }
+    this.dom.select('.tsne-learning-rate span').text(this.learningRate);
+  }
+
   private setupUIControls() {
-    // Tabs
-    const self = this;
-    this.dom.selectAll('.ink-tab').on('click', function() {
-      let id = this.getAttribute('data-tab');
-      self.showTab(id);
-    });
+    {
+      const self = this;
+      this.dom.selectAll('.ink-tab').on('click', function() {
+        let id = this.getAttribute('data-tab');
+        self.showTab(id);
+      });
+    }
 
-    this.runTsneButton = this.dom.select('.run-tsne');
     this.runTsneButton.on('click', () => this.runTSNE());
-    this.stopTsneButton = this.dom.select('.stop-tsne');
-    this.stopTsneButton.on('click', () => {
-      this.projector.currentDataSet.stopTSNE();
-    });
+    this.stopTsneButton.on('click', () => this.dataSet.stopTSNE());
 
-    let perplexitySlider = this.$$('#perplexity-slider') as HTMLInputElement;
-    let updatePerplexity = () => {
-      this.perplexity = +perplexitySlider.value;
-      this.dom.select('.tsne-perplexity span').text(this.perplexity);
-    };
-    perplexitySlider.value = this.perplexity.toString();
-    perplexitySlider.addEventListener('change', updatePerplexity);
-    updatePerplexity();
+    this.perplexitySlider.value = this.perplexity.toString();
+    this.perplexitySlider.addEventListener(
+        'change', () => this.updateTSNEPerplexityFromUIChange());
+    this.updateTSNEPerplexityFromUIChange();
 
-    let learningRateInput =
-        this.$$('#learning-rate-slider') as HTMLInputElement;
-    let updateLearningRate = () => {
-      this.learningRate = Math.pow(10, +learningRateInput.value);
-      this.dom.select('.tsne-learning-rate span').text(this.learningRate);
-    };
-    learningRateInput.addEventListener('change', updateLearningRate);
-    updateLearningRate();
-    this.setupAllInputsInCustomTab();
+    this.learningRateInput.addEventListener(
+        'change', () => this.updateTSNELearningRateFromUIChange());
+    this.updateTSNELearningRateFromUIChange();
+
+    this.setupCustomProjectionInputFields();
     // TODO: figure out why `--paper-input-container-input` css mixin didn't
     // work.
     this.dom.selectAll('paper-dropdown-menu paper-input input')
       .style('font-size', '14px');
   }
 
-  setPCAComponentUIValues(componentDimensions: number[]) {
-    this.pcaX = componentDimensions[0];
-    this.pcaY = componentDimensions[1];
+  restoreUIFromBookmark(bookmark: State) {
+    this.disablePolymerChangesTriggerReprojection();
 
-    if (componentDimensions.length === 3) {
-      this.pcaZ = componentDimensions[2];
+    // PCA
+    this.pcaX = bookmark.pcaComponentDimensions[0];
+    this.pcaY = bookmark.pcaComponentDimensions[1];
+    if (bookmark.pcaComponentDimensions.length === 3) {
+      this.pcaZ = bookmark.pcaComponentDimensions[2];
     }
+    this.pcaIs3d = (bookmark.pcaComponentDimensions.length === 3);
 
-    this.setZDropdownEnabled(componentDimensions.length === 3);
+    // t-SNE
+    if (this.perplexitySlider) {
+      this.perplexitySlider.value = bookmark.tSNEPerplexity.toString();
+    }
+    if (this.learningRateInput) {
+      this.learningRateInput.value = bookmark.tSNELearningRate.toString();
+    }
+    this.tSNEis3d = bookmark.tSNEis3d;
+
+    // custom
+    this.customSelectedSearchByMetadataOption =
+        bookmark.customSelectedSearchByMetadataOption;
+    if (this.customProjectionXLeftInput) {
+      this.customProjectionXLeftInput.set(
+          bookmark.customXLeftText, bookmark.customXLeftRegex);
+    }
+    if (this.customProjectionXRightInput) {
+      this.customProjectionXRightInput.set(
+          bookmark.customXRightText, bookmark.customXRightRegex);
+    }
+    if (this.customProjectionYUpInput) {
+      this.customProjectionYUpInput.set(
+          bookmark.customYUpText, bookmark.customYUpRegex);
+    }
+    if (this.customProjectionYDownInput) {
+      this.customProjectionYDownInput.set(
+          bookmark.customYDownText, bookmark.customYDownRegex);
+    }
+    this.computeAllCentroids();
+
+    this.setZDropdownEnabled(this.pcaIs3d);
+    this.updateTSNEPerplexityFromUIChange();
+    this.updateTSNELearningRateFromUIChange();
+    if (this.iterationLabel) {
+      this.iterationLabel.text(bookmark.tSNEIteration.toString());
+    }
+    this.showTab(bookmark.selectedProjection);
+    this.enablePolymerChangesTriggerReprojection();
   }
 
-  getPCAComponentUIValues(): number[] {
-    const componentDimensions = [this.pcaX, this.pcaY];
-    if (this.is3d) {
-      componentDimensions.push(this.pcaZ);
+  populateBookmarkFromUI(bookmark: State) {
+    this.disablePolymerChangesTriggerReprojection();
+
+    // PCA
+    bookmark.pcaComponentDimensions = [this.pcaX, this.pcaY];
+    if (this.pcaIs3d) {
+      bookmark.pcaComponentDimensions.push(this.pcaZ);
     }
-    return componentDimensions;
+
+    // t-SNE
+    if (this.perplexitySlider != null) {
+      bookmark.tSNEPerplexity = +this.perplexitySlider.value;
+    }
+    if (this.learningRateInput != null) {
+      bookmark.tSNELearningRate = +this.learningRateInput.value;
+    }
+    bookmark.tSNEis3d = this.tSNEis3d;
+
+    // custom
+    bookmark.customSelectedSearchByMetadataOption =
+        this.customSelectedSearchByMetadataOption;
+    if (this.customProjectionXLeftInput != null) {
+      bookmark.customXLeftText = this.customProjectionXLeftInput.getValue();
+      bookmark.customXLeftRegex =
+          this.customProjectionXLeftInput.getInRegexMode();
+    }
+    if (this.customProjectionXRightInput != null) {
+      bookmark.customXRightText = this.customProjectionXRightInput.getValue();
+      bookmark.customXRightRegex =
+          this.customProjectionXRightInput.getInRegexMode();
+    }
+    if (this.customProjectionYUpInput != null) {
+      bookmark.customYUpText = this.customProjectionYUpInput.getValue();
+      bookmark.customYUpRegex = this.customProjectionYUpInput.getInRegexMode();
+    }
+    if (this.customProjectionYDownInput != null) {
+      bookmark.customYDownText = this.customProjectionYDownInput.getValue();
+      bookmark.customYDownRegex =
+          this.customProjectionYDownInput.getInRegexMode();
+    }
+
+    this.enablePolymerChangesTriggerReprojection();
   }
 
-  private setZDropdownEnabled(enabled: boolean) {
+  // This method is marked as public as it is used as the view method that
+  // abstracts DOM manipulation so we can stub it in a test.
+  // TODO(nsthorat): Move this to its own class as the glue between this class
+  // and the DOM.
+  setZDropdownEnabled(enabled: boolean) {
     if (this.zDropdown) {
       this.zDropdown.attr('disabled', enabled ? null : true);
     }
   }
 
-  dataSetUpdated(dataSet: DataSet, dim: number) {
-    this.currentDataSet = dataSet;
+  dataSetUpdated(dataSet: DataSet, originalDataSet: DataSet, dim: number) {
+    this.dataSet = dataSet;
+    this.originalDataSet = originalDataSet;
     this.dim = dim;
     this.clearCentroids();
 
@@ -189,26 +293,26 @@ export class ProjectionsPanel extends ProjectionsPanelPolymer {
     this.showTab('pca');
   }
 
-  _dimensionsObserver() {
-    this.setZDropdownEnabled(this.is3d);
+  _pcaDimensionToggleObserver() {
+    this.setZDropdownEnabled(this.pcaIs3d);
+    this.beginProjection(this.currentProjection);
+  }
+
+  _tsneDimensionToggleObserver() {
     this.beginProjection(this.currentProjection);
   }
 
   metadataChanged(metadata: MetadataInfo) {
     // Project by options for custom projections.
     let searchByMetadataIndex = -1;
-    if (metadata.stats.length > 1) {
-      this.searchByMetadataOptions = metadata.stats.map((stats, i) => {
-        // Make the default label by the first non-numeric column.
-        if (!stats.isNumeric && searchByMetadataIndex === -1) {
-          searchByMetadataIndex = i;
-        }
-        return stats.name;
-      });
-    } else {
-      this.searchByMetadataOptions = ['label'];
-    }
-    this.selectedSearchByMetadataOption =
+    this.searchByMetadataOptions = metadata.stats.map((stats, i) => {
+      // Make the default label by the first non-numeric column.
+      if (!stats.isNumeric && searchByMetadataIndex === -1) {
+        searchByMetadataIndex = i;
+      }
+      return stats.name;
+    });
+    this.customSelectedSearchByMetadataOption =
         this.searchByMetadataOptions[Math.max(0, searchByMetadataIndex)];
   }
 
@@ -222,40 +326,45 @@ export class ProjectionsPanel extends ProjectionsPanelPolymer {
     this.dom.select('.ink-panel-content[data-panel="' + id + '"]')
         .classed('active', true);
 
-    // In order for the projections panel to animate its height, we need to set
-    // it explicitly.
-    requestAnimationFrame(() => {
-      this.style.height = this.$['main'].clientHeight + 'px';
-    });
+    // guard for unit tests, where polymer isn't attached and $ doesn't exist.
+    if (this.$ != null) {
+      const main = this.$['main'];
+      // In order for the projections panel to animate its height, we need to
+      // set it explicitly.
+      requestAnimationFrame(() => {
+        this.style.height = main.clientHeight + 'px';
+      });
+    }
 
     this.beginProjection(id);
   }
 
   private beginProjection(projection: string) {
-    if (this.polymerChangesTriggerReprojection) {
-      if (projection === 'pca') {
-        this.currentDataSet.stopTSNE();
-        this.showPCA();
-      } else if (projection === 'tsne') {
-        this.showTSNE();
-      } else if (projection === 'custom') {
-        this.currentDataSet.stopTSNE();
-        this.computeAllCentroids();
-        this.reprojectCustom();
-      }
+    if (this.polymerChangesTriggerReprojection === false) {
+      return;
+    }
+    if (projection === 'pca') {
+      this.dataSet.stopTSNE();
+      this.showPCA();
+    } else if (projection === 'tsne') {
+      this.showTSNE();
+    } else if (projection === 'custom') {
+      this.dataSet.stopTSNE();
+      this.computeAllCentroids();
+      this.reprojectCustom();
     }
   }
 
   private showTSNE() {
-    const dataSet = this.currentDataSet;
+    const dataSet = this.dataSet;
     if (dataSet == null) {
       return;
     }
     const accessors =
-        dataSet.getPointAccessors('tsne', [0, 1, this.is3d ? 2 : null]);
-    this.projector.setProjection('tsne', this.is3d ? 3 : 2, accessors);
+        dataSet.getPointAccessors('tsne', [0, 1, this.tSNEis3d ? 2 : null]);
+    this.projector.setProjection('tsne', this.tSNEis3d ? 3 : 2, accessors);
 
-    if (!this.currentDataSet.hasTSNERun) {
+    if (!this.dataSet.hasTSNERun) {
       this.runTSNE();
     } else {
       this.projector.notifyProjectionsUpdated();
@@ -265,11 +374,11 @@ export class ProjectionsPanel extends ProjectionsPanelPolymer {
   private runTSNE() {
     this.runTsneButton.attr('disabled', true);
     this.stopTsneButton.attr('disabled', null);
-    this.currentDataSet.projectTSNE(
-        this.perplexity, this.learningRate, this.is3d ? 3 : 2,
+    this.dataSet.projectTSNE(
+        this.perplexity, this.learningRate, this.tSNEis3d ? 3 : 2,
         (iteration: number) => {
           if (iteration != null) {
-            this.dom.select('.run-tsne-iter').text(iteration);
+            this.iterationLabel.text(iteration);
             this.projector.notifyProjectionsUpdated();
           } else {
             this.runTsneButton.attr('disabled', null);
@@ -286,15 +395,15 @@ export class ProjectionsPanel extends ProjectionsPanelPolymer {
   }
 
   private showPCA() {
-    if (this.currentDataSet == null) {
+    if (this.dataSet == null) {
       return;
     }
-    this.currentDataSet.projectPCA().then(() => {
+    this.dataSet.projectPCA().then(() => {
       // Polymer properties are 1-based.
-      const accessors = this.currentDataSet.getPointAccessors(
-          'pca', [this.pcaX - 1, this.pcaY - 1, this.pcaZ - 1]);
+      const accessors = this.dataSet.getPointAccessors(
+          'pca', [this.pcaX, this.pcaY, this.pcaZ]);
 
-      this.projector.setProjection('pca', this.is3d ? 3 : 2, accessors);
+      this.projector.setProjection('pca', this.pcaIs3d ? 3 : 2, accessors);
     });
   }
 
@@ -305,14 +414,12 @@ export class ProjectionsPanel extends ProjectionsPanelPolymer {
       return;
     }
     const xDir = vector.sub(this.centroids.xRight, this.centroids.xLeft);
-    this.currentDataSet.projectLinear(xDir, 'linear-x');
+    this.dataSet.projectLinear(xDir, 'linear-x');
 
     const yDir = vector.sub(this.centroids.yUp, this.centroids.yDown);
-    this.currentDataSet.projectLinear(yDir, 'linear-y');
-
-    const accessors =
-        this.currentDataSet.getPointAccessors('custom', ['x', 'y']);
+    this.dataSet.projectLinear(yDir, 'linear-y');
 
+    const accessors = this.dataSet.getPointAccessors('custom', ['x', 'y']);
     this.projector.setProjection('custom', 2, accessors);
   }
 
@@ -321,18 +428,24 @@ export class ProjectionsPanel extends ProjectionsPanelPolymer {
     this.allCentroid = null;
   }
 
-  _searchByMetadataOptionChanged(newVal: string, oldVal: string) {
+  _customSelectedSearchByMetadataOptionChanged(newVal: string, oldVal: string) {
+    if (this.polymerChangesTriggerReprojection === false) {
+      return;
+    }
     if (this.currentProjection === 'custom') {
       this.computeAllCentroids();
       this.reprojectCustom();
     }
   }
 
-  private setupAllInputsInCustomTab() {
-    this.setupInputUIInCustomTab('xLeft');
-    this.setupInputUIInCustomTab('xRight');
-    this.setupInputUIInCustomTab('yUp');
-    this.setupInputUIInCustomTab('yDown');
+  private setupCustomProjectionInputFields() {
+    this.customProjectionXLeftInput =
+        this.setupCustomProjectionInputField('xLeft');
+    this.customProjectionXRightInput =
+        this.setupCustomProjectionInputField('xRight');
+    this.customProjectionYUpInput = this.setupCustomProjectionInputField('yUp');
+    this.customProjectionYDownInput =
+        this.setupCustomProjectionInputField('yDown');
   }
 
   private computeAllCentroids() {
@@ -343,13 +456,15 @@ export class ProjectionsPanel extends ProjectionsPanelPolymer {
   }
 
   private computeCentroid(name: InputControlName) {
-    let input = this.querySelector('#' + name) as ProjectorInput;
-    let value = input.getValue();
-    let inRegexMode = input.getInRegexMode();
-
+    const input = this.querySelector('#' + name) as ProjectorInput;
+    if (input == null) {
+      return;
+    }
+    const value = input.getValue();
     if (value == null) {
       return;
     }
+    let inRegexMode = input.getInRegexMode();
     let result = this.getCentroid(value, inRegexMode);
     if (result.numMatches === 0) {
       input.message = '0 matches. Using a random vector.';
@@ -361,22 +476,28 @@ export class ProjectionsPanel extends ProjectionsPanelPolymer {
     this.centroidValues[name] = value;
   }
 
-  private setupInputUIInCustomTab(name: InputControlName) {
+  private setupCustomProjectionInputField(name: InputControlName):
+      ProjectorInput {
     let input = this.querySelector('#' + name) as ProjectorInput;
-    // Setup the input text.
-    input.onInputChanged((input, inRegexMode) => {
-      this.computeCentroid(name);
-      this.reprojectCustom();
+    input.registerInputChangedListener((input, inRegexMode) => {
+      if (this.polymerChangesTriggerReprojection) {
+        this.computeCentroid(name);
+        this.reprojectCustom();
+      }
     });
+    return input;
   }
 
   private getCentroid(pattern: string, inRegexMode: boolean): CentroidResult {
     if (pattern == null || pattern === '') {
       return {numMatches: 0};
     }
-    let accessor = (i: number) => this.currentDataSet.points[i].vector;
-    let r = this.projector.currentDataSet.query(
-        pattern, inRegexMode, this.selectedSearchByMetadataOption);
+    // Search by the original dataset since we often want to filter and project
+    // only the nearest neighbors of A onto B-C where B and C are not nearest
+    // neighbors of A.
+    let accessor = (i: number) => this.originalDataSet.points[i].vector;
+    let r = this.originalDataSet.query(
+        pattern, inRegexMode, this.customSelectedSearchByMetadataOption);
     return {centroid: vector.centroid(r, accessor), numMatches: r.length};
   }
 
@@ -387,15 +508,10 @@ export class ProjectionsPanel extends ProjectionsPanelPolymer {
   getTsneSampleSize() {
     return SAMPLE_SIZE.toLocaleString();
   }
+
+  _addOne(value: number) {
+    return value + 1;
+  }
 }
 
-type CentroidResult = {
-  centroid?: number[]; numMatches?: number;
-};
-
-type Centroids = {
-  [key: string]: number[]; xLeft: number[]; xRight: number[]; yUp: number[];
-  yDown: number[];
-};
-
 document.registerElement(ProjectionsPanel.prototype.is, ProjectionsPanel);
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel_test.ts b/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel_test.ts
new file mode 100644
index 00000000000..3ce35afb743
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel_test.ts
@@ -0,0 +1,82 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+import {State} from './data';
+import {ProjectionsPanel} from './vz-projector-projections-panel';
+
+const assert = chai.assert;
+
+describe('restoreUIFromBookmark', () => {
+  it('sets the pcaX/Y properties when setting 2D component values', () => {
+    let projectionsPanel = document.createElement(
+        ProjectionsPanel.prototype.is) as ProjectionsPanel;
+
+    spyOn(projectionsPanel, 'setZDropdownEnabled');
+
+    const s = new State();
+    s.pcaComponentDimensions = [0, 1];
+    projectionsPanel.restoreUIFromBookmark(s);
+
+    assert.equal(0, projectionsPanel.pcaX);
+    assert.equal(1, projectionsPanel.pcaY);
+
+    expect(projectionsPanel.setZDropdownEnabled).toHaveBeenCalledWith(false);
+  });
+
+  it('sets the pcaX/Y properties when setting 3D component values', () => {
+    let projectionsPanel = document.createElement(
+        ProjectionsPanel.prototype.is) as ProjectionsPanel;
+
+    spyOn(projectionsPanel, 'setZDropdownEnabled');
+
+    const s = new State();
+    s.pcaComponentDimensions = [0, 1, 2];
+    projectionsPanel.restoreUIFromBookmark(s);
+
+    assert.equal(0, projectionsPanel.pcaX);
+    assert.equal(1, projectionsPanel.pcaY);
+    assert.equal(2, projectionsPanel.pcaZ);
+
+    expect(projectionsPanel.setZDropdownEnabled).toHaveBeenCalledWith(true);
+  });
+});
+
+describe('populateBookmarkFromUI', () => {
+  it('gets the PCA component UI values from a 2D PCA projection', () => {
+    let projectionsPanel = document.createElement(
+        ProjectionsPanel.prototype.is) as ProjectionsPanel;
+
+    projectionsPanel.pcaX = 0;
+    projectionsPanel.pcaY = 1;
+    projectionsPanel.pcaIs3d = false;
+
+    const s = new State();
+    projectionsPanel.populateBookmarkFromUI(s);
+    assert.deepEqual([0, 1], s.pcaComponentDimensions);
+  });
+
+  it('gets the PCA component UI values from a 3D PCA projection', () => {
+    let projectionsPanel = document.createElement(
+        ProjectionsPanel.prototype.is) as ProjectionsPanel;
+
+    projectionsPanel.pcaX = 0;
+    projectionsPanel.pcaY = 1;
+    projectionsPanel.pcaZ = 2;
+    projectionsPanel.pcaIs3d = true;
+
+    const s = new State();
+    projectionsPanel.populateBookmarkFromUI(s);
+    assert.deepEqual([0, 1, 2], s.pcaComponentDimensions);
+  });
+});
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector.html b/tensorflow/tensorboard/components/vz_projector/vz-projector.html
index efaa8dfdc6a..a43fd29857d 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector.html
+++ b/tensorflow/tensorboard/components/vz_projector/vz-projector.html
@@ -122,7 +122,6 @@ limitations under the License.
   align-items: center;
   display: flex;
   justify-content: center;
-  width: 300px;
 }
 
 #notify-msgs {
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector.ts b/tensorflow/tensorboard/components/vz_projector/vz-projector.ts
index 1d96acece4b..c0735f5e58e 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector.ts
+++ b/tensorflow/tensorboard/components/vz_projector/vz-projector.ts
@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-import {ColorOption, DataProto, DataSet, MetadataInfo, PointAccessor, Projection, State} from './data';
-import {DataProvider, getDataProvider, ServingMode, TensorInfo} from './data-loader';
+import {ColorOption, DataPoint, DataProto, DataSet, MetadataInfo, PointAccessor, PointMetadata, Projection, State, stateGetAccessorDimensions} from './data';
+import {DataProvider, ServingMode, TensorInfo} from './data-provider';
+import {DemoDataProvider} from './data-provider-demo';
+import {ProtoDataProvider} from './data-provider-proto';
+import {ServerDataProvider} from './data-provider-server';
 import {HoverContext, HoverListener} from './hoverContext';
 import * as knn from './knn';
 import * as logging from './logging';
@@ -49,16 +52,18 @@ export let ProjectorPolymer = PolymerElement({
   }
 });
 
+const INDEX_METADATA_FIELD = '__index__';
+
 export class Projector extends ProjectorPolymer implements SelectionContext,
                                                            HoverContext {
   // The working subset of the data source's original data set.
-  currentDataSet: DataSet;
+  dataSet: DataSet;
   servingMode: ServingMode;
 
   private selectionChangedListeners: SelectionChangedListener[];
   private hoverListeners: HoverListener[];
 
-  private dataSet: DataSet;
+  private originalDataSet: DataSet;
   private dom: d3.Selection<any>;
   private projectorScatterPlotAdapter: ProjectorScatterPlotAdapter;
   private scatterPlot: ScatterPlot;
@@ -108,7 +113,7 @@ export class Projector extends ProjectorPolymer implements SelectionContext,
   setSelectedLabelOption(labelOption: string) {
     this.selectedLabelOption = labelOption;
     let labelAccessor = (i: number): string => {
-      return this.currentDataSet.points[i]
+      return this.dataSet.points[i]
           .metadata[this.selectedLabelOption] as string;
     };
     this.scatterPlot.setLabelAccessor(labelAccessor);
@@ -122,26 +127,25 @@ export class Projector extends ProjectorPolymer implements SelectionContext,
 
   setNormalizeData(normalizeData: boolean) {
     this.normalizeData = normalizeData;
-    this.setCurrentDataSet(this.dataSet.getSubset());
+    this.setCurrentDataSet(this.originalDataSet.getSubset());
   }
 
-  updateDataSet(ds: DataSet, metadata: MetadataInfo) {
-    this.dataSet = ds;
-    if (this.scatterPlot == null || this.dataSet == null) {
+  updateDataSet(ds: DataSet, metadata?: MetadataInfo, metadataFile?: string) {
+    this.originalDataSet = ds;
+    if (this.scatterPlot == null || this.originalDataSet == null) {
       // We are not ready yet.
       return;
     }
-    this.normalizeData = this.dataSet.dim[1] >= THRESHOLD_DIM_NORMALIZE;
-    if (metadata != null) {
-      ds.mergeMetadata(metadata);
-    }
+    this.normalizeData = this.originalDataSet.dim[1] >= THRESHOLD_DIM_NORMALIZE;
+    metadata = metadata || this.makeDefaultMetadata(ds.points);
+    ds.mergeMetadata(metadata);
     this.dataPanel.setNormalizeData(this.normalizeData);
-    this.setCurrentDataSet(this.dataSet.getSubset());
+    this.setCurrentDataSet(this.originalDataSet.getSubset());
     this.inspectorPanel.datasetChanged();
-    if (metadata != null) {
-      this.inspectorPanel.metadataChanged(metadata);
-      this.projectionsPanel.metadataChanged(metadata);
-    }
+
+    this.inspectorPanel.metadataChanged(metadata);
+    this.projectionsPanel.metadataChanged(metadata);
+    this.dataPanel.metadataChanged(metadata, metadataFile);
     // Set the container to a fixed height, otherwise in Colab the
     // height can grow indefinitely.
     let container = this.dom.select('#container');
@@ -162,14 +166,18 @@ export class Projector extends ProjectorPolymer implements SelectionContext,
   filterDataset() {
     let indices = this.selectedPointIndices.concat(
         this.neighborsOfFirstPoint.map(n => n.index));
-    this.setCurrentDataSet(this.currentDataSet.getSubset(indices));
-    this.clearSelectionAndHover();
+    let selectionSize = this.selectedPointIndices.length;
+    this.setCurrentDataSet(this.dataSet.getSubset(indices));
+    this.adjustSelectionAndHover(d3.range(selectionSize));
     this.scatterPlot.recreateScene();
   }
 
   resetFilterDataset() {
-    this.setCurrentDataSet(this.dataSet.getSubset(null));
-    this.selectedPointIndices = [];
+    let originalPointIndices = this.selectedPointIndices.map(localIndex => {
+      return this.dataSet.points[localIndex].index;
+    });
+    this.setCurrentDataSet(this.originalDataSet.getSubset());
+    this.adjustSelectionAndHover(originalPointIndices);
   }
 
   /**
@@ -180,7 +188,7 @@ export class Projector extends ProjectorPolymer implements SelectionContext,
     let neighbors: knn.NearestEntry[] = [];
 
     if (newSelectedPointIndices.length === 1) {
-      neighbors = this.currentDataSet.findNeighbors(
+      neighbors = this.dataSet.findNeighbors(
           newSelectedPointIndices[0], this.inspectorPanel.distFunc,
           this.inspectorPanel.numNN);
       this.metadataCard.updateMetadata(
@@ -213,13 +221,39 @@ export class Projector extends ProjectorPolymer implements SelectionContext,
     this.initializeDataProvider(dataProto);
   }
 
-  private initializeDataProvider(dataProto?: DataProto) {
-    getDataProvider(this.servingMode, dataProto, this.routePrefix,
-        dataProvider => {
-      this.dataProvider = dataProvider;
-      this.dataPanel.initialize(this, dataProvider);
-      this.bookmarkPanel.initialize(this, dataProvider);
+  private makeDefaultMetadata(points: DataPoint[]): MetadataInfo {
+    let pointsInfo: PointMetadata[] = [];
+    points.forEach(p => {
+      let pointInfo: PointMetadata = {};
+      pointInfo[INDEX_METADATA_FIELD] = p.index;
+      pointsInfo.push(pointInfo);
     });
+    return {
+      stats: [{
+        name: INDEX_METADATA_FIELD,
+        isNumeric: false,
+        tooManyUniqueValues: true,
+        min: 0,
+        max: pointsInfo.length - 1
+      }],
+      pointsInfo: pointsInfo
+    };
+  }
+
+  private initializeDataProvider(dataProto?: DataProto) {
+    if (this.servingMode === 'demo') {
+      this.dataProvider = new DemoDataProvider();
+    } else if (this.servingMode === 'server') {
+      if (!this.routePrefix) {
+        throw 'route-prefix is a required parameter';
+      }
+      this.dataProvider = new ServerDataProvider(this.routePrefix);
+    } else if (this.servingMode === 'proto' && dataProto != null) {
+      this.dataProvider = new ProtoDataProvider(dataProto);
+    }
+
+    this.dataPanel.initialize(this, this.dataProvider);
+    this.bookmarkPanel.initialize(this, this.dataProvider);
   }
 
   private getLegendPointColorer(colorOption: ColorOption):
@@ -229,7 +263,7 @@ export class Projector extends ProjectorPolymer implements SelectionContext,
     }
     const colorer = (i: number) => {
       let value =
-          this.currentDataSet.points[i].metadata[this.selectedColorOption.name];
+          this.dataSet.points[i].metadata[this.selectedColorOption.name];
       if (value == null) {
         return POINT_COLOR_MISSING;
       }
@@ -247,32 +281,33 @@ export class Projector extends ProjectorPolymer implements SelectionContext,
     return (label3DModeButton as any).active;
   }
 
-  clearSelectionAndHover() {
-    this.notifySelectionChanged([]);
-    this.notifyHoverOverPoint(null);
+  adjustSelectionAndHover(selectedPointIndices: number[], hoverIndex?: number) {
+    this.notifySelectionChanged(selectedPointIndices);
+    this.notifyHoverOverPoint(hoverIndex);
     this.scatterPlot.setMode(Mode.HOVER);
   }
 
   private unsetCurrentDataSet() {
-    this.currentDataSet.stopTSNE();
+    this.dataSet.stopTSNE();
   }
 
   private setCurrentDataSet(ds: DataSet) {
-    this.clearSelectionAndHover();
-    if (this.currentDataSet != null) {
+    this.adjustSelectionAndHover([]);
+    if (this.dataSet != null) {
       this.unsetCurrentDataSet();
     }
-    this.currentDataSet = ds;
+    this.dataSet = ds;
     if (this.normalizeData) {
-      this.currentDataSet.normalize();
+      this.dataSet.normalize();
     }
-    this.dim = this.currentDataSet.dim[1];
-    this.dom.select('span.numDataPoints').text(this.currentDataSet.dim[0]);
-    this.dom.select('span.dim').text(this.currentDataSet.dim[1]);
+    this.dim = this.dataSet.dim[1];
+    this.dom.select('span.numDataPoints').text(this.dataSet.dim[0]);
+    this.dom.select('span.dim').text(this.dataSet.dim[1]);
 
-    this.projectionsPanel.dataSetUpdated(this.currentDataSet, this.dim);
+    this.projectionsPanel.dataSetUpdated(
+        this.dataSet, this.originalDataSet, this.dim);
 
-    this.scatterPlot.setDataSet(this.currentDataSet, this.dataSet.spriteImage);
+    this.scatterPlot.setDataSet(this.dataSet, this.originalDataSet.spriteImage);
     this.updateScatterPlot();
   }
 
@@ -313,7 +348,7 @@ export class Projector extends ProjectorPolymer implements SelectionContext,
 
     this.scatterPlot = new ScatterPlot(
         this.getScatterContainer(), i => '' +
-            this.currentDataSet.points[i].metadata[this.selectedLabelOption],
+            this.dataSet.points[i].metadata[this.selectedLabelOption],
         this, this);
     this.createVisualizers(false);
 
@@ -335,7 +370,7 @@ export class Projector extends ProjectorPolymer implements SelectionContext,
     this.hoverPointIndex = hoverIndex;
     let hoverText = null;
     if (hoverIndex != null) {
-      const point = this.currentDataSet.points[hoverIndex];
+      const point = this.dataSet.points[hoverIndex];
       if (point.metadata[this.selectedLabelOption]) {
         hoverText = point.metadata[this.selectedLabelOption].toString();
       }
@@ -348,7 +383,7 @@ export class Projector extends ProjectorPolymer implements SelectionContext,
   }
 
   private updateScatterPlot() {
-    const dataSet = this.currentDataSet;
+    const dataSet = this.dataSet;
     const selectedSet = this.selectedPointIndices;
     const hoverIndex = this.hoverPointIndex;
     const neighbors = this.neighborsOfFirstPoint;
@@ -364,10 +399,14 @@ export class Projector extends ProjectorPolymer implements SelectionContext,
     const labels =
         this.projectorScatterPlotAdapter.generateVisibleLabelRenderParams(
             dataSet, selectedSet, neighbors, hoverIndex);
+    const traceColors =
+        this.projectorScatterPlotAdapter.generateLineSegmentColorMap(
+            dataSet, pointColorer);
 
     this.scatterPlot.setPointColors(pointColors);
     this.scatterPlot.setPointScaleFactors(pointScaleFactors);
     this.scatterPlot.setLabels(labels);
+    this.scatterPlot.setTraceColors(traceColors);
     this.scatterPlot.render();
   }
 
@@ -413,8 +452,7 @@ export class Projector extends ProjectorPolymer implements SelectionContext,
     this.scatterPlot.showTickLabels(false);
     this.scatterPlot.setPointAccessors(pointAccessors);
 
-    /* tsne needs to do an iteration for the points to look reasonable */
-    if (projection !== 'tsne') {
+    if (this.dataSet.hasMeaningfulVisualization(projection)) {
       this.scatterPlot.update();
     }
 
@@ -430,60 +468,52 @@ export class Projector extends ProjectorPolymer implements SelectionContext,
    * Gets the current view of the embedding and saves it as a State object.
    */
   getCurrentState(): State {
-    let state: State = {};
+    const state = new State();
 
     // Save the individual datapoint projections.
     state.projections = [];
-    for (let i = 0; i < this.currentDataSet.points.length; i++) {
-      state.projections.push(this.currentDataSet.points[i].projections);
+    for (let i = 0; i < this.dataSet.points.length; i++) {
+      const point = this.dataSet.points[i];
+      const projections: {[key: string]: number} = {};
+      const keys = Object.keys(point.projections);
+      for (let j = 0; j < keys.length; ++j) {
+        projections[keys[j]] = point.projections[keys[j]];
+      }
+      state.projections.push(projections);
     }
-
     state.selectedProjection = this.selectedProjection;
-    state.is3d = this.projectionsPanel.is3d;
-    if (this.selectedProjection === 'pca') {
-      state.componentDimensions =
-          this.projectionsPanel.getPCAComponentUIValues();
-    }
+    state.tSNEIteration = this.dataSet.tSNEIteration;
     state.selectedPoints = this.selectedPointIndices;
     state.cameraDef = this.scatterPlot.getCameraDef();
-
-    // Save the color and label by options.
     state.selectedColorOptionName = this.dataPanel.selectedColorOptionName;
     state.selectedLabelOption = this.selectedLabelOption;
-
+    this.projectionsPanel.populateBookmarkFromUI(state);
     return state;
   }
 
   /** Loads a State object into the world. */
   loadState(state: State) {
     for (let i = 0; i < state.projections.length; i++) {
-      this.currentDataSet.points[i].projections = state.projections[i];
+      const point = this.dataSet.points[i];
+      const projection = state.projections[i];
+      const keys = Object.keys(projection);
+      for (let j = 0; j < keys.length; ++j) {
+        point.projections[keys[j]] = projection[keys[j]];
+      }
     }
-    if (state.selectedProjection === 'tsne') {
-      this.currentDataSet.hasTSNERun = true;
-    }
-
-    this.projectionsPanel.disablePolymerChangesTriggerReprojection();
-    this.projectionsPanel.is3d = state.is3d;
-    if (state.selectedProjection === 'pca') {
-      this.projectionsPanel.setPCAComponentUIValues(state.componentDimensions);
-    }
-    this.projectionsPanel.showTab(state.selectedProjection);
-    this.projectionsPanel.enablePolymerChangesTriggerReprojection();
-
-    // Load the color and label by options.
+    this.dataSet.hasTSNERun = (state.selectedProjection === 'tsne');
+    this.dataSet.tSNEIteration = state.tSNEIteration;
+    this.projectionsPanel.restoreUIFromBookmark(state);
     this.dataPanel.selectedColorOptionName = state.selectedColorOptionName;
     this.selectedLabelOption = state.selectedLabelOption;
-
     this.scatterPlot.setCameraDefForNextCameraCreation(state.cameraDef);
-
     {
-      const accessors = this.currentDataSet.getPointAccessors(
-          state.selectedProjection, state.componentDimensions);
+      const dimensions = stateGetAccessorDimensions(state);
+      const accessors =
+          this.dataSet.getPointAccessors(state.selectedProjection, dimensions);
       this.setProjection(
-          state.selectedProjection, state.is3d ? 3 : 2, accessors);
+          state.selectedProjection, dimensions.length, accessors);
     }
-
     this.notifySelectionChanged(state.selectedPoints);
   }
 }
diff --git a/tensorflow/tensorboard/gulp_tasks/compile.js b/tensorflow/tensorboard/gulp_tasks/compile.js
index 0f0bdd8f38c..3d0d725cfb2 100644
--- a/tensorflow/tensorboard/gulp_tasks/compile.js
+++ b/tensorflow/tensorboard/gulp_tasks/compile.js
@@ -32,7 +32,7 @@ var tsProject = ts.createProject('./tsconfig.json', {
 
 /** List of components (and their external deps) that are using es6 modules. */
 var ES6_COMPONENTS = [{
-  name: 'vz-projector',
+  name: 'vz_projector',
   deps: [
     'd3/d3.min.js', 'weblas/dist/weblas.js', 'three.js/build/three.min.js',
     'three.js/examples/js/controls/OrbitControls.js',
@@ -80,7 +80,7 @@ module.exports = function(includeDeps) {
     // Compile components that are using global namespaces producing 1 js file
     // for each ts file.
     var isComponent = filter([
-      'components/tf-*/**/*.ts', 'components/vz-*/**/*.ts', 'typings/**/*.ts',
+      'components/tf_*/**/*.ts', 'components/vz_*/**/*.ts', 'typings/**/*.ts',
       'components/plottable/plottable.d.ts'
       // Ignore components that use es6 modules.
     ].concat(ES6_COMPONENTS.map(function(component) {
diff --git a/tensorflow/tensorboard/gulp_tasks/vulcanize.js b/tensorflow/tensorboard/gulp_tasks/vulcanize.js
index 42b985f8f65..b4fdabf01da 100644
--- a/tensorflow/tensorboard/gulp_tasks/vulcanize.js
+++ b/tensorflow/tensorboard/gulp_tasks/vulcanize.js
@@ -41,29 +41,46 @@ Instead, use `gulp regenerate` to create a new version with your changes.\n\
 -->\n\n'
 
 /**
- * Returns a list of non-tensorboard components inside the components
- * directory, i.e. components that don't begin with 'tf-' or 'vz-''.
+ * Returns a list of web components inside the components directory for which
+ * the name predicate is true.
  */
-function getNonTensorBoardComponents() {
+function getComponents(namePredicate) {
   return fs.readdirSync('components')
       .filter(function(file) {
-        var prefix = file.slice(0,3);
         return fs.statSync(path.join('components', file)).isDirectory() &&
-            prefix !== 'tf-'  && prefix !== 'vz-';
+            namePredicate(file);
       })
       .map(function(dir) { return '/' + dir + '/'; });
 }
 
+var tbComponents = getComponents(function(name) {
+  var prefix = name.slice(0, 3);
+  return prefix == 'tf_' || prefix == 'vz_';
+});
+var base = path.join(__dirname, '../components');
+// List of redirects of the form path1|path2 for every tensorboard component
+// in order to replace dashes with underscores.
+// E.g. .../tf-tensorboard|.../tf_tensorboard
+var redirects = tbComponents.map(function(dir) {
+  return path.join(base, dir.replace(/_/g, '-')) + '|' + path.join(base, dir);
+});
+
+var nonTBComponents = getComponents(function(name) {
+  var prefix = name.slice(0, 3);
+  return prefix !== 'tf_'  && prefix !== 'vz_';
+});
+
 module.exports = function(overwrite) {
   return function() {
     var suffix = overwrite ? '' : '.OPENSOURCE';
     // Vulcanize TensorBoard without external libraries.
-    gulp.src('components/tf-tensorboard/tf-tensorboard.html')
+    gulp.src('components/tf_tensorboard/tf-tensorboard.html')
         .pipe(vulcanize({
           inlineScripts: true,
           inlineCss: true,
           stripComments: true,
-          excludes: getNonTensorBoardComponents()
+          excludes: nonTBComponents,
+          redirects: redirects
         }))
         .pipe(header(HEADER_STR))
         .pipe(rename('tf-tensorboard.html' + suffix))
diff --git a/tensorflow/tensorboard/gulpfile.js b/tensorflow/tensorboard/gulpfile.js
index 96e955f8a88..0fd28adcb76 100644
--- a/tensorflow/tensorboard/gulpfile.js
+++ b/tensorflow/tensorboard/gulpfile.js
@@ -30,19 +30,16 @@ function getTask(task) {
 
 
 gulp.task('compile', getTask('compile')(true));
-gulp.task('tslint', getTask('tslint')(true));
-// tslint.permissive warns without failing.
-gulp.task('tslint.permissive', getTask('tslint')(false));
 gulp.task('first-compile', getTask('compile')(true));
 gulp.task('compile-without-deps', getTask('compile')(false));
-gulp.task('test.onlytest', getTask('test')); // if you don't want to lint, etc
-gulp.task('test', ['tslint', 'compile'], getTask('test'));
+gulp.task('test.onlytest', getTask('test'));
+gulp.task('test', ['compile'], getTask('test'));
 
 gulp.task('watch', [], function() {
   // Avoid watching generated .d.ts in the build (aka output) directory.
   return gulp.watch(
-      ['components/tf-*/**/*.ts', 'components/vz-*/**/*.ts'],
-      {ignoreInitial: true}, ['compile', 'tslint.permissive']);
+      ['components/tf_*/**/*.ts', 'components/vz_*/**/*.ts'],
+      {ignoreInitial: true}, ['compile']);
 });
 
 
@@ -67,11 +64,11 @@ gulp.task('server', ['first-compile'], function() {
 // TODO(danmane): When testing is nicer, integrate into vulcanize task
 // gulp vulcanize: Regenerate the tf-tensorboard.html.OPENSOURCE file for pre-release
 gulp.task(
-    'vulcanize', ['compile-without-deps', 'tslint.permissive'],
+    'vulcanize', ['compile-without-deps'],
     getTask('vulcanize')(false));
 // gulp regenerate: Regenerate the tf-tensorboard.html for interactive bazel development
 gulp.task(
-    'regenerate', ['compile-without-deps', 'tslint.permissive'],
+    'regenerate', ['compile-without-deps'],
     getTask('vulcanize')(true));
 
 // TODO(danmane): consider making bower install part of default task
diff --git a/tensorflow/tensorboard/package.json b/tensorflow/tensorboard/package.json
index 9078dc25d9c..05c9ea1d7f2 100644
--- a/tensorflow/tensorboard/package.json
+++ b/tensorflow/tensorboard/package.json
@@ -24,14 +24,12 @@
     "gulp-rename": "~1.2.2",
     "gulp-replace": "~0.5.4",
     "gulp-server-livereload": "~1.5.4",
-    "gulp-tslint": "~4.2.2",
     "gulp-typescript": "~2.10.0",
     "gulp-util": "~3.0.7",
     "gulp-vulcanize": "~6.1.0",
     "merge2": "~0.3.6",
     "minimist": "~1.2.0",
     "tsify": "^0.14.8",
-    "tslint": "^3.2.1",
     "typescript": "^2.0.0",
     "typings": "1.4.0",
     "vinyl-source-stream": "^1.1.0",
diff --git a/tensorflow/tensorboard/plugins/projector/plugin.py b/tensorflow/tensorboard/plugins/projector/plugin.py
index 1e159e613aa..eddc1c16391 100644
--- a/tensorflow/tensorboard/plugins/projector/plugin.py
+++ b/tensorflow/tensorboard/plugins/projector/plugin.py
@@ -38,7 +38,7 @@ RUNS_ROUTE = '/runs'
 BOOKMARKS_ROUTE = '/bookmarks'
 
 # Limit for the number of points we send to the browser.
-LIMIT_NUM_POINTS = 50000
+LIMIT_NUM_POINTS = 100000
 
 
 class ProjectorPlugin(TBPlugin):
@@ -190,11 +190,16 @@ class ProjectorPlugin(TBPlugin):
       self.handler.respond('%s is not a file' % fpath, 'text/plain', 400)
       return
 
+    num_header_rows = 0
     with file_io.FileIO(fpath, 'r') as f:
       lines = []
+      # Stream reading the file with early break in case the file doesn't fit in
+      # memory.
       for line in f:
         lines.append(line)
-        if len(lines) >= LIMIT_NUM_POINTS:
+        if len(lines) == 1 and '\t' in lines[0]:
+          num_header_rows = 1
+        if len(lines) >= LIMIT_NUM_POINTS + num_header_rows:
           break
     self.handler.respond(''.join(lines), 'text/plain')
 
diff --git a/tensorflow/tensorboard/scripts/serialize_tensorboard.py b/tensorflow/tensorboard/scripts/serialize_tensorboard.py
index 606d07e8019..e74796167f5 100644
--- a/tensorflow/tensorboard/scripts/serialize_tensorboard.py
+++ b/tensorflow/tensorboard/scripts/serialize_tensorboard.py
@@ -25,7 +25,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import argparse
 import json
 import os
 import os.path
@@ -40,7 +39,21 @@ import tensorflow as tf
 from tensorflow.python.summary import event_multiplexer
 from tensorflow.tensorboard.backend import server
 
-FLAGS = None
+tf.flags.DEFINE_string('logdir', None, """the logdir to pass to the TensorBoard
+backend; data will be read from this logdir for serialization.""")
+
+tf.flags.DEFINE_string('target', None, """The directoy where serialized data
+will be written""")
+
+tf.flags.DEFINE_boolean('overwrite', False, """Whether to remove and overwrite
+TARGET if it already exists.""")
+
+tf.flags.DEFINE_boolean(
+    'purge_orphaned_data', True, 'Whether to purge data that '
+    'may have been orphaned due to TensorBoard restarts. '
+    'Disabling purge_orphaned_data can be used to debug data '
+    'disappearance.')
+FLAGS = tf.flags.FLAGS
 
 BAD_CHARACTERS = "#%&{}\\/<>*? $!'\":@+`|="
 DEFAULT_SUFFIX = '.json'
@@ -195,38 +208,4 @@ def main(unused_argv=None):
 
 
 if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      '--logdir',
-      type=str,
-      default=None,
-      help="""\
-      the logdir to pass to the TensorBoard backend; data will be read from
-      this logdir for serialization.\
-      """
-  )
-  parser.add_argument(
-      '--target',
-      type=str,
-      default=None,
-      help='The directoy where serialized data will be written'
-  )
-  parser.add_argument(
-      '--overwrite',
-      default=False,
-      help='Whether to remove and overwrite TARGET if it already exists.',
-      action='store_true'
-  )
-  parser.add_argument(
-      '--purge_orphaned_data',
-      type=bool,
-      default=True,
-      help="""\
-      Whether to purge data that may have been orphaned due to TensorBoard
-      restarts. Disabling purge_orphaned_data can be used to debug data
-      disappearance.\
-      """
-  )
-  FLAGS = parser.parse_args()
-
   tf.app.run()
diff --git a/tensorflow/tensorboard/tensorboard.py b/tensorflow/tensorboard/tensorboard.py
index b0bfccb4912..9adcee7e367 100644
--- a/tensorflow/tensorboard/tensorboard.py
+++ b/tensorflow/tensorboard/tensorboard.py
@@ -17,26 +17,73 @@
 This is a simple web server to proxy data from the event_loader to the web, and
 serve static web files.
 """
-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import argparse
 import os
 import socket
 
 from tensorflow.python.platform import app
+from tensorflow.python.platform import flags
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import status_bar
-from tensorflow.python.platform import (
-    tf_logging as logging)
-from tensorflow.python.summary import (
-    event_file_inspector as efi)
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.summary import event_file_inspector as efi
 from tensorflow.python.summary import event_multiplexer
 from tensorflow.tensorboard.backend import server
 
-FLAGS = None
+flags.DEFINE_string('logdir', '', """logdir specifies the directory where
+TensorBoard will look to find TensorFlow event files that it can display.
+TensorBoard will recursively walk the directory structure rooted at logdir,
+looking for .*tfevents.* files.
+
+You may also pass a comma separated list of log directories, and TensorBoard
+will watch each directory. You can also assign names to individual log
+directories by putting a colon between the name and the path, as in
+
+tensorboard --logdir=name1:/path/to/logs/1,name2:/path/to/logs/2
+""")
+
+flags.DEFINE_boolean('debug', False, 'Whether to run the app in debug mode. '
+                     'This increases log verbosity to DEBUG.')
+
+flags.DEFINE_string('host', '0.0.0.0', 'What host to listen to. Defaults to '
+                    'serving on 0.0.0.0, set to 127.0.0.1 (localhost) to'
+                    'disable remote access (also quiets security warnings).')
+
+flags.DEFINE_boolean('inspect', False, """Use this flag to print out a digest
+of your event files to the command line, when no data is shown on TensorBoard or
+the data shown looks weird.
+
+Example usages:
+tensorboard --inspect --event_file=myevents.out
+tensorboard --inspect --event_file=myevents.out --tag=loss
+tensorboard --inspect --logdir=mylogdir
+tensorboard --inspect --logdir=mylogdir --tag=loss
+
+See tensorflow/python/summary/event_file_inspector.py for more info and
+detailed usage.
+""")
+flags.DEFINE_string(
+    'tag', '',
+    'The particular tag to query for. Only used if --inspect is present')
+flags.DEFINE_string(
+    'event_file', '',
+    'The particular event file to query for. Only used if --inspect is present '
+    'and --logdir is not specified.')
+
+flags.DEFINE_integer('port', 6006, 'What port to serve TensorBoard on.')
+
+flags.DEFINE_boolean('purge_orphaned_data', True, 'Whether to purge data that '
+                     'may have been orphaned due to TensorBoard restarts. '
+                     'Disabling purge_orphaned_data can be used to debug data '
+                     'disappearance.')
+
+flags.DEFINE_integer('reload_interval', 60, 'How often the backend should load '
+                     'more data.')
+
+FLAGS = flags.FLAGS
 
 
 def main(unused_argv=None):
@@ -105,100 +152,4 @@ def main(unused_argv=None):
 
 
 if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      '--logdir',
-      type=str,
-      default='',
-      help="""\
-      logdir specifies the directory where TensorBoard will look to find
-      TensorFlow event files that it can display. TensorBoard will recursively
-      walk the directory structure rooted at logdir, looking for .*tfevents.*
-      files.
-
-      You may also pass a comma separated list of log directories, and
-      TensorBoard will watch each directory. You can also assign names to
-      individual log directories by putting a colon between the name and the
-      path, as in
-
-      tensorboard --logdir=name1:/path/to/logs/1,name2:/path/to/logs/2\
-      """
-  )
-  parser.add_argument(
-      '--debug',
-      default=False,
-      help="""\
-      Whether to run the app in debug mode. This increases log verbosity to
-      DEBUG.\
-      """,
-      action='store_true'
-  )
-  parser.add_argument(
-      '--host',
-      type=str,
-      default='0.0.0.0',
-      help="""\
-      What host to listen to. Defaults to serving on 0.0.0.0, set to 127.0.0.1
-      (localhost) todisable remote access (also quiets security warnings).\
-      """
-  )
-  parser.add_argument(
-      '--inspect',
-      default=False,
-      help="""\
-      Use this flag to print out a digest of your event files to the command
-      line, when no data is shown on TensorBoard or the data shown looks weird.
-
-      Example usages:
-      tensorboard --inspect --event_file=myevents.out
-      tensorboard --inspect --event_file=myevents.out --tag=loss
-      tensorboard --inspect --logdir=mylogdir
-      tensorboard --inspect --logdir=mylogdir --tag=loss
-
-      See tensorflow/python/summary/event_file_inspector.py for
-      more info and detailed usage.\
-      """,
-      action='store_true'
-  )
-  parser.add_argument(
-      '--tag',
-      type=str,
-      default='',
-      help="""\
-      The particular tag to query for. Only used if --inspect is present\
-      """
-  )
-  parser.add_argument(
-      '--event_file',
-      type=str,
-      default='',
-      help="""\
-      The particular event file to query for. Only used if --inspect is present
-      and --logdir is not specified.\
-      """
-  )
-  parser.add_argument(
-      '--port',
-      type=int,
-      default=6006,
-      help='What port to serve TensorBoard on.'
-  )
-  parser.add_argument(
-      '--purge_orphaned_data',
-      type=bool,
-      default=True,
-      help="""\
-      Whether to purge data that may have been orphaned due to TensorBoard
-      restarts. Disabling purge_orphaned_data can be used to debug data
-      disappearance.\
-      """
-  )
-  parser.add_argument(
-      '--reload_interval',
-      type=int,
-      default=60,
-      help='How often the backend should load more data.'
-  )
-  FLAGS = parser.parse_args()
-
   app.run()
diff --git a/tensorflow/tensorboard/wct.conf.json b/tensorflow/tensorboard/wct.conf.json
index 160d33da85e..519218ce418 100644
--- a/tensorflow/tensorboard/wct.conf.json
+++ b/tensorflow/tensorboard/wct.conf.json
@@ -1,7 +1,7 @@
 {
   "suites": [
-    "components/tf-*/test",
-    "components/vz-*/test"
+    "components/tf_*/test",
+    "components/vz_*/test"
   ],
   "plugins": ["local"]
 }
diff --git a/tensorflow/tools/dist_test/Dockerfile b/tensorflow/tools/dist_test/Dockerfile
index 9888cfd14f4..65d7e1717e7 100644
--- a/tensorflow/tools/dist_test/Dockerfile
+++ b/tensorflow/tools/dist_test/Dockerfile
@@ -24,6 +24,7 @@ MAINTAINER Shanqing Cai <cais@google.com>
 
 RUN apt-get update
 RUN apt-get install -y --no-install-recommends \
+    curl \
     python \
     python-numpy \
     python-pip \
diff --git a/tensorflow/tools/dist_test/local/start_local_k8s_service.sh b/tensorflow/tools/dist_test/local/start_local_k8s_service.sh
deleted file mode 100755
index 6d12ed7b3c6..00000000000
--- a/tensorflow/tools/dist_test/local/start_local_k8s_service.sh
+++ /dev/null
@@ -1,118 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# Start a Kubernetes (k8s) cluster on the local machine.
-#
-# This script assumes that git, docker, and golang are installed and on
-# the path. It will attempt to install the version of etcd recommended by the
-# kubernetes source.
-#
-# Usage: start_local_k8s_service.sh
-#
-# This script obeys the following environment variables:
-# TF_DIST_K8S_SRC_DIR:     Overrides the default directory for k8s source code.
-# TF_DIST_K8S_SRC_BRANCH:  Overrides the default branch to run the local k8s
-#                          cluster with.
-
-
-# Configurations
-K8S_SRC_REPO=https://github.com/kubernetes/kubernetes.git
-K8S_SRC_DIR=${TF_DIST_K8S_SRC_DIR:-/local/kubernetes}
-K8S_SRC_BRANCH=${TF_DIST_K8S_SRC_BRANCH:-release-1.2}
-
-# Helper functions
-die() {
-    echo $@
-    exit 1
-}
-
-# Start docker service. Try multiple times if necessary.
-COUNTER=0
-while true; do
-  ((COUNTER++))
-  service docker start
-  sleep 1
-
-  service docker status
-  if [[ $? == "0" ]]; then
-    echo "Docker service started successfully."
-    break;
-  else
-    echo "Docker service failed to start"
-
-    # 23 is the exit code to signal failure to start docker service in the dind
-    # container.
-    exit 23
-
-  fi
-done
-
-# Wait for docker0 net interface to appear
-echo "Waiting for docker0 network interface to appear..."
-while true; do
-  if [[ -z $(netstat -i | grep "^docker0") ]]; then
-    sleep 1
-  else
-    break
-  fi
-done
-echo "docker0 interface has appeared."
-
-# Set docker0 to promiscuous mode
-ip link set docker0 promisc on || \
-    die "FAILED to set docker0 to promiscuous"
-echo "Turned promisc on for docker0"
-
-# Check promiscuous mode of docker0
-netstat -i
-
-umask 000
-if [[ ! -d "${K8S_SRC_DIR}/.git" ]]; then
-  mkdir -p ${K8S_SRC_DIR}
-  git clone ${K8S_SRC_REPO} ${K8S_SRC_DIR} || \
-      die "FAILED to clone k8s source from GitHub from: ${K8S_SRC_REPO}"
-fi
-
-pushd ${K8S_SRC_DIR}
-git checkout ${K8S_SRC_BRANCH} || \
-    die "FAILED to checkout k8s source branch: ${K8S_SRC_BRANCH}"
-git pull origin ${K8S_SRC_BRANCH} || \
-    die "FAILED to pull from k8s source branch: ${K8S_SRC_BRANCH}"
-
-# Create kubectl binary
-
-# Install etcd
-hack/install-etcd.sh
-
-export PATH=$(pwd)/third_party/etcd:${PATH}
-
-# Setup golang
-export PATH=/usr/local/go/bin:${PATH}
-
-echo "etcd path: $(which etcd)"
-echo "go path: $(which go)"
-
-# Create shortcut to kubectl
-echo '#!/bin/bash' > /usr/local/bin/kubectl
-echo "$(pwd)/cluster/kubectl.sh \\" >> /usr/local/bin/kubectl
-echo '    $@' >> /usr/local/bin/kubectl
-chmod +x /usr/local/bin/kubectl
-
-# Bring up local cluster
-export KUBE_ENABLE_CLUSTER_DNS=true
-hack/local-up-cluster.sh
-
-popd
diff --git a/tensorflow/tools/dist_test/local/start_tf_cluster_container.sh b/tensorflow/tools/dist_test/local/start_tf_cluster_container.sh
deleted file mode 100755
index 49578d3051f..00000000000
--- a/tensorflow/tools/dist_test/local/start_tf_cluster_container.sh
+++ /dev/null
@@ -1,91 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# Starts a docker-in-docker (dind) container that is capable of running docker
-# service and Kubernetes (k8s) cluster inside.
-#
-# Usage: start_tf_cluster_container.sh <local_k8s_dir> <docker_img_name>
-#
-# local_k8s_dir:   Kubernetes (k8s) source directory on the host
-# docker_img_name: Name of the docker image to start
-#
-# In addition, this script obeys the following environment variables:
-# TF_DIST_SERVER_DOCKER_IMAGE:  overrides the default docker image to launch
-#                                 TensorFlow (GRPC) servers with
-
-# Parse input arguments
-if [[ $# != "2" ]]; then
-  echo "Usage: $0 <host_k8s_dir> <docker_img_name>"
-  exit 1
-fi
-
-HOST_K8S_DIR=$1
-DOCKER_IMG_NAME=$2
-
-# Helper functions
-die() {
-  echo $@
-  exit 1
-}
-
-# Maximum number of tries to start the docker container with docker running
-# inside
-MAX_ATTEMPTS=100
-
-# Map environment variables into the docker-in-docker (dind) container
-DOCKER_ENV=""
-if [[ ! -z "${TF_DIST_SERVER_DOCKER_IMAGE}" ]]; then
-  DOCKER_ENV="-e TF_DIST_SERVER_DOCKER_IMAGE=${TF_DIST_SERVER_DOCKER_IMAGE}"
-fi
-
-# Verify that the promisc (promiscuous mode) flag is set on docker0 network
-# interface
-if [[ -z $(netstat -i | grep "^docker0" | awk '{print $NF}' | grep -o P) ]];
-then
-  die "FAILED: Cannot proceed with dind k8s container creation because "\
-"network interface 'docker0' is not set to promisc on the host."
-fi
-
-# Create cache for k8s source
-if [[ ! -d ${HOST_K8S_DIR} ]]; then
-  umask 000
-  mkdir -p ${HOST_K8S_DIR} || die "FAILED to create directory for k8s source"
-fi
-
-# Attempt to start docker service in docker container.
-# Try multiple times if necessary.
-COUNTER=1
-while true; do
-  ((COUNTER++))
-  docker run --rm --net=host --privileged ${DOCKER_ENV} \
-      -v ${HOST_K8S_DIR}:/local/kubernetes \
-      ${DOCKER_IMG_NAME} \
-      /var/tf-k8s/local/start_local_k8s_service.sh
-
-  if [[ $? == "23" ]]; then
-    if [[ "${COUNTER}" -ge "${MAX_ATTEMPTS}" ]]; then
-      echo "Reached maximum number of attempts (${MAX_ATTEMPTS}) "\
-"while attempting to start docker-in-docker for local k8s TensorFlow cluster"
-      exit 1
-    fi
-
-    echo "Docker service failed to start."
-    echo "Will make another attempt (#${COUNTER}) to start it..."
-    sleep 1
-  else
-    break
-  fi
-done
diff --git a/tensorflow/tools/dist_test/local/test_local_tf_cluster.sh b/tensorflow/tools/dist_test/local/test_local_tf_cluster.sh
deleted file mode 100755
index 402f7b5f556..00000000000
--- a/tensorflow/tools/dist_test/local/test_local_tf_cluster.sh
+++ /dev/null
@@ -1,142 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# Launch a Kubernetes (k8s) TensorFlow cluster on the local machine and run
-# the distributed test suite.
-#
-# This script assumes that a TensorFlow cluster is already running on the
-# local machine and can be controlled by the "kubectl" binary.
-#
-# Usage: test_local_tf_cluster.sh <NUM_WORKERS> <NUM_PARAMETER_SERVERS>
-#                                 [--model-name <MODEL_NAME>]
-#                                 [--sync-replicas]
-#
-# --sync-replicas
-#   Use the synchronized-replica mode. The parameter updates from the replicas
-#   (workers) will be aggregated before applied, which avoids stale parameter
-#   updates.
-
-export GCLOUD_BIN=/usr/local/bin/gcloud
-export TF_DIST_LOCAL_CLUSTER=1
-
-# Parse input arguments
-if [[ $# == 0 ]] || [[ $# == 1 ]]; then
-  echo "Usage: $0 <NUM_WORKERS> <NUM_PARAMETER_SERVERS>"
-  exit 1
-fi
-
-NUM_WORKERS=$1
-NUM_PARAMETER_SERVERS=$2
-shift
-shift
-
-# Process optional command-line flags
-MODEL_NAME=""
-MODEL_NAME_FLAG=""
-SYNC_REPLICAS_FLAG=""
-while true; do
-  if [[ "$1" == "--model-name" ]]; then
-    MODEL_NAME="$2"
-    MODEL_NAME_FLAG="--model-name ${MODEL_NAME}"
-  elif [[ "$1" == "--sync-replicas" ]]; then
-    SYNC_REPLICAS_FLAG="--sync-replicas"
-  fi
-  shift
-
-  if [[ -z "$1" ]]; then
-    break
-  fi
-done
-
-echo "NUM_WORKERS: ${NUM_WORKERS}"
-echo "NUM_PARAMETER_SERVERS: ${NUM_PARAMETER_SERVERS}"
-echo "MODEL_NAME: \"${MODEL_NAME}\""
-echo "MODEL_NAME_FLAG: \"${MODEL_NAME_FLAG}\""
-echo "SYNC_REPLICAS_FLAG: \"${SYNC_REPLICAS_FLAG}\""
-
-# Get current script directory
-DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-
-# Get utility functions
-source "${DIR}/../scripts/utils.sh"
-
-# Wait for the kube-system pods to be running
-KUBECTL_BIN=$(which kubectl)
-if [[ -z ${KUBECTL_BIN} ]]; then
-  die "FAILED to find path to kubectl"
-fi
-
-echo "Waiting for kube-system pods to be all running..."
-echo ""
-
-MAX_ATTEMPTS=360
-COUNTER=0
-while true; do
-  sleep 1
-  ((COUNTER++))
-  if [[ "${COUNTER}" -gt "${MAX_ATTEMPTS}" ]]; then
-    die "Reached maximum polling attempts while waiting for all pods in "\
-"kube-system to be running in local k8s TensorFlow cluster"
-  fi
-
-  if [[ $(are_all_pods_running "${KUBECTL_BIN}" "kube-system") == "1" ]]; then
-    break
-  fi
-done
-
-# Create the local k8s tf cluster
-${DIR}/../scripts/create_tf_cluster.sh \
-    ${NUM_WORKERS} ${NUM_PARAMETER_SERVERS} | \
-    tee /tmp/tf_cluster.log || \
-    die "FAILED to create local tf cluster"
-
-DOCKER_CONTAINER_ID=$(cat /tmp/tf_cluster.log | \
-    grep "Docker container ID" |
-    awk '{print $NF}')
-if [[ -z "${DOCKER_CONTAINER_ID}" ]]; then
-  die "FAILED to determine worker0 Docker container ID"
-fi
-
-WORKER_URLS=""
-IDX=0
-while true; do
-  WORKER_URLS="${WORKER_URLS},grpc://tf-worker${IDX}:2222"
-
-  ((IDX++))
-  if [[ ${IDX} == ${NUM_WORKERS} ]]; then
-    break
-  fi
-done
-
-echo "Worker URLs: ${WORKER_URLS}"
-
-export TF_DIST_GRPC_SERVER_URLS="${WORKER_URLS}"
-GRPC_ENV="TF_DIST_GRPC_SERVER_URLS=${TF_DIST_GRPC_SERVER_URLS}"
-
-# Command to launch clients from worker0
-CMD="${GRPC_ENV} /var/tf-k8s/scripts/dist_test.sh "\
-"--num-workers ${NUM_WORKERS} "\
-"--num-parameter-servers ${NUM_PARAMETER_SERVERS} "\
-"${MODEL_NAME_FLAG} ${SYNC_REPLICAS_FLAG}"
-
-# Launch clients from worker0
-docker exec ${DOCKER_CONTAINER_ID} /bin/bash -c "${CMD}"
-
-if [[ $? != "0" ]]; then
-  die "Test of local k8s TensorFlow cluster FAILED"
-else
-  echo "Test of local k8s TensorFlow cluster PASSED"
-fi
diff --git a/tensorflow/tools/dist_test/local_test.sh b/tensorflow/tools/dist_test/local_test.sh
index e46e60dd81a..f9f37ff0e11 100755
--- a/tensorflow/tools/dist_test/local_test.sh
+++ b/tensorflow/tools/dist_test/local_test.sh
@@ -56,6 +56,10 @@
 # In addition, this script obeys the following environment variables:
 # TF_DIST_DOCKER_NO_CACHE:      do not use cache when building docker images
 
+die() {
+  echo $@
+  exit 1
+}
 
 # Configurations
 DOCKER_IMG_NAME="tensorflow/tf-dist-test-local-cluster"
diff --git a/tensorflow/tools/dist_test/python/mnist_replica.py b/tensorflow/tools/dist_test/python/mnist_replica.py
index 0f642d5e692..b57cbfc79c3 100644
--- a/tensorflow/tools/dist_test/python/mnist_replica.py
+++ b/tensorflow/tools/dist_test/python/mnist_replica.py
@@ -177,28 +177,44 @@ def main(unused_argv):
       else:
         replicas_to_aggregate = FLAGS.replicas_to_aggregate
 
-      opt = tf.train.SyncReplicasOptimizer(
+      opt = tf.train.SyncReplicasOptimizerV2(
           opt,
           replicas_to_aggregate=replicas_to_aggregate,
           total_num_replicas=num_workers,
-          replica_id=FLAGS.task_index,
           name="mnist_sync_replicas")
 
     train_step = opt.minimize(cross_entropy, global_step=global_step)
 
-    if FLAGS.sync_replicas and is_chief:
+    if FLAGS.sync_replicas:
+      local_init_op = opt.local_step_init_op
+      if is_chief:
+        local_init_op = opt.chief_init_op
+
+      ready_for_local_init_op = opt.ready_for_local_init_op
+
       # Initial token and chief queue runners required by the sync_replicas mode
       chief_queue_runner = opt.get_chief_queue_runner()
-      init_tokens_op = opt.get_init_tokens_op()
+      sync_init_op = opt.get_init_tokens_op()
 
     init_op = tf.initialize_all_variables()
     train_dir = tempfile.mkdtemp()
-    sv = tf.train.Supervisor(
-        is_chief=is_chief,
-        logdir=train_dir,
-        init_op=init_op,
-        recovery_wait_secs=1,
-        global_step=global_step)
+
+    if FLAGS.sync_replicas:
+      sv = tf.train.Supervisor(
+          is_chief=is_chief,
+          logdir=train_dir,
+          init_op=init_op,
+          local_init_op=local_init_op,
+          ready_for_local_init_op=ready_for_local_init_op,
+          recovery_wait_secs=1,
+          global_step=global_step)
+    else:
+      sv = tf.train.Supervisor(
+          is_chief=is_chief,
+          logdir=train_dir,
+          init_op=init_op,
+          recovery_wait_secs=1,
+          global_step=global_step)
 
     sess_config = tf.ConfigProto(
         allow_soft_placement=True,
@@ -217,18 +233,17 @@ def main(unused_argv):
       server_grpc_url = "grpc://" + worker_spec[FLAGS.task_index]
       print("Using existing server at: %s" % server_grpc_url)
 
-      sess = sv.prepare_or_wait_for_session(server_grpc_url, config=sess_config)
-    else:
-      sess = sv.prepare_or_wait_for_session(server.target,
+      sess = sv.prepare_or_wait_for_session(server_grpc_url,
                                             config=sess_config)
+    else:
+      sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)
 
     print("Worker %d: Session initialization complete." % FLAGS.task_index)
 
     if FLAGS.sync_replicas and is_chief:
-      # Chief worker will start the chief queue runner and call the init op
-      print("Starting chief queue runner and running init_tokens_op")
+      # Chief worker will start the chief queue runner and call the init op.
+      sess.run(sync_init_op)
       sv.start_queue_runners(sess, [chief_queue_runner])
-      sess.run(init_tokens_op)
 
     # Perform training
     time_begin = time.time()
diff --git a/tensorflow/tools/dist_test/remote_test.sh b/tensorflow/tools/dist_test/remote_test.sh
index b1e6b1e71e4..935535312d3 100755
--- a/tensorflow/tools/dist_test/remote_test.sh
+++ b/tensorflow/tools/dist_test/remote_test.sh
@@ -66,6 +66,11 @@
 #                                 servers
 #   TF_DIST_DOCKER_NO_CACHE:      do not use cache when building docker images
 
+die() {
+  echo $@
+  exit 1
+}
+
 DOCKER_IMG_NAME="tensorflow/tf-dist-test-client"
 
 # Get current script directory
diff --git a/tensorflow/tools/dist_test/scripts/create_tf_cluster.sh b/tensorflow/tools/dist_test/scripts/create_tf_cluster.sh
index 69c459ec8c5..1da6a540f10 100755
--- a/tensorflow/tools/dist_test/scripts/create_tf_cluster.sh
+++ b/tensorflow/tools/dist_test/scripts/create_tf_cluster.sh
@@ -102,6 +102,9 @@ if [[ ${IS_LOCAL_CLUSTER} == "0" ]]; then
   # Activate gcloud service account
   "${GCLOUD_BIN}" auth activate-service-account --key-file "${GCLOUD_KEY_FILE}"
 
+  # See: https://github.com/kubernetes/kubernetes/issues/30617
+  "${GCLOUD_BIN}" config set container/use_client_certificate True
+
   # Set gcloud project
   "${GCLOUD_BIN}" config set project "${GCLOUD_PROJECT}"
 
diff --git a/tensorflow/tools/dist_test/scripts/dist_mnist_test.sh b/tensorflow/tools/dist_test/scripts/dist_mnist_test.sh
index 7ebe80db1b1..ea4906588da 100755
--- a/tensorflow/tools/dist_test/scripts/dist_mnist_test.sh
+++ b/tensorflow/tools/dist_test/scripts/dist_mnist_test.sh
@@ -67,30 +67,37 @@ EXISTING_SERVERS=False
 
 while true; do
   if [[ "$1" == "--ps_hosts" ]]; then
-  	PS_HOSTS=$2
+    PS_HOSTS=$2
+    shift 2
   elif [[ "$1" == "--worker_hosts" ]]; then
     WORKER_HOSTS=$2
+    shift 2
   elif [[ "$1" == "--existing_servers" ]]; then
     EXISTING_SERVERS=$2
+    shift 2
     if [[ "${EXISTING_SERVERS}" != "True" ]] && \
        [[ "${EXISTING_SERVERS}" != "False" ]]; then
       die "Invalid value for --existing_servers: should be (True|False)"
     fi
   elif [[ "$1" == "--num_gpus" ]]; then
     N_GPUS=$2
+    shift 2
   elif [[ "$1" == "--sync_replicas" ]]; then
     SYNC_REPLICAS="1"
-    die "ERROR: --sync_replicas (synchronized-replicas) mode is not fully "\
-"supported by this test yet."
-    # TODO(cais): Remove error message once sync_replicas is fully supported.
+    shift 1
   fi
-  shift 2
 
   if [[ -z "$1" ]]; then
     break
   fi
 done
 
+if [[ ${SYNC_REPLICAS} == "1" ]] && [[ EXISTING_SERVERS == "1" ]]; then
+  die "ERROR: --sync_replicas (synchronized-replicas) mode is not fully "\
+"supported under the --existing_servers mode yet."
+  # TODO(cais): Remove error message once sync_replicas is fully supported.
+fi
+
 SYNC_REPLICAS_FLAG=""
 if [[ ${SYNC_REPLICAS} == "1" ]]; then
   SYNC_REPLICAS_FLAG="True"
@@ -150,7 +157,7 @@ if [[ ${EXISTING_SERVERS} == "False" ]]; then
         --job_name="ps" \
         --task_index=${IDX} \
         --num_gpus=${N_GPUS} \
-        --sync_replicas=${SYNC_REPLICAS_FLAG} | tee "${PS_LOG_PREFIX}${IDX}.log" &
+        --sync_replicas=${SYNC_REPLICAS_FLAG} 2>&1 | tee "${PS_LOG_PREFIX}${IDX}.log" &
     echo "PS ${IDX}: "
     echo "  PS HOST: ${PS_ARRAY[IDX]}"
     echo "  log file: ${PS_LOG_PREFIX}${IDX}.log"
@@ -181,7 +188,7 @@ while true; do
       --task_index=${IDX} \
       --num_gpus=${N_GPUS} \
       --train_steps=500 \
-      --sync_replicas=${SYNC_REPLICAS_FLAG} | tee "${WKR_LOG_PREFIX}${IDX}.log" &
+      --sync_replicas=${SYNC_REPLICAS_FLAG} 2>&1 | tee "${WKR_LOG_PREFIX}${IDX}.log" &
   echo "Worker ${IDX}: "
   echo "  WORKER HOST: ${WORKER_ARRAY[IDX]}"
   echo "  log file: ${WKR_LOG_PREFIX}${IDX}.log"
diff --git a/tensorflow/tools/dist_test/scripts/dist_test.sh b/tensorflow/tools/dist_test/scripts/dist_test.sh
index 080ce1df5f8..5c107fb030d 100755
--- a/tensorflow/tools/dist_test/scripts/dist_test.sh
+++ b/tensorflow/tools/dist_test/scripts/dist_test.sh
@@ -191,11 +191,12 @@ test_MNIST() {
       ${SYNC_REPLICAS_FLAG}
 
   if [[ $? == "0" ]]; then
-    echo "MNIST-replica test PASSED\n"
+    echo "MNIST-replica test PASSED"
   else
-    echo "MNIST-replica test FAILED\n"
+    echo "MNIST-replica test FAILED"
     return 1
   fi
+  echo ""
 }
 
 # Test routine for model "CENSUS_WIDENDEEP"
@@ -231,8 +232,9 @@ if [[ $(type -t "test_${MODEL_NAME}") != "function" ]]; then
 fi
 
 # Invoke test routine according to model name
-"test_${MODEL_NAME}" || \
-    die "Test of distributed training of model ${MODEL_NAME} FAILED"
+"test_${MODEL_NAME}" && \
+    FAILED=0 || \
+    FAILED=1
 
 # Tear down current k8s TensorFlow cluster
 if [[ "${TEARDOWN_WHEN_DONE}" == "1" ]]; then
@@ -242,5 +244,9 @@ if [[ "${TEARDOWN_WHEN_DONE}" == "1" ]]; then
       die "Cluster tear-down FAILED"
 fi
 
-echo "SUCCESS: Test of distributed TensorFlow runtime PASSED"
-echo ""
\ No newline at end of file
+if [[ "${FAILED}" == 1 ]]; then
+  die "Test of distributed training of model ${MODEL_NAME} FAILED"
+else
+  echo "SUCCESS: Test of distributed TensorFlow runtime PASSED"
+  echo ""
+fi
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 2ee95644545..702967650fe 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -5,10 +5,26 @@ package(default_visibility = ["//visibility:private"])
 
 load("//tensorflow:tensorflow.bzl", "transitive_hdrs")
 
+# This returns a list of headers of all public header libraries (e.g.,
+# framework, lib), and all of the transitive dependencies of those
+# public headers.  Not all of the headers returned by the filegroup
+# are public (e.g., internal headers that are included by public
+# headers), but the internal headers need to be packaged in the
+# pip_package for the public headers to be properly included.
+#
+# Public headers are therefore defined by those that are both:
+#
+# 1) "publicly visible" as defined by bazel
+# 2) Have documentation.
+#
+# This matches the policy of "public" for our python API.
 transitive_hdrs(
-    name = "other_headers",
+    name = "included_headers",
     deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:stream_executor",
         "//third_party/eigen3",
     ],
 )
@@ -30,7 +46,7 @@ py_binary(
         "MANIFEST.in",
         "README",
         "setup.py",
-        ":other_headers",
+        ":included_headers",
         "//tensorflow:tensorflow_py",
         "//tensorflow/contrib/ndlstm:all_files",
         "//tensorflow/contrib/session_bundle:all_files",
@@ -40,7 +56,6 @@ py_binary(
         "//tensorflow/contrib/specs:all_files",
         "//tensorflow/contrib/tensor_forest:all_files",
         "//tensorflow/contrib/tensor_forest/hybrid:all_files",
-        "//tensorflow/core:framework_headers",
         "//tensorflow/examples/tutorials/mnist:package",
         "//tensorflow/models/embedding:package",
         "//tensorflow/models/image/alexnet:all_files",
@@ -70,7 +85,7 @@ sh_binary(
             "MANIFEST.in",
             "README",
             "setup.py",
-            ":other_headers",
+            ":included_headers",
             ":simple_console",
             "//tensorflow:tensorflow_py",
             "//tensorflow/contrib/ndlstm:all_files",
@@ -81,7 +96,6 @@ sh_binary(
             "//tensorflow/contrib/specs:all_files",
             "//tensorflow/contrib/tensor_forest:all_files",
             "//tensorflow/contrib/tensor_forest/hybrid:all_files",
-            "//tensorflow/core:framework_headers",
             "//tensorflow/examples/tutorials/mnist:package",
             "//tensorflow/models/embedding:package",
             "//tensorflow/models/image/alexnet:all_files",
diff --git a/tensorflow/tools/quantization/BUILD b/tensorflow/tools/quantization/BUILD
index 5d8115eefc8..4c026068f8d 100644
--- a/tensorflow/tools/quantization/BUILD
+++ b/tensorflow/tools/quantization/BUILD
@@ -34,6 +34,7 @@ py_test(
         "quantize_graph_test.py",
     ],
     srcs_version = "PY2AND3",
+    tags = ["nomsan"],  # http://b/32242946
     deps = [
         ":quantize_graph",
         "//tensorflow/python:framework_test_lib",
diff --git a/tensorflow/tools/quantization/quantize_graph.py b/tensorflow/tools/quantization/quantize_graph.py
index e6ddb69ebe0..894806f186e 100644
--- a/tensorflow/tools/quantization/quantize_graph.py
+++ b/tensorflow/tools/quantization/quantize_graph.py
@@ -33,6 +33,7 @@ import numpy as np
 import tensorflow as tf
 
 from tensorflow.python.framework import graph_util
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 
 
@@ -54,6 +55,16 @@ flags.DEFINE_string("test_input_dims", "1,224,224,3",
                     """ graph loaded from a file.""")
 flags.DEFINE_boolean("strip_redundant_quantization", True,
                      """Removes redundant dequantize/quantize pairs.""")
+flags.DEFINE_boolean("quantized_input", False,
+                     "If true, assume Placeholders are quantized with values "
+                     "covering [--quantized_input_min,--quantized_input_max]. "
+                     "Only supported when --mode=eightbit")
+flags.DEFINE_float("quantized_input_min", 0,
+                   "The minimum of the actual input range when "
+                   "--quantized_input")
+flags.DEFINE_float("quantized_input_max", 1,
+                   "The maximum of the actual input range when "
+                   "--quantized_input")
 
 
 def print_input_nodes(current_node, nodes_map, indent, already_visited):
@@ -96,6 +107,14 @@ def set_attr_dtype(node, key, value):
     pass
 
 
+def set_attr_shape(node, key, value):
+  try:
+    node.attr[key].CopyFrom(
+        tf.AttrValue(shape=tensor_shape.as_shape(value).as_proto()))
+  except KeyError:
+    pass
+
+
 def set_attr_tensor(node, key, value, dtype, shape=None):
   try:
     node.attr[key].CopyFrom(tf.AttrValue(
@@ -201,7 +220,7 @@ def quantize_weight_rounded(input_node):
   """Returns a replacement node for input_node containing bucketed floats."""
   input_tensor = input_node.attr["value"].tensor
   tensor_value = tensor_util.MakeNdarray(input_tensor)
-  tensor_shape = input_tensor.tensor_shape
+  shape = input_tensor.tensor_shape
   # Currently, the parameter FLAGS.bitdepth is used to compute the
   # number of buckets as 1 << FLAGS.bitdepth, meaning the number of
   # buckets can only be a power of 2.
@@ -211,7 +230,7 @@ def quantize_weight_rounded(input_node):
   # to this script than absolutely necessary.
   num_buckets = 1 << FLAGS.bitdepth
   tensor_value_rounded = quantize_array(tensor_value, num_buckets)
-  tensor_shape_list = tensor_util.TensorShapeProtoToList(tensor_shape)
+  tensor_shape_list = tensor_util.TensorShapeProtoToList(shape)
   return [create_constant_node(input_node.name, tensor_value_rounded,
                                tf.float32, shape=tensor_shape_list)]
 
@@ -267,13 +286,16 @@ def quantize_weight_eightbit(input_node, quantization_mode):
 class GraphRewriter(object):
   """Takes a float graph, and rewrites it in quantized form."""
 
-  def __init__(self, input_graph, mode):
+  def __init__(self, input_graph, mode, quantized_input_range):
     """Sets up the class to rewrite a float graph.
 
     Args:
       input_graph: A float graph to transform.
       mode: A string controlling how quantization is performed -
         round, quantize, eightbit, or weights.
+      quantized_input_range: if set, assume the input is
+        quantized and represents the range
+        [quantized_input_range[0], quantized_input_range[1]]
 
     Raises:
       ValueError: Two nodes with the same name were found in the graph.
@@ -282,6 +304,17 @@ class GraphRewriter(object):
     self.nodes_map = self.create_nodes_map(input_graph)
     self.output_graph = None
     self.mode = mode
+    self.final_node_renames = {}
+    if quantized_input_range:
+      self.input_range = (quantized_input_range[0], quantized_input_range[1])
+      if self.input_range[0] >= self.input_range[1]:
+        raise ValueError("Invalid quantized_input_range: [%s,%s]" %
+                         self.input_range)
+      if self.mode != "eightbit":
+        raise ValueError(
+            "quantized_input_range can only be specified in eightbit mode")
+    else:
+      self.input_range = None
 
   def create_nodes_map(self, graph):
     """Builds a mapping of node names to their defs from the graph."""
@@ -319,15 +352,22 @@ class GraphRewriter(object):
       self.set_input_graph(graph_util.remove_training_nodes(self.input_graph))
       output_nodes = [self.nodes_map[output_node_name]
                       for output_node_name in output_node_names]
+
       self.already_visited = {}
       self.layers_eightbitized = []
       for output_node in output_nodes:
         self.eightbitize_nodes_recursively(output_node)
       self.output_graph = self.quantize_weights(self.output_graph, b"MIN_FIRST")
+      if self.input_range:
+        self.add_output_graph_node(create_constant_node(
+            "quantized_input_min_value", self.input_range[0], tf.float32, []))
+        self.add_output_graph_node(create_constant_node(
+            "quantized_input_max_value", self.input_range[1], tf.float32, []))
       if FLAGS.strip_redundant_quantization:
         self.output_graph = self.remove_redundant_quantization(
             self.output_graph)
         self.remove_dead_nodes(output_node_names)
+      self.apply_final_node_renames()
     elif self.mode == "weights":
       self.output_graph = self.quantize_weights(self.input_graph,
                                                 b"MIN_COMBINED")
@@ -465,6 +505,9 @@ class GraphRewriter(object):
       self.eightbitize_batch_norm_node(current_node)
     elif current_node.op == "Reshape":
       self.eightbitize_reshape_node(current_node)
+    elif (self.input_range and
+          current_node.op in ("Placeholder", "PlaceholderV2")):
+      self.eightbitize_placeholder_node(current_node)
     else:
       new_node = tf.NodeDef()
       new_node.CopyFrom(current_node)
@@ -535,16 +578,25 @@ class GraphRewriter(object):
     max_output_name = quantize_input_name + ":2"
     return quantize_input_name, min_output_name, max_output_name
 
-  def add_quantize_down_node(self, original_node, quantized_output_name):
-    quantize_down_name = original_node.name + "_eightbit_quantize_down"
-    quantize_down_node = create_node(
-        "QuantizeDownAndShrinkRange", quantize_down_name,
-        [quantized_output_name, quantized_output_name + ":1",
-         quantized_output_name + ":2"])
-    set_attr_dtype(quantize_down_node, "Tinput", tf.qint32)
-    set_attr_dtype(quantize_down_node, "out_type", tf.quint8)
-    self.add_output_graph_node(quantize_down_node)
-    return quantize_down_name
+  def add_quantize_down_nodes(self, original_node, quantized_output_name):
+    quantized_outputs = [
+        quantized_output_name, quantized_output_name + ":1",
+        quantized_output_name + ":2"
+    ]
+    requant_range_node = create_node(
+        "RequantizationRange", original_node.name + "_eightbit_requant_range",
+        quantized_outputs)
+    set_attr_dtype(requant_range_node, "Tinput", tf.qint32)
+    self.add_output_graph_node(requant_range_node)
+
+    requantize_node = create_node(
+        "Requantize", original_node.name + "_eightbit_requantize",
+        (quantized_outputs +
+         [requant_range_node.name + ":0", requant_range_node.name + ":1"]))
+    set_attr_dtype(requantize_node, "Tinput", tf.qint32)
+    set_attr_dtype(requantize_node, "out_type", tf.quint8)
+    self.add_output_graph_node(requantize_node)
+    return requantize_node.name
 
   def add_dequantize_result_node(self, quantized_output_name,
                                  original_node_name, min_tensor_index=1):
@@ -573,8 +625,8 @@ class GraphRewriter(object):
     copy_attr(quantized_mat_mul_node, "transpose_b",
               original_node.attr["transpose_b"])
     self.add_output_graph_node(quantized_mat_mul_node)
-    quantize_down_name = self.add_quantize_down_node(original_node,
-                                                     quantized_mat_mul_name)
+    quantize_down_name = self.add_quantize_down_nodes(original_node,
+                                                      quantized_mat_mul_name)
     self.add_dequantize_result_node(quantize_down_name, original_node.name)
 
   def eightbitize_conv_node(self, original_node):
@@ -589,8 +641,8 @@ class GraphRewriter(object):
     set_attr_dtype(quantized_conv_node, "Tfilter", tf.quint8)
     set_attr_dtype(quantized_conv_node, "out_type", tf.qint32)
     self.add_output_graph_node(quantized_conv_node)
-    quantize_down_name = self.add_quantize_down_node(original_node,
-                                                     quantized_conv_name)
+    quantize_down_name = self.add_quantize_down_nodes(original_node,
+                                                      quantized_conv_name)
     self.add_dequantize_result_node(quantize_down_name, original_node.name)
 
   def eightbitize_bias_add_node(self, original_node):
@@ -605,8 +657,8 @@ class GraphRewriter(object):
     set_attr_dtype(quantized_bias_add_node, "T2", tf.quint8)
     set_attr_dtype(quantized_bias_add_node, "out_type", tf.qint32)
     self.add_output_graph_node(quantized_bias_add_node)
-    quantize_down_name = self.add_quantize_down_node(original_node,
-                                                     quantized_bias_add_name)
+    quantize_down_name = self.add_quantize_down_nodes(original_node,
+                                                      quantized_bias_add_name)
     self.add_dequantize_result_node(quantize_down_name, original_node.name)
 
   def eightbitize_single_input_tensor_node(self, original_node,
@@ -746,6 +798,33 @@ class GraphRewriter(object):
     self.add_output_graph_node(quantized_concat_node)
     self.add_dequantize_result_node(quantized_concat_name, original_node.name)
 
+  def eightbitize_placeholder_node(self, current_node):
+    """Replaces a placeholder node with a quint8 placeholder node+dequantize."""
+    name = current_node.name
+
+    # Convert the placeholder into a quantized type.
+    output_node = tf.NodeDef()
+    output_node.CopyFrom(current_node)
+    set_attr_dtype(output_node, "dtype", tf.quint8)
+    output_node.name += "_original_input"
+    self.add_output_graph_node(output_node)
+
+    # Add a dequantize to convert back to float.
+    dequantize_node = create_node(
+        "Dequantize", name,
+        [output_node.name, "quantized_input_min_value",
+         "quantized_input_max_value"])
+    set_attr_dtype(dequantize_node, "T", tf.quint8)
+    set_attr_string(dequantize_node, "mode", b"MIN_FIRST")
+    self.add_output_graph_node(dequantize_node)
+
+    # For the descent over the graph to work, the dequantize node must be named
+    # current_node.name.  However, for the feeding of the graph to work, the
+    # placeholder must have the name current_node.name; so record a final set
+    # of renames to apply after all processing has been done.
+    self.final_node_renames[output_node.name] = name
+    self.final_node_renames[dequantize_node.name] = name + "_dequantize"
+
   def eightbitize_reshape_node(self, original_node):
     """Replaces a Reshape node with the eight bit equivalent sub-graph.
 
@@ -812,8 +891,8 @@ class GraphRewriter(object):
     copy_attr(quantized_batch_norm_node, "variance_epsilon",
               original_node.attr["variance_epsilon"])
     self.add_output_graph_node(quantized_batch_norm_node)
-    quantize_down_name = self.add_quantize_down_node(original_node,
-                                                     quantized_batch_norm_name)
+    quantize_down_name = self.add_quantize_down_nodes(original_node,
+                                                      quantized_batch_norm_name)
     self.add_dequantize_result_node(quantize_down_name, original_node.name)
 
   def add_output_graph_node(self, output_node):
@@ -922,6 +1001,21 @@ class GraphRewriter(object):
       self.add_output_graph_node(node)
     return self.output_graph
 
+  def apply_final_node_renames(self):
+    """Applies node renames in self.final_node_renames to self.output_graph."""
+    old_graph = self.output_graph
+    self.output_graph = tf.GraphDef()
+    for node in old_graph.node:
+      node.name = self.final_node_renames.get(node.name, node.name)
+      for index, input_name in enumerate(node.input):
+        node_name = node_name_from_input(input_name)
+        input_full_name = ensure_tensor_name_has_port(input_name)
+        if node_name in self.final_node_renames:
+          node.input[index] = "%s%s" % (self.final_node_renames[node_name],
+                                        input_full_name[len(node_name):])
+      self.add_output_graph_node(node)
+    return self.output_graph
+
   def remove_dead_nodes(self, output_names):
     """Removes nodes that are no longer needed for inference from the graph."""
     old_output_graph = self.output_graph
@@ -1008,7 +1102,12 @@ def main(unused_args):
   with graph.as_default():
     tf.import_graph_def(tf_graph, input_map={}, name="")
 
-  rewriter = GraphRewriter(tf_graph, FLAGS.mode)
+  quantized_input_range = None
+  if FLAGS.quantized_input:
+    quantized_input_range = [FLAGS.quantized_input_min,
+                             FLAGS.quantized_input_max]
+
+  rewriter = GraphRewriter(tf_graph, FLAGS.mode, quantized_input_range)
 
   output_graph = rewriter.rewrite(FLAGS.output_node_names.split(","))
 
diff --git a/tensorflow/tools/quantization/quantize_graph_test.py b/tensorflow/tools/quantization/quantize_graph_test.py
index 30b924defb9..1521240f284 100644
--- a/tensorflow/tools/quantization/quantize_graph_test.py
+++ b/tensorflow/tools/quantization/quantize_graph_test.py
@@ -21,6 +21,7 @@ from __future__ import division
 from __future__ import print_function
 
 
+import sys
 import numpy as np
 import tensorflow as tf
 
@@ -174,7 +175,8 @@ def test_graph(float_graph_def, input_map, output_names):
   #
   # TODO(petewarden): Add test for "quantize" mode.
 
-  eightbit_rewriter = quantize_graph.GraphRewriter(float_graph_def, "eightbit")
+  eightbit_rewriter = quantize_graph.GraphRewriter(float_graph_def, "eightbit",
+                                                   quantized_input_range=None)
   eightbit_graph_def = eightbit_rewriter.rewrite(output_names)
   eightbit_results = run_graph_def(eightbit_graph_def, input_map,
                                    [output_name + ":0"
@@ -184,11 +186,11 @@ def test_graph(float_graph_def, input_map, output_names):
 
   # Test the weights_rounded mode. This uses the default bit_depth.
   weights_rounded_rewriter = quantize_graph.GraphRewriter(
-      float_graph_def, "weights_rounded")
+      float_graph_def, "weights_rounded", quantized_input_range=None)
   weights_rounded_graph_def = weights_rounded_rewriter.rewrite(output_names)
-  weights_rounded_results = run_graph_def(weights_rounded_graph_def, input_map,
-                                          [output_name + ":0"
-                                           for output_name in output_names])
+  weights_rounded_results = run_graph_def(
+      weights_rounded_graph_def, input_map,
+      [output_name + ":0" for output_name in output_names])
   for expected, result in zip(float_results, weights_rounded_results):
     assert are_tensors_near(expected, result, 1.0)
 
@@ -265,11 +267,10 @@ class QuantizeGraphTest(tf.test.TestCase):
     test_graph(g, {}, ["matmul_2"])
 
     # Verify there is only one Quantize and one Requantize op.
-    eightbit_rewriter = quantize_graph.GraphRewriter(g, "eightbit")
+    eightbit_rewriter = quantize_graph.GraphRewriter(g, "eightbit",
+                                                     quantized_input_range=None)
     eightbit_graph_def = eightbit_rewriter.rewrite(["matmul_2"])
 
-    tf.logging.info("S:\n%s", str(eightbit_graph_def))
-
     ops = [node.op for node in eightbit_graph_def.node]
     # No quantize since all inputs are const and can be quantized up-front.
     self.assertEqual(0, ops.count("QuantizeV2") + ops.count("Quantize"))
@@ -621,6 +622,104 @@ class QuantizeGraphTest(tf.test.TestCase):
     float_graph_def.node.extend([bias_add_node])
     test_graph(float_graph_def, {}, [bias_add_name])
 
+  def test_quantized_input_range_errors(self):
+    with self.assertRaises(ValueError):
+      # Invalid mode.
+      quantize_graph.GraphRewriter(tf.GraphDef(), "weights_rounded", [0, 1])
+    with self.assertRaises(ValueError):
+      # Invalid range.
+      quantize_graph.GraphRewriter(tf.GraphDef(), "eightbit", [0, -1])
+
+  def test_quantized_input_range_bias_add(self):
+    input_shape = [1, 1, 2, 6]
+    input_n = quantize_graph.create_node(
+        "PlaceholderV2", "input", [])
+    quantize_graph.set_attr_dtype(input_n, "dtype", tf.float32)
+    quantize_graph.set_attr_shape(input_n, "shape", input_shape)
+    offset_n = quantize_graph.create_constant_node("offset",
+                                                   value=[1, 2, 3, 4, 5, 6],
+                                                   dtype=tf.float32,
+                                                   shape=[6])
+    bias_add_n = quantize_graph.create_node("BiasAdd", "bias_add",
+                                            [input_n.name, offset_n.name])
+    quantize_graph.set_attr_dtype(bias_add_n, "T", tf.float32)
+
+    float_graph_def = tf.GraphDef()
+    float_graph_def.node.extend([input_n, offset_n, bias_add_n])
+
+    input_map = {input_n.name + ":0":
+                 np.reshape([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
+                            input_shape)}
+    self._RunTestsForQuantizedInputRange(
+        float_graph_def, input_map, [bias_add_n.name], [-1, 20.])
+    self._RunTestsForQuantizedInputRange(
+        float_graph_def, input_map, [bias_add_n.name], [0, 12.])
+
+  def test_quantized_input_range_mat_mul(self):
+    shapes = [[3, 2], [2, 4]]
+    inputs = []
+    for i, shape in enumerate(shapes):
+      node = quantize_graph.create_node("PlaceholderV2", "input_%s" % i, [])
+      quantize_graph.set_attr_dtype(node, "dtype", tf.float32)
+      quantize_graph.set_attr_shape(node, "shape", shape)
+      inputs.append(node)
+    mat_mul_node = quantize_graph.create_node("MatMul", "mat_mul",
+                                              [n.name for n in inputs])
+    quantize_graph.set_attr_dtype(mat_mul_node, "T", tf.float32)
+
+    float_graph_def = tf.GraphDef()
+    float_graph_def.node.extend(inputs + [mat_mul_node])
+
+    input_map = {inputs[0].name + ":0":
+                     np.reshape([1, 2, 3, 4, 5, 6], shapes[0]),
+                 inputs[1].name + ":0":
+                     np.reshape([.8, .7, .6, .5, .4, .3, .2, .1], shapes[1])}
+    self._RunTestsForQuantizedInputRange(
+        float_graph_def, input_map, [mat_mul_node.name], [-1, 20.])
+    self._RunTestsForQuantizedInputRange(
+        float_graph_def, input_map, [mat_mul_node.name], [0, 6.])
+
+  def _RunTestsForQuantizedInputRange(self, float_graph_def, input_map,
+                                      output_names, input_range):
+    if sys.version_info[0] == 3:
+      # uint8->quint8 conversion for numpy is not working currently.
+      return
+
+    quantized_input_map = {}
+    for k, v in input_map.items():
+      arr = [
+          int(round((n-input_range[0])*255/(input_range[1]-input_range[0])))
+          for n in v.flat]
+      arr = np.array(arr, np.uint8)
+      arr = arr.reshape(v.shape)
+      arr = arr.astype(tf.quint8.as_numpy_dtype)
+      quantized_input_map[k] = arr
+    output_tensors = [output_name + ":0" for output_name in output_names]
+    float_results = run_graph_def(float_graph_def, input_map, output_tensors)
+
+    # Quantize treating the input as quantized in range <input_range>.
+    rewriter = quantize_graph.GraphRewriter(float_graph_def, "eightbit",
+                                            input_range)
+    graph_def = rewriter.rewrite(output_names)
+    results = run_graph_def(graph_def, quantized_input_map, output_tensors)
+    for expected, result in zip(float_results, results):
+      assert are_tensors_near(expected, result, .5)
+    ops = [node.op for node in graph_def.node]
+    self.assertEqual(0, ops.count("QuantizeV2") + ops.count("Quantize"))
+    self.assertEqual(len(output_names), ops.count("Dequantize"))
+
+    # Quantize without treating input as quantized.
+    rewriter = quantize_graph.GraphRewriter(float_graph_def, "eightbit",
+                                            quantized_input_range=None)
+    graph_def = rewriter.rewrite(output_names)
+    results = run_graph_def(graph_def, input_map, output_tensors)
+    for expected, result in zip(float_results, results):
+      assert are_tensors_near(expected, result, .5)
+    ops = [node.op for node in graph_def.node]
+    self.assertEqual(len(input_map),
+                     ops.count("QuantizeV2") + ops.count("Quantize"))
+    self.assertEqual(len(output_names), ops.count("Dequantize"))
+
   def test_remove_redundant_quantization(self):
     a_constant_name = "a_constant"
     a_constant_min_name = "a_constant_min"
@@ -745,7 +844,8 @@ class QuantizeGraphTest(tf.test.TestCase):
     quantize_graph.set_attr_dtype(mat_mul_node, "T2", tf.int32)
     expected_output.node.extend([mat_mul_node])
 
-    rewriter = quantize_graph.GraphRewriter(graph_def, [mat_mul_name])
+    rewriter = quantize_graph.GraphRewriter(graph_def, [mat_mul_name],
+                                            quantized_input_range=None)
     output = rewriter.remove_redundant_quantization(graph_def)
     stripped_output = graph_util.extract_sub_graph(output, [mat_mul_name])
     self.assertProtoEquals(expected_output, stripped_output)
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/BUILD b/tensorflow/tools/tfprof/BUILD
similarity index 84%
rename from tensorflow/contrib/tfprof/tools/tfprof/BUILD
rename to tensorflow/tools/tfprof/BUILD
index da161b1ffa1..56e1fb7ae4d 100644
--- a/tensorflow/contrib/tfprof/tools/tfprof/BUILD
+++ b/tensorflow/tools/tfprof/BUILD
@@ -26,13 +26,13 @@ cc_binary(
         ":protos_all_cc",
         "//tensorflow/c:c_api",
         "//tensorflow/c:checkpoint_reader",
-        "//tensorflow/contrib/tfprof/tools/tfprof/internal:tfprof_options",
-        "//tensorflow/contrib/tfprof/tools/tfprof/internal:tfprof_stats",
-        "//tensorflow/contrib/tfprof/tools/tfprof/internal:tfprof_utils",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/tools/tfprof/internal:tfprof_options",
+        "//tensorflow/tools/tfprof/internal:tfprof_stats",
+        "//tensorflow/tools/tfprof/internal:tfprof_utils",
         "@linenoise//:linenoise",
     ],
 )
diff --git a/tensorflow/tools/tfprof/README.md b/tensorflow/tools/tfprof/README.md
new file mode 100644
index 00000000000..8618abe0d5e
--- /dev/null
+++ b/tensorflow/tools/tfprof/README.md
@@ -0,0 +1,455 @@
+# tfprof: A Profiling Tool for TensorFlow Models
+
+Internal User Please Use: go/tfprof
+
+Author: Xin Pan (xpan@google.com, github: panyx0718)
+
+Consultants: Jon Shlens, Pete Warden
+
+
+## Introduction
+
+tfprof is a profiling tool for TensorFlow that analyzes model architectures
+and measures system performance.
+
+###Major Features
+
+1.  Measure model parameters, float operations, tensor shapes.
+2.  Measure op execution times, requested memory size and device placement.
+3.  Inspect checkpoint tensors' shapes and their values.
+4.  Explore model based on name scope or graph structure.
+5.  Selectively grouping/filtering/accounting/ordering ops.
+
+### Interfaces
+
+[CLI Tutorials](#cli-tutorials):
+It supports interactive mode for exploration and single-shot mode for
+scripts. Outputs can be dumped to files or printed in terminal.
+
+Python API Tutorials: Python API is not released yet.
+
+## CLI Tutorials
+
+Tutorials are based on a 32 layers ResNet.
+TODO(xpan): Provide graph.pbtxt, model.ckpt, tfprof_log and run_meta download.
+
+### Examples
+
+1) Start `tfprof` command line tool
+
+```shell
+# Build the tool.
+bazel build -c opt tensorflow/tools/tfprof/...
+
+# Help information, including detail 'option' instructions.
+bazel-bin/tensorflow/tools/tfprof/tfprof help
+#
+# The following commands will start tfprof interactive mode.
+#
+# Profile model shapes and parameters only.
+bazel-bin/tensorflow/tools/tfprof/tfprof \
+    --graph_path=graph.pbtxt
+#
+# Additionally profile checkpoint statistics and values.
+# Use '-account_type_regexes _checkpoint_variables' to select
+# checkpoint tensors.
+bazel-bin/tensorflow/tools/tfprof/tfprof \
+    --graph_path=graph.pbtxt \
+    --checkpoint_path=model.ckpt
+#
+# Additionally profile ops requested memory and timing.
+# See CLI Input Files section on generating run_meta file.
+bazel-bin/tensorflow/tools/tfprof/tfprof \
+    --graph_path=graph.pbtxt \
+    --run_meta_path=run_meta \
+    --checkpoint_path=model.ckpt
+#
+# tfprof_log is used to define customized op types and float ops.
+# Use tfprof_logger.write_op_log() to create tfprof_log.
+# See 11) in Examples section on generating tfprof_log file.
+bazel-bin/tensorflow/tools/tfprof/tfprof \
+    --graph_path=graph.pbtxt \
+    --run_meta_path=run_meta \
+    --op_log_path=tfprof_log \
+    --checkpoint_path=model.ckpt
+```
+Note that `graph.pbtxt` is an ASCII text format.
+
+2) Press enter to show the default options
+
+```shell
+tfprof>
+tfprof>
+-max_depth                  4
+-min_bytes                  0
+-min_micros                 0
+-min_params                 0
+-min_float_ops              0
+-device_regexes             .*
+-order_by                   name
+-account_type_regexes       Variable
+-start_name_regexes         .*
+-trim_name_regexes
+-show_name_regexes          .*
+-hide_name_regexes          IsVariableInitialized_[0-9]+,save\/.*,^zeros[0-9_]*
+-account_displayed_op_only  false
+# supported select fileds. Availability depends on --[run_meta|checkpoint|op_log]_path.
+# [bytes|micros|params|float_ops|num_hidden_ops|tensor_value|device|op_types]
+-select                     params
+-viz                        false
+-dump_to_file
+```
+
+3) I want to see the `BatchNorm`'s gamma value in checkpoint.
+
+```shell
+# Requires --graph_path, --checkpoint_path.
+tfprof> scope -show_name_regexes unit_1_0.*gamma -select tensor_value -max_depth 5
+_TFProfRoot ()
+  unit_1_0/shared_activation/init_bn/gamma ()
+[1.80 2.10 2.06 1.91 2.26 1.86 1.81 1.37 1.78 1.85 1.96 1.54 2.04 2.34 2.22 1.99 ],
+  unit_1_0/sub2/bn2/gamma ()
+[1.57 1.83 1.30 1.25 1.59 1.14 1.26 0.82 1.19 1.10 1.48 1.01 0.82 1.23 1.21 1.14 ],
+```
+
+4) I want to see my checkpoint tensors shape and number of parameters.
+
+```shell
+# Requires --graph_path, --checkpoint_path.
+# Increase -max_depth to see all tensors.
+tfprof> scope -account_type_regexes _checkpoint_variables -select params -max_depth 4
+_TFProfRoot (--/930.58k params)
+  global_step (0/0 params)
+  init/init_conv/DW (3x3x3x16, 432/864 params)
+  pool_logit/DW (64x10, 640/1.28k params)
+    pool_logit/DW/Momentum (64x10, 640/640 params)
+  pool_logit/biases (10, 10/20 params)
+    pool_logit/biases/Momentum (10, 10/10 params)
+  unit_last/final_bn/beta (64, 64/128 params)
+  unit_last/final_bn/gamma (64, 64/128 params)
+  unit_last/final_bn/moving_mean (64, 64/64 params)
+  unit_last/final_bn/moving_variance (64, 64/64 params)
+```
+
+5) I defined an op named ‘cost’ to calculate the loss. I want to know what ops
+it depends on take a long time to run. Hint: Use the ‘graph’ command to explore
+graph dependencies.
+
+```shell
+# Requires --graph_path, --run_meta_path.
+tfprof> graph -start_name_regexes cost.* -max_depth 100 -min_micros 10000 -select micros -account_type_regexes .*
+_TFProfRoot (0us/3.61sec)
+  init/init_conv/Conv2D (11.75ms/3.10sec)
+    random_shuffle_queue_DequeueMany (3.09sec/3.09sec)
+  unit_1_0/sub2/conv2/Conv2D (74.14ms/3.19sec)
+  unit_1_3/sub2/conv2/Conv2D (60.75ms/3.34sec)
+  unit_2_4/sub2/conv2/Conv2D (73.58ms/3.54sec)
+  unit_3_3/sub2/conv2/Conv2D (10.26ms/3.60sec)
+```
+
+6) I want to know the expensive operations during the back propagation.
+Hint: tensorflow prepend ‘gradient’ to your defined name scopes. Use the ‘scope’
+command to explore based on name scope hierarchies.
+
+```shell
+# Requires --graph_path, --run_meta_path.
+tfprof> scope -start_name_regexes gradient.* -max_depth 100 -min_micros 20000 -select micros -account_type_regexes .*
+_TFProfRoot (0us/2.29sec)
+  gradients/unit_1_0/sub1/conv1/Conv2D_grad/Conv2DBackpropFilter (54.96ms/54.96ms)
+  gradients/unit_1_0/sub2/conv2/Conv2D_grad/Conv2DBackpropFilter (83.63ms/83.63ms)
+  gradients/unit_1_1/sub1/conv1/Conv2D_grad/Conv2DBackpropFilter (99.25ms/99.25ms)
+  gradients/unit_1_2/sub1/conv1/Conv2D_grad/Conv2DBackpropFilter (95.40ms/95.40ms)
+  gradients/unit_1_2/sub2/conv2/Conv2D_grad/Conv2DBackpropFilter (99.83ms/99.83ms)
+  gradients/unit_1_3/sub1/conv1/Conv2D_grad/Conv2DBackpropFilter (95.39ms/95.39ms)
+  ...
+```
+
+7) Show the number of float operations in the model.
+Note: float operations calculation depends on
+1) op.RegisterStatistics. If an op doesn’t
+have RegisterStatistics defined, its float operations cannot be counted.
+2) fully defined shape is also necessary in order to calculate flops.
+float operations number is provided by tensorflow::tfprof::OpLog logged from
+Python API.
+
+```shell
+# Requires --graph_path, --op_log_path.
+tfprof> scope -min_float_ops 1 -max_depth 10 -select float_ops -account_type_regexes .*
+_TFProfRoot (0/17.63b flops)
+  gradients/pool_logit/xw_plus_b/MatMul_grad/MatMul (163.84k/163.84k flops)
+  gradients/pool_logit/xw_plus_b/MatMul_grad/MatMul_1 (163.84k/163.84k flops)
+  init/init_conv/Conv2D (113.25m/113.25m flops)
+  pool_logit/xw_plus_b (1.28k/165.12k flops)
+    pool_logit/xw_plus_b/MatMul (163.84k/163.84k flops)
+  unit_1_0/sub1/conv1/Conv2D (603.98m/603.98m flops)
+  unit_1_0/sub2/conv2/Conv2D (603.98m/603.98m flops)
+  unit_1_1/sub1/conv1/Conv2D (603.98m/603.98m flops)
+  unit_1_1/sub2/conv2/Conv2D (603.98m/603.98m flops)
+  ...
+```
+
+8) Show the number of parameters of all `tf.trainable_variables()` in the model.
+
+```shell
+# Requires --graph_path --op_log_path.
+# store option for future commands.
+tfprof> set -account_type_regexes _trainable_variables
+tfprof> scope -max_depth 4 -select params
+_TFProfRoot (--/464.15k params)
+  init/init_conv/DW (3x3x3x16, 432/432 params)
+  pool_logit/DW (64x10, 640/640 params)
+  pool_logit/biases (10, 10/10 params)
+  unit_last/final_bn/beta (64, 64/64 params)
+  unit_last/final_bn/gamma (64, 64/64 params)
+```
+
+Where does “_trainable_variables” come from? It is from the OpLog file
+generated by write_op_log() Python API. write_op_log() help users create some
+common op types implicitly. Users can define their own op types and log it
+through the write_op_log() API.
+
+9) What if I’m lazy and don’t want to define op type? I have given my ops
+well-defined names in my model’s code. And want to use names to select a group
+of ops. Let’s try it!
+
+```shell
+tfprof> set -account_type_regexes .*
+tfprof> scope -show_name_regexes unit_2_1.*DW -max_depth 100 -account_displayed_op_only
+_TFProfRoot (0/18.43k params)
+  unit_2_1/sub1/conv1/DW (3x3x32x32, 9.22k/9.22k params)
+  unit_2_1/sub2/conv2/DW (3x3x32x32, 9.22k/9.22k params)
+```
+
+The above command allows you to filter ops that match specific names.
+`-account_displayed_op_only` asks tfprof to only account ops displayed
+in terminal. Otherwise, tfprof accounts all ops matched by
+`-account_type_regexes` recursively even if they are hidden due to some
+options such as -max_depth.
+
+10) TensorFlow has built-in op types. For example, built-in op type `Variable`
+seems to include `Variable's` created by your model. However, be careful when
+depending on it because TensorFlow creates extra `Variable` ops implicitly and
+the implicitly created ops can have the same prefix as the `Variable's` you
+defined.
+
+In the following example, extra `Variables` are created and “/Momentum” is
+appended to their names. This might cause you “model capacity” calculation
+to get wrong.
+
+```shell
+tfprof> scope -account_type_regexes Variable -max_depth 4 -select params
+_TFProfRoot (--/930.58k params)
+  global_step (1/1 params)
+  init/init_conv/DW (3x3x3x16, 432/864 params)
+  pool_logit/DW (64x10, 640/1.28k params)
+    pool_logit/DW/Momentum (64x10, 640/640 params)
+  pool_logit/biases (10, 10/20 params)
+    pool_logit/biases/Momentum (10, 10/10 params)
+  unit_last/final_bn/beta (64, 64/128 params)
+  unit_last/final_bn/gamma (64, 64/128 params)
+  unit_last/final_bn/moving_mean (64, 64/64 params)
+  unit_last/final_bn/moving_variance (64, 64/64 params)
+```
+
+
+11) A example of defining extra op type for ops using `OpLog`
+
+First, in Python code, create an `OpLog` proto and add op type
+information to it:
+
+```python
+
+op_log = tfprof_log_pb2.OpLog()
+entry = op_log.log_entries.add()
+entry.name = 'pool_logit/DW'
+entry.types.append('pool_logit')
+entry = op_log.log_entries.add()
+entry.name = 'pool_logit/biases'
+# Alternatively:
+# var = tf.get_variable(xxx)
+# entry.name = var.op.name
+entry.types.append('pool_logit')
+```
+
+Second, call write_op_log to write the OpLog proto.
+
+```python
+tf.contrib.tfprof.tfprof_logger.write_op_log(
+    sess.graph, /tmp/my_op_log_dir, op_log)
+```
+
+Third, when starting the tfprof tool, specify
+"--op_log_path /tmp/my_op_log_dir/op_log"
+
+```shell
+tfprof> scope -account_type_regexes pool_logit -max_depth 4 -select params
+_TFProfRoot (--/650 params)
+  pool_logit/DW (64x10, 640/640 params)
+  pool_logit/biases (10, 10/10 params)
+```
+
+Note that when you call
+`tf.contrib.tfprof.tfprof_logger.write_op_log(...)`,
+the tool adds all `Variables` inside `tf.trainable_variables()` to
+`_trainable_variables`.
+
+12) Run tfprof in one-shot mode and dump result to file.
+
+```shell
+# Printed to stdout if --dump_to_file is not set.
+tfprof scope --graph_path=graph.pbtxt  \
+             --max_depth=3 \
+             --dump_to_file="/tmp/dump"
+Reading Files...
+Parsing GraphDef...
+Preparing Views...
+
+cat /tmp/dump
+_TFProfRoot (--/930.58k params)
+  global_step (0/0 params)
+  pool_logit/DW (64x10, 640/1.28k params)
+  pool_logit/biases (10, 10/20 params)
+```
+
+13) Analyze how balanced Variable are on parameter servers.
+
+In this tutorial, I'm going to use a seq2seq model, which are split
+on several gpus at workers and several parameter servers.
+
+In tfprof, 'device' is an op_type. For example, if op1 and op2 are placed on
+gpu0. They share an op_type called 'gpu0'.
+
+```shell
+bazel-bin/tensorflow/tools/tfprof/tfprof \
+  --graph_path ~/tfprof/textsum/graph.pbtxt  \
+  --run_meta_path ~/tfprof/textsum/run_meta
+
+# Looks like ps task 1 is holding twice more parameters than task 0.
+tfprof> scope -select device,params -account_type_regexes .*ps.*task:0.* -max_depth 1
+_TFProfRoot (--/25.81m params)
+tfprof> scope -select device,params -account_type_regexes .*ps.*task:1.* -max_depth 1
+_TFProfRoot (--/58.84m params)
+```
+
+### CLI Input Files
+
+tfprof command line inference (CLI) loads dumped files from a tensorflow model.
+Convert them into in-memory data structures. To use it, users need to specify
+the locations of the dumped files. The following are the dumped files loaded
+by tfprof:
+
+<b>--graph_path:</b> GraphDef text file (required). Used to build in-memory
+representation of the model. For example, graph.pbtxt written by tf.Supervisor
+is a candidate. If you are not using tf.Supervisor, you can easily get GraphDef
+using tf.Graph.as_graph_def() or other API.
+
+<b>--run_meta_path:</b> tensorflow::RunMetadata.
+Used to get the memory and time consumption of
+each op of the model. Users need to enable it. For example, the following code
+snippet writes a RunMetadata file:
+
+```python
+run_options = config_pb2.RunOptions(trace_level=config_pb2.RunOptions.FULL_TRACE)
+run_metadata = config_pb2.RunMetadata()
+# Once a while, call it the get the RunMeta.
+_ = self._sess.run(..., options=run_options, run_metadata=run_metadata)
+with gfile.Open(os.path.join(output_dir, "run_meta"), "w") as f:
+  f.write(run_metadata.SerializeToString())
+```
+
+<b>--op_log_path:</b>
+tensorflow::tfprof::OpLog. A proto used to provide extra op information
+for ops. By giving a group of ops a type name, users can easily aggregate the
+statistics for those ops without accidently missing or including extra ops.
+tfprof exposes the following Python API to add op information and logging.
+
+```python
+tf.contrib.tfprof.tfprof_logger.write_op_log(graph, log_dir, op_log=None)
+```
+
+<b>--checkpoint_path:</b>
+TensorFlow checkpoint. It defines _checkpoint_variable op type. It also
+provides checkpointed tensors' values.
+
+
+## Design
+
+
+### In-memory representation
+
+<b>Scope:</b> This representation organizes ops based on name scope hierarchy,
+similar to filesystem hierarchy. Hence, it is essentially a tree data structure.
+For example op1 with name “name1/name2” is a child of op2 with name “name1”.
+
+<b>Graph:</b> The representation organizes ops based on op inputs. Hence it is
+a graph structure. The graph is a “directed acyclic graph” (hopefully), with
+direction from “output to input”. The direction is design this way so that users
+can trace from “result” to its “sources”.
+
+### Command line options
+
+tfprof’s major goals are to measure system performance and quicly analyze
+model architectures. Hence, its commands and options should allow users to achieve
+these 2 goals easily.
+
+<b>graph:</b> It is expected that users will mostly use graph representation to
+debug system performance. Hence, tfprof supports graph command, which pulls the
+graph in-memory representation described above.
+
+<b>scope:</b> It is expected that some users might want to explore their model
+statistics using the name scope information they defined in the Python codes.
+Hence, tfprof supports “scope” command, which pulls the tree in-memory
+representation.
+
+<b>set:</b> It is used to store the options so that user doesn’t need to
+re-type the same option again and again in the follow up command line. Note that
+tfprof has traditional terminal’s history and auto-complete support.
+
+<b>help:</b> print help information.
+
+<b>Options:</b> Run “tfprof help” to get detailed explanations.
+
+```python
+"-max_depth",
+"-min_bytes",
+"-min_micros",
+"-min_params",
+"-min_float_ops",
+"-order_by",
+"-account_type_regexes",
+"-start_name_regexes",
+"-trim_name_regexes",
+"-show_name_regexes",
+"-hide_name_regexes",
+"-account_displayed_op_only",
+"-select",
+"-viz",  # Only supported for graph command.
+"-dump_to_file",
+```
+
+A key design is that stats are aggregated from descendants up to ancestors.
+`-account_type_regexes` is used to decide which ops stat is accounted. It makes
+decision based on op type. Usually set it to `.*` if no extra type information
+is added to the ops using OpLog. Intuitively, only accounted ops are displayed.
+`-min/max` and `-show/hide/trim/start` options are only used the optionally
+displayed or hide ops based on ops’ name and stats. However, they don’t prevent
+tfprof from accounting stats of hidden ops. Hence, the stat of a op can be
+aggregated by its parent even if it is hidden. `-account_displayed_op_only` is
+an option to break this rule. When it is set, only displayed ops are accounted.
+
+Regexes are all comma-separated, for example `-show_name_regexes`
+`regex1.*,regex2.*`. It is designed this way because it is convenient and comma
+is not expected to show up in op names.
+
+`-order_by` is used to order displayed ops. Displayed ops at the same hierarchy
+(notice the indent printed) are sorted according to order_by.
+
+## Future Work
+
+* Load SummaryWriter event logs so that it can show the latest summary value.
+
+* Better sorting and aggregation of outputs. Easier comprehension.
+
+* Currently, shape information is based on `graph.pbtxt`. When the shape
+information is incomplete, tfprof ignores it. See if it can use `RunMetadata`
+and `Checkpoint` to complete shape information.
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/BUILD b/tensorflow/tools/tfprof/internal/BUILD
similarity index 86%
rename from tensorflow/contrib/tfprof/tools/tfprof/internal/BUILD
rename to tensorflow/tools/tfprof/internal/BUILD
index 7a4b4c0c98f..7476a5ad412 100644
--- a/tensorflow/contrib/tfprof/tools/tfprof/internal/BUILD
+++ b/tensorflow/tools/tfprof/internal/BUILD
@@ -1,5 +1,9 @@
 package(
     default_visibility = ["//tensorflow:__subpackages__"],
+    features = [
+        "-layering_check",
+        "-parse_headers",
+    ],
 )
 
 licenses(["notice"])  # Apache 2.0
@@ -18,10 +22,10 @@ cc_library(
         ":tfprof_show",
         ":tfprof_utils",
         "//tensorflow/c:checkpoint_reader",
-        "//tensorflow/contrib/tfprof/tools/tfprof:protos_all_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:regexp_internal",
+        "//tensorflow/tools/tfprof:protos_all_cc",
     ],
 )
 
@@ -49,11 +53,11 @@ cc_library(
         ":tfprof_utils",
         "//tensorflow/c:c_api",
         "//tensorflow/c:checkpoint_reader",
-        "//tensorflow/contrib/tfprof/tools/tfprof:protos_all_cc",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:regexp_internal",
+        "//tensorflow/tools/tfprof:protos_all_cc",
     ],
 )
 
@@ -69,10 +73,10 @@ cc_library(
         ":tfprof_tensor",
         ":tfprof_utils",
         "//tensorflow/c:checkpoint_reader",
-        "//tensorflow/contrib/tfprof/tools/tfprof:protos_all_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:regexp_internal",
+        "//tensorflow/tools/tfprof:protos_all_cc",
     ],
 )
 
@@ -87,10 +91,10 @@ cc_library(
         ":tfprof_tensor",
         ":tfprof_utils",
         "//tensorflow/c:checkpoint_reader",
-        "//tensorflow/contrib/tfprof/tools/tfprof:protos_all_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:regexp_internal",
+        "//tensorflow/tools/tfprof:protos_all_cc",
     ],
 )
 
@@ -109,12 +113,12 @@ tf_cc_test(
         ":tfprof_stats",
         ":tfprof_utils",
         "//tensorflow/c:checkpoint_reader",
-        "//tensorflow/contrib/tfprof/tools/tfprof:protos_all_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/tools/tfprof:protos_all_cc",
     ],
 )
 
@@ -138,6 +142,18 @@ cc_library(
     deps = [
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib",
+        "//tensorflow/tools/tfprof:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "print_model_analysis_hdr",
+    hdrs = [
+        "print_model_analysis.h",
+    ],
+    deps = [
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:protos_all_cc",
     ],
 )
 
@@ -149,10 +165,11 @@ cc_library(
         ":tfprof_options",
         ":tfprof_stats",
         "//tensorflow/c:checkpoint_reader",
-        "//tensorflow/contrib/tfprof/tools/tfprof:protos_all_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/tools/tfprof:protos_all_cc",
     ],
+    alwayslink = 1,
 )
 
 tf_cc_test(
@@ -170,12 +187,12 @@ tf_cc_test(
         ":tfprof_stats",
         ":tfprof_utils",
         "//tensorflow/c:checkpoint_reader",
-        "//tensorflow/contrib/tfprof/tools/tfprof:protos_all_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/tools/tfprof:protos_all_cc",
     ],
 )
 
@@ -185,9 +202,9 @@ cc_library(
     hdrs = ["tfprof_tensor.h"],
     copts = ["-Wno-sign-compare"],
     deps = [
-        "//tensorflow/contrib/tfprof/tools/tfprof:protos_all_cc",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/tools/tfprof:protos_all_cc",
     ],
 )
 
@@ -203,12 +220,12 @@ tf_cc_test(
         ":tfprof_stats",
         ":tfprof_utils",
         "//tensorflow/c:checkpoint_reader",
-        "//tensorflow/contrib/tfprof/tools/tfprof:protos_all_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/tools/tfprof:protos_all_cc",
     ],
 )
 
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/print_model_analysis.cc b/tensorflow/tools/tfprof/internal/print_model_analysis.cc
similarity index 73%
rename from tensorflow/contrib/tfprof/tools/tfprof/internal/print_model_analysis.cc
rename to tensorflow/tools/tfprof/internal/print_model_analysis.cc
index ab1e47b32dd..dfe4019fbb4 100644
--- a/tensorflow/contrib/tfprof/tools/tfprof/internal/print_model_analysis.cc
+++ b/tensorflow/tools/tfprof/internal/print_model_analysis.cc
@@ -13,20 +13,26 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/print_model_analysis.h"
+#include "tensorflow/tools/tfprof/internal/print_model_analysis.h"
 
 #include <stdio.h>
 #include <memory>
 #include <utility>
 
 #include "tensorflow/c/checkpoint_reader.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_stats.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_stats.h"
+#include "tensorflow/tools/tfprof/tfprof_log.pb.h"
+#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
 string PrintModelAnalysis(const string* graph, const string* run_meta,
                           const string* op_log, const string* command,
-                          const Options* options) {
+                          const string* options) {
   CHECK(graph) << "graph mustn't be null";
   CHECK(command) << "command mustn't be null";
   CHECK(options) << "options mustn't be null";
@@ -50,16 +56,18 @@ string PrintModelAnalysis(const string* graph, const string* run_meta,
   TFStats tf_stats(std::move(graph_ptr), std::move(run_meta_ptr),
                    std::move(op_log_ptr), std::move(ckpt_reader));
 
-  if (options->dump_to_file.empty()) {
+  Options opts = Options::FromProtoStr(*options);
+
+  if (opts.dump_to_file.empty()) {
     printf("\n=========================Options=============================\n");
-    printf("%s", options->ToString().c_str());
+    printf("%s", opts.ToString().c_str());
     printf("\n==================Model Analysis Report======================\n");
-    TFProfNode root(tf_stats.PrintGraph(*command, *options));
+    TFProfNode root(tf_stats.PrintGraph(*command, opts));
     printf("\n======================End of Report==========================\n");
     fflush(stdout);
     return root.SerializeAsString();
   }
-  return tf_stats.PrintGraph(*command, *options).SerializeAsString();
+  return tf_stats.PrintGraph(*command, opts).SerializeAsString();
 }
 }  // namespace tfprof
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/print_model_analysis.h b/tensorflow/tools/tfprof/internal/print_model_analysis.h
similarity index 62%
rename from tensorflow/contrib/tfprof/tools/tfprof/internal/print_model_analysis.h
rename to tensorflow/tools/tfprof/internal/print_model_analysis.h
index 579147f1641..071ac7102ca 100644
--- a/tensorflow/contrib/tfprof/tools/tfprof/internal/print_model_analysis.h
+++ b/tensorflow/tools/tfprof/internal/print_model_analysis.h
@@ -13,22 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_PRINT_MODEL_ANALYSIS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_PRINT_MODEL_ANALYSIS_H_
+#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_PRINT_MODEL_ANALYSIS_H_
+#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_PRINT_MODEL_ANALYSIS_H_
 
 #include <string>
 
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/tfprof_log.pb.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/tfprof_output.pb.h"
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/framework/types.h"
 
 namespace tensorflow {
 namespace tfprof {
-
-// ***This API is only for swig.***
+class Options;
+// ***This API is only for swig. Don't user it directory!***
 //
 // Interface defined for Python API swig. Calls the tfprof core API.
 // 'graph', 'run_meta', 'op_log' are serialized GraphDef, RunMetadata,
@@ -37,9 +32,9 @@ namespace tfprof {
 // if not available.
 string PrintModelAnalysis(const string* graph, const string* run_meta,
                           const string* op_log, const string* command,
-                          const Options* options);
+                          const string* options);
 
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_PRINT_MODEL_ANALYSIS_H_
+#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_PRINT_MODEL_ANALYSIS_H_
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/testdata/ckpt b/tensorflow/tools/tfprof/internal/testdata/ckpt
similarity index 100%
rename from tensorflow/contrib/tfprof/tools/tfprof/internal/testdata/ckpt
rename to tensorflow/tools/tfprof/internal/testdata/ckpt
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/testdata/graph.pbtxt b/tensorflow/tools/tfprof/internal/testdata/graph.pbtxt
similarity index 100%
rename from tensorflow/contrib/tfprof/tools/tfprof/internal/testdata/graph.pbtxt
rename to tensorflow/tools/tfprof/internal/testdata/graph.pbtxt
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/testdata/run_meta b/tensorflow/tools/tfprof/internal/testdata/run_meta
similarity index 100%
rename from tensorflow/contrib/tfprof/tools/tfprof/internal/testdata/run_meta
rename to tensorflow/tools/tfprof/internal/testdata/run_meta
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/testdata/tfprof_log b/tensorflow/tools/tfprof/internal/testdata/tfprof_log
similarity index 100%
rename from tensorflow/contrib/tfprof/tools/tfprof/internal/testdata/tfprof_log
rename to tensorflow/tools/tfprof/internal/testdata/tfprof_log
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_constants.h b/tensorflow/tools/tfprof/internal/tfprof_constants.h
similarity index 84%
rename from tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_constants.h
rename to tensorflow/tools/tfprof/internal/tfprof_constants.h
index 169ebae4a75..e495128728b 100644
--- a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_constants.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_constants.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_CONSTANTS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_CONSTANTS_H_
+#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_CONSTANTS_H_
+#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_CONSTANTS_H_
 
 namespace tensorflow {
 namespace tfprof {
@@ -34,4 +34,4 @@ static const char* const kCkptVarType = "_checkpoint_variables";
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_CONSTANTS_H_
+#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_CONSTANTS_H_
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_graph.cc b/tensorflow/tools/tfprof/internal/tfprof_graph.cc
similarity index 97%
rename from tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_graph.cc
rename to tensorflow/tools/tfprof/internal/tfprof_graph.cc
index 287fd78d46c..469b258f98b 100644
--- a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_graph.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_graph.cc
@@ -13,16 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_graph.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_graph.h"
 
 #include <stdio.h>
 #include <utility>
 
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_constants.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_tensor.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/regexp.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_tensor.h"
 
 namespace tensorflow {
 namespace tfprof {
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_graph.h b/tensorflow/tools/tfprof/internal/tfprof_graph.h
similarity index 85%
rename from tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_graph.h
rename to tensorflow/tools/tfprof/internal/tfprof_graph.h
index ee54534f56b..b16f80b33db 100644
--- a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_graph.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_graph.h
@@ -16,8 +16,8 @@ limitations under the License.
 // Build a graph structure based on op inputs/outputs. The graph is a directed
 // acyclic graph pointing *from outputs to inputs*.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_GRAPH_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_GRAPH_H_
+#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_GRAPH_H_
+#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_GRAPH_H_
 
 #include <deque>
 #include <map>
@@ -27,13 +27,13 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/c/checkpoint_reader.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_node.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_show.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_utils.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/tfprof_output.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_node.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_show.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -113,4 +113,4 @@ class TFGraph : public TFShow {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_GRAPH_H_
+#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_GRAPH_H_
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_node.cc b/tensorflow/tools/tfprof/internal/tfprof_node.cc
similarity index 95%
rename from tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_node.cc
rename to tensorflow/tools/tfprof/internal/tfprof_node.cc
index 0e8ab366cbb..0e77439231d 100644
--- a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_node.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_node.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_node.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_node.h"
 
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/tensor_description.pb.h"
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_node.h b/tensorflow/tools/tfprof/internal/tfprof_node.h
similarity index 90%
rename from tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_node.h
rename to tensorflow/tools/tfprof/internal/tfprof_node.h
index c8a8f5e7ec4..6ffb85506cc 100644
--- a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_node.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_node.h
@@ -13,15 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_NODE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_NODE_H_
+#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_NODE_H_
+#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_NODE_H_
 
 #include <map>
 #include <set>
 #include <string>
 #include <vector>
 
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -29,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_description.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -103,4 +103,4 @@ class TFNode {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_NODE_H_
+#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_NODE_H_
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.cc b/tensorflow/tools/tfprof/internal/tfprof_options.cc
similarity index 57%
rename from tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.cc
rename to tensorflow/tools/tfprof/internal/tfprof_options.cc
index 2574415fdd4..03282533ffd 100644
--- a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_options.cc
@@ -13,13 +13,41 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
 
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/tools/tfprof/tfprof_options.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
 
+Options Options::FromProtoStr(const string& opts_proto_str) {
+  OptionsProto opts_pb;
+  CHECK(opts_pb.ParseFromString(opts_proto_str));
+  Options opts(
+      opts_pb.max_depth(), opts_pb.min_bytes(), opts_pb.min_micros(),
+      opts_pb.min_params(), opts_pb.min_float_ops(),
+      std::vector<string>(opts_pb.device_regexes().begin(),
+                          opts_pb.device_regexes().end()),
+      opts_pb.order_by(),
+      std::vector<string>(opts_pb.account_type_regexes().begin(),
+                          opts_pb.account_type_regexes().end()),
+      std::vector<string>(opts_pb.start_name_regexes().begin(),
+                          opts_pb.start_name_regexes().end()),
+      std::vector<string>(opts_pb.trim_name_regexes().begin(),
+                          opts_pb.trim_name_regexes().end()),
+      std::vector<string>(opts_pb.show_name_regexes().begin(),
+                          opts_pb.show_name_regexes().end()),
+      std::vector<string>(opts_pb.hide_name_regexes().begin(),
+                          opts_pb.hide_name_regexes().end()),
+      opts_pb.account_displayed_op_only(),
+      std::vector<string>(opts_pb.select().begin(), opts_pb.select().end()),
+      opts_pb.viz(), opts_pb.dump_to_file());
+  return opts;
+}
+
 string Options::ToString() const {
   const string s = strings::Printf(
       "%-28s%d\n"
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.h b/tensorflow/tools/tfprof/internal/tfprof_options.h
similarity index 90%
rename from tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.h
rename to tensorflow/tools/tfprof/internal/tfprof_options.h
index a0c52e6d1af..a5b55e77fac 100644
--- a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_options.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_OPTIONS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_OPTIONS_H_
+#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_OPTIONS_H_
+#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_OPTIONS_H_
 
 #include <set>
 #include <string>
@@ -22,8 +22,6 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -62,6 +60,8 @@ static const char* const kCmds[] = {
 
 struct Options {
  public:
+  static Options FromProtoStr(const string& opts_proto_str);
+
   virtual ~Options() {}
   Options(int max_depth, tensorflow::int64 min_bytes,
           tensorflow::int64 min_micros, tensorflow::int64 min_params,
@@ -116,4 +116,4 @@ struct Options {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_OPTIONS_H_
+#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_OPTIONS_H_
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_scope.cc b/tensorflow/tools/tfprof/internal/tfprof_scope.cc
similarity index 96%
rename from tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_scope.cc
rename to tensorflow/tools/tfprof/internal/tfprof_scope.cc
index 6b2bc298ccb..949d2d54e42 100644
--- a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_scope.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_scope.cc
@@ -13,17 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_scope.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_scope.h"
 
 #include <stdio.h>
 #include <utility>
 
 #include "tensorflow/c/c_api.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_constants.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_tensor.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/regexp.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_tensor.h"
 
 namespace tensorflow {
 namespace tfprof {
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_scope.h b/tensorflow/tools/tfprof/internal/tfprof_scope.h
similarity index 80%
rename from tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_scope.h
rename to tensorflow/tools/tfprof/internal/tfprof_scope.h
index 3a8ca52b43c..a7c58920a24 100644
--- a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_scope.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_scope.h
@@ -17,8 +17,8 @@ limitations under the License.
 // For example, 'name1/name2' is a child of 'name1'.
 // Stats are aggregated from descendants from ancestors.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_SCOPE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_SCOPE_H_
+#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_SCOPE_H_
+#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_SCOPE_H_
 
 #include <map>
 #include <memory>
@@ -26,13 +26,13 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/c/checkpoint_reader.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_node.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_show.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_utils.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/tfprof_output.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_node.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_show.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -85,4 +85,4 @@ class TFScope : public TFShow {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_SCOPE_H_
+#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_SCOPE_H_
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_show.cc b/tensorflow/tools/tfprof/internal/tfprof_show.cc
similarity index 99%
rename from tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_show.cc
rename to tensorflow/tools/tfprof/internal/tfprof_show.cc
index f7275d8ae4d..a8f1ac6ae94 100644
--- a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_show.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_show.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_show.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_show.h"
 
 #include <memory>
 #include <set>
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_show.h b/tensorflow/tools/tfprof/internal/tfprof_show.h
similarity index 84%
rename from tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_show.h
rename to tensorflow/tools/tfprof/internal/tfprof_show.h
index 4b5d6592e5a..a17358bb6b4 100644
--- a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_show.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_show.h
@@ -15,23 +15,23 @@ limitations under the License.
 
 // Parent class and utilities for tfprof_graph and tfprof_scope.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_SHOW_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_SHOW_H_
+#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_SHOW_H_
+#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_SHOW_H_
 
 #include <algorithm>
 #include <string>
 #include <vector>
 
 #include "tensorflow/c/checkpoint_reader.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_constants.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_node.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_tensor.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_utils.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/tfprof_output.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_node.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_tensor.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -124,4 +124,4 @@ class TFShow {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_SHOW_H_
+#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_SHOW_H_
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_show_test.cc b/tensorflow/tools/tfprof/internal/tfprof_show_test.cc
similarity index 76%
rename from tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_show_test.cc
rename to tensorflow/tools/tfprof/internal/tfprof_show_test.cc
index 81396e31cca..15794727649 100644
--- a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_show_test.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_show_test.cc
@@ -13,30 +13,30 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_stats.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_stats.h"
 
 #include <utility>
 
 #include "tensorflow/c/checkpoint_reader.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_constants.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_utils.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/tfprof_log.pb.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/tfprof_output.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/tools/tfprof/tfprof_log.pb.h"
+#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
 class TFProfShowTest : public ::testing::Test {
  protected:
   TFProfShowTest() {
-    string graph_path = io::JoinPath(
-        testing::TensorFlowSrcRoot(),
-        "contrib/tfprof/tools/tfprof/internal/testdata/graph.pbtxt");
+    string graph_path =
+        io::JoinPath(testing::TensorFlowSrcRoot(),
+                     "tools/tfprof/internal/testdata/graph.pbtxt");
     std::unique_ptr<tensorflow::GraphDef> graph_pb(new tensorflow::GraphDef());
     TF_CHECK_OK(ReadGraphDefText(Env::Default(), graph_path, graph_pb.get()));
 
@@ -44,19 +44,18 @@ class TFProfShowTest : public ::testing::Test {
         new tensorflow::RunMetadata());
     string run_meta_path =
         io::JoinPath(testing::TensorFlowSrcRoot(),
-                     "contrib/tfprof/tools/tfprof/internal/testdata/run_meta");
+                     "tools/tfprof/internal/testdata/run_meta");
     TF_CHECK_OK(
         ReadBinaryProto(Env::Default(), run_meta_path, run_meta_pb.get()));
 
     std::unique_ptr<OpLog> op_log_pb(new OpLog());
-    string op_log_path = io::JoinPath(
-        testing::TensorFlowSrcRoot(),
-        "contrib/tfprof/tools/tfprof/internal/testdata/tfprof_log");
+    string op_log_path =
+        io::JoinPath(testing::TensorFlowSrcRoot(),
+                     "tools/tfprof/internal/testdata/tfprof_log");
     TF_CHECK_OK(ReadBinaryProto(Env::Default(), op_log_path, op_log_pb.get()));
 
-    string ckpt_path =
-        io::JoinPath(testing::TensorFlowSrcRoot(),
-                     "contrib/tfprof/tools/tfprof/internal/testdata/ckpt");
+    string ckpt_path = io::JoinPath(testing::TensorFlowSrcRoot(),
+                                    "tools/tfprof/internal/testdata/ckpt");
     TF_Status* status = TF_NewStatus();
     std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader(
         new checkpoint::CheckpointReader(ckpt_path, status));
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_stats.cc b/tensorflow/tools/tfprof/internal/tfprof_stats.cc
similarity index 98%
rename from tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_stats.cc
rename to tensorflow/tools/tfprof/internal/tfprof_stats.cc
index 54fce4772bd..4bb3a07eafa 100644
--- a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_stats.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_stats.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_stats.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_stats.h"
 
 #include <stdio.h>
 #include <utility>
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_stats.h b/tensorflow/tools/tfprof/internal/tfprof_stats.h
similarity index 74%
rename from tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_stats.h
rename to tensorflow/tools/tfprof/internal/tfprof_stats.h
index 1246a2fae2f..3a8b46ae315 100644
--- a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_stats.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_stats.h
@@ -20,8 +20,8 @@ limitations under the License.
 // 3. Accept command and options to selectively aggregate stats for analysis
 //    and print out the results.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_STATS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_STATS_H_
+#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_STATS_H_
+#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_STATS_H_
 
 #include <map>
 #include <memory>
@@ -29,20 +29,20 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/c/checkpoint_reader.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_graph.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_node.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_scope.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_show.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_utils.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/tfprof_log.pb.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/tfprof_output.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_graph.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_node.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_scope.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_show.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/tools/tfprof/tfprof_log.pb.h"
+#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -79,4 +79,4 @@ class TFStats {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_STATS_H_
+#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_STATS_H_
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_stats_test.cc b/tensorflow/tools/tfprof/internal/tfprof_stats_test.cc
similarity index 89%
rename from tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_stats_test.cc
rename to tensorflow/tools/tfprof/internal/tfprof_stats_test.cc
index 06b288fdce7..a6fcadbe95a 100644
--- a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_stats_test.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_stats_test.cc
@@ -13,31 +13,31 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_stats.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_stats.h"
 
 #include <utility>
 
 #include "tensorflow/c/checkpoint_reader.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_constants.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_utils.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/tfprof_log.pb.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/tfprof_output.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/tools/tfprof/tfprof_log.pb.h"
+#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
 class TFProfStatsTest : public ::testing::Test {
  protected:
   TFProfStatsTest() {
-    string graph_path = io::JoinPath(
-        testing::TensorFlowSrcRoot(),
-        "contrib/tfprof/tools/tfprof/internal/testdata/graph.pbtxt");
+    string graph_path =
+        io::JoinPath(testing::TensorFlowSrcRoot(),
+                     "tools/tfprof/internal/testdata/graph.pbtxt");
     std::unique_ptr<tensorflow::GraphDef> graph_pb(new tensorflow::GraphDef());
     TF_CHECK_OK(ReadGraphDefText(Env::Default(), graph_path, graph_pb.get()));
 
@@ -45,19 +45,18 @@ class TFProfStatsTest : public ::testing::Test {
         new tensorflow::RunMetadata());
     string run_meta_path =
         io::JoinPath(testing::TensorFlowSrcRoot(),
-                     "contrib/tfprof/tools/tfprof/internal/testdata/run_meta");
+                     "tools/tfprof/internal/testdata/run_meta");
     TF_CHECK_OK(
         ReadBinaryProto(Env::Default(), run_meta_path, run_meta_pb.get()));
 
     std::unique_ptr<OpLog> op_log_pb(new OpLog());
-    string op_log_path = io::JoinPath(
-        testing::TensorFlowSrcRoot(),
-        "contrib/tfprof/tools/tfprof/internal/testdata/tfprof_log");
+    string op_log_path =
+        io::JoinPath(testing::TensorFlowSrcRoot(),
+                     "tools/tfprof/internal/testdata/tfprof_log");
     TF_CHECK_OK(ReadBinaryProto(Env::Default(), op_log_path, op_log_pb.get()));
 
-    string ckpt_path =
-        io::JoinPath(testing::TensorFlowSrcRoot(),
-                     "contrib/tfprof/tools/tfprof/internal/testdata/ckpt");
+    string ckpt_path = io::JoinPath(testing::TensorFlowSrcRoot(),
+                                    "tools/tfprof/internal/testdata/ckpt");
     TF_Status* status = TF_NewStatus();
     std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader(
         new checkpoint::CheckpointReader(ckpt_path, status));
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_tensor.cc b/tensorflow/tools/tfprof/internal/tfprof_tensor.cc
similarity index 97%
rename from tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_tensor.cc
rename to tensorflow/tools/tfprof/internal/tfprof_tensor.cc
index c21626919fa..297258fee11 100644
--- a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_tensor.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_tensor.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_tensor.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_tensor.h"
 
 namespace tensorflow {
 namespace tfprof {
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_tensor.h b/tensorflow/tools/tfprof/internal/tfprof_tensor.h
similarity index 92%
rename from tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_tensor.h
rename to tensorflow/tools/tfprof/internal/tfprof_tensor.h
index 471a1db4172..4f6fffd6504 100644
--- a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_tensor.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_tensor.h
@@ -19,16 +19,16 @@ limitations under the License.
 //    is not supported by TensorFlow CheckPointReader library, though it is
 //    supported in current code.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_TENSOR_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_TENSOR_H_
+#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_TENSOR_H_
+#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_TENSOR_H_
 
 #include <typeinfo>
 
-#include "tensorflow/contrib/tfprof/tools/tfprof/tfprof_output.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -117,4 +117,4 @@ class TFProfTensor {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_TENSOR_H_
+#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_TENSOR_H_
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_tensor_test.cc b/tensorflow/tools/tfprof/internal/tfprof_tensor_test.cc
similarity index 96%
rename from tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_tensor_test.cc
rename to tensorflow/tools/tfprof/internal/tfprof_tensor_test.cc
index d3f1e3c7b70..1066e6208a4 100644
--- a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_tensor_test.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_tensor_test.cc
@@ -14,34 +14,33 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/c/checkpoint_reader.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_stats.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_utils.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/tfprof_log.pb.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/tfprof_output.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_stats.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/tools/tfprof/tfprof_log.pb.h"
+#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
 class TFProfTensorTest : public ::testing::Test {
  protected:
   TFProfTensorTest() {
-    string graph_path = io::JoinPath(
-        testing::TensorFlowSrcRoot(),
-        "contrib/tfprof/tools/tfprof/internal/testdata/graph.pbtxt");
+    string graph_path =
+        io::JoinPath(testing::TensorFlowSrcRoot(),
+                     "tools/tfprof/internal/testdata/graph.pbtxt");
     std::unique_ptr<tensorflow::GraphDef> graph_pb(new tensorflow::GraphDef());
     TF_CHECK_OK(ReadGraphDefText(Env::Default(), graph_path, graph_pb.get()));
 
     std::unique_ptr<tensorflow::RunMetadata> run_meta_pb;
     std::unique_ptr<OpLog> op_log_pb;
 
-    string ckpt_path =
-        io::JoinPath(testing::TensorFlowSrcRoot(),
-                     "contrib/tfprof/tools/tfprof/internal/testdata/ckpt");
+    string ckpt_path = io::JoinPath(testing::TensorFlowSrcRoot(),
+                                    "tools/tfprof/internal/testdata/ckpt");
     TF_Status* status = TF_NewStatus();
     std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader(
         new checkpoint::CheckpointReader(ckpt_path, status));
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_utils.cc b/tensorflow/tools/tfprof/internal/tfprof_utils.cc
similarity index 99%
rename from tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_utils.cc
rename to tensorflow/tools/tfprof/internal/tfprof_utils.cc
index 7610729a118..5783b9f4759 100644
--- a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_utils.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_utils.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
 
 #include <stdio.h>
 #include <algorithm>
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_utils.h b/tensorflow/tools/tfprof/internal/tfprof_utils.h
similarity index 81%
rename from tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_utils.h
rename to tensorflow/tools/tfprof/internal/tfprof_utils.h
index 6c1bba04fc2..13077a8fc5c 100644
--- a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_utils.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_utils.h
@@ -13,16 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_UTILS_H_
+#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_UTILS_H_
+#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_UTILS_H_
 
 #include <string>
 #include <vector>
 
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -47,4 +47,4 @@ void PrintHelp();
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_UTILS_H_
+#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_UTILS_H_
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/tfprof_log.proto b/tensorflow/tools/tfprof/tfprof_log.proto
similarity index 100%
rename from tensorflow/contrib/tfprof/tools/tfprof/tfprof_log.proto
rename to tensorflow/tools/tfprof/tfprof_log.proto
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/tfprof_main.cc b/tensorflow/tools/tfprof/tfprof_main.cc
similarity index 97%
rename from tensorflow/contrib/tfprof/tools/tfprof/tfprof_main.cc
rename to tensorflow/tools/tfprof/tfprof_main.cc
index 38b1588d72d..f72797f0a23 100644
--- a/tensorflow/contrib/tfprof/tools/tfprof/tfprof_main.cc
+++ b/tensorflow/tools/tfprof/tfprof_main.cc
@@ -24,10 +24,6 @@ limitations under the License.
 #include "linenoise.h"
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/checkpoint_reader.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_stats.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_utils.h"
-#include "tensorflow/contrib/tfprof/tools/tfprof/tfprof_log.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -36,6 +32,10 @@ limitations under the License.
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_stats.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/tools/tfprof/tfprof_log.pb.h"
 
 using tensorflow::str_util::Split;
 
diff --git a/tensorflow/tools/tfprof/tfprof_options.proto b/tensorflow/tools/tfprof/tfprof_options.proto
new file mode 100644
index 00000000000..0d8e6880390
--- /dev/null
+++ b/tensorflow/tools/tfprof/tfprof_options.proto
@@ -0,0 +1,24 @@
+syntax = "proto2";
+
+package tensorflow.tfprof;
+
+// Refers to tfprof_options.h/cc for documentation.
+// Only used to pass tfprof options from Python to C++.
+message OptionsProto {
+  optional int64 max_depth = 1;
+  optional int64 min_bytes = 2;
+  optional int64 min_micros = 3;
+  optional int64 min_params = 4;
+  optional int64 min_float_ops = 5;
+  repeated string device_regexes = 6;
+  optional string order_by = 7;
+  repeated string account_type_regexes = 8;
+  repeated string start_name_regexes = 9;
+  repeated string trim_name_regexes = 10;
+  repeated string show_name_regexes = 11;
+  repeated string hide_name_regexes = 12;
+  optional bool account_displayed_op_only = 13;
+  repeated string select = 14;
+  optional bool viz = 15;
+  optional string dump_to_file = 16;
+}
\ No newline at end of file
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/tfprof_output.proto b/tensorflow/tools/tfprof/tfprof_output.proto
similarity index 100%
rename from tensorflow/contrib/tfprof/tools/tfprof/tfprof_output.proto
rename to tensorflow/tools/tfprof/tfprof_output.proto
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 60931dd577f..4be2490d316 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -14,8 +14,8 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   # These lines need to be changed when updating Eigen. They are parsed from
   # this file by the cmake and make builds to determine the eigen version and
   # hash.
-  eigen_version = "1c7159a65db4"
-  eigen_sha256 = "b089a6eae493c32703c6beb5fdae9d64a7667c3a5440bae00ac8e517cc822e62"
+  eigen_version = "1d454915237a"
+  eigen_sha256 = "7e05dd4b9866ef0aa4498be34752a362596cc5db2f8439cee111e4ea54046b57"
 
   native.new_http_archive(
     name = "eigen_archive",
@@ -34,9 +34,9 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
   native.http_archive(
     name = "gemmlowp",
-    url = "http://github.com/google/gemmlowp/archive/c0bacf11fb509a2cbe15a97362a2df067ffd57a2.tar.gz",
-    sha256 = "dc64a38f9927db18748d9024987c9b102115e25bc2be4b76aa8e422b8f83d882",
-    strip_prefix = "gemmlowp-c0bacf11fb509a2cbe15a97362a2df067ffd57a2",
+    url = "http://github.com/google/gemmlowp/archive/a6f29d8ac48d63293f845f2253eccbf86bc28321.tar.gz",
+    sha256 = "75d40ea8e68b0d1644f052fffe8f14a410b2a73d40ccb859a95c0578d194ec26",
+    strip_prefix = "gemmlowp-a6f29d8ac48d63293f845f2253eccbf86bc28321",
   )
 
   native.new_http_archive(
@@ -129,16 +129,16 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.new_http_archive(
     name = "pcre",
     sha256 = "ccdf7e788769838f8285b3ee672ed573358202305ee361cfec7a4a4fb005bbc7",
-    url = "http://ftp.cs.stanford.edu/pub/exim/pcre/pcre-8.39.tar.gz",
+    url = "http://ftp.exim.org/pub/pcre/pcre-8.39.tar.gz",
     strip_prefix = "pcre-8.39",
     build_file = str(Label("//third_party:pcre.BUILD")),
   )
 
   native.new_http_archive(
     name = "swig",
-    sha256 = "a2669657cabcedc371f63c0457407a183e0b6b2ef4e7e303c1ec9a3964cc7813",
-    url = "http://ufpr.dl.sourceforge.net/project/swig/swig/swig-3.0.2/swig-3.0.2.tar.gz",
-    strip_prefix = "swig-3.0.2",
+    sha256 = "58a475dbbd4a4d7075e5fe86d4e54c9edde39847cdb96a3053d87cb64a23a453",
+    url = "http://ufpr.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz",
+    strip_prefix = "swig-3.0.8",
     build_file = str(Label("//third_party:swig.BUILD")),
   )
 
diff --git a/third_party/swig.BUILD b/third_party/swig.BUILD
index 0ec413a2480..bea5d6b5314 100644
--- a/third_party/swig.BUILD
+++ b/third_party/swig.BUILD
@@ -291,7 +291,7 @@ genrule(
           "#define HAVE_PCRE\n" +
           "#define HAVE_POPEN\n" +
           "#define PACKAGE_BUGREPORT \"http://www.swig.org\"\n" +
-          "#define PACKAGE_VERSION \"3.0.2\"\n" +
+          "#define PACKAGE_VERSION \"3.0.8\"\n" +
           "#define STDC_HEADERS\n" +
           "#define SWIG_CXX \"bazel4lyfe\"\n" +
           "#define SWIG_LIB \"external/swig/Lib\"\n" +
@@ -323,6 +323,7 @@ genrule(
           "    -e '/swig_pike/d'" +
           "    -e '/swig_r/d'" +
           "    -e '/swig_ruby/d'" +
+          "    -e '/swig_scilab/d'" +
           "    -e '/swig_sexp/d'" +
           "    -e '/swig_tcl/d'" +
           "    -e '/swig_uffi/d'" +