diff --git a/WORKSPACE b/WORKSPACE
index 9d3622878c3..e0931512f4a 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -38,6 +38,13 @@ new_http_archive(
   sha256 = "b4c178fd6236dcf0a20d25d07c45eebe85281263978c6a6f1dfc49d75befc45f"
 )
 
+new_http_archive(
+  name = "stylize",
+  build_file = "models.BUILD",
+  url = "https://storage.googleapis.com/download.tensorflow.org/models/stylize_v1.zip",
+  sha256 = "3d374a730aef330424a356a8d4f04d8a54277c425e274ecb7d9c83aa912c6bfa"
+)
+
 # TENSORBOARD_BOWER_AUTOGENERATED_BELOW_THIS_LINE_DO_NOT_EDIT
 
 new_http_archive(
diff --git a/configure b/configure
index 64add33bd5d..1e4d786974d 100755
--- a/configure
+++ b/configure
@@ -57,9 +57,27 @@ done
 if is_windows; then
   TF_NEED_GCP=0
   TF_NEED_HDFS=0
+  TF_NEED_JEMALLOC=0
   TF_NEED_OPENCL=0
 fi
 
+while [ "$TF_NEED_JEMALLOC" == "" ]; do
+  read -p "Do you wish to use jemalloc as the malloc implementation? "\
+"(Linux only) [Y/n] " INPUT
+  case $INPUT in
+    [Yy]* ) echo "jemalloc enabled on Linux"; TF_NEED_JEMALLOC=1;;
+    [Nn]* ) echo "jemalloc disabled on Linux"; TF_NEED_JEMALLOC=0;;
+    "" ) echo "jemalloc enabled on Linux"; TF_NEED_JEMALLOC=1;;
+    * ) echo "Invalid selection: " $INPUT;;
+  esac
+done
+
+if [ "$TF_NEED_JEMALLOC" == "1" ]; then
+  sed -i -e "s/WITH_JEMALLOC = False/WITH_JEMALLOC = True/" tensorflow/core/platform/default/build_config.bzl
+else
+  sed -i -e "s/WITH_JEMALLOC = True/WITH_JEMALLOC = False/" tensorflow/core/platform/default/build_config.bzl
+fi
+
 while [ "$TF_NEED_GCP" == "" ]; do
   read -p "Do you wish to build TensorFlow with "\
 "Google Cloud Platform support? [y/N] " INPUT
diff --git a/tensorflow/.clang-format b/tensorflow/.clang-format
new file mode 100644
index 00000000000..e06cf478f46
--- /dev/null
+++ b/tensorflow/.clang-format
@@ -0,0 +1,4 @@
+# Run manually to reformat a file:
+# clang-format -i --style=file <file>
+BasedOnStyle: Google
+DerivePointerAlignment: false
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 9063af696fd..355e48d582f 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -190,6 +190,7 @@ filegroup(
         "//tensorflow/examples/image_retraining:all_files",
         "//tensorflow/examples/label_image:all_files",
         "//tensorflow/examples/learn:all_files",
+        "//tensorflow/examples/saved_model:all_files",
         "//tensorflow/examples/tutorials/estimators:all_files",
         "//tensorflow/examples/tutorials/mnist:all_files",
         "//tensorflow/examples/tutorials/word2vec:all_files",
@@ -203,7 +204,6 @@ filegroup(
         "//tensorflow/python/debug:all_files",
         "//tensorflow/python/kernel_tests:all_files",
         "//tensorflow/python/saved_model:all_files",
-        "//tensorflow/python/saved_model/example:all_files",
         "//tensorflow/python/tools:all_files",
         "//tensorflow/tensorboard:all_files",
         "//tensorflow/tensorboard/app:all_files",
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 32696e83e4d..a6bc8fdc492 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -6,6 +6,7 @@ licenses(["notice"])  # Apache 2.0
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
+    "tf_copts",
     "tf_cuda_library",
     "tf_custom_op_library",
 )
@@ -23,13 +24,19 @@ tf_cuda_library(
     name = "c_api",
     srcs = ["c_api.cc"],
     hdrs = ["c_api.h"],
+    copts = tf_copts(),
     visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/cc/saved_model:loader",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-    ],
+    deps = select({
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib_lite",
+        ],
+        "//conditions:default": [
+            "//tensorflow/cc/saved_model:loader",
+            "//tensorflow/core:core_cpu",
+            "//tensorflow/core:framework",
+            "//tensorflow/core:lib",
+        ],
+    }),
 )
 
 tf_cuda_library(
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 14988fbc4d7..83ce3e25d46 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -20,7 +20,9 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#ifndef __ANDROID__
 #include "tensorflow/cc/saved_model/loader.h"
+#endif
 #include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -37,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/thread_annotations.h"
@@ -159,11 +162,13 @@ Status MessageToBuffer(const tensorflow::protobuf::Message& in,
     return InvalidArgument("Passing non-empty TF_Buffer is invalid.");
   }
   const auto proto_size = in.ByteSize();
-  void* buf = malloc(proto_size);
+  void* buf = tensorflow::port::Malloc(proto_size);
   in.SerializeToArray(buf, proto_size);
   out->data = buf;
   out->length = proto_size;
-  out->data_deallocator = [](void* data, size_t length) { free(data); };
+  out->data_deallocator = [](void* data, size_t length) {
+    tensorflow::port::Free(data);
+  };
   return Status::OK();
 }
 
@@ -287,13 +292,15 @@ void TF_SetConfig(TF_SessionOptions* options, const void* proto,
 TF_Buffer* TF_NewBuffer() { return new TF_Buffer{nullptr, 0, nullptr}; }
 
 TF_Buffer* TF_NewBufferFromString(const void* proto, size_t proto_len) {
-  void* copy = malloc(proto_len);
+  void* copy = tensorflow::port::Malloc(proto_len);
   memcpy(copy, proto, proto_len);
 
   TF_Buffer* buf = new TF_Buffer;
   buf->data = copy;
   buf->length = proto_len;
-  buf->data_deallocator = [](void* data, size_t length) { free(data); };
+  buf->data_deallocator = [](void* data, size_t length) {
+    tensorflow::port::Free(data);
+  };
   return buf;
 }
 
@@ -694,7 +701,7 @@ TF_Library* TF_LoadLibrary(const char* library_filename, TF_Status* status) {
 TF_Buffer TF_GetOpList(TF_Library* lib_handle) { return lib_handle->op_list; }
 
 void TF_DeleteLibraryHandle(TF_Library* lib_handle) {
-  free(const_cast<void*>(lib_handle->op_list.data));
+  tensorflow::port::Free(const_cast<void*>(lib_handle->op_list.data));
   delete lib_handle;
 }
 
@@ -1704,6 +1711,7 @@ TF_Session* TF_NewSession(TF_Graph* graph, const TF_SessionOptions* opt,
   }
 }
 
+#ifndef __ANDROID__
 TF_Session* TF_LoadSessionFromSavedModel(
     const TF_SessionOptions* session_options, const TF_Buffer* run_options,
     const char* export_dir, const char* const* tags, int tags_len,
@@ -1757,6 +1765,7 @@ TF_Session* TF_LoadSessionFromSavedModel(
   session->last_num_graph_nodes = graph->graph.num_node_ids();
   return session;
 }
+#endif  // __ANDROID__
 
 void TF_CloseSession(TF_Session* s, TF_Status* status) {
   status->status = s->session->Close();
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index 3ea2d31699d..e625d656ade 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -835,6 +835,10 @@ typedef struct TF_Session TF_Session;
 extern TF_Session* TF_NewSession(TF_Graph* graph, const TF_SessionOptions* opts,
                                  TF_Status* status);
 
+#ifndef __ANDROID__
+// TODO(ashankar): Remove the __ANDROID__ guard. This will require ensuring that
+// the tensorflow/cc/saved_model:loader build target is Android friendly.
+
 // This function creates a new TF_Session (which is created on success) using
 // `session_options`, and then initializes state (restoring tensors and other
 // assets) using `run_options`.
@@ -853,6 +857,7 @@ TF_Session* TF_LoadSessionFromSavedModel(
     const TF_SessionOptions* session_options, const TF_Buffer* run_options,
     const char* export_dir, const char* const* tags, int tags_len,
     TF_Graph* graph, TF_Buffer* meta_graph_def, TF_Status* status);
+#endif  // __ANDROID__
 
 // Close a session.
 //
diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc
index 50e596786ab..00c07932aca 100644
--- a/tensorflow/compiler/aot/compile.cc
+++ b/tensorflow/compiler/aot/compile.cc
@@ -204,23 +204,23 @@ Status RewriteAndPruneGraph(Graph* graph, const Config& config,
       string feed_id;
       TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), kFeedIdAttr, &feed_id));
       if (missing_feeds.erase(feed_id) == 0) {
-        return errors::Aborted(kArgOp, " node found with unknown feed id: ",
-                               feed_id);
+        return errors::Aborted(kArgOp,
+                               " node found with unknown feed id: ", feed_id);
       }
     } else if (n->type_string() == kRetvalOp) {
       string fetch_id;
       TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), kFetchIdAttr, &fetch_id));
       if (missing_fetches.erase(fetch_id) == 0) {
-        return errors::Aborted(kRetvalOp, " node found with unknown fetch id: ",
-                               fetch_id);
+        return errors::Aborted(kRetvalOp,
+                               " node found with unknown fetch id: ", fetch_id);
       }
     }
   }
   if (!missing_feeds.empty() || !missing_fetches.empty()) {
-    return errors::Aborted("Post graph-pruning", ", missing feeds: ",
-                           str_util::Join(missing_feeds, ", "),
-                           ", missing fetches: ",
-                           str_util::Join(missing_fetches, ", "));
+    return errors::Aborted(
+        "Post graph-pruning",
+        ", missing feeds: ", str_util::Join(missing_feeds, ", "),
+        ", missing fetches: ", str_util::Join(missing_fetches, ", "));
   }
   return Status::OK();
 }
@@ -351,16 +351,19 @@ Status CompileXla(xla::LocalClient* client, const xla::Computation& computation,
   for (int i = 0; i < pshape->parameters_size(); ++i) {
     arg_layouts.push_back(pshape->mutable_parameters(i));
   }
-  xla::StatusOr<std::unique_ptr<xla::AotCompilationResult>> aot_or =
-      client->CompileAheadOfTime(computation, arg_layouts, pshape->result(),
-                                 aot_opts);
+  xla::LocalClient::AheadOfTimeComputationInstance instance;
+  instance.computation = &computation;
+  instance.argument_layouts = std::move(arg_layouts);
+  instance.result_layout = &pshape->result();
+  xla::StatusOr<std::vector<std::unique_ptr<xla::AotCompilationResult>>>
+      aot_or = client->CompileAheadOfTime({instance}, aot_opts);
   if (!aot_or.ok()) {
     return errors::Unknown("XLA compilation failed: ",
                            aot_or.status().error_message());
   }
   compile_result->aot =
       xla::unique_ptr_static_cast<xla::cpu::CpuAotCompilationResult>(
-          aot_or.ConsumeValueOrDie());
+          std::move(aot_or.ValueOrDie().back()));
   compile_result->entry_point = aot_opts.entry_point_name();
   compile_result->pointer_size =
       xla::LocalClient::PointerSizeForTriple(aot_opts.triple());
diff --git a/tensorflow/compiler/aot/tests/make_test_graphs.py b/tensorflow/compiler/aot/tests/make_test_graphs.py
index 261dfcbdf8c..2a2d13dc498 100644
--- a/tensorflow/compiler/aot/tests/make_test_graphs.py
+++ b/tensorflow/compiler/aot/tests/make_test_graphs.py
@@ -18,6 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import argparse
+import sys
+
 from tensorflow.core.protobuf import saver_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
@@ -27,22 +30,18 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import app
-from tensorflow.python.platform import flags as flags_lib
 from tensorflow.python.training import saver as saver_lib
 
-flags = flags_lib
-FLAGS = flags.FLAGS
-flags.DEFINE_string('out_dir', '',
-                    'Output directory for graphs, checkpoints and savers.')
+FLAGS = None
 
 
-def tfadd():
+def tfadd(_):
   x = constant_op.constant([1], name='x_const')
   y = constant_op.constant([2], name='y_const')
   math_ops.add(x, y, name='x_y_sum')
 
 
-def tfadd_with_ckpt():
+def tfadd_with_ckpt(out_dir):
   x = array_ops.placeholder(dtypes.int32, name='x_hold')
   y = variables.Variable(constant_op.constant([0]), name='y_saved')
   math_ops.add(x, y, name='x_y_sum')
@@ -53,11 +52,11 @@ def tfadd_with_ckpt():
     sess.run(init_op)
     sess.run(y.assign(y + 42))
     # Without the checkpoint, the variable won't be set to 42.
-    ckpt = '%s/test_graph_tfadd_with_ckpt.ckpt' % FLAGS.out_dir
+    ckpt = '%s/test_graph_tfadd_with_ckpt.ckpt' % out_dir
     saver.save(sess, ckpt)
 
 
-def tfadd_with_ckpt_saver():
+def tfadd_with_ckpt_saver(out_dir):
   x = array_ops.placeholder(dtypes.int32, name='x_hold')
   y = variables.Variable(constant_op.constant([0]), name='y_saved')
   math_ops.add(x, y, name='x_y_sum')
@@ -68,27 +67,27 @@ def tfadd_with_ckpt_saver():
     sess.run(init_op)
     sess.run(y.assign(y + 42))
     # Without the checkpoint, the variable won't be set to 42.
-    ckpt_file = '%s/test_graph_tfadd_with_ckpt_saver.ckpt' % FLAGS.out_dir
+    ckpt_file = '%s/test_graph_tfadd_with_ckpt_saver.ckpt' % out_dir
     saver.save(sess, ckpt_file)
     # Without the SaverDef, the restore op won't be named correctly.
-    saver_file = '%s/test_graph_tfadd_with_ckpt_saver.saver' % FLAGS.out_dir
+    saver_file = '%s/test_graph_tfadd_with_ckpt_saver.saver' % out_dir
     with open(saver_file, 'w') as f:
       f.write(saver.as_saver_def().SerializeToString())
 
 
-def tfgather():
+def tfgather(_):
   params = array_ops.placeholder(dtypes.float32, name='params')
   indices = array_ops.placeholder(dtypes.int32, name='indices')
   array_ops.gather(params, indices, name='gather_output')
 
 
-def tfmatmul():
+def tfmatmul(_):
   x = array_ops.placeholder(dtypes.float32, name='x_hold')
   y = array_ops.placeholder(dtypes.float32, name='y_hold')
   math_ops.matmul(x, y, name='x_y_prod')
 
 
-def tfmatmulandadd():
+def tfmatmulandadd(_):
   # This tests multiple outputs.
   x = array_ops.placeholder(dtypes.float32, name='x_hold')
   y = array_ops.placeholder(dtypes.float32, name='y_hold')
@@ -96,24 +95,33 @@ def tfmatmulandadd():
   math_ops.add(x, y, name='x_y_sum')
 
 
-def write_graph(build_graph):
+def write_graph(build_graph, out_dir):
   """Build a graph using build_graph and write it out."""
   g = ops.Graph()
   with g.as_default():
-    build_graph()
-    filename = '%s/test_graph_%s.pb' % (FLAGS.out_dir, build_graph.__name__)
+    build_graph(out_dir)
+    filename = '%s/test_graph_%s.pb' % (out_dir, build_graph.__name__)
     with open(filename, 'w') as f:
       f.write(g.as_graph_def().SerializeToString())
 
 
 def main(_):
-  write_graph(tfadd)
-  write_graph(tfadd_with_ckpt)
-  write_graph(tfadd_with_ckpt_saver)
-  write_graph(tfgather)
-  write_graph(tfmatmul)
-  write_graph(tfmatmulandadd)
+  write_graph(tfadd, FLAGS.out_dir)
+  write_graph(tfadd_with_ckpt, FLAGS.out_dir)
+  write_graph(tfadd_with_ckpt_saver, FLAGS.out_dir)
+  write_graph(tfgather, FLAGS.out_dir)
+  write_graph(tfmatmul, FLAGS.out_dir)
+  write_graph(tfmatmulandadd, FLAGS.out_dir)
 
 
 if __name__ == '__main__':
-  app.run()
+  parser = argparse.ArgumentParser()
+  parser.register('type', 'bool', lambda v: v.lower() == 'true')
+  parser.add_argument(
+      '--out_dir',
+      type=str,
+      default='',
+      help='Output directory for graphs, checkpoints and savers.'
+  )
+  FLAGS, unparsed = parser.parse_known_args()
+  app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 486725f1daa..318dc7fada9 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -41,12 +41,15 @@ const char* const kXlaClusterAttr = "_XlaCluster";
 
 namespace {
 
-bool HasXLAKernel(const NodeDef& node_def, DeviceType jit_device_type) {
+bool HasXLAKernel(const Node& node, const DeviceType& jit_device_type) {
+  // _Send and _Recv should not be marked for compilation.
+  if (node.IsSend() || node.IsRecv()) return false;
+
   // There is a SymbolicGradient kernel on the XLA_JIT device, but the gradient
   // is really a kind of function call and will be handled by
   // IsCompilableCall().
-  if (node_def.op() == "SymbolicGradient") return false;
-  return FindKernelDef(jit_device_type, node_def, nullptr, nullptr).ok();
+  if (node.type_string() == "SymbolicGradient") return false;
+  return FindKernelDef(jit_device_type, node.def(), nullptr, nullptr).ok();
 }
 
 // Make sure we don't recurse infinitely on recursive functions.
@@ -125,7 +128,7 @@ bool IsCompilableCall(const NodeDef& call_def, DeviceType jit_device_type,
       return IsCompilableWhile(node->def(), jit_device_type, depth + 1,
                                lib_runtime);
     }
-    if (!HasXLAKernel(node->def(), jit_device_type) &&
+    if (!HasXLAKernel(*node, jit_device_type) &&
         !IsCompilableCall(node->def(), jit_device_type, depth + 1,
                           lib_runtime)) {
       VLOG(2) << "Function marking failed: unsupported op " << node->name()
@@ -168,7 +171,7 @@ Status FindCompilationCandidates(
     CHECK(XlaOpRegistry::GetJitDevice(device_type.type(), &jit_device_name,
                                       /*requires_jit=*/nullptr));
     DeviceType jit_device_type(*jit_device_name);
-    if (!HasXLAKernel(node->def(), jit_device_type) &&
+    if (!HasXLAKernel(*node, jit_device_type) &&
         !IsCompilableCall(node->def(), jit_device_type, 0, lib_runtime.get())) {
       VLOG(2) << "Compilation rejected node: unsupported op " << node->name()
               << ": " << node->def().op();
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index 250960d3958..f329e83e14d 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/platform/mem.h"
 
 namespace tensorflow {
 
@@ -41,7 +42,7 @@ void* XlaDeviceAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
   // Regardless of the size requested, always allocate a XlaGlobalData. Respect
   // the aligment request because there is alignment checking even for Tensors
   // whose data is never accessed.
-  void* p = port::aligned_malloc(sizeof(XlaGlobalData), alignment);
+  void* p = port::AlignedMalloc(sizeof(XlaGlobalData), alignment);
   VLOG(2) << "Allocated XLA device tensor " << p;
   return new (p) XlaGlobalData();
 }
@@ -50,7 +51,7 @@ void XlaDeviceAllocator::DeallocateRaw(void* ptr) {
   XlaGlobalData* global_data = reinterpret_cast<XlaGlobalData*>(ptr);
   VLOG(2) << "Deallocated XLA device tensor " << ptr;
   global_data->~XlaGlobalData();
-  port::aligned_free(ptr);
+  port::AlignedFree(ptr);
 }
 
 void XlaDeviceAllocator::GetStats(AllocatorStats* stats) { stats->Clear(); }
diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc
index 731ff7d673f..db4c86505cb 100644
--- a/tensorflow/compiler/jit/xla_gpu_device.cc
+++ b/tensorflow/compiler/jit/xla_gpu_device.cc
@@ -45,7 +45,7 @@ Status XlaGpuDeviceFactory::CreateDevices(const SessionOptions& options,
                         name_prefix, &device);
   if (!status.ok()) {
     // Treat failures as non-fatal; there might not be a GPU in the machine.
-    LOG(WARNING) << "Failed to create XLA_GPU device: " << status;
+    VLOG(1) << "Failed to create XLA_GPU device: " << status;
     return Status::OK();
   }
   devices->push_back(device.release());
diff --git a/tensorflow/compiler/tests/lstm_test.py b/tensorflow/compiler/tests/lstm_test.py
index 9ffeb6c2a2f..31093c65713 100644
--- a/tensorflow/compiler/tests/lstm_test.py
+++ b/tensorflow/compiler/tests/lstm_test.py
@@ -18,7 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import argparse
 import os
+import sys
 
 import numpy as np
 
@@ -32,29 +34,8 @@ from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
-from tensorflow.python.platform import flags as flags_lib
 from tensorflow.python.platform import test
 
-flags = flags_lib
-FLAGS = flags.FLAGS
-
-flags.DEFINE_integer('batch_size', 128,
-                     'Inputs are fed in batches of this size, for both '
-                     'inference and training. Larger values cause the matmul '
-                     'in each LSTM cell to have higher dimensionality.')
-flags.DEFINE_integer('seq_length', 60,
-                     'Length of the unrolled sequence of LSTM cells in a layer.'
-                     'Larger values cause more LSTM matmuls to be run.')
-flags.DEFINE_integer('num_inputs', 1024,
-                     'Dimension of inputs that are fed into each LSTM cell.')
-flags.DEFINE_integer('num_nodes', 1024, 'Number of nodes in each LSTM cell.')
-flags.DEFINE_string('device', 'gpu',
-                    'TensorFlow device to assign ops to, e.g. "gpu", "cpu". '
-                    'For details see documentation for tf.Graph.device.')
-
-flags.DEFINE_string('dump_graph_dir', '', 'If non-empty, dump graphs in '
-                    '*.pbtxt format to this directory.')
-
 
 def _DumpGraph(graph, basename):
   if FLAGS.dump_graph_dir:
@@ -290,4 +271,54 @@ class LSTMBenchmark(test.Benchmark):
 
 
 if __name__ == '__main__':
-  test.main()
+  parser = argparse.ArgumentParser()
+  parser.register('type', 'bool', lambda v: v.lower() == 'true')
+  parser.add_argument(
+      '--batch_size',
+      type=int,
+      default=128,
+      help="""\
+      Inputs are fed in batches of this size, for both inference and training.
+      Larger values cause the matmul in each LSTM cell to have higher
+      dimensionality.\
+      """
+  )
+  parser.add_argument(
+      '--seq_length',
+      type=int,
+      default=60,
+      help="""\
+      Length of the unrolled sequence of LSTM cells in a layer.Larger values
+      cause more LSTM matmuls to be run.\
+      """
+  )
+  parser.add_argument(
+      '--num_inputs',
+      type=int,
+      default=1024,
+      help='Dimension of inputs that are fed into each LSTM cell.'
+  )
+  parser.add_argument(
+      '--num_nodes',
+      type=int,
+      default=1024,
+      help='Number of nodes in each LSTM cell.'
+  )
+  parser.add_argument(
+      '--device',
+      type=str,
+      default='gpu',
+      help="""\
+      TensorFlow device to assign ops to, e.g. "gpu", "cpu". For details see
+      documentation for tf.Graph.device.\
+      """
+  )
+  parser.add_argument(
+      '--dump_graph_dir',
+      type=str,
+      default='',
+      help='If non-empty, dump graphs in *.pbtxt format to this directory.'
+  )
+  global FLAGS  # pylint:disable=global-at-module-level
+  FLAGS, unparsed = parser.parse_known_args()
+  test.main(argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 3de9958cd66..4d861c71c41 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -89,6 +89,27 @@ cc_library(
 
 # Internal targets below this point.
 
+cc_test(
+    name = "xla_compiler_test",
+    srcs = ["xla_compiler_test.cc"],
+    deps = [
+        ":xla_compiler",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:function_ops",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:sendrecv_ops",
+        "//tensorflow/compiler/tf2xla/kernels:xla_ops",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_test(
     name = "str_util_test",
     srcs = [
diff --git a/tensorflow/compiler/tf2xla/kernels/relu_op.cc b/tensorflow/compiler/tf2xla/kernels/relu_op.cc
index 3cddff9df40..8adac23eeec 100644
--- a/tensorflow/compiler/tf2xla/kernels/relu_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/relu_op.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h"
 #include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
-#include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.cc b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
index 86a53c929ef..ad3c9217440 100644
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.cc
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
 namespace tensorflow {
@@ -47,7 +48,7 @@ class XlaCompilationAllocator : public Allocator {
     // XlaExpression. Respect the aligment request because there is
     // alignment checking even for Tensors whose data is never
     // accessed.
-    void* p = port::aligned_malloc(sizeof(XlaExpression), alignment);
+    void* p = port::AlignedMalloc(sizeof(XlaExpression), alignment);
     XlaExpression* expression = reinterpret_cast<XlaExpression*>(p);
     new (expression) XlaExpression();
     return expression;
@@ -56,7 +57,7 @@ class XlaCompilationAllocator : public Allocator {
   void DeallocateRaw(void* ptr) override {
     XlaExpression* expression = reinterpret_cast<XlaExpression*>(ptr);
     expression->~XlaExpression();
-    port::aligned_free(ptr);
+    port::AlignedFree(ptr);
   }
 
   // Make sure that even tensors with 0 elements have allocated
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index e46c2a31482..a0edbc5cbc3 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -318,7 +318,7 @@ Status XlaCompiler::CompileGraph(string const& name,
   }
 
   XlaContext* xla_context =
-      new XlaContext(client(), name, allow_cpu_custom_calls_);
+      new XlaContext(this, client(), name, allow_cpu_custom_calls_);
   core::ScopedUnref xla_context_unref(xla_context);
 
   TF_RETURN_IF_ERROR(xla_context->BuildArguments(args, use_tuple_arg));
@@ -402,4 +402,15 @@ Status XlaCompiler::CompileGraph(string const& name,
   return Status::OK();
 }
 
+Status XlaCompiler::GetChannelHandle(const string& key,
+                                     xla::ChannelHandle* channel) {
+  mutex_lock lock(mu_);
+  auto result = channels_.emplace(key, xla::ChannelHandle());
+  if (result.second) {
+    TF_ASSIGN_OR_RETURN(result.first->second, client_->CreateChannelHandle());
+  }
+  *channel = result.first->second;
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index 0b882d60a1b..f21abae5f42 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -172,6 +172,12 @@ class XlaCompiler {
   XlaCompilationDevice* device() const { return device_; }
   const DeviceMgr* device_mgr() const { return &device_mgr_; }
 
+  // Retrieves the channel handle associated with `key`. Allocates
+  // a new channel handle if none exists.
+  // Channel handles can be used to communicate between different computations.
+  // Computations that communicate should be compiled with the same XlaCompiler.
+  Status GetChannelHandle(const string& key, xla::ChannelHandle* channel);
+
  private:
   // Does the real work of Compile() and CompileToComputation().
   Status CompileFunctionBody(FunctionLibraryRuntime* function_library,
@@ -195,6 +201,8 @@ class XlaCompiler {
   XlaCompilationDevice* device_;  // Owned by device_mgr_
   DeviceMgr device_mgr_;
 
+  std::unordered_map<string, xla::ChannelHandle> channels_ GUARDED_BY(mu_);
+
   TF_DISALLOW_COPY_AND_ASSIGN(XlaCompiler);
 };
 
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
new file mode 100644
index 00000000000..24efd3ed0b8
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -0,0 +1,107 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/sendrecv_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
+#include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/version.h"
+
+namespace tensorflow {
+namespace {
+
+class XlaCompilerTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    client_ = xla::ClientLibrary::LocalClientOrDie();
+
+    XlaCompiler::Options options;
+    options.device_type = DeviceType(DEVICE_CPU_XLA_JIT);
+    options.client = client_;
+    compiler_.reset(new XlaCompiler(options));
+
+    XlaOpRegistry::RegisterJitKernels();
+
+    FunctionDefLibrary flib;
+    flib_def_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), flib));
+    flr_.reset(NewFunctionLibraryRuntime(
+        compiler_->device_mgr(), /*env=*/nullptr, compiler_->device(),
+        TF_GRAPH_DEF_VERSION, flib_def_.get(), OptimizerOptions(),
+        /*custom_kernel_creator=*/nullptr));
+  }
+
+  xla::Client* client_;
+  std::unique_ptr<XlaCompiler> compiler_;
+  std::unique_ptr<FunctionLibraryDefinition> flib_def_;
+  std::unique_ptr<FunctionLibraryRuntime> flr_;
+};
+
+TEST_F(XlaCompilerTest, Simple) {
+  // Builds a graph that adds two Tensors.
+  Scope scope = Scope::NewRootScope().ExitOnError();
+  auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, 0);
+  auto b = ops::_Arg(scope.WithOpName("B"), DT_INT32, 1);
+  auto c = ops::Add(scope.WithOpName("C"), a, b);
+  auto d = ops::_Retval(scope.WithOpName("D"), c, 0);
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(scope.ToGraph(graph.get()));
+
+  // Builds a description of the arguments.
+  std::vector<XlaCompiler::Argument> args(2);
+  args[0].type = DT_INT32;
+  args[0].shape = TensorShape({2});
+  args[0].parameter = 0;
+  args[1].type = DT_INT32;
+  args[1].shape = TensorShape({2});
+  args[1].parameter = 1;
+
+  // Compiles the graph.
+  XlaCompiler::CompilationResult result;
+  TF_ASSERT_OK(compiler_->CompileGraph("add", std::move(graph), flr_.get(),
+                                       args, /*use_tuple_arg=*/false, &result));
+
+  // Tests that the generated computation works.
+  std::unique_ptr<xla::Literal> param0_literal =
+      xla::LiteralUtil::CreateR1<int32>({7, 42});
+  std::unique_ptr<xla::Literal> param1_literal =
+      xla::LiteralUtil::CreateR1<int32>({-3, 101});
+  std::unique_ptr<xla::GlobalData> param0_data =
+      client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
+  std::unique_ptr<xla::GlobalData> param1_data =
+      client_->TransferToServer(*param1_literal).ConsumeValueOrDie();
+
+  std::unique_ptr<xla::GlobalData> actual =
+      client_
+          ->Execute(result.computation, {param0_data.get(), param1_data.get()})
+          .ConsumeValueOrDie();
+  std::unique_ptr<xla::Literal> actual_literal =
+      client_->Transfer(*actual).ConsumeValueOrDie();
+
+  std::unique_ptr<xla::Literal> expected_literal =
+      xla::LiteralUtil::CreateR1<int32>({4, 143});
+  xla::LiteralTestUtil::ExpectEqual(*expected_literal, *actual_literal);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_context.cc b/tensorflow/compiler/tf2xla/xla_context.cc
index ad8fc3f2057..6c399366e5e 100644
--- a/tensorflow/compiler/tf2xla/xla_context.cc
+++ b/tensorflow/compiler/tf2xla/xla_context.cc
@@ -167,7 +167,7 @@ Status XlaContext::CollectResults(
     }
   }
 
-  if (handle.handle() > 0) {
+  if (handle.handle() > 0 || has_side_effects_) {
     // Build the full computation. The return value is the handle
     // constructed above.
     xla::StatusOr<xla::Computation> computation_status = builder().Build();
@@ -190,9 +190,11 @@ Status XlaContext::CollectResults(
   return Status::OK();
 }
 
-XlaContext::XlaContext(xla::Client* client, const string& computation_name,
+XlaContext::XlaContext(XlaCompiler* compiler, xla::Client* client,
+                       const string& computation_name,
                        bool allow_cpu_custom_calls)
-    : xla_builder_(client, computation_name),
+    : compiler_(compiler),
+      xla_builder_(client, computation_name),
       allow_cpu_custom_calls_(allow_cpu_custom_calls) {}
 
 const xla::ComputationDataHandle&
@@ -233,6 +235,11 @@ Status XlaContext::AddConstRetval(int retval_index, DataType dtype,
   return Status::OK();
 }
 
+void XlaContext::AddSideEffects() {
+  mutex_lock lock(mu_);
+  has_side_effects_ = true;
+}
+
 /* static */ const XlaExpression* XlaContext::CastExpressionFromTensor(
     const Tensor& tensor) {
   const XlaExpression* expression =
diff --git a/tensorflow/compiler/tf2xla/xla_context.h b/tensorflow/compiler/tf2xla/xla_context.h
index b0464025f7d..f4c840dc0bd 100644
--- a/tensorflow/compiler/tf2xla/xla_context.h
+++ b/tensorflow/compiler/tf2xla/xla_context.h
@@ -68,7 +68,7 @@ class XlaExpression {
   TF_DISALLOW_COPY_AND_ASSIGN(XlaExpression);
 };
 
-// The XlaContext is the datastructure accessible from
+// The XlaContext is the data structure accessible from
 // OpKernelContexts when evaluating a subgraph of Ops for JIT
 // compilation by XLA. When an Op is executed during JIT
 // compilation the input Tensors to the Op store handles to
@@ -132,8 +132,8 @@ class XlaContext : public ResourceBase {
   }
 
   // Create a new XlaContext.
-  XlaContext(xla::Client* client, const string& computation_name,
-             bool allow_cpu_custom_calls);
+  XlaContext(XlaCompiler* compiler, xla::Client* client,
+             const string& computation_name, bool allow_cpu_custom_calls);
 
   // Builds XLA computations for each of the arguments.
   // Should only be called once to initialize the arguments. Not thread-safe.
@@ -160,6 +160,9 @@ class XlaContext : public ResourceBase {
   Status AddConstRetval(int retval_index, DataType dtype,
                         const xla::Literal& literal);
 
+  // Mark the computation as having side effects (i.e., Send operators).
+  void AddSideEffects();
+
   // Retrieves the ComputationDataHandle from an input Tensor to an Op. This
   // computation was constructed by an Op that executed previously and
   // created the output Tensor using CreateOutputTensorFromComputation
@@ -167,6 +170,8 @@ class XlaContext : public ResourceBase {
   static const xla::ComputationDataHandle& GetComputationFromTensor(
       const Tensor& tensor);
 
+  XlaCompiler* compiler() const { return compiler_; }
+
   // Returns the ComputationBuilder that Ops use for compiling new
   // expressions.
   xla::ComputationBuilder& builder();
@@ -215,6 +220,8 @@ class XlaContext : public ResourceBase {
   // or CreateConstantOutputTensor.
   static const XlaExpression* GetExpressionFromTensor(const Tensor& tensor);
 
+  XlaCompiler* const compiler_;
+
   mutable mutex mu_;
 
   // The ComputationBuilder used to construct the subgraph's compiled
@@ -250,6 +257,9 @@ class XlaContext : public ResourceBase {
   // The non-data-dependent return values of the computation.
   std::vector<ConstRetVal> compile_time_constant_ GUARDED_BY(mu_);
 
+  // Does the computation have side effects, i.e., Send() calls?
+  bool has_side_effects_ GUARDED_BY(mu_) = false;
+
   // Cache of prebuilt computations indexed by their type.
   using ComputationMap = std::map<DataType, xla::Computation>;
 
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index 3883b907b43..00cf1adc119 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -223,6 +223,10 @@ void XlaOpKernelContext::SetConstantOutput(int index, const Tensor& constant) {
   expression->set_constant_value(constant);
 }
 
+void XlaOpKernelContext::SetOpHasSideEffects() {
+  XlaContext::Get(context_).AddSideEffects();
+}
+
 void XlaOpKernelContext::CtxFailure(Status s) { context_->CtxFailure(s); }
 void XlaOpKernelContext::CtxFailureWithWarning(Status s) {
   context_->CtxFailureWithWarning(s);
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h
index 0c614005bec..5fbc0cb6ac3 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.h
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h
@@ -131,6 +131,9 @@ class XlaOpKernelContext {
   void SetStatus(const Status& status) { context_->SetStatus(status); }
   Status status() { return context_->status(); }
 
+  // Mark the op has having side effects (i.e., via Send).
+  void SetOpHasSideEffects();
+
   // Helper routines for the OP_REQUIRES macros
   void CtxFailure(Status s);
   void CtxFailureWithWarning(Status s);
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index 148c033eaa3..384aae867b1 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -314,12 +314,23 @@ tensorflow::Status LocalClient::ExecuteLocally(
                                         options, result);
 }
 
-StatusOr<std::unique_ptr<AotCompilationResult>> LocalClient::CompileAheadOfTime(
-    const Computation& computation,
-    const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
-    const Shape& result_layout, const AotCompilationOptions& options) {
-  return local_service_->CompileAheadOfTime(
-      computation.handle(), argument_layouts, result_layout, options);
+StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+LocalClient::CompileAheadOfTime(
+    const tensorflow::gtl::ArraySlice<AheadOfTimeComputationInstance>
+        computations,
+    const AotCompilationOptions& options) {
+  std::vector<LocalService::AheadOfTimeComputationInstance> service_instances;
+  service_instances.reserve(computations.size());
+  for (const AheadOfTimeComputationInstance& instance : computations) {
+    service_instances.push_back({});
+    LocalService::AheadOfTimeComputationInstance& service_instance =
+        service_instances.back();
+    TF_RET_CHECK(instance.computation != nullptr);
+    service_instance.computation = instance.computation->handle();
+    service_instance.argument_layouts = instance.argument_layouts;
+    service_instance.result_layout = instance.result_layout;
+  }
+  return local_service_->CompileAheadOfTime(service_instances, options);
 }
 
 int64 LocalClient::PointerSizeForTriple(tensorflow::StringPiece target_triple) {
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index 1d6243a3b68..33366b97fd5 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -219,19 +219,26 @@ class LocalClient : public Client {
       const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
       const ExecutableBuildOptions& options);
 
-  // Compiles the computation for ahead-of-time execution.  This is intended for
-  // use in static compilation.  The |argument_layouts| parameter is used to
-  // inform the compiler of the expected layout for arguments while
-  // |result_layout| is used to signal the layout of the result.  The |options|
-  // parameter is used to request which target the compiler should emit code
-  // for.
+  // A description of a computation to compile using CompileAheadOfTime.
+  struct AheadOfTimeComputationInstance {
+    const Computation* computation;
+    // Inform the compiler of the expected layout for arguments.
+    std::vector<const Shape*> argument_layouts;
+    // Specifies the expected result layout.
+    const Shape* result_layout;
+  };
+
+  // Compiles a list of computations for ahead-of-time execution.  This is
+  // intended for use in static compilation. The |options| parameter describes
+  // the target for which the compiler should emit code.
   //
   // TODO(b/31222190): This doesn't really belong in LocalClient. Move it to its
   // own library.
-  StatusOr<std::unique_ptr<AotCompilationResult>> CompileAheadOfTime(
-      const Computation& computation,
-      const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
-      const Shape& result_layout, const AotCompilationOptions& options);
+  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  CompileAheadOfTime(
+      const tensorflow::gtl::ArraySlice<AheadOfTimeComputationInstance>
+          computations,
+      const AotCompilationOptions& options);
 
   // Returns the size of a pointer in bytes for a given triple.
   static int64 PointerSizeForTriple(tensorflow::StringPiece triple);
diff --git a/tensorflow/compiler/xla/layout_util.cc b/tensorflow/compiler/xla/layout_util.cc
index 81eb717821b..4d5f682156e 100644
--- a/tensorflow/compiler/xla/layout_util.cc
+++ b/tensorflow/compiler/xla/layout_util.cc
@@ -360,4 +360,20 @@ tensorflow::Status LayoutUtil::CopyLayoutBetweenShapes(const Shape& src,
   }
 }
 
+/* static */ bool LayoutUtil::AreDimensionsConsecutive(
+    const Layout& layout, tensorflow::gtl::ArraySlice<int64> dims) {
+  std::vector<int64> positions_in_layout;
+  for (int64 dim : dims) {
+    positions_in_layout.push_back(
+        PositionInContainer(layout.minor_to_major(), dim));
+  }
+  std::sort(positions_in_layout.begin(), positions_in_layout.end());
+  for (size_t i = 1; i < positions_in_layout.size(); ++i) {
+    if (1 != positions_in_layout[i] - positions_in_layout[i - 1]) {
+      return false;
+    }
+  }
+  return true;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/layout_util.h b/tensorflow/compiler/xla/layout_util.h
index 984bf402cdc..b6e8ecaa169 100644
--- a/tensorflow/compiler/xla/layout_util.h
+++ b/tensorflow/compiler/xla/layout_util.h
@@ -144,6 +144,11 @@ class LayoutUtil {
   // except that the element type is ignored.
   static bool LayoutsInShapesEqual(const Shape& lhs, const Shape& rhs);
 
+  // Returns whether the given dimensions are consecutive in the given layout,
+  // not necessarily in the order given.
+  static bool AreDimensionsConsecutive(const Layout& layout,
+                                       tensorflow::gtl::ArraySlice<int64> dims);
+
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(LayoutUtil);
 };
diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h
index 78e9e3fb24f..f26116bf078 100644
--- a/tensorflow/compiler/xla/literal_util.h
+++ b/tensorflow/compiler/xla/literal_util.h
@@ -136,6 +136,12 @@ class LiteralUtil {
       const Literal& literal, tensorflow::gtl::ArraySlice<int64> start_indices,
       tensorflow::gtl::ArraySlice<int64> limit_indices);
 
+  // Creates a literal with a prepended dimension with bound "times"; e.g. a
+  // f32[3x2] with times=4 will produce a f32[4x3x2] with the 3x2 from the input
+  // literal replicated four times.
+  template <typename NativeT>
+  static std::unique_ptr<Literal> Replicate(const Literal& input, int64 times);
+
   // Create a literal by converting each element in an original literal to a new
   // type.
   template <typename NativeSrcT, typename NativeDestT>
@@ -999,6 +1005,30 @@ LiteralUtil::CreateFullWithMonotonicDim0MajorLayout(
   return literal;
 }
 
+template <typename NativeT>
+/* static */ std::unique_ptr<Literal> LiteralUtil::Replicate(
+    const Literal& input, int64 times) {
+  std::vector<int64> bounds = {times};
+  bounds.insert(bounds.end(), input.shape().dimensions().begin(),
+                input.shape().dimensions().end());
+  auto literal = MakeUnique<Literal>();
+  *literal->mutable_shape() =
+      ShapeUtil::MakeShape(input.shape().element_type(), bounds);
+  Reserve(ShapeUtil::ElementsIn(literal->shape()), literal.get());
+  for (int64 index = 0; index < ShapeUtil::ElementsIn(input.shape()); ++index) {
+    const std::vector<int64> element_indices =
+        IndexUtil::LinearIndexToMultidimensionalIndex(input.shape(), index);
+    const auto element = Get<NativeT>(input, element_indices);
+    for (int64 sample = 0; sample < times; ++sample) {
+      std::vector<int64> output_indices = {sample};
+      output_indices.insert(output_indices.end(), element_indices.begin(),
+                            element_indices.end());
+      Set<NativeT>(literal.get(), output_indices, element);
+    }
+  }
+  return literal;
+}
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_LITERAL_UTIL_H_
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index fe892e872fd..b08f859270a 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -749,11 +749,11 @@ Status AlgebraicSimplifierVisitor::HandleConvolution(
   TF_RET_CHECK(LayoutUtil::HasLayout(filter_shape));
   TF_RET_CHECK(LayoutUtil::HasLayout(convolution_shape));
 
-  // Require 1x1 filter in the spatial dimensions (so no need to extract image
-  // patches).
-  if (filter_shape.dimensions(dnums.kernel_spatial_dimensions(0)) != 1 ||
-      filter_shape.dimensions(dnums.kernel_spatial_dimensions(1)) != 1) {
-    return Status::OK();
+  // Require the spatial dimensions in the kernel to have a bound of one.
+  for (int64 i = 0; i < dnums.kernel_spatial_dimensions_size(); ++i) {
+    if (filter_shape.dimensions(dnums.kernel_spatial_dimensions(i)) != 1) {
+      return Status::OK();
+    }
   }
 
   // Stride ignores part of the output, which matrix multiplication does not do,
@@ -782,9 +782,9 @@ Status AlgebraicSimplifierVisitor::HandleConvolution(
       input_shape.layout().minor_to_major(0) != dnums.feature_dimension() ||
       // The input feature dimension should come later in the minor-to-major
       // order.
-      (PositionInContainer(AsInt64Slice(filter_shape.layout().minor_to_major()),
+      (PositionInContainer(filter_shape.layout().minor_to_major(),
                            dnums.kernel_input_feature_dimension()) <
-       PositionInContainer(AsInt64Slice(filter_shape.layout().minor_to_major()),
+       PositionInContainer(filter_shape.layout().minor_to_major(),
                            dnums.kernel_output_feature_dimension()))) {
     return Status::OK();
   }
diff --git a/tensorflow/compiler/xla/service/backend.cc b/tensorflow/compiler/xla/service/backend.cc
index 6e76c98c9f3..7452a7b6965 100644
--- a/tensorflow/compiler/xla/service/backend.cc
+++ b/tensorflow/compiler/xla/service/backend.cc
@@ -234,4 +234,8 @@ StatusOr<bool> Backend::devices_equivalent(int device_ordinal_a,
           executor_b->GetDeviceDescription().name());
 }
 
+Status Backend::ResetDevices() {
+  return transfer_manager_->ResetDevices(stream_executors_);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/backend.h b/tensorflow/compiler/xla/service/backend.h
index 17c53d299ed..db482c09ae2 100644
--- a/tensorflow/compiler/xla/service/backend.h
+++ b/tensorflow/compiler/xla/service/backend.h
@@ -149,6 +149,9 @@ class Backend {
   // used for scheduling work. For other platforms, returns NULL.
   const Eigen::ThreadPoolDevice* eigen_intra_op_thread_pool_device() const;
 
+  // Resets the devices associated with this backend.
+  Status ResetDevices();
+
  private:
   struct EigenThreadPoolWrapper;
   Backend(int64 replica_count, perftools::gputools::Platform* platform,
diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h
index 632081a747e..85c2d03e1bc 100644
--- a/tensorflow/compiler/xla/service/compiler.h
+++ b/tensorflow/compiler/xla/service/compiler.h
@@ -128,10 +128,11 @@ class Compiler {
 
   // Compiles the HLO module for ahead-of-time execution.  This is intended for
   // use in static compilation.
-  virtual StatusOr<std::unique_ptr<AotCompilationResult>> CompileAheadOfTime(
-      std::unique_ptr<HloModule> module,
-      std::unique_ptr<HloModuleConfig> module_config, HloDumper dump_hlo,
-      const AotCompilationOptions& options) = 0;
+  virtual StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  CompileAheadOfTime(
+      std::vector<std::unique_ptr<HloModule>> module,
+      std::vector<std::unique_ptr<HloModuleConfig>> module_config,
+      HloDumper dump_hlo, const AotCompilationOptions& options) = 0;
 
   /////
   // The Compiler class also serves as a point to register compiler objects
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index d566cfd8c8f..b9f4537b809 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -478,10 +478,13 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> CpuCompiler::Compile(
       "Compilation of multiple HLO modules is not yet supported on CPU.");
 }
 
-StatusOr<std::unique_ptr<AotCompilationResult>> CpuCompiler::CompileAheadOfTime(
-    std::unique_ptr<HloModule> hlo_module,
-    std::unique_ptr<HloModuleConfig> module_config, HloDumper dump_hlo,
-    const AotCompilationOptions& aot_options) {
+StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+CpuCompiler::CompileAheadOfTime(
+    std::vector<std::unique_ptr<HloModule>> hlo_modules,
+    std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
+    HloDumper dump_hlo, const AotCompilationOptions& aot_options) {
+  TF_RET_CHECK(hlo_modules.size() == module_configs.size());
+
   if (aot_options.PlatformId() != se::host::kHostPlatformId) {
     return InvalidArgument("Incompatible AOT compilation platform");
   }
@@ -549,72 +552,78 @@ StatusOr<std::unique_ptr<AotCompilationResult>> CpuCompiler::CompileAheadOfTime(
   const llvm::DataLayout& data_layout = llvm_module.getDataLayout();
   int64 pointer_size = data_layout.getPointerSize();
 
-  TF_RETURN_IF_ERROR(
-      RunHloPasses(hlo_module.get(), module_config.get(), dump_hlo));
+  std::vector<std::unique_ptr<AotCompilationResult>> results;
+  for (int i = 0; i < hlo_modules.size(); ++i) {
+    HloModule* hlo_module = hlo_modules[i].get();
+    HloModuleConfig* module_config = module_configs[i].get();
 
-  SequentialHloOrdering::HloModuleSequence module_sequence =
-      CreateModuleSequence(hlo_module.get());
-  // Run buffer analysis on the HLO graph. This analysis figures out which
-  // temporary buffers are required to run the computation.
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<BufferAssignment> assignment,
-      BufferAssigner::Run(
-          hlo_module.get(),
-          MakeUnique<SequentialHloOrdering>(hlo_module.get(), module_sequence),
-          pointer_size));
+    TF_RETURN_IF_ERROR(RunHloPasses(hlo_module, module_config, dump_hlo));
 
-  IrEmitter ir_emitter(*hlo_module, *module_config, *assignment, &llvm_module,
-                       /*hlo_to_profile_idx=*/nullptr);
-  HloComputation* computation = hlo_module->entry_computation();
-  for (auto embedded_computation :
-       computation->MakeEmbeddedComputationsList()) {
-    TF_RETURN_IF_ERROR(
-        ir_emitter
-            .EmitComputation(embedded_computation, embedded_computation->name(),
-                             /*is_entry_computation=*/false,
-                             &module_sequence.at(embedded_computation))
-            .status());
-  }
-  const string& entry_point_name = options.entry_point_name();
-  TF_ASSIGN_OR_RETURN(
-      llvm::Function * entry_function,
-      ir_emitter.EmitComputation(computation, entry_point_name,
-                                 /*is_entry_computation=*/true));
+    SequentialHloOrdering::HloModuleSequence module_sequence =
+        CreateModuleSequence(hlo_module);
+    // Run buffer analysis on the HLO graph. This analysis figures out which
+    // temporary buffers are required to run the computation.
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<BufferAssignment> assignment,
+        BufferAssigner::Run(hlo_module, MakeUnique<SequentialHloOrdering>(
+                                            hlo_module, module_sequence),
+                            pointer_size));
 
-  entry_function->setName(llvm_ir::AsStringRef(entry_point_name));
-
-  Disassembler disassembler(*target_machine);
-  CompilerFunctor compiler_functor(target_machine.get(), &disassembler,
-                                   opt_level, CompilerFunctor::AllIntrinsics());
-  llvm::object::OwningBinary<llvm::object::ObjectFile> object_file =
-      compiler_functor(llvm_module);
-  llvm::StringRef object_file_data_ref = object_file.getBinary()->getData();
-  ObjectFileData object_file_data(object_file_data_ref.begin(),
-                                  object_file_data_ref.end());
-
-  BufferSizes buffer_sizes;
-  for (const BufferAllocation& allocation : assignment->Allocations()) {
-    // Callers don't need to allocate temporary buffers for parameters.
-    if (allocation.is_entry_computation_parameter()) {
-      buffer_sizes.push_back(-1);
-      continue;
+    IrEmitter ir_emitter(*hlo_module, *module_config, *assignment, &llvm_module,
+                         /*hlo_to_profile_idx=*/nullptr);
+    HloComputation* computation = hlo_module->entry_computation();
+    for (auto embedded_computation :
+         computation->MakeEmbeddedComputationsList()) {
+      TF_RETURN_IF_ERROR(
+          ir_emitter
+              .EmitComputation(embedded_computation,
+                               embedded_computation->name(),
+                               /*is_entry_computation=*/false,
+                               &module_sequence.at(embedded_computation))
+              .status());
     }
-    // Callers don't need to allocate anything for thread-local temporary
-    // buffers.  They are lowered to allocas.
-    if (allocation.is_thread_local()) {
-      buffer_sizes.push_back(-1);
-      continue;
+    const string& entry_point_name = options.entry_point_name();
+    TF_ASSIGN_OR_RETURN(
+        llvm::Function * entry_function,
+        ir_emitter.EmitComputation(computation, entry_point_name,
+                                   /*is_entry_computation=*/true));
+
+    entry_function->setName(llvm_ir::AsStringRef(entry_point_name));
+
+    Disassembler disassembler(*target_machine);
+    CompilerFunctor compiler_functor(target_machine.get(), &disassembler,
+                                     opt_level,
+                                     CompilerFunctor::AllIntrinsics());
+    llvm::object::OwningBinary<llvm::object::ObjectFile> object_file =
+        compiler_functor(llvm_module);
+    llvm::StringRef object_file_data_ref = object_file.getBinary()->getData();
+    ObjectFileData object_file_data(object_file_data_ref.begin(),
+                                    object_file_data_ref.end());
+
+    BufferSizes buffer_sizes;
+    for (const BufferAllocation& allocation : assignment->Allocations()) {
+      // Callers don't need to allocate temporary buffers for parameters.
+      if (allocation.is_entry_computation_parameter()) {
+        buffer_sizes.push_back(-1);
+        continue;
+      }
+      // Callers don't need to allocate anything for thread-local temporary
+      // buffers.  They are lowered to allocas.
+      if (allocation.is_thread_local()) {
+        buffer_sizes.push_back(-1);
+        continue;
+      }
+      buffer_sizes.push_back(allocation.size());
     }
-    buffer_sizes.push_back(allocation.size());
+
+    TF_ASSIGN_OR_RETURN(const BufferAllocation* result_allocation,
+                        assignment->GetUniqueTopLevelOutputAllocation());
+
+    results.emplace_back(MakeUnique<CpuAotCompilationResult>(
+        std::move(object_file_data), std::move(buffer_sizes),
+        result_allocation->index()));
   }
-
-  TF_ASSIGN_OR_RETURN(const BufferAllocation* result_allocation,
-                      assignment->GetUniqueTopLevelOutputAllocation());
-
-  return std::unique_ptr<AotCompilationResult>(
-      MakeUnique<CpuAotCompilationResult>(std::move(object_file_data),
-                                          std::move(buffer_sizes),
-                                          result_allocation->index()));
+  return std::move(results);
 }
 
 se::Platform::Id CpuCompiler::PlatformId() const {
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
index 349724d8406..d7d77ce58a6 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
@@ -123,10 +123,11 @@ class CpuCompiler : public Compiler {
       HloDumper dump_hlo,
       std::vector<perftools::gputools::StreamExecutor*> stream_exec) override;
 
-  StatusOr<std::unique_ptr<AotCompilationResult>> CompileAheadOfTime(
-      std::unique_ptr<HloModule> module,
-      std::unique_ptr<HloModuleConfig> module_config, HloDumper dump_hlo,
-      const AotCompilationOptions& options) override;
+  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  CompileAheadOfTime(
+      std::vector<std::unique_ptr<HloModule>> module,
+      std::vector<std::unique_ptr<HloModuleConfig>> module_config,
+      HloDumper dump_hlo, const AotCompilationOptions& options) override;
 
   perftools::gputools::Platform::Id PlatformId() const override;
 
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.cc b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
index 086306696d3..1a6a144bd63 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
@@ -160,7 +160,9 @@ Status GenericTransferManager::TransferLiteralToInfeed(
   return Unimplemented("Infeed is not supported on GPU (b/30467474)");
 }
 
-Status GenericTransferManager::ResetDevice(se::StreamExecutor* executor) {
+Status GenericTransferManager::ResetDevices(
+    tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
+        executors) {
   return Unimplemented(
       "Device reset is not yet supported on CPU and GPU (b/30481585)");
 }
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.h b/tensorflow/compiler/xla/service/generic_transfer_manager.h
index cfa02bf22f7..06819d65c70 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.h
@@ -55,7 +55,9 @@ class GenericTransferManager : public TransferManager {
   Status TransferLiteralToInfeed(perftools::gputools::StreamExecutor* executor,
                                  const Literal& literal) override;
 
-  Status ResetDevice(perftools::gputools::StreamExecutor* executor) override;
+  Status ResetDevices(
+      tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
+          executors) override;
 
   StatusOr<std::vector<perftools::gputools::DeviceMemoryBase>>
   ShallowCopyTupleFromDevice(
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index a13279c6ff6..2f95446e6c4 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -312,10 +312,11 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> GpuCompiler::Compile(
       "Compilation of multiple HLO modules is not yet supported on GPU.");
 }
 
-StatusOr<std::unique_ptr<AotCompilationResult>> GpuCompiler::CompileAheadOfTime(
-    std::unique_ptr<HloModule> module,
-    std::unique_ptr<HloModuleConfig> module_config, HloDumper dump_hlo,
-    const AotCompilationOptions& options) {
+StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+GpuCompiler::CompileAheadOfTime(
+    std::vector<std::unique_ptr<HloModule>> module,
+    std::vector<std::unique_ptr<HloModuleConfig>> module_config,
+    HloDumper dump_hlo, const AotCompilationOptions& options) {
   return Unimplemented("not yet implemented: GpuCompiler::CompileAheadOfTime");
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
index fefa4031041..a074607760f 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
@@ -52,10 +52,11 @@ class GpuCompiler : public Compiler {
       HloDumper dump_hlo,
       std::vector<perftools::gputools::StreamExecutor*> stream_exec) override;
 
-  StatusOr<std::unique_ptr<AotCompilationResult>> CompileAheadOfTime(
-      std::unique_ptr<HloModule> module,
-      std::unique_ptr<HloModuleConfig> module_config, HloDumper dump_hlo,
-      AotCompilationOptions const& options) override;
+  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  CompileAheadOfTime(
+      std::vector<std::unique_ptr<HloModule>> module,
+      std::vector<std::unique_ptr<HloModuleConfig>> module_config,
+      HloDumper dump_hlo, AotCompilationOptions const& options) override;
 
   perftools::gputools::Platform::Id PlatformId() const override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index 0821fb01abb..e141179ba17 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 
 #include <algorithm>
+#include <vector>
 
 #include "external/llvm/include/llvm/IR/Module.h"
 #include "tensorflow/compiler/xla/layout_util.h"
@@ -121,8 +122,22 @@ bool IsReductionToVector(const HloInstruction& reduce) {
     return false;
   }
   const HloInstruction* input = reduce.operand(0);
-  return ShapeUtil::Rank(input->shape()) > 1 &&
-         ShapeUtil::Rank(reduce.shape()) == 1;
+  std::vector<int64> dims_to_keep;
+  for (int64 dim = 0; dim < input->shape().dimensions().size(); ++dim) {
+    if (!std::count(reduce.dimensions().begin(), reduce.dimensions().end(),
+                    dim)) {
+      dims_to_keep.push_back(dim);
+    }
+  }
+  return LayoutUtil::AreDimensionsConsecutive(input->shape().layout(),
+                                              dims_to_keep) &&
+         ShapeUtil::Equal(reduce.shape(), ShapeUtil::FilterDimensions(
+                                              [&dims_to_keep](int64 dim) {
+                                                return std::count(
+                                                    dims_to_keep.begin(),
+                                                    dims_to_keep.end(), dim);
+                                              },
+                                              input->shape()));
 }
 
 // This emits a device-side call to
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 79a64433465..c107f9cbbe2 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -1047,8 +1047,9 @@ Status IrEmitterUnnested::EmitRowReduction(
 // Figures out whether `reduce` is a row or column reduction, and which
 // dimensions to reduce, and calls either `EmitRowReduction` or
 // `EmitColumnReduction` as appropriate.
-// Prerequisite: the shape of `reduce` has rank 1 and, if `reduce` is fused, the
-//               fused subgraph is pure elementwise.
+// Prerequisite: all the dimensions to keep are contiguous in the input layout
+//               and, if `reduce` is fused, the fused subgraph is pure
+//               elementwise.
 Status IrEmitterUnnested::EmitReductionToVector(
     HloInstruction* reduce, const Shape& input_shape,
     const llvm_ir::ElementGenerator& input_gen,
@@ -1063,25 +1064,39 @@ Status IrEmitterUnnested::EmitReductionToVector(
                                   << reduce->ToString();
 
   // Specialize multi-dimensional-array-to-vector reduction.
-  //
-  // TODO(b/33239522): we could use the same algorithm for general reduction
-  // as long as the input dimensions to keep are adjacent in the layout and
-  // have the same relative layout as their corresponding output dimensions.
-  // For example, reducing shape [2,3,4,5] with minor_to_major={2,0,1,3} to
-  // shape [2,4] with minor_to_major={1,0} can be implemented as a column
-  // reduction from shape [15,8] to shape [8].
-  int64 input_dim_to_keep = -1;
+  std::vector<int64> input_dims_to_keep;
   for (int64 input_dim = 0; input_dim < ShapeUtil::Rank(input_shape);
        ++input_dim) {
     if (std::find(dimensions_to_reduce.begin(), dimensions_to_reduce.end(),
                   input_dim) == dimensions_to_reduce.end()) {
-      input_dim_to_keep = input_dim;
-      break;
+      input_dims_to_keep.push_back(input_dim);
     }
   }
-  CHECK_NE(-1, input_dim_to_keep);
 
-  if (LayoutUtil::Minor(input_shape.layout(), 0) == input_dim_to_keep) {
+  // Sort the dimensions to keep from minor to major, to facilitate checking
+  // whether another dimension is major or minor of them.
+  std::sort(input_dims_to_keep.begin(), input_dims_to_keep.end(),
+            [&input_shape](int64 dim_a, int64 dim_b) {
+              return PositionInContainer(input_shape.layout().minor_to_major(),
+                                         dim_a) <
+                     PositionInContainer(input_shape.layout().minor_to_major(),
+                                         dim_b);
+            });
+  // Now, if output rank is at least 1, `input_dims_to_keep.front()` is
+  // minormost and `input_dims_to_keep.back()` is majormost.
+
+  // If the dimensions to keep are minormost, emit a column reduction. As all
+  // the dimensions to keep are contiguous, by prerequisite of
+  // `EmitReductionToVector`, we only need to check whether the minormost
+  // dimension of the input is to keep.
+  //
+  // If the output is scalar, we could emit either a row or a column reduction.
+  // Some tests have shown scalar reduction is no more efficient as row
+  // reduction, and is simpler to emit as column reduction, so we emit a column
+  // reduction in this case.
+  if (input_dims_to_keep.empty() ||
+      input_dims_to_keep.front() ==
+          LayoutUtil::Minor(input_shape.layout(), 0)) {
     // Column reduction. Treat the result of "input" as a matrix whose width
     // is the most minor dimension and height the product of other dimensions,
     // and treat "reduce" as a column reduction of the input matrix.
@@ -1091,7 +1106,8 @@ Status IrEmitterUnnested::EmitReductionToVector(
     int64 height = 1;
     for (int64 input_dim = 0; input_dim < ShapeUtil::Rank(input_shape);
          ++input_dim) {
-      if (input_dim != input_dim_to_keep) {
+      if (!std::count(input_dims_to_keep.begin(), input_dims_to_keep.end(),
+                      input_dim)) {
         height *= input_shape.dimensions(input_dim);
       }
     }
@@ -1108,22 +1124,19 @@ Status IrEmitterUnnested::EmitReductionToVector(
     int64 width = 1;
     for (int64 input_dim = 0; input_dim < ShapeUtil::Rank(input_shape);
          ++input_dim) {
-      if (PositionInContainer(
-              AsInt64Slice(input_shape.layout().minor_to_major()), input_dim) >
-          PositionInContainer(
-              AsInt64Slice(input_shape.layout().minor_to_major()),
-              input_dim_to_keep)) {
+      if (PositionInContainer(input_shape.layout().minor_to_major(),
+                              input_dim) >
+          PositionInContainer(input_shape.layout().minor_to_major(),
+                              input_dims_to_keep.back())) {
         depth *= input_shape.dimensions(input_dim);
-      } else if (PositionInContainer(
-                     AsInt64Slice(input_shape.layout().minor_to_major()),
-                     input_dim) <
-                 PositionInContainer(
-                     AsInt64Slice(input_shape.layout().minor_to_major()),
-                     input_dim_to_keep)) {
+      } else if (PositionInContainer(input_shape.layout().minor_to_major(),
+                                     input_dim) <
+                 PositionInContainer(input_shape.layout().minor_to_major(),
+                                     input_dims_to_keep.front())) {
         width *= input_shape.dimensions(input_dim);
       }
     }
-    int64 height = input_shape.dimensions(input_dim_to_keep);
+    const int64 height = ShapeUtil::ElementsIn(reduce->shape());
     return EmitRowReduction(depth, height, width, reduce, input_shape,
                             input_gen, init_value_gen, reducer);
   }
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index 38465e37e7b..7f86a3cbb57 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -206,42 +206,49 @@ tensorflow::Status LocalService::ExecuteLocally(
   return tensorflow::Status::OK();
 }
 
-StatusOr<std::unique_ptr<AotCompilationResult>>
+StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
 LocalService::CompileAheadOfTime(
-    const ComputationHandle& computation,
-    const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
-    const Shape& result_layout, const AotCompilationOptions& options) {
-  TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
-                      computation_tracker_.Resolve(computation));
-  VersionedComputationHandle versioned_handle =
-      user_computation->GetVersionedHandle();
+    const tensorflow::gtl::ArraySlice<AheadOfTimeComputationInstance>
+        computations,
+    const AotCompilationOptions& options) {
+  std::vector<std::unique_ptr<HloModule>> hlo_modules;
+  std::vector<std::unique_ptr<HloModuleConfig>> module_configs;
+  for (const AheadOfTimeComputationInstance& instance : computations) {
+    TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
+                        computation_tracker_.Resolve(instance.computation));
+    VersionedComputationHandle versioned_handle =
+        user_computation->GetVersionedHandle();
 
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<HloModule> hlo_module,
-      computation_tracker_.BuildHloModule(versioned_handle,
-                                          /*include_unused_parameters=*/true));
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
+                        computation_tracker_.BuildHloModule(
+                            versioned_handle,
+                            /*include_unused_parameters=*/true));
+    hlo_modules.push_back(std::move(hlo_module));
 
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<const ProgramShape> program_shape,
-      user_computation->ComputeProgramShape(versioned_handle.version));
+    TF_ASSIGN_OR_RETURN(
+        std::shared_ptr<const ProgramShape> program_shape,
+        user_computation->ComputeProgramShape(versioned_handle.version));
 
-  auto module_config = MakeUnique<HloModuleConfig>(*program_shape);
-  auto* computation_layout = module_config->mutable_entry_computation_layout();
-  for (int i = 0; i < argument_layouts.size(); ++i) {
-    const Shape& argument_layout = *argument_layouts[i];
-    if (ShapeUtil::IsTuple(argument_layout)) {
-      return Unimplemented("tuple arguments not supported yet");
+    module_configs.push_back(MakeUnique<HloModuleConfig>(*program_shape));
+    HloModuleConfig* module_config = module_configs.back().get();
+    auto* computation_layout =
+        module_config->mutable_entry_computation_layout();
+    for (int i = 0; i < instance.argument_layouts.size(); ++i) {
+      const Shape& argument_layout = *instance.argument_layouts[i];
+      if (ShapeUtil::IsTuple(argument_layout)) {
+        return Unimplemented("tuple arguments not supported yet");
+      }
+      TF_RETURN_IF_ERROR(
+          computation_layout->mutable_parameter_layout(i)->CopyLayoutFromShape(
+              argument_layout));
     }
     TF_RETURN_IF_ERROR(
-        computation_layout->mutable_parameter_layout(i)->CopyLayoutFromShape(
-            argument_layout));
+        computation_layout->mutable_result_layout()->CopyLayoutFromShape(
+            *instance.result_layout));
   }
-  TF_RETURN_IF_ERROR(
-      computation_layout->mutable_result_layout()->CopyLayoutFromShape(
-          result_layout));
 
   return execute_backend_->compiler()
-      ->CompileAheadOfTime(std::move(hlo_module), std::move(module_config),
+      ->CompileAheadOfTime(std::move(hlo_modules), std::move(module_configs),
                            MakeHloDumper(), options)
       .ConsumeValueOrDie();
 }
@@ -426,8 +433,9 @@ StatusOr<std::unique_ptr<ShapedBuffer>> LocalService::ExecuteLocallyInternal(
   } else {
     se::StreamExecutor* stream_executor;
     if (options.device_ordinal() >= 0) {
-      TF_ASSIGN_OR_RETURN(stream_executor, execute_backend_->stream_executor(
-                                               options.device_ordinal()));
+      TF_ASSIGN_OR_RETURN(
+          stream_executor,
+          execute_backend_->stream_executor(options.device_ordinal()));
     } else {
       stream_executor = execute_backend_->default_stream_executor();
     }
diff --git a/tensorflow/compiler/xla/service/local_service.h b/tensorflow/compiler/xla/service/local_service.h
index 3e160a0201e..9fe0d5993b3 100644
--- a/tensorflow/compiler/xla/service/local_service.h
+++ b/tensorflow/compiler/xla/service/local_service.h
@@ -139,13 +139,21 @@ class LocalService : public Service {
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       const LocalExecuteOptions& options, ShapedBuffer* result_buffer);
 
-  // Compiles the computation for ahead-of-time execution.  This is intended for
-  // use in static compilation.  See |LocalClient::CompileAheadOfTime| for
-  // additional details.
-  StatusOr<std::unique_ptr<AotCompilationResult>> CompileAheadOfTime(
-      const ComputationHandle& computation,
-      const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
-      const Shape& result_layout, const AotCompilationOptions& Options);
+  // A description of a computation to compile using CompileAheadOfTime.
+  struct AheadOfTimeComputationInstance {
+    ComputationHandle computation;
+    std::vector<const Shape*> argument_layouts;
+    const Shape* result_layout = nullptr;
+  };
+
+  // Compiles a list of computations for ahead-of-time execution.  This is
+  // intended for use in static compilation.  See
+  // |LocalClient::CompileAheadOfTime| for additional details.
+  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  CompileAheadOfTime(
+      const tensorflow::gtl::ArraySlice<AheadOfTimeComputationInstance>
+          computations,
+      const AotCompilationOptions& Options);
 
   // Builds an Executable with the given argument layouts and options. If
   // result_layout is non-null, then the executable is compiled to produce a
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 847aea78884..0b3900b3b20 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -1019,16 +1019,7 @@ tensorflow::Status Service::TransferToInfeed(const TransferToInfeedRequest* arg,
 
 tensorflow::Status Service::ResetDevice(const ResetDeviceRequest* arg,
                                         ResetDeviceResponse* result) {
-  int first_device_ordinal = arg->has_device_handle()
-                                 ? arg->device_handle().handle()
-                                 : execute_backend_->default_device_ordinal();
-  TF_ASSIGN_OR_RETURN(auto executors,
-                      execute_backend_->Replicas(first_device_ordinal));
-  for (se::StreamExecutor* executor : executors) {
-    TF_RETURN_IF_ERROR(
-        execute_backend_->transfer_manager()->ResetDevice(executor));
-  }
-  return tensorflow::Status::OK();
+  return execute_backend_->ResetDevices();
 }
 
 tensorflow::Status Service::TransferToClientInProcess(
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index 1141e99fe32..e8ad61c9d0f 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -162,7 +162,15 @@ class Service : public ServiceInterface {
       const TransferToInfeedRequest* arg,
       TransferToInfeedResponse* result) override;
 
-  // Resets the device, clearing all existing state on the device.
+  // Resets devices, clearing all existing state on all the devices associated
+  // with this service (including memory allocated on the devices).
+  //
+  // ResetDevice may only be called where no previous Execution state on the
+  // device is used by the next Execution.
+  //
+  // ResetDevice should be called before an Execution that expect the device to
+  // be in the reset state. For example, if the prior Execution modifies device
+  // state (e.g., architectural state) that the next Execution depends on.
   tensorflow::Status ResetDevice(const ResetDeviceRequest* arg,
                                  ResetDeviceResponse* result) override;
 
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index 11559ad7578..fbab2dfd4af 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -1319,9 +1319,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   // Permute(dimensions,input) computes output[dimensions[i]]=input[i]. However,
   // we need output[i]=input[dimensions[i]] which is
   // Permute(Inverse(dimensions),input).
-  return ShapeUtil::MakeShape(operand.element_type(),
-                              Permute(InversePermutation(dimensions),
-                                      AsInt64Slice(operand.dimensions())));
+  return ShapeUtil::PermuteDimensions(InversePermutation(dimensions), operand);
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferSelectShape(
diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc
index 10fd4e53c5c..5a1ae6b0024 100644
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@@ -1125,8 +1125,8 @@ TEST_F(ShapeInferenceTest, Transpose) {
       ShapeInference::InferTransposeShape(a_shape, {1, 2, 3, 0});
   EXPECT_IS_OK(inferred_shape_and_status);
   Shape inferred_shape = inferred_shape_and_status.ValueOrDie();
-  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape,
-                               ShapeUtil::MakeShape(F32, {3, 4, 5, 2})));
+  EXPECT_TRUE(ShapeUtil::Compatible(inferred_shape,
+                                    ShapeUtil::MakeShape(F32, {3, 4, 5, 2})));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h
index 90dc921b7de..7ffce452139 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.h
+++ b/tensorflow/compiler/xla/service/transfer_manager.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/thread_annotations.h"
@@ -63,8 +64,10 @@ class TransferManager {
       perftools::gputools::StreamExecutor* executor,
       const Literal& literal) = 0;
 
-  // Resets the device that the given executor runs on.
-  virtual Status ResetDevice(perftools::gputools::StreamExecutor* executor) = 0;
+  // Resets the devices associated with this transfer manager.
+  virtual Status ResetDevices(
+      tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
+          executor) = 0;
 
   // Shallow copy a tuple from the device and create a DeviceMemoryBase object
   // for each element in the tuple. A DeviceMemoryBase object refers to the
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index 1e1e8c1b98a..ab2c43cd3dc 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -984,4 +984,38 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
          check_input_unit_indices(output_shape, input_shape);
 }
 
+/* static */ Shape ShapeUtil::DeleteDimension(int64 dim_to_delete,
+                                              Shape shape) {
+  shape.mutable_dimensions()->erase(shape.dimensions().begin() + dim_to_delete);
+  if (LayoutUtil::HasLayout(shape)) {
+    Layout* layout = shape.mutable_layout();
+    for (size_t i = 0; i < layout->minor_to_major().size();) {
+      if (layout->minor_to_major(i) == dim_to_delete) {
+        layout->mutable_minor_to_major()->erase(
+            layout->minor_to_major().begin() + i);
+        continue;
+      }
+      if (layout->minor_to_major(i) > dim_to_delete) {
+        (*layout->mutable_minor_to_major())[i] -= 1;
+      }
+      ++i;
+    }
+  }
+  return shape;
+}
+
+/* static */ Shape ShapeUtil::FilterDimensions(
+    const std::function<bool(int64)>& p, Shape shape) {
+  std::vector<int64> dims_to_delete;
+  for (int64 i = shape.dimensions().size() - 1; i >= 0; --i) {
+    if (!p(i)) {
+      dims_to_delete.push_back(i);
+    }
+  }
+  for (int64 dim : dims_to_delete) {
+    shape = DeleteDimension(dim, shape);
+  }
+  return shape;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index 35fd714b0bc..fa5fcc0224f 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -374,6 +374,19 @@ class ShapeUtil {
   static bool ReshapeIsBitcast(const Shape& input_shape,
                                const Shape& output_shape);
 
+  // Returns a shape with the given dimension deleted.
+  // For example:
+  // • `DeleteDimension(1, T[m, n, k]) = T[m, k]`
+  static Shape DeleteDimension(int64 dim_to_delete, Shape shape);
+
+  // Returns a shape with all the dimensions of the input shape for which `p`
+  // returns true.
+  // For examples:
+  // • `FilterDimensions((< 2), T[m, n, k]) = T[m, n]`
+  // • `FilterDimensions(is_even_number, T[m, n, k]) = T[m, k]`
+  static Shape FilterDimensions(const std::function<bool(int64)>& p,
+                                Shape shape);
+
  private:
   // Recursive helper for comparing the equality of two shapes. Returns true if
   // the shapes are the same. If compare_layouts is true, then layouts must also
diff --git a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
index 50e5dec0f62..50d9ee50835 100644
--- a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
+++ b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -72,16 +73,19 @@ int main(int argc, char** argv) {
 
   llvm::Triple triple(xla::llvm_ir::AsStringRef(triple_string));
 
+  xla::Computation computation = builder.Build().ConsumeValueOrDie();
+  xla::LocalClient::AheadOfTimeComputationInstance instance{
+      &computation, /*argument_layouts=*/{&opaque_shape}, &r0f32};
+
   xla::cpu::CpuAotCompilationOptions options(
       triple_string,
       /*cpu_name=*/"", /*features=*/"", "SumAndDouble",
       xla::cpu::CpuAotCompilationOptions::RelocationModel::Static);
+
+  auto results =
+      client->CompileAheadOfTime({instance}, options).ConsumeValueOrDie();
   auto result = xla::unique_ptr_static_cast<xla::cpu::CpuAotCompilationResult>(
-      client
-          ->CompileAheadOfTime(builder.Build().ValueOrDie(),
-                               /*argument_layouts=*/{&opaque_shape}, r0f32,
-                               options)
-          .ConsumeValueOrDie());
+      std::move(results.front()));
   // We should have two buffers, one for the result and one temporary buffer,
   // and both should be float-sized.  It's lame to hard-code this, but we need
   // local_client_aot_test.cc to be able to easily invoke the function.
diff --git a/tensorflow/compiler/xla/util.cc b/tensorflow/compiler/xla/util.cc
index dac5dadf834..3ee5dfc9496 100644
--- a/tensorflow/compiler/xla/util.cc
+++ b/tensorflow/compiler/xla/util.cc
@@ -176,12 +176,6 @@ std::vector<int64> ComposePermutations(tensorflow::gtl::ArraySlice<int64> p1,
   return output;
 }
 
-int64 PositionInContainer(tensorflow::gtl::ArraySlice<int64> container,
-                          int64 value) {
-  return std::find(container.begin(), container.end(), value) -
-         container.begin();
-}
-
 PaddingConfig MakeNoPaddingConfig(int64 rank) {
   PaddingConfig padding_config;
   for (int64 dnum = 0; dnum < rank; ++dnum) {
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index 842b4f219a0..00f8d946f89 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -183,8 +183,11 @@ std::vector<int64> InversePermutation(
 std::vector<int64> ComposePermutations(tensorflow::gtl::ArraySlice<int64> p1,
                                        tensorflow::gtl::ArraySlice<int64> p2);
 
-int64 PositionInContainer(tensorflow::gtl::ArraySlice<int64> container,
-                          int64 value);
+template <typename Container>
+int64 PositionInContainer(const Container& container, int64 value) {
+  return std::distance(container.begin(),
+                       std::find(container.begin(), container.end(), value));
+}
 
 // Returns a PaddingConfig object that represents no padding for the given rank.
 PaddingConfig MakeNoPaddingConfig(int64 rank);
diff --git a/tensorflow/contrib/android/BUILD b/tensorflow/contrib/android/BUILD
index be77f4985a0..4c7df9d8d63 100644
--- a/tensorflow/contrib/android/BUILD
+++ b/tensorflow/contrib/android/BUILD
@@ -33,6 +33,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:android_tensorflow_lib_lite",
+        "//tensorflow/java/src/main/native",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/contrib/android/cmake/README.md b/tensorflow/contrib/android/cmake/README.md
index ad9e1720c74..915319da557 100644
--- a/tensorflow/contrib/android/cmake/README.md
+++ b/tensorflow/contrib/android/cmake/README.md
@@ -1,6 +1,10 @@
 TensorFlow-Android-Inference
 ============================
-Android Java interface to the TensorFlow native APIs
+This directory contains CMake support for building the Android Java Inference
+interface to the TensorFlow native APIs.
+
+See [tensorflow/contrib/android](..) for more details about the library, and
+instructions for building with Bazel.
 
 Usage
 -----
@@ -24,9 +28,9 @@ Note: this makes native code in the lib traceable from your app.
 
 Dependencies
 ------------
-TensorFlow-Android-Inference depends on the TensorFlow static libs already built in your
-local TensorFlow repo directory. For Linux/Mac OS, build_all_android.sh is used
-in build.gradle to build it. It DOES take time to build the core libs;
+TensorFlow-Android-Inference depends on the TensorFlow static libs already built
+in your local TensorFlow repo directory. For Linux/Mac OS, build_all_android.sh
+is used in build.gradle to build it. It DOES take time to build the core libs;
 so, by default, it is commented out to avoid confusion (otherwise
 Android Studio would appear to hang during opening the project).
 To enable it, refer to the comment in
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py
index 8d5ff341acd..24b726ac098 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import time
+
 from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
 from tensorflow.contrib.rnn.python.ops import core_rnn
 from tensorflow.contrib.rnn.python.ops import core_rnn_cell_impl
@@ -31,12 +32,8 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import variables
-from tensorflow.python.platform import flags
 from tensorflow.python.platform import test
 
-flags.DEFINE_integer("batch_size", 64, "batch size.")
-FLAGS = flags.FLAGS
-
 
 class CudnnRNNBenchmark(test.Benchmark):
   """Benchmarks Cudnn LSTM and other related models.
diff --git a/tensorflow/contrib/hvx/hexagon_controller/src_soc_interface/include/soc_interface.h b/tensorflow/contrib/hvx/hexagon_controller/src_soc_interface/include/soc_interface.h
new file mode 100644
index 00000000000..6d85e6ce487
--- /dev/null
+++ b/tensorflow/contrib/hvx/hexagon_controller/src_soc_interface/include/soc_interface.h
@@ -0,0 +1,98 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_PLATFORM_HEXAGON_SOC_INTERFACE_H_
+#define TENSORFLOW_PLATFORM_HEXAGON_SOC_INTERFACE_H_
+
+#include <inttypes.h>
+
+// Declaration of APIs provided by hexagon shared library. This header is shared
+// with both hexagon library built with qualcomm SDK and tensorflow.
+// All functions defined here must have prefix "soc_interface" to avoid
+// naming conflicts.
+#ifdef __cplusplus
+extern "C" {
+#else
+#include <stdbool.h>
+#endif  // __cplusplus
+// Returns the version of loaded hexagon wrapper shared library.
+// You should assert that the version matches the expected version before
+// calling APIs defined in this header.
+int soc_interface_GetWrapperVersion();
+// Returns the version of hexagon binary.
+// You should assert that the version matches the expected version before
+// calling APIs defined in this header.
+int soc_interface_GetSocControllerVersion();
+// Initialize SOC
+bool soc_interface_Init();
+// Finalize SOC
+bool soc_interface_Finalize();
+// Execute graph on SOC
+bool soc_interface_ExecuteGraph();
+// Teardown graph setup
+bool soc_interface_TeardownGraph();
+// Send input data to SOC
+bool soc_interface_FillInputNodeFloat(int x, int y, int z, int d,
+                                      const uint8_t* const buf,
+                                      uint64_t buf_size);
+// Load output data from SOC
+bool soc_interface_ReadOutputNodeFloat(const char* const node_name,
+                                       uint8_t** buf, uint64_t* buf_size);
+// Setup graph
+// TODO(satok): Remove and use runtime version
+bool soc_interface_setupDummyGraph(int version);
+
+// Allocate memory for params of node inputs and node outputs
+bool soc_interface_AllocateNodeInputAndNodeOutputArray(int total_input_count,
+                                                       int total_output_count);
+
+// Release memory for params of node inputs and node outputs
+bool soc_interface_ReleaseNodeInputAndNodeOutputArray();
+
+// Set one node's inputs and return pointer to that struct
+void* soc_interface_SetOneNodeInputs(int input_count, const int* const node_id,
+                                     const int* const port);
+
+// Set one node's outputs and return pointer to that struct
+void* soc_interface_SetOneNodeOutputs(int output_count, int* max_size);
+
+// Append const node to the graph
+bool soc_interface_AppendConstNode(const char* const name, int node_id,
+                                   int batch, int height, int width, int depth,
+                                   const uint8_t* const data, int data_length);
+
+// Append node to the graph
+bool soc_interface_AppendNode(const char* const name, int node_id, int op_id,
+                              int padding_id, const void* const inputs,
+                              int inputs_count, const void* const outputs,
+                              int outputs_count);
+
+// Instantiate graph
+bool soc_interface_InstantiateGraph();
+
+// Construct graph
+bool soc_interface_ConstructGraph();
+
+// Set log level
+void soc_interface_SetLogLevel(int log_level);
+
+// Set debug flag
+void soc_interface_SetDebugFlag(uint64_t flag);
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_PLATFORM_HEXAGON_SOC_INTERFACE_H_
diff --git a/tensorflow/contrib/hvx/hexagon_controller/src_soc_interface/soc_interface.c b/tensorflow/contrib/hvx/hexagon_controller/src_soc_interface/soc_interface.c
new file mode 100755
index 00000000000..ebcbb963e83
--- /dev/null
+++ b/tensorflow/contrib/hvx/hexagon_controller/src_soc_interface/soc_interface.c
@@ -0,0 +1,124 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "soc_interface.h"
+
+int soc_interface_GetWrapperVersion() {
+  // TODO(satok): implement
+  return -1;
+}
+
+int soc_interface_GetSocControllerVersion() {
+  // TODO(satok): implement
+  return -1;
+}
+
+bool soc_interface_Init() {
+  // TODO(satok): implement
+  return false;
+}
+
+bool soc_interface_Finalize() {
+  // TODO(satok): implement
+  return false;
+}
+
+bool soc_interface_ExecuteGraph() {
+  // TODO(satok): implement
+  return false;
+}
+
+bool soc_interface_TeardownGraph() {
+  // TODO(satok): implement
+  return false;
+}
+
+bool soc_interface_FillInputNodeFloat(
+    int x, int y, int z, int d, const uint8_t* const buf, uint64_t buf_size) {
+  // TODO(satok): implement
+  return false;
+}
+
+// TODO(satok): Remove and use runtime version
+bool soc_interface_ReadOutputNodeFloat(
+    const char* const node_name, uint8_t** buf, uint64_t *buf_size) {
+  // TODO(satok): implement
+  return false;
+}
+
+bool soc_interface_SetupGraphDummy(int version) {
+  // TODO(satok): implement
+  return false;
+}
+
+bool soc_interface_AllocateNodeInputAndNodeOutputArray(
+    int total_input_count, int total_output_count) {
+  // TODO(satok): implement
+  return false;
+}
+
+bool soc_interface_ReleaseNodeInputAndNodeOutputArray() {
+  // TODO(satok): implement
+  return false;
+}
+
+void* soc_interface_SetOneNodeInputs(
+    int input_count, const int* const node_id, const int* const port) {
+  // TODO(satok): implement
+  return 0;
+}
+
+void* soc_interface_SetOneNodeOutputs(int output_count, int* max_size) {
+  // TODO(satok): implement
+  return 0;
+}
+
+// Append const node to the graph
+bool soc_interface_AppendConstNode(
+    const char* const name, int node_id, int batch, int height, int width,
+    int depth, const uint8_t* const data, int data_length) {
+  // TODO(satok): implement
+  return false;
+}
+
+// Append node to the graph
+bool soc_interface_AppendNode(
+    const char* const name, int node_id, int ops_id, int padding_id,
+    const void* const inputs, int inputs_count, const void* const outputs,
+    int outputs_count) {
+  // TODO(satok): implement
+  return false;
+}
+
+
+// Instantiate graph
+bool soc_interface_InstantiateGraph() {
+  // TODO(satok): implement
+  return false;
+}
+
+// Construct graph
+bool soc_interface_ConstructGraph() {
+  // TODO(satok): implement
+  return false;
+}
+
+void soc_interface_SetLogLevel(int log_level) {
+  // TODO(satok): implement
+}
+
+void soc_interface_SetDebugFlag(uint64_t flag) {
+  // TODO(satok): implement
+}
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index b303a9d32b7..2673495b904 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -173,11 +173,12 @@ def _fused_batch_norm(
       `data_format` is `NHWC` and the second dimension if `data_format` is
       `NCHW`.
     decay: decay for the moving average. Reasonable values for `decay` are close
-      to 1.0, typically in the multiple-nines range: 0.999, 0.99, 0.9, etc. Lower
-      `decay` value (recommend trying `decay`=0.9) if model experiences reasonably
-      good training performance but poor validation and/or test performance.
-    center: If True, add offset of `beta` to normalized tensor. If False, `beta`
-      is ignored.
+      to 1.0, typically in the multiple-nines range: 0.999, 0.99, 0.9, etc.
+      Lower `decay` value (recommend trying `decay`=0.9) if model experiences
+      reasonably good training performance but poor validation and/or test
+      performance.
+    center: If True, add offset of `beta` to normalized tensor.  If False, 
+      `beta` is ignored.
     scale: If True, multiply by `gamma`. If False, `gamma` is
       not used. When the next layer is linear (also e.g. `nn.relu`), this can be
       disabled since the scaling can be done by the next layer.
@@ -632,16 +633,12 @@ def batch_norm(
     if need_moments:
       # Calculate the moments based on the individual batch.
       if batch_weights is None:
-        # Use a copy of moving_mean as a shift to compute more reliable moments.
-        shift = math_ops.add(moving_mean, 0)
         if data_format == DATA_FORMAT_NCHW:
-          shift = array_ops.reshape(shift, params_shape_broadcast)
-          mean, variance = nn.moments(inputs, moments_axes, shift=shift,
-                                      keep_dims=True)
+          mean, variance = nn.moments(inputs, moments_axes, keep_dims=True)
           mean = array_ops.reshape(mean, [-1])
           variance = array_ops.reshape(variance, [-1])
         else:
-          mean, variance = nn.moments(inputs, moments_axes, shift=shift)
+          mean, variance = nn.moments(inputs, moments_axes)
       else:
         if data_format == DATA_FORMAT_NCHW:
           mean, variance = nn.weighted_moments(inputs, moments_axes,
@@ -1385,7 +1382,7 @@ def fully_connected(inputs,
   Raises:
     ValueError: if x has rank less than 2 or if its last dimension is not set.
   """
-  if not (isinstance(num_outputs, six.integer_types)):
+  if not isinstance(num_outputs, six.integer_types):
     raise ValueError('num_outputs should be int or long, got %s.', num_outputs)
 
   layer_variable_getter = _build_variable_getter({'bias': 'biases'})
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index 1b0a8b12728..d1b35e33c26 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -2356,7 +2356,7 @@ class BatchNormTest(test.TestCase):
       else:
         image_shape = (batch_size, channels, height, width)
         axis = (0, 2, 3)
-      image_values = np.random.rand(*image_shape) + 2
+      image_values = np.random.rand(*image_shape) + 256
       expected_mean = np.mean(image_values, axis=axis)
       expected_var = np.var(image_values, axis=axis)
       if fused:
@@ -2393,9 +2393,9 @@ class BatchNormTest(test.TestCase):
         # The outputs should be close to 0.0 mean and 1.0 variance
         self.assertAllClose(
             np.mean(
-                np_output, axis=axis), [0] * channels, rtol=0.1, atol=0.1)
+                np_output, axis=axis), [0] * channels, rtol=0.001, atol=0.001)
         self.assertAllClose(
-            np.var(np_output, axis=axis), [1] * channels, rtol=0.1, atol=0.1)
+            np.var(np_output, axis=axis), [1] * channels, rtol=0.01, atol=0.01)
         # The gradients should change slowly while updating moving_mean.
         max_diff = np.max(np.abs(images_gradients_value - new_images_gradients))
         self.assertGreaterEqual(max_diff, 0.0)
@@ -2558,25 +2558,29 @@ class LayerNormTest(test.TestCase):
       # output_train and output_eval should be the same.
       self.assertAllClose(sess.run([output_train]), sess.run([output_eval]))
 
-  def doOutputTest(self, input_shape):
-    with self.test_session() as sess:
-      input_values = np.random.rand(*input_shape)
-      inputs = constant_op.constant(
-          input_values, shape=input_shape, dtype=dtypes.float32)
-      output_op = _layers.layer_norm(inputs, scope='LN')
-      # Initialize all variables
-      sess.run(variables_lib.global_variables_initializer())
-      # The mean and variance of the output should be close to 0 and 1
-      # respectively.
-      moments_axis = tuple([i for i in range(1, len(input_shape))])
-      outputs = sess.run(output_op)
-      expected_mean = np.zeros(input_shape[0])
-      expected_var = np.ones(input_shape[0])
-      mean = np.mean(outputs, axis=moments_axis)
-      var = np.var(outputs, axis=moments_axis)
-      tol = 1e-5
-      self.assertAllClose(mean, expected_mean, rtol=tol, atol=tol)
-      self.assertAllClose(var, expected_var, rtol=tol, atol=tol)
+  def doOutputTest(self, input_shape, tol=1e-3):
+    for mu in [0.0, 1e2]:
+      for sigma in [1.0, 0.1]:
+        input_values = np.random.rand(*input_shape) * sigma + mu
+        expected_mean = np.zeros(input_shape[0])
+        expected_var = np.ones(input_shape[0])
+        with ops.Graph().as_default() as g:
+          with self.test_session(graph=g) as sess:
+            inputs = constant_op.constant(input_values, shape=input_shape,
+                                          dtype=dtypes.float32)
+            output_op = _layers.layer_norm(inputs, scope='LN')
+            # Initialize all variables
+            sess.run(variables_lib.global_variables_initializer())
+            # The mean and variance of the output should be close to 0 and 1
+            # respectively.
+            moments_axis = tuple([i for i in range(1, len(input_shape))])
+            outputs = sess.run(output_op)
+            # Make sure that there are no NaNs
+            self.assertFalse(np.isnan(outputs).any())
+            mean = np.mean(outputs, axis=moments_axis)
+            var = np.var(outputs, axis=moments_axis)
+            self.assertAllClose(mean, expected_mean, rtol=tol, atol=tol)
+            self.assertAllClose(var, expected_var, rtol=tol, atol=tol)
 
   def testOutput2DInput(self):
     self.doOutputTest((10, 300))
@@ -2584,6 +2588,12 @@ class LayerNormTest(test.TestCase):
   def testOutput4DInput(self):
     self.doOutputTest((100, 10, 10, 3))
 
+  def testOutputSmallInput(self):
+    self.doOutputTest((10, 10, 10, 30))
+
+  def testOutputBigInput(self):
+    self.doOutputTest((1, 100, 100, 1))
+
 
 class MaxPool2DTest(test.TestCase):
 
diff --git a/tensorflow/contrib/layers/python/layers/regularizers.py b/tensorflow/contrib/layers/python/layers/regularizers.py
index 86d05167748..02eb2b390c6 100644
--- a/tensorflow/contrib/layers/python/layers/regularizers.py
+++ b/tensorflow/contrib/layers/python/layers/regularizers.py
@@ -65,7 +65,7 @@ def l1_regularizer(scale, scope=None):
       my_scale = ops.convert_to_tensor(scale,
                                        dtype=weights.dtype.base_dtype,
                                        name='scale')
-      return standard_ops.mul(
+      return standard_ops.multiply(
           my_scale,
           standard_ops.reduce_sum(standard_ops.abs(weights)),
           name=name)
@@ -104,7 +104,7 @@ def l2_regularizer(scale, scope=None):
       my_scale = ops.convert_to_tensor(scale,
                                        dtype=weights.dtype.base_dtype,
                                        name='scale')
-      return standard_ops.mul(my_scale, nn.l2_loss(weights), name=name)
+      return standard_ops.multiply(my_scale, nn.l2_loss(weights), name=name)
 
   return l2
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
index 467d31c3317..e3dc27e6460 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@@ -407,14 +407,15 @@ class BaseEstimator(
       raise ValueError('Can not provide both steps and max_steps.')
     _verify_input_args(x, y, input_fn, None, batch_size)
     if x is not None:
-      return SKCompat(self).fit(x, y, batch_size, steps, max_steps, monitors)
+      SKCompat(self).fit(x, y, batch_size, steps, max_steps, monitors)
+      return self
 
     if max_steps is not None:
       try:
         start_step = load_variable(self._model_dir, ops.GraphKeys.GLOBAL_STEP)
         if max_steps <= start_step:
           logging.info('Skipping training since max_steps has already saved.')
-          return None
+          return self
       except:  # pylint: disable=bare-except
         pass
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/run_config.py b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
index d8336e3cbd0..42da9969733 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/run_config.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 import json
 import os
 
-from tensorflow.contrib.framework import deprecated
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.training import server_lib
 
@@ -256,79 +255,30 @@ class RunConfig(ClusterConfig):
   def tf_config(self):
     return self._tf_config
 
-  @tf_config.setter
-  @deprecated(
-      '2017-01-08',
-      'RunConfig will be made immutable, please pass all args to constructor.')
-  def tf_config(self, value):
-    self._tf_config = value
-
   @property
   def tf_random_seed(self):
     return self._tf_random_seed
 
-  @tf_random_seed.setter
-  @deprecated(
-      '2017-01-08',
-      'RunConfig will be made immutable, please pass all args to constructor.')
-  def tf_random_seed(self, value):
-    self._tf_random_seed = value
-
   @property
   def save_summary_steps(self):
     return self._save_summary_steps
 
-  @save_summary_steps.setter
-  @deprecated(
-      '2017-01-08',
-      'RunConfig will be made immutable, please pass all args to constructor.')
-  def save_summary_steps(self, value):
-    self._save_summary_steps = value
-
   @property
   def save_checkpoints_secs(self):
     return self._save_checkpoints_secs
 
-  @save_checkpoints_secs.setter
-  @deprecated(
-      '2017-01-08',
-      'RunConfig will be made immutable, please pass all args to constructor.')
-  def save_checkpoints_secs(self, value):
-    self._save_checkpoints_secs = value
-
   @property
   def save_checkpoints_steps(self):
     return self._save_checkpoints_steps
 
-  @save_checkpoints_steps.setter
-  @deprecated(
-      '2017-01-08',
-      'RunConfig will be made immutable, please pass all args to constructor.')
-  def save_checkpoints_steps(self, value):
-    self._save_checkpoints_steps = value
-
   @property
   def keep_checkpoint_max(self):
     return self._keep_checkpoint_max
 
-  @keep_checkpoint_max.setter
-  @deprecated(
-      '2017-01-08',
-      'RunConfig will be made immutable, please pass all args to constructor.')
-  def keep_checkpoint_max(self, value):
-    self._keep_checkpoint_max = value
-
   @property
   def keep_checkpoint_every_n_hours(self):
     return self._keep_checkpoint_every_n_hours
 
-  @keep_checkpoint_every_n_hours.setter
-  @deprecated(
-      '2017-01-08',
-      'RunConfig will be made immutable, please pass all args to constructor.')
-  def keep_checkpoint_every_n_hours(self, value):
-    self._keep_checkpoint_every_n_hours = value
-
 
 def _count_ps(cluster_spec):
   """Counts the number of parameter servers in cluster_spec."""
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
index 4356e930fac..48d79ecbbff 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Implementations of different data feeders to provide data for TF trainer."""
 
 # TODO(ipolosukhin): Replace this module with feed-dict queue runners & queues.
@@ -37,13 +36,13 @@ from tensorflow.python.platform import tf_logging as logging
 from .pandas_io import HAS_PANDAS, extract_pandas_data, extract_pandas_matrix, extract_pandas_labels
 from .dask_io import HAS_DASK, extract_dask_data, extract_dask_labels
 
-
 # pylint: enable=g-multiple-import,g-bad-import-order
 
 
 def _get_in_out_shape(x_shape, y_shape, n_classes, batch_size=None):
   """Returns shape for input and output of the data feeder."""
-  x_is_dict, y_is_dict = isinstance(x_shape, dict), y_shape is not None and isinstance(y_shape, dict)
+  x_is_dict, y_is_dict = isinstance(
+      x_shape, dict), y_shape is not None and isinstance(y_shape, dict)
   if y_is_dict and n_classes is not None:
     assert (isinstance(n_classes, dict))
 
@@ -76,8 +75,11 @@ def _get_in_out_shape(x_shape, y_shape, n_classes, batch_size=None):
   if not y_is_dict:
     output_shape = out_el_shape(y_shape, n_classes)
   else:
-    output_shape = dict([(k, out_el_shape(v, n_classes[k] if n_classes is not None and k in n_classes else None))
-                         for k, v in list(y_shape.items())])
+    output_shape = dict([
+        (k, out_el_shape(v, n_classes[k]
+                         if n_classes is not None and k in n_classes else None))
+        for k, v in list(y_shape.items())
+    ])
 
   return input_shape, output_shape, batch_size
 
@@ -99,8 +101,12 @@ def _is_iterable(x):
   return hasattr(x, 'next') or hasattr(x, '__next__')
 
 
-def setup_train_data_feeder(
-        x, y, n_classes, batch_size=None, shuffle=True, epochs=None):
+def setup_train_data_feeder(x,
+                            y,
+                            n_classes,
+                            batch_size=None,
+                            shuffle=True,
+                            epochs=None):
   """Create data feeder, to sample inputs from dataset.
 
   If `x` and `y` are iterators, use `StreamingDataFeeder`.
@@ -108,10 +114,13 @@ def setup_train_data_feeder(
   Args:
     x: numpy, pandas or Dask matrix or dictionary of aforementioned. Also
       supports iterables.
-    y: numpy, pandas or Dask array or dictionary of aforementioned. Also supports
+    y: numpy, pandas or Dask array or dictionary of aforementioned. Also
+      supports
       iterables.
-    n_classes: number of classes. Must be None or same type as y. In case, `y` is `dict`
-      (or iterable which returns dict) such that `n_classes[key] = n_classes for y[key]`
+    n_classes: number of classes. Must be None or same type as y. In case, `y`
+      is `dict`
+      (or iterable which returns dict) such that `n_classes[key] = n_classes for
+        y[key]`
     batch_size: size to split data into parts. Must be >= 1.
     shuffle: Whether to shuffle the inputs.
     epochs: Number of epochs to run.
@@ -127,7 +136,7 @@ def setup_train_data_feeder(
     # pylint: disable=g-import-not-at-top
     import dask.dataframe as dd
     if (isinstance(x, (dd.Series, dd.DataFrame)) and
-          (y is None or isinstance(y, (dd.Series, dd.DataFrame)))):
+        (y is None or isinstance(y, (dd.Series, dd.DataFrame)))):
       data_feeder_cls = DaskDataFeeder
     else:
       data_feeder_cls = DataFeeder
@@ -140,7 +149,7 @@ def setup_train_data_feeder(
                        'streaming learning to work.')
     return StreamingDataFeeder(x, y, n_classes, batch_size)
   return data_feeder_cls(
-    x, y, n_classes, batch_size, shuffle=shuffle, epochs=epochs)
+      x, y, n_classes, batch_size, shuffle=shuffle, epochs=epochs)
 
 
 def _batch_data(x, batch_size=None):
@@ -150,7 +159,8 @@ def _batch_data(x, batch_size=None):
   x_first_el = six.next(x)
   x = itertools.chain([x_first_el], x)
 
-  chunk = dict([(k, []) for k in list(x_first_el.keys())]) if isinstance(x_first_el, dict) else []
+  chunk = dict([(k, []) for k in list(x_first_el.keys())]) if isinstance(
+      x_first_el, dict) else []
   chunk_filled = False
   for data in x:
     if isinstance(data, dict):
@@ -161,7 +171,8 @@ def _batch_data(x, batch_size=None):
           chunk_filled = True
       if chunk_filled:
         yield chunk
-        chunk = dict([(k, []) for k in list(x_first_el.keys())]) if isinstance(x_first_el, dict) else []
+        chunk = dict([(k, []) for k in list(x_first_el.keys())]) if isinstance(
+            x_first_el, dict) else []
         chunk_filled = False
     else:
       chunk.append(data)
@@ -259,16 +270,21 @@ def _access(data, iloc):
 def _check_dtype(dtype):
   if dtypes.as_dtype(dtype) == dtypes.float64:
     logging.warn(
-      'float64 is not supported by many models, consider casting to float32.')
+        'float64 is not supported by many models, consider casting to float32.')
   return dtype
 
 
 class DataFeeder(object):
   """Data feeder is an example class to sample data for TF trainer."""
 
-  def __init__(
-          self, x, y, n_classes, batch_size=None, shuffle=True, random_state=None,
-          epochs=None):
+  def __init__(self,
+               x,
+               y,
+               n_classes,
+               batch_size=None,
+               shuffle=True,
+               random_state=None,
+               epochs=None):
     """Initializes a DataFeeder instance.
 
     Args:
@@ -299,29 +315,33 @@ class DataFeeder(object):
       input_dtype: DType of input (or dictionary of shapes).
       output_dtype: DType of output (or dictionary of shapes.
     """
-    x_is_dict, y_is_dict = isinstance(x, dict), y is not None and isinstance(y, dict)
+    x_is_dict, y_is_dict = isinstance(x, dict), y is not None and isinstance(
+        y, dict)
     if isinstance(y, list):
       y = np.array(y)
 
-    self._x = dict([(k, check_array(v, v.dtype)) for k, v in list(x.items())]) if x_is_dict else check_array(x, x.dtype)
+    self._x = dict([(k, check_array(v, v.dtype)) for k, v in list(x.items())
+                   ]) if x_is_dict else check_array(x, x.dtype)
     self._y = None if y is None else \
       dict([(k, check_array(v, v.dtype)) for k, v in list(y.items())]) if x_is_dict else check_array(y, y.dtype)
 
     # self.n_classes is not None means we're converting raw target indices to one-hot.
     if n_classes is not None:
       if not y_is_dict:
-        y_dtype = (np.int64 if n_classes is not None and n_classes > 1 else np.float32)
+        y_dtype = (np.int64
+                   if n_classes is not None and n_classes > 1 else np.float32)
         self._y = (None if y is None else check_array(y, dtype=y_dtype))
 
     self.n_classes = n_classes
     self.max_epochs = epochs
 
-    x_shape = dict([(k, v.shape) for k, v in list(self._x.items())]) if x_is_dict else self._x.shape
-    y_shape = dict(
-      [(k, v.shape) for k, v in list(self._y.items())]) if y_is_dict else None if y is None else self._y.shape
+    x_shape = dict([(k, v.shape) for k, v in list(self._x.items())
+                   ]) if x_is_dict else self._x.shape
+    y_shape = dict([(k, v.shape) for k, v in list(self._y.items())
+                   ]) if y_is_dict else None if y is None else self._y.shape
 
     self.input_shape, self.output_shape, self._batch_size = _get_in_out_shape(
-      x_shape, y_shape, n_classes, batch_size)
+        x_shape, y_shape, n_classes, batch_size)
 
     # Input dtype matches dtype of x.
     self._input_dtype = dict([(k, _check_dtype(v.dtype)) for k, v in list(self._x.items())]) if x_is_dict \
@@ -339,9 +359,10 @@ class DataFeeder(object):
 
     self._shuffle = shuffle
     self.random_state = np.random.RandomState(
-      42) if random_state is None else random_state
+        42) if random_state is None else random_state
 
-    num_samples = list(self._x.values())[0].shape[0] if x_is_dict else self._x.shape[0]
+    num_samples = list(self._x.values())[0].shape[
+        0] if x_is_dict else self._x.shape[0]
     if self._shuffle:
       self.indices = self.random_state.permutation(num_samples)
     else:
@@ -380,8 +401,8 @@ class DataFeeder(object):
     Returns:
       The epoch placeholder.
     """
-    self._epoch_placeholder = array_ops.placeholder(dtypes.int32, [1],
-                                                    name='epoch')
+    self._epoch_placeholder = array_ops.placeholder(
+        dtypes.int32, [1], name='epoch')
     return self._epoch_placeholder
 
   def input_builder(self):
@@ -398,19 +419,17 @@ class DataFeeder(object):
         placeholder = {}
         for key in list(shape.keys()):
           placeholder[key] = array_ops.placeholder(
-            dtypes.as_dtype(dtype[key]),
-            [None] + shape[key][1:],
-            name=name_prepend + '_' + key
-          )
+              dtypes.as_dtype(dtype[key]), [None] + shape[key][1:],
+              name=name_prepend + '_' + key)
       else:
         placeholder = array_ops.placeholder(
-          dtypes.as_dtype(dtype),
-          [None] + shape[1:],
-          name=name_prepend)
+            dtypes.as_dtype(dtype), [None] + shape[1:], name=name_prepend)
       return placeholder
 
-    self._input_placeholder = get_placeholder(self.input_shape, self._input_dtype, 'input')
-    self._output_placeholder = get_placeholder(self.output_shape, self._output_dtype, 'output')
+    self._input_placeholder = get_placeholder(self.input_shape,
+                                              self._input_dtype, 'input')
+    self._output_placeholder = get_placeholder(self.output_shape,
+                                               self._output_dtype, 'output')
     return self._input_placeholder, self._output_placeholder
 
   def set_placeholders(self, input_placeholder, output_placeholder):
@@ -432,9 +451,9 @@ class DataFeeder(object):
       A `dict` with data feed params while training.
     """
     return {
-      'epoch': self.epoch,
-      'offset': self.offset,
-      'batch_size': self._batch_size
+        'epoch': self.epoch,
+        'offset': self.offset,
+        'batch_size': self._batch_size
     }
 
   def get_feed_dict_fn(self):
@@ -444,12 +463,13 @@ class DataFeeder(object):
       A function that when called samples a random subset of batch size
       from `x` and `y`.
     """
-    x_is_dict, y_is_dict = isinstance(self._x, dict), self._y is not None and isinstance(self._y, dict)
+    x_is_dict, y_is_dict = isinstance(
+        self._x, dict), self._y is not None and isinstance(self._y, dict)
 
     # Assign input features from random indices.
     def extract(data, indices):
-      return (np.array(_access(data, indices)).reshape((indices.shape[0], 1))
-              if len(data.shape) == 1 else _access(data, indices))
+      return (np.array(_access(data, indices)).reshape((indices.shape[0], 1)) if
+              len(data.shape) == 1 else _access(data, indices))
 
     # assign labels from random indices
     def assign_label(data, shape, dtype, n_classes, indices):
@@ -481,19 +501,22 @@ class DataFeeder(object):
         feed_dict[self._epoch_placeholder.name] = [self.epoch]
 
       # Take next batch of indices.
-      x_len = list(self._x.values())[0].shape[0] if x_is_dict else self._x.shape[0]
+      x_len = list(self._x.values())[0].shape[
+          0] if x_is_dict else self._x.shape[0]
       end = min(x_len, self.offset + self._batch_size)
       batch_indices = self.indices[self.offset:end]
 
       # adding input placeholder
       feed_dict.update(
-        dict([(self._input_placeholder[k].name, extract(v, batch_indices)) for k, v in list(self._x.items())])
-        if x_is_dict else {self._input_placeholder.name: extract(self._x, batch_indices)})
+          dict([(self._input_placeholder[k].name, extract(v, batch_indices))
+                for k, v in list(self._x.items())]) if x_is_dict else
+          {self._input_placeholder.name: extract(self._x, batch_indices)})
 
       # move offset and reset it if necessary
       self.offset += self._batch_size
       if self.offset >= x_len:
-        self.indices = self.random_state.permutation(x_len) if self._shuffle else np.array(range(x_len))
+        self.indices = self.random_state.permutation(
+            x_len) if self._shuffle else np.array(range(x_len))
         self.offset = 0
         self.epoch += 1
 
@@ -504,15 +527,19 @@ class DataFeeder(object):
       # adding output placeholders
       if y_is_dict:
         for k, v in list(self._y.items()):
-          n_classes = (
-            self.n_classes[k] if k in self.n_classes else None) if self.n_classes is not None else None
+          n_classes = (self.n_classes[k] if k in self.n_classes else
+                       None) if self.n_classes is not None else None
           shape, dtype = self.output_shape[k], self._output_dtype[k]
-          feed_dict.update(
-            {self._output_placeholder[k].name: assign_label(v, shape, dtype, n_classes, batch_indices)})
+          feed_dict.update({
+              self._output_placeholder[k].name:
+                  assign_label(v, shape, dtype, n_classes, batch_indices)
+          })
       else:
         shape, dtype, n_classes = self.output_shape, self._output_dtype, self.n_classes
-        feed_dict.update(
-          {self._output_placeholder.name: assign_label(self._y, shape, dtype, n_classes, batch_indices)})
+        feed_dict.update({
+            self._output_placeholder.name:
+                assign_label(self._y, shape, dtype, n_classes, batch_indices)
+        })
 
       return feed_dict
 
@@ -566,41 +593,56 @@ class StreamingDataFeeder(DataFeeder):
       self._y = None
     self.n_classes = n_classes
 
-    x_is_dict, y_is_dict = isinstance(x_first_el, dict), y is not None and isinstance(y_first_el, dict)
+    x_is_dict = isinstance(x_first_el, dict)
+    y_is_dict = y is not None and isinstance(y_first_el, dict)
     if y_is_dict and n_classes is not None:
-      assert (isinstance(n_classes, dict))
+      assert isinstance(n_classes, dict)
 
     # extract shapes for first_elements
-    x_first_el_shape = dict([(k, [1] + list(v.shape)) for k, v in list(x_first_el.items())]) if x_is_dict \
-      else [1] + list(x_first_el.shape)
+    if x_is_dict:
+      x_first_el_shape = dict(
+          [(k, [1] + list(v.shape)) for k, v in list(x_first_el.items())])
+    else:
+      x_first_el_shape = [1] + list(x_first_el.shape)
 
-    y_first_el_shape = dict([(k, [1] + list(v.shape)) for k, v in list(y_first_el.items())]) if y_is_dict \
-      else ([1] + list(y_first_el[0].shape if isinstance(y_first_el, list) else y_first_el.shape)
-            if y is not None else None)
+    if y_is_dict:
+      y_first_el_shape = dict(
+          [(k, [1] + list(v.shape)) for k, v in list(y_first_el.items())])
+    elif y is None:
+      y_first_el_shape = None
+    else:
+      y_first_el_shape = ([1] + list(y_first_el[0].shape if isinstance(
+          y_first_el, list) else y_first_el.shape))
 
-    self.input_shape, self.output_shape, self._batch_size = _get_in_out_shape(x_first_el_shape, y_first_el_shape,
-                                                                              n_classes, batch_size)
+    self.input_shape, self.output_shape, self._batch_size = _get_in_out_shape(
+        x_first_el_shape, y_first_el_shape, n_classes, batch_size)
 
     # Input dtype of x_first_el.
-    self._input_dtype = dict([(k, _check_dtype(v.dtype)) for k, v in list(x_first_el.items())]) if x_is_dict \
-      else _check_dtype(x_first_el.dtype)
+    if x_is_dict:
+      self._input_dtype = dict(
+          [(k, _check_dtype(v.dtype)) for k, v in list(x_first_el.items())])
+    else:
+      self._input_dtype = _check_dtype(x_first_el.dtype)
 
     # Output dtype of y_first_el.
     def check_y_dtype(el):
-      if isinstance(el, list) or isinstance(el, np.ndarray):
-        if isinstance(el, np.ndarray) and el.ndim == 0:
-          return el.dtype
-        else:
-          return _check_dtype(np.dtype(type(el[0])))
+      if isinstance(el, np.ndarray):
+        return el.dtype
+      elif isinstance(el, list):
+        return check_y_dtype(el[0])
       else:
         return _check_dtype(np.dtype(type(el)))
 
     # Output types are floats, due to both softmaxes and regression req.
     if n_classes is not None and (y is None or not y_is_dict) and n_classes > 0:
       self._output_dtype = np.float32
+    elif y_is_dict:
+      self._output_dtype = dict(
+          [(k, check_y_dtype(v)) for k, v in list(y_first_el.items())])
+    elif y is None:
+      self._output_dtype = None
     else:
-      self._output_dtype = dict([(k, check_y_dtype(v)) for k, v in list(y_first_el.items())]) if y_is_dict \
-        else (check_y_dtype(y_first_el) if y is not None else None)
+      self._output_dtype = check_y_dtype(y_first_el)
 
   def get_feed_params(self):
     """Function returns a `dict` with data feed params while training.
@@ -627,13 +669,17 @@ class StreamingDataFeeder(DataFeeder):
       """
 
       def init_array(shape, dtype):
+        """Initialize array of given shape or dict of shapes and dtype."""
         if shape is None:
           return None
+        elif isinstance(shape, dict):
+          return dict([(k, np.zeros(shape[k], dtype[k]))
+                       for k in list(shape.keys())])
         else:
-          return dict([(k, np.zeros(shape[k], dtype[k])) for k in list(shape.keys())]) if isinstance(shape, dict) else \
-            np.zeros(shape, dtype=dtype)
+          return np.zeros(shape, dtype=dtype)
 
       def put_data_array(dest, index, source=None, n_classes=None):
+        """Puts data array into container."""
         if source is None:
           dest = dest[:index]
         elif n_classes is not None and n_classes > 1:
@@ -650,12 +696,13 @@ class StreamingDataFeeder(DataFeeder):
         return dest
 
       def put_data_array_or_dict(holder, index, data=None, n_classes=None):
+        """Puts data array or data dictionary into container."""
         if holder is None:
           return None
         if isinstance(holder, dict):
           if data is None:
             data = {k: None for k in holder.keys()}
-          assert (isinstance(data, dict))
+          assert isinstance(data, dict)
           for k in holder.keys():
             num_classes = n_classes[k] if (n_classes is not None and
                                            k in n_classes) else None
@@ -688,12 +735,18 @@ class StreamingDataFeeder(DataFeeder):
           out = put_data_array_or_dict(out, i, next_out, self.n_classes)
 
       # creating feed_dict
-      feed_dict = dict([(self._input_placeholder[k].name, inp[k]) for k in list(self._input_placeholder.keys())]) if \
-        isinstance(inp, dict) else {self._input_placeholder.name: inp}
+      if isinstance(inp, dict):
+        feed_dict = dict([(self._input_placeholder[k].name, inp[k])
+                          for k in list(self._input_placeholder.keys())])
+      else:
+        feed_dict = {self._input_placeholder.name: inp}
       if self._y is not None:
-        feed_dict.update(
-          dict([(self._output_placeholder[k].name, out[k]) for k in list(self._output_placeholder.keys())]) \
-            if isinstance(out, dict) else {self._output_placeholder.name: out})
+        if isinstance(out, dict):
+          feed_dict.update(
+              dict([(self._output_placeholder[k].name, out[k])
+                    for k in list(self._output_placeholder.keys())]))
+        else:
+          feed_dict.update({self._output_placeholder.name: out})
 
       return feed_dict
 
@@ -708,8 +761,14 @@ class DaskDataFeeder(object):
   memory and still do random seeks for sampling of batches.
   """
 
-  def __init__(self, x, y, n_classes, batch_size, shuffle=True,
-               random_state=None, epochs=None):
+  def __init__(self,
+               x,
+               y,
+               n_classes,
+               batch_size,
+               shuffle=True,
+               random_state=None,
+               epochs=None):
     """Initializes a DaskDataFeeder instance.
 
     Args:
@@ -732,10 +791,14 @@ class DaskDataFeeder(object):
       output_shape: shape of the output.
       input_dtype: dtype of input.
       output_dtype: dtype of output.
+
+    Raises:
+      ValueError: if `x` or `y` are `dict`, as they are not supported currently.
     """
 
     if isinstance(x, dict) or isinstance(y, dict):
-      raise ValueError("DaskDataFeeder does not support dictionaries at the moment.")
+      raise ValueError(
+          'DaskDataFeeder does not support dictionaries at the moment.')
 
     # pylint: disable=invalid-name,super-init-not-called
     import dask.dataframe as dd  # pylint: disable=g-import-not-at-top
@@ -763,7 +826,7 @@ class DaskDataFeeder(object):
     self._shuffle = shuffle
     self.epochs = epochs
     self.input_shape, self.output_shape, self._batch_size = _get_in_out_shape(
-      x_shape, y_shape, n_classes, batch_size)
+        x_shape, y_shape, n_classes, batch_size)
     self.sample_fraction = self._batch_size / float(x_count)
     self._input_dtype = _check_dtype(self._x.dtypes[0])
     self._output_dtype = _check_dtype(self._y.dtypes[self._y_columns])
@@ -797,8 +860,8 @@ class DaskDataFeeder(object):
       # TODO(ipolosukhin): option for with/without replacement (dev version of
       # dask)
       sample = self.df.random_split(
-        [self.sample_fraction, 1 - self.sample_fraction],
-        random_state=self.random_state)
+          [self.sample_fraction, 1 - self.sample_fraction],
+          random_state=self.random_state)
       inp = extract_pandas_matrix(sample[0][self._x_columns].compute()).tolist()
       out = extract_pandas_matrix(sample[0][self._y_columns].compute())
       # convert to correct dtype
@@ -811,7 +874,6 @@ class DaskDataFeeder(object):
       out_max = self._y.max().compute().values[0]
       encoded_out = np.zeros((out.size, out_max + 1), dtype=self._output_dtype)
       encoded_out[np.arange(out.size), out] = 1
-      return {input_placeholder.name: inp,
-              output_placeholder.name: encoded_out}
+      return {input_placeholder.name: inp, output_placeholder.name: encoded_out}
 
     return _feed_dict_fn
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder_test.py b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder_test.py
index f6e5a3973b0..7f5711ac1b5 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder_test.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder_test.py
@@ -253,20 +253,20 @@ class DataFeederTest(test.TestCase):
       inp, out = df.input_builder()
       feed_dict_fn = df.get_feed_dict_fn()
       feed_dict = feed_dict_fn()
-      self._assertAllClose(inp, [[1, 2], [3, 4]], feed_dict, 'name')
-      self._assertAllClose(out, [1, 2], feed_dict, 'name')
+      self._assertAllClose(inp, [[[1, 2]], [[3, 4]]], feed_dict, 'name')
+      self._assertAllClose(out, [[[1], [2]], [[2], [2]]], feed_dict, 'name')
 
     def x_iter(wrap_dict=False):
-      yield np.array([1, 2]) if not wrap_dict else self._wrap_dict(
-          np.array([1, 2]), 'in')
-      yield np.array([3, 4]) if not wrap_dict else self._wrap_dict(
-          np.array([3, 4]), 'in')
+      yield np.array([[1, 2]]) if not wrap_dict else self._wrap_dict(
+          np.array([[1, 2]]), 'in')
+      yield np.array([[3, 4]]) if not wrap_dict else self._wrap_dict(
+          np.array([[3, 4]]), 'in')
 
     def y_iter(wrap_dict=False):
-      yield np.array([1]) if not wrap_dict else self._wrap_dict(
-          np.array([1]), 'out')
-      yield np.array([2]) if not wrap_dict else self._wrap_dict(
-          np.array([2]), 'out')
+      yield np.array([[1], [2]]) if not wrap_dict else self._wrap_dict(
+          np.array([[1], [2]]), 'out')
+      yield np.array([[2], [2]]) if not wrap_dict else self._wrap_dict(
+          np.array([[2], [2]]), 'out')
 
     func(
         data_feeder.StreamingDataFeeder(
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py b/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py
index 9579b8fabd1..5de9bb5d775 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py
@@ -25,6 +25,7 @@ import six
 from tensorflow.contrib.framework import tensor_util as contrib_tensor_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
@@ -139,10 +140,11 @@ class LinearOperatorDerivedClassTest(test.TestCase):
 
   def test_to_dense(self):
     self._maybe_skip("to_dense")
-    with self.test_session() as sess:
-      for use_placeholder in False, True:
-        for shape in self._shapes_to_test:
-          for dtype in self._dtypes_to_test:
+    for use_placeholder in False, True:
+      for shape in self._shapes_to_test:
+        for dtype in self._dtypes_to_test:
+          with self.test_session(graph=ops.Graph()) as sess:
+            sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
             operator, mat, feed_dict = self._operator_and_mat_and_feed_dict(
                 shape, dtype, use_placeholder=use_placeholder)
             op_dense = operator.to_dense()
@@ -153,14 +155,15 @@ class LinearOperatorDerivedClassTest(test.TestCase):
 
   def test_det(self):
     self._maybe_skip("det")
-    with self.test_session() as sess:
-      for use_placeholder in False, True:
-        for shape in self._shapes_to_test:
-          for dtype in self._dtypes_to_test:
-            if dtype.is_complex:
-              self.skipTest(
-                  "tf.matrix_determinant does not work with complex, so this "
-                  "test is being skipped.")
+    for use_placeholder in False, True:
+      for shape in self._shapes_to_test:
+        for dtype in self._dtypes_to_test:
+          if dtype.is_complex:
+            self.skipTest(
+                "tf.matrix_determinant does not work with complex, so this "
+                "test is being skipped.")
+          with self.test_session(graph=ops.Graph()) as sess:
+            sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
             operator, mat, feed_dict = self._operator_and_mat_and_feed_dict(
                 shape, dtype, use_placeholder=use_placeholder)
             op_det = operator.determinant()
@@ -173,11 +176,12 @@ class LinearOperatorDerivedClassTest(test.TestCase):
 
   def test_apply(self):
     self._maybe_skip("apply")
-    with self.test_session() as sess:
-      for use_placeholder in False, True:
-        for shape in self._shapes_to_test:
-          for dtype in self._dtypes_to_test:
-            for adjoint in False, True:
+    for use_placeholder in False, True:
+      for shape in self._shapes_to_test:
+        for dtype in self._dtypes_to_test:
+          for adjoint in False, True:
+            with self.test_session(graph=ops.Graph()) as sess:
+              sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
               operator, mat, feed_dict = self._operator_and_mat_and_feed_dict(
                   shape, dtype, use_placeholder=use_placeholder)
               x = self._make_x(operator, adjoint=adjoint)
@@ -191,11 +195,12 @@ class LinearOperatorDerivedClassTest(test.TestCase):
 
   def test_solve(self):
     self._maybe_skip("solve")
-    with self.test_session() as sess:
-      for use_placeholder in False, True:
-        for shape in self._shapes_to_test:
-          for dtype in self._dtypes_to_test:
-            for adjoint in False, True:
+    for use_placeholder in False, True:
+      for shape in self._shapes_to_test:
+        for dtype in self._dtypes_to_test:
+          for adjoint in False, True:
+            with self.test_session(graph=ops.Graph()) as sess:
+              sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
               operator, mat, feed_dict = self._operator_and_mat_and_feed_dict(
                   shape, dtype, use_placeholder=use_placeholder)
               rhs = self._make_rhs(operator, adjoint=adjoint)
@@ -209,10 +214,11 @@ class LinearOperatorDerivedClassTest(test.TestCase):
 
   def test_add_to_tensor(self):
     self._maybe_skip("add_to_tensor")
-    with self.test_session() as sess:
-      for use_placeholder in False, True:
-        for shape in self._shapes_to_test:
-          for dtype in self._dtypes_to_test:
+    for use_placeholder in False, True:
+      for shape in self._shapes_to_test:
+        for dtype in self._dtypes_to_test:
+          with self.test_session(graph=ops.Graph()) as sess:
+            sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
             operator, mat, feed_dict = self._operator_and_mat_and_feed_dict(
                 shape, dtype, use_placeholder=use_placeholder)
             op_plus_2mat = operator.add_to_tensor(2 * mat)
diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index 2bf246bdf91..97d963ede2d 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -21,7 +21,7 @@ echo "false")
 
 # Hexagon integration
 ifdef HEXAGON_LIBS
-	LIBGEMM_WRAPPER := $(HEXAGON_LIBS)/libgemm_wrapper.so
+	LIBGEMM_WRAPPER := $(HEXAGON_LIBS)/libhexagon_controller.so
 	ifeq ($(shell test -f $(LIBGEMM_WRAPPER) 2> /dev/null; echo $$?), 0)
     $(info "Use hexagon libs at " $(LIBGEMM_WRAPPER))
 	else
@@ -271,7 +271,7 @@ ifeq ($(TARGET),ANDROID)
 
 	ifdef HEXAGON_LIBS
 		INCLUDES += -I$(HEXAGON_INCLUDE)
-		LIBS += -lgemm_wrapper
+		LIBS += -lhexagon_controller
 		LDFLAGS += -L$(HEXAGON_LIBS)
 		CXXFLAGS += -DUSE_HEXAGON_LIBS
 	endif
diff --git a/tensorflow/contrib/makefile/build_all_android.sh b/tensorflow/contrib/makefile/build_all_android.sh
index c826431dea8..3d80f0fd2dd 100755
--- a/tensorflow/contrib/makefile/build_all_android.sh
+++ b/tensorflow/contrib/makefile/build_all_android.sh
@@ -22,21 +22,32 @@ usage() {
   echo "-s [sub_makefiles] sub makefiles separated by white space"
   echo "-t [build_target] build target for Android makefile [default=all]"
   echo "-T only build tensorflow"
-  echo "-x use hexagon library located at ../hexagon/<libs and include>"
+  echo "-x use hexagon library located at tensorflow/contrib/makefile/downloads/hexagon"
+  echo "-X download hexagon deps and run hexagon_graph_execution"
   exit 1
 }
 
+download_and_push() {
+    URL="$1"
+    LOCAL_DEST="$2"
+    ANDROID_DEST="$3"
+    curl -Ls "${URL}" -o "${LOCAL_DEST}"
+    adb shell mkdir -p "${ANDROID_DEST}"
+    adb push "${LOCAL_DEST}" "${ANDROID_DEST}"
+}
+
 if [[ -z "${NDK_ROOT}" ]]; then
     echo "NDK_ROOT should be set as an environment variable" 1>&2
     exit 1
 fi
 
-while getopts "s:t:Tx" opt_name; do
+while getopts "s:t:TxX" opt_name; do
   case "$opt_name" in
     s) SUB_MAKEFILES="${OPTARG}";;
     t) BUILD_TARGET="${OPTARG}";;
     T) ONLY_MAKE_TENSORFLOW="true";;
     x) USE_HEXAGON="true";;
+    X) DOWNLOAD_AND_USE_HEXAGON="true";;
     *) usage;;
   esac
 done
@@ -49,6 +60,8 @@ cd ${SCRIPT_DIR}/../../../
 source "${SCRIPT_DIR}/build_helper.subr"
 JOB_COUNT="${JOB_COUNT:-$(get_job_count)}"
 
+HEXAGON_DOWNLOAD_PATH="tensorflow/contrib/makefile/downloads/hexagon"
+
 if [[ "${ONLY_MAKE_TENSORFLOW}" != "true" ]]; then
   # Remove any old files first.
   make -f tensorflow/contrib/makefile/Makefile clean
@@ -63,10 +76,30 @@ else
   make -f tensorflow/contrib/makefile/Makefile clean_except_protobuf_libs
 fi
 
+if [[ "${DOWNLOAD_AND_USE_HEXAGON}" == "true" ]]; then
+    URL_BASE="https://storage.googleapis.com/download.tensorflow.org"
+
+    rm -rf "${HEXAGON_DOWNLOAD_PATH}"
+    mkdir -p "${HEXAGON_DOWNLOAD_PATH}/libs"
+
+    download_and_push "${URL_BASE}/deps/hexagon/libhexagon_controller.so" \
+"${HEXAGON_DOWNLOAD_PATH}/libs/libhexagon_controller.so" "/data/local/tmp"
+
+    download_and_push "${URL_BASE}/deps/hexagon/libhexagon_nn_skel.so" \
+"${HEXAGON_DOWNLOAD_PATH}/libs/libhexagon_nn_skel.so" "/vendor/lib/rfsa/adsp"
+
+    download_and_push "${URL_BASE}/example_images/img_299x299.jpg" \
+"${HEXAGON_DOWNLOAD_PATH}/img_299x299.jpg" "/data/local/tmp"
+
+    USE_HEXAGON="true"
+    SUB_MAKEFILES="$(pwd)/tensorflow/contrib/makefile/sub_makefiles/hexagon_graph_execution/Makefile.in"
+    BUILD_TARGET="hexagon_graph_execution"
+fi
+
 if [[ "${USE_HEXAGON}" == "true" ]]; then
-    HEXAGON_PARENT_DIR=$(cd ../hexagon && pwd)
+    HEXAGON_PARENT_DIR=$(cd "${HEXAGON_DOWNLOAD_PATH}" && pwd)
     HEXAGON_LIBS="${HEXAGON_PARENT_DIR}/libs"
-    HEXAGON_INCLUDE=$(cd tensorflow/core/platform/hexagon && pwd)
+    HEXAGON_INCLUDE=$(cd "tensorflow/core/platform/hexagon" && pwd)
 fi
 
 if [[ -z "${BUILD_TARGET}" ]]; then
@@ -80,3 +113,14 @@ else
 HEXAGON_LIBS="${HEXAGON_LIBS}" HEXAGON_INCLUDE="${HEXAGON_INCLUDE}" \
 SUB_MAKEFILES="${SUB_MAKEFILES}" "${BUILD_TARGET}"
 fi
+
+if [[ "${DOWNLOAD_AND_USE_HEXAGON}" == "true" ]]; then
+    ANDROID_EXEC_FILE_MODE=755
+    echo "Run hexagon_graph_execution"
+    adb push -p "./tensorflow/contrib/makefile/gen/bin/hexagon_graph_execution" "/data/local/tmp/"
+    adb wait-for-device
+    adb shell chmod "${ANDROID_EXEC_FILE_MODE}" "/data/local/tmp/hexagon_graph_execution"
+    adb wait-for-device
+    adb shell 'LD_LIBRARY_PATH=/data/local/tmp:$LD_LIBRARY_PATH' \
+    "/data/local/tmp/hexagon_graph_execution"
+fi
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
index 35efaf14d1b..3e2e408e6f9 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
@@ -4486,7 +4486,7 @@ class StreamingMeanIOUTest(test.TestCase):
                                                    num_classes)
       sess.run(variables.local_variables_initializer())
       confusion_matrix = update_op.eval()
-      self.assertAllEqual([[3, 2], [0, 5]], confusion_matrix)
+      self.assertAllEqual([[3, 0], [2, 5]], confusion_matrix)
       desired_miou = np.mean([3. / 5., 5. / 7.])
       self.assertAlmostEqual(desired_miou, miou.eval())
 
@@ -4509,7 +4509,7 @@ class StreamingMeanIOUTest(test.TestCase):
       miou, update_op = metrics.streaming_mean_iou(predictions, labels,
                                                    num_classes)
       sess.run(variables.local_variables_initializer())
-      self.assertAllEqual([[0, 40], [0, 0]], update_op.eval())
+      self.assertAllEqual([[0, 0], [40, 0]], update_op.eval())
       self.assertEqual(0., miou.eval())
 
   def testResultsWithSomeMissing(self):
@@ -4540,7 +4540,7 @@ class StreamingMeanIOUTest(test.TestCase):
       miou, update_op = metrics.streaming_mean_iou(
           predictions, labels, num_classes, weights=weights)
       sess.run(variables.local_variables_initializer())
-      self.assertAllEqual([[2, 2], [0, 4]], update_op.eval())
+      self.assertAllEqual([[2, 0], [2, 4]], update_op.eval())
       desired_miou = np.mean([2. / 4., 4. / 6.])
       self.assertAlmostEqual(desired_miou, miou.eval())
 
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index e81d0349b60..d3ffd692b28 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -84,12 +84,14 @@ load(
     "//tensorflow/core:platform/default/build_config.bzl",
     "tf_proto_library",
     "tf_proto_library_cc",
+    "tf_additional_core_deps",
+    "tf_additional_lib_defines",
+    "tf_additional_lib_deps",
     "tf_additional_lib_hdrs",
     "tf_additional_lib_srcs",
     "tf_additional_minimal_lib_srcs",
     "tf_additional_proto_hdrs",
     "tf_additional_proto_srcs",
-    "tf_additional_lib_deps",
     "tf_additional_stream_executor_srcs",
     "tf_additional_cupti_wrapper_deps",
     "tf_additional_libdevice_data",
@@ -1127,12 +1129,13 @@ cc_library(
         "platform/tracing.h",
     ],
     copts = tf_copts(),
+    defines = tf_additional_lib_defines(),
     linkopts = ["-ldl"],
-    deps = [
+    deps = tf_additional_lib_deps() + [
         ":lib_proto_parsing",
         ":protos_all_cc",
-        "//tensorflow/core/platform/default/build_config:platformlib",
         "//third_party/eigen3",
+        "//tensorflow/core/platform/default/build_config:platformlib",
         "@zlib_archive//:zlib",
     ],
 )
@@ -1352,7 +1355,7 @@ tf_cuda_library(
         ":protos_all_cc",
         "//third_party/eigen3",
         "//tensorflow/core/kernels:required",
-    ] + tf_additional_lib_deps(),
+    ] + tf_additional_core_deps(),
     alwayslink = 1,
 )
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_tracer.cc b/tensorflow/core/common_runtime/gpu/gpu_tracer.cc
index ee93b19d291..981a6549889 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_tracer.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_tracer.cc
@@ -215,7 +215,7 @@ Status CUPTIManager::DisableTrace() {
 void CUPTIManager::InternalBufferRequested(uint8_t **buffer, size_t *size,
                                            size_t *maxNumRecords) {
   VLOG(2) << "BufferRequested";
-  void *p = port::aligned_malloc(kBufferSize, kBufferAlignment);
+  void *p = port::AlignedMalloc(kBufferSize, kBufferAlignment);
   *size = kBufferSize;
   *buffer = reinterpret_cast<uint8_t *>(p);
   *maxNumRecords = 0;
@@ -246,7 +246,7 @@ void CUPTIManager::InternalBufferCompleted(CUcontext ctx, uint32_t streamId,
       LOG(WARNING) << "Dropped " << dropped << " activity records";
     }
   }
-  port::aligned_free(buffer);
+  port::AlignedFree(buffer);
 }
 
 CUPTIManager *GetCUPTIManager() {
diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator.h b/tensorflow/core/common_runtime/gpu/pool_allocator.h
index 5842758f0e3..91ce830df85 100644
--- a/tensorflow/core/common_runtime/gpu/pool_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/pool_allocator.h
@@ -171,9 +171,9 @@ class BasicCPUAllocator : public SubAllocator {
   ~BasicCPUAllocator() override {}
 
   void* Alloc(size_t alignment, size_t num_bytes) override {
-    return port::aligned_malloc(num_bytes, alignment);
+    return port::AlignedMalloc(num_bytes, alignment);
   }
-  void Free(void* ptr, size_t num_bytes) override { port::aligned_free(ptr); }
+  void Free(void* ptr, size_t num_bytes) override { port::AlignedFree(ptr); }
 };
 
 // Allocator for pinned CPU RAM that is made known to CUDA for the
diff --git a/tensorflow/core/debug/BUILD b/tensorflow/core/debug/BUILD
index 3e4ab5bc179..5a7e7bb7e56 100644
--- a/tensorflow/core/debug/BUILD
+++ b/tensorflow/core/debug/BUILD
@@ -1,8 +1,12 @@
 # Description:
 # TensorFlow Debugger (tfdbg).
 #
-# Public Android targets:
-# filegroup ":android_srcs" - Debugger source files for Android.
+# Public target(s):
+#
+# ":debug" - Depending on this target causes a concrete implementation of
+#    DebuggerState to be constructed at initialization time, enabling
+#    TensorFlow Debugger (tfdbg) support. For details, please see
+#    core/common_runtime/debugger_state_interface.h.
 
 package(
     default_visibility = ["//tensorflow:internal"],
@@ -39,14 +43,12 @@ tf_proto_library_cc(
     protodeps = ["//tensorflow/core:protos_all"],
 )
 
-# Depending on this target causes a concrete DebuggerState implementation
-# to be registered at initialization time.  For details, please see
-# core/common_runtime/debugger_state_interface.h.
 cc_library(
     name = "debug",
     srcs = ["debug.cc"],
     copts = tf_copts(),
     linkstatic = 1,
+    visibility = ["//visibility:public"],
     deps = [
         ":debug_graph_utils",
         "//tensorflow/core:core_cpu_internal",
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index 89710a4654c..8ab8712c8cc 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -275,6 +275,7 @@ cc_library(
         "//tensorflow/core/distributed_runtime:server_lib",
         "//tensorflow/core/distributed_runtime:worker_env",
         "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc_unsecure",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index addf09672ab..99309a98cab 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "grpc++/grpc++.h"
 #include "grpc++/security/credentials.h"
 #include "grpc++/server_builder.h"
+#include "grpc/support/alloc.h"
 
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -41,6 +42,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
@@ -304,6 +306,11 @@ class GrpcServerFactory : public ServerFactory {
 class GrpcServerRegistrar {
  public:
   GrpcServerRegistrar() {
+    gpr_allocation_functions alloc_fns;
+    alloc_fns.malloc_fn = port::Malloc;
+    alloc_fns.realloc_fn = port::Realloc;
+    alloc_fns.free_fn = port::Free;
+    gpr_set_allocation_functions(alloc_fns);
     ServerFactory::Register("GRPC_SERVER", new GrpcServerFactory());
   }
 };
diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index 601d87fa554..812ce4bfe7e 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -68,7 +68,7 @@ class CPUAllocator : public Allocator {
   string Name() override { return "cpu"; }
 
   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
-    void* p = port::aligned_malloc(num_bytes, alignment);
+    void* p = port::AlignedMalloc(num_bytes, alignment);
     if (cpu_allocator_collect_stats) {
       const std::size_t alloc_size = port::MallocExtension_GetAllocatedSize(p);
       mutex_lock l(mu_);
@@ -89,7 +89,7 @@ class CPUAllocator : public Allocator {
       mutex_lock l(mu_);
       stats_.bytes_in_use -= alloc_size;
     }
-    port::aligned_free(ptr);
+    port::AlignedFree(ptr);
   }
 
   void GetStats(AllocatorStats* stats) override {
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index 9e5bfc4d6e9..5959bce9e76 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -211,43 +211,6 @@ Status AddRetName(NameInfoIndex* name_info, const string& ret,
   return Status::OK();
 }
 
-Status BuildNodeOutputIndex(const FunctionDef::Node& node,
-                            const InstantiateAttrValueMap& attrs,
-                            GetFunctionSignature get_function,
-                            const int arg_index, NameInfoIndex* name_info) {
-  const OpDef* node_sig = nullptr;
-  TF_RETURN_IF_ERROR(get_function(node.op(), &node_sig));
-  if (node_sig->output_arg_size() == 0) {
-    // This node produces no output.
-    if (node.ret_size() != 1) {
-      return errors::InvalidArgument("Expect one ret name.");
-    }
-    return AddRetName(name_info, node.ret(0), {false, arg_index, 0, false, {}});
-  }
-  const int num_retval = node_sig->output_arg_size();
-  if (num_retval != node.ret_size()) {
-    return errors::InvalidArgument("Malformed function node (#ret): ",
-                                   num_retval, " vs. ", node.ret_size());
-  }
-  int start = 0;
-  bool is_type_list;
-  DataTypeVector dtypes;
-  for (int i = 0; i < num_retval; ++i) {
-    TF_RETURN_IF_ERROR(
-        ArgNumType(attrs, node_sig->output_arg(i), &is_type_list, &dtypes));
-    TF_RETURN_IF_ERROR(
-        AddRetName(name_info, node.ret(i),
-                   {false, arg_index, start, is_type_list, dtypes}));
-    for (int j = 0; j < static_cast<int>(dtypes.size()); ++j) {
-      TF_RETURN_IF_ERROR(
-          AddRetName(name_info, strings::StrCat(node.ret(i), ":", j),
-                     {false, arg_index, start + j, false, {dtypes[j]}}));
-    }
-    start += dtypes.size();
-  }
-  return Status::OK();
-}
-
 Status BuildNodeOutputIndex(const NodeDef& node,
                             const InstantiateAttrValueMap& attrs,
                             GetFunctionSignature get_function,
@@ -280,85 +243,6 @@ Status BuildNodeOutputIndex(const NodeDef& node,
   return Status::OK();
 }
 
-Status InstantiateNode(const FunctionDef::Node& fnode,
-                       const InstantiateAttrValueMap& attrs,
-                       GetFunctionSignature get_function,
-                       const NameInfoIndex& name_info, GraphDef* gdef) {
-  const OpDef* fnode_sig = nullptr;
-  TF_CHECK_OK(get_function(fnode.op(), &fnode_sig));
-  NodeDef* gnode = gdef->add_node();
-  gnode->set_name(Name(gdef->node_size() - 1));
-  gnode->set_op(fnode.op());
-
-  // Input
-  const int num_args = fnode_sig->input_arg_size();
-  bool is_type_list;
-  DataTypeVector dtypes;
-  int fnode_arg_index = 0;
-  for (int i = 0; i < num_args; ++i) {
-    TF_RETURN_IF_ERROR(
-        ArgNumType(attrs, fnode_sig->input_arg(i), &is_type_list, &dtypes));
-    if (!is_type_list) {
-      const NameInfoItem* item =
-          gtl::FindOrNull(name_info, fnode.arg(fnode_arg_index));
-      if (item == nullptr) {
-        return errors::InvalidArgument("arg[", i, "] is not found: ",
-                                       ProtoShortDebugString(fnode));
-      }
-      if (dtypes != item->dtypes) {
-        return errors::InvalidArgument("Invalid arg(", i,
-                                       ") for function arg: ",
-                                       DataTypeSliceString(dtypes), " vs. ",
-                                       DataTypeSliceString(item->dtypes), ".");
-      }
-      for (size_t j = 0; j < dtypes.size(); ++j) {
-        if (item->is_func_arg) {
-          gnode->add_input(Name(item->nid + j));
-        } else {
-          gnode->add_input(Name(item->nid, item->idx + j));
-        }
-      }
-      ++fnode_arg_index;
-    } else {
-      for (size_t j = 0; j < dtypes.size(); ++j) {
-        const NameInfoItem* item =
-            gtl::FindOrNull(name_info, fnode.arg(fnode_arg_index + j));
-        if (item == nullptr) {
-          return errors::InvalidArgument("arg[", i + j, "] is not found: ",
-                                         ProtoShortDebugString(fnode));
-        }
-        if (item->dtypes.size() != 1 || (item->dtypes[0] != dtypes[j])) {
-          return errors::InvalidArgument(
-              "Invalid typelist arg(", i + j, ") for function arg: ",
-              DataTypeSliceString(dtypes), " vs. ",
-              DataTypeSliceString(item->dtypes), ".");
-        }
-        if (item->is_func_arg) {
-          gnode->add_input(Name(item->nid));
-        } else {
-          gnode->add_input(Name(item->nid, item->idx));
-        }
-      }
-      fnode_arg_index += dtypes.size();
-    }
-  }
-  // Control deps.
-  for (int i = 0; i < fnode.dep_size(); ++i) {
-    const NameInfoItem* item = gtl::FindOrNull(name_info, fnode.dep(i));
-    if (item == nullptr) {
-      return errors::InvalidArgument("dep[", i, "] is not found.");
-    }
-    gnode->add_input(Dep(item->nid));
-  }
-
-  // Attrs.
-  for (const auto& p : attrs) {
-    (*gnode->mutable_attr())[p.first] = p.second;
-  }
-
-  return Status::OK();
-}
-
 Status InstantiateNode(const NodeDef& fnode,
                        const InstantiateAttrValueMap& attrs,
                        GetFunctionSignature get_function,
@@ -448,38 +332,6 @@ Status InstantiateNode(const NodeDef& fnode,
   return Status::OK();
 }
 
-// FunctionDef::Node version
-Status AddReturnNode(const OpDef::ArgDef& ret_def,
-                     const InstantiateAttrValueMap& attrs,
-                     const NameInfoIndex& name_info, int* ret_index,
-                     InstantiationResult* result) {
-  bool is_type_list;
-  DataTypeVector dtypes;
-  TF_RETURN_IF_ERROR(ArgNumType(attrs, ret_def, &is_type_list, &dtypes));
-  CHECK_GE(dtypes.size(), size_t{1});
-  const NameInfoItem* item = gtl::FindOrNull(name_info, ret_def.name());
-  if (item == nullptr) {
-    return errors::InvalidArgument("ret is not found.");
-  }
-  if (dtypes != item->dtypes) {
-    return errors::InvalidArgument("Invalid ret types ", ret_def.name(), " : ",
-                                   DataTypeVectorString(dtypes), " vs. ",
-                                   DataTypeVectorString(item->dtypes));
-  }
-  GraphDef* gdef = &result->gdef;
-  for (size_t i = 0; i < dtypes.size(); ++i) {
-    NodeDef* gnode = gdef->add_node();
-    gnode->set_name(Name(gdef->node_size() - 1));
-    gnode->set_op("_Retval");
-    gnode->add_input(Name(item->nid, item->idx + i));
-    AddAttr("T", dtypes[i], gnode);
-    AddAttr("index", (*ret_index)++, gnode);
-    result->ret_types.push_back(dtypes[i]);
-  }
-  return Status::OK();
-}
-
-// NodeDef version
 Status AddReturnNode(const OpDef::ArgDef& ret_def,
                      const InstantiateAttrValueMap& attrs,
                      const ::tensorflow::protobuf::Map<string, string>& ret_map,
@@ -561,38 +413,6 @@ string Print(const AttrValue& attr_value) {
   return SummarizeAttrValue(attr_value);
 }
 
-string Print(const FunctionDef::Node& node) {
-  string out;
-  for (int i = 0; i < node.ret_size(); ++i) {
-    const auto& name = node.ret(i);
-    if (i > 0) strings::StrAppend(&out, ", ");
-    strings::StrAppend(&out, name);
-  }
-  strings::StrAppend(&out, " = ", node.op());
-  if (node.attr_size() > 0) {
-    std::vector<string> entries;
-    for (auto p : node.attr()) {
-      entries.push_back(strings::StrCat(p.first, "=", Print(p.second)));
-    }
-    sort(entries.begin(), entries.end());
-    strings::StrAppend(&out, "[", str_util::Join(entries, ", "), "]");
-  }
-  strings::StrAppend(&out, "(");
-  for (int i = 0; i < node.arg_size(); ++i) {
-    if (i > 0) strings::StrAppend(&out, ", ");
-    strings::StrAppend(&out, node.arg(i));
-  }
-  strings::StrAppend(&out, ")");
-  if (node.dep_size() > 0) {
-    strings::StrAppend(&out, " @ ");
-    for (int i = 0; i < node.dep_size(); ++i) {
-      if (i > 0) strings::StrAppend(&out, ", ");
-      strings::StrAppend(&out, node.dep(i));
-    }
-  }
-  return out;
-}
-
 // TODO(josh11b): Merge this with SummarizeNodeDef().
 string Print(const NodeDef& n) {
   string out;
@@ -650,17 +470,11 @@ string Print(const FunctionDef& fdef) {
     strings::StrAppend(&out, Print(sig.output_arg(i)));
   }
   strings::StrAppend(&out, ") {\n");
-  if (fdef.node_def_size() > 0 || fdef.ret_size() > 0) {
-    for (const auto& n : fdef.node_def()) {
-      strings::StrAppend(&out, "  ", Print(n), "\n");
-    }
-    for (const auto& r : fdef.ret()) {
-      strings::StrAppend(&out, "  return ", r.first, " = ", r.second, "\n");
-    }
-  } else {  // TODO(josh11b): Eventually remove this case.
-    for (const auto& n : fdef.node()) {
-      strings::StrAppend(&out, "  ", Print(n), "\n");
-    }
+  for (const auto& n : fdef.node_def()) {
+    strings::StrAppend(&out, "  ", Print(n), "\n");
+  }
+  for (const auto& r : fdef.ret()) {
+    strings::StrAppend(&out, "  return ", r.first, " = ", r.second, "\n");
   }
   strings::StrAppend(&out, "}\n");
   return out;
@@ -772,92 +586,47 @@ Status InstantiateFunction(const FunctionDef& fdef,
   // Makes a copy of all attrs in fdef and substitutes placeholders.
   // After this step, every attr is bound to a concrete value.
   std::vector<InstantiateAttrValueMap> node_attrs;
-  if (fdef.node_def_size() > 0 || fdef.ret_size() > 0) {
-    node_attrs.resize(fdef.node_def_size());
-    for (int i = 0; i < fdef.node_def_size(); ++i) {
-      for (auto attr : fdef.node_def(i).attr()) {
-        if (!SubstitutePlaceholders(substitute, &attr.second)) {
-          return errors::InvalidArgument("Failed to bind all placeholders in ",
-                                         SummarizeAttrValue(attr.second));
-        }
-        if (!node_attrs[i].insert(attr).second) {
-          return errors::Internal("Somehow duplicated: ", attr.first);
-        }
+  node_attrs.resize(fdef.node_def_size());
+  for (int i = 0; i < fdef.node_def_size(); ++i) {
+    for (auto attr : fdef.node_def(i).attr()) {
+      if (!SubstitutePlaceholders(substitute, &attr.second)) {
+        return errors::InvalidArgument("Failed to bind all placeholders in ",
+                                       SummarizeAttrValue(attr.second));
+      }
+      if (!node_attrs[i].insert(attr).second) {
+        return errors::Internal("Somehow duplicated: ", attr.first);
       }
-      TF_RETURN_IF_ERROR(
-          AddDefaultAttrs(fdef.node_def(i).op(), get_function, &node_attrs[i]));
     }
+    TF_RETURN_IF_ERROR(
+        AddDefaultAttrs(fdef.node_def(i).op(), get_function, &node_attrs[i]));
+  }
 
-    for (int i = 0; i < fdef.node_def_size(); ++i) {
-      s = BuildNodeOutputIndex(fdef.node_def(i), node_attrs[i], get_function,
-                               gdef->node_size() + i, &name_info);
-      if (!s.ok()) {
-        errors::AppendToMessage(&s, "In ", SummarizeNodeDef(fdef.node_def(i)));
-        return s;
-      }
+  for (int i = 0; i < fdef.node_def_size(); ++i) {
+    s = BuildNodeOutputIndex(fdef.node_def(i), node_attrs[i], get_function,
+                             gdef->node_size() + i, &name_info);
+    if (!s.ok()) {
+      errors::AppendToMessage(&s, "In ", SummarizeNodeDef(fdef.node_def(i)));
+      return s;
     }
-    // Emits one gdef.node for each fdef.node_def.
-    for (int i = 0; i < fdef.node_def_size(); ++i) {
-      s = InstantiateNode(fdef.node_def(i), node_attrs[i], get_function,
-                          name_info, gdef);
-      if (!s.ok()) {
-        errors::AppendToMessage(&s, "In ", SummarizeNodeDef(fdef.node_def(i)));
-        return s;
-      }
+  }
+  // Emits one gdef.node for each fdef.node_def.
+  for (int i = 0; i < fdef.node_def_size(); ++i) {
+    s = InstantiateNode(fdef.node_def(i), node_attrs[i], get_function,
+                        name_info, gdef);
+    if (!s.ok()) {
+      errors::AppendToMessage(&s, "In ", SummarizeNodeDef(fdef.node_def(i)));
+      return s;
     }
+  }
 
-    // Emits nodes for the function's return values.
-    int ret_index = 0;
-    for (const OpDef::ArgDef& ret_def : sig.output_arg()) {
-      s = AddReturnNode(ret_def, attr_values, fdef.ret(), name_info, &ret_index,
-                        result);
-      if (!s.ok()) {
-        errors::AppendToMessage(&s, "In function output ", Print(ret_def));
-        return s;
-      }
-    }
-  } else {  // TODO(josh11b): Eventually remove this case.
-    node_attrs.resize(fdef.node_size());
-    for (int i = 0; i < fdef.node_size(); ++i) {
-      for (auto attr : fdef.node(i).attr()) {
-        if (!SubstitutePlaceholders(substitute, &attr.second)) {
-          return errors::InvalidArgument("Failed to bind all placeholders in ",
-                                         SummarizeAttrValue(attr.second));
-        }
-        if (!node_attrs[i].insert(attr).second) {
-          return errors::Internal("Somehow duplicated: ", attr.first);
-        }
-      }
-      TF_RETURN_IF_ERROR(
-          AddDefaultAttrs(fdef.node(i).op(), get_function, &node_attrs[i]));
-    }
-
-    for (int i = 0; i < fdef.node_size(); ++i) {
-      s = BuildNodeOutputIndex(fdef.node(i), node_attrs[i], get_function,
-                               gdef->node_size() + i, &name_info);
-      if (!s.ok()) {
-        errors::AppendToMessage(&s, "In ", Print(fdef.node(i)));
-        return s;
-      }
-    }
-    // Emits one gdef.node for each fdef.node.
-    for (int i = 0; i < fdef.node_size(); ++i) {
-      s = InstantiateNode(fdef.node(i), node_attrs[i], get_function, name_info,
-                          gdef);
-      if (!s.ok()) {
-        errors::AppendToMessage(&s, "In ", Print(fdef.node(i)));
-        return s;
-      }
-    }
-
-    // Emits nodes for the function's return values.
-    int ret_index = 0;
-    for (const OpDef::ArgDef& ret_def : sig.output_arg()) {
-      s = AddReturnNode(ret_def, attr_values, name_info, &ret_index, result);
-      if (!s.ok()) {
-        errors::AppendToMessage(&s, "In function output ", Print(ret_def));
-        return s;
-      }
+  // Emits nodes for the function's return values.
+  int ret_index = 0;
+  for (const OpDef::ArgDef& ret_def : sig.output_arg()) {
+    s = AddReturnNode(ret_def, attr_values, fdef.ret(), name_info, &ret_index,
+                      result);
+    if (!s.ok()) {
+      errors::AppendToMessage(&s, "In function output ", Print(ret_def));
+      return s;
     }
   }
 
diff --git a/tensorflow/core/framework/function.proto b/tensorflow/core/framework/function.proto
index 5a394d64809..bd01e86da3a 100644
--- a/tensorflow/core/framework/function.proto
+++ b/tensorflow/core/framework/function.proto
@@ -30,61 +30,7 @@ message FunctionDef {
   // Attributes specific to this function definition.
   map<string, AttrValue> attr = 5;
 
-  // TO BE REPLACED
-
-  // The body of the function.
-  repeated Node node = 2;  // function.node.ret[*] are unique.
-
-  // A node is a multi-value assignment:
-  //   (ret[0], ret[1], ...) = func(arg[0], arg[1], ...)
-  //
-  // By convention, "func" is resolved by consulting with a user-defined
-  // library first. If not resolved, "func" is assumed to be a builtin op.
-  message Node {
-    // This node produces multiple outputs. They are named ret[0],
-    // ret[1], ..., etc.
-    //
-    // REQUIRES: function.node.ret[*] are unique across all nodes.
-    // REQUIRES: ret.size == func/op def's number of output args.
-    repeated string ret = 1;
-
-    // The op/function name.
-    string op = 2;
-
-    // Arguments passed to this func/op.
-    //
-    // arg[i] must be either one of
-    // function.signature.input_args[*].name or one of
-    // function.node[*].ret[*].
-    //
-    // REQUIRES: arg.size == func/op def's number of input args.
-    repeated string arg = 3;
-
-    // Control dependencies.
-    //
-    // dep[i] must be one of function.node[*].ret[*] or one of
-    // function.signature.input_args[*].name.
-    repeated string dep = 4;
-
-    // Attrs.
-    //
-    // 'attr' maps names defined by 'func's attr defs to attr values.
-    // attr values may have placeholders which are substituted
-    // recursively by concrete values when this node is instantiated.
-    // These placeholders must name an attr listed in the FunctionDef's
-    // signature.
-    map<string, AttrValue> attr = 5;
-  }
-
-  // WILL REPLACE THE ABOVE
-
-  // If node_def is present, and the consumer is at GraphDef version
-  // >= 12, then these fields are used and `node` is ignored.  If the
-  // consumer's GraphDef version is < 12 or this field is empty, then
-  // `node` is used.  This allows producers to fill both fields to
-  // remain compatible with old consumers.  At some future GraphDef
-  // version, `node` will be ignored even if `node_def` is empty.
-  // TODO(josh11b): Finish this transition.
+  // NOTE: field id 2 deleted on Jan 11, 2016, GraphDef version 21.
 
   // In both of the following fields, there is the need to specify an
   // output that is used as either the input to another node (in
@@ -120,6 +66,10 @@ message FunctionDef {
   // The body of the function.  Unlike the NodeDefs in a GraphDef, attrs
   // may have values of type `placeholder` and the `input` field uses
   // the "output" format above.
+
+  // By convention, "op" in node_def is resolved by consulting with a
+  // user-defined library first. If not resolved, "func" is assumed to
+  // be a builtin op.
   repeated NodeDef node_def = 3;
 
   // A mapping from the output arg names from `signature` to the
diff --git a/tensorflow/core/framework/function_test.cc b/tensorflow/core/framework/function_test.cc
index a768e18b014..8bce215a9af 100644
--- a/tensorflow/core/framework/function_test.cc
+++ b/tensorflow/core/framework/function_test.cc
@@ -48,52 +48,8 @@ y: A scalar in type T.
 
 static InstantiateAttrValueMap kNoAttrs;
 
-TEST(TFunc, SquarePlusOneOld) {
-  auto fdef = FDH::Define(  // Create a FunctionDef using Function::Nodes.
-      // Name
-      "SquarePlusOne",
-      // Args
-      {"x: T"},
-      // Return values
-      {"y: T"},
-      // Attrs
-      {"T: {float, double, int32, int64}"},
-      // Nodes
-      {// a = Square<T>(x)
-       {{"a"}, "Square", {"x"}, {{"T", "$T"}}},
-       // o = One<T>()
-       // NOTE: We can also have a Cast<Tin, Tout>(x) instead.
-       {{"o"}, "One", {}, {{"T", "$T"}}},
-       // y = Add<T>(a, o)
-       {{"y"}, "Add", {"a", "o"}, {{"T", "$T"}}}});
-
-  const char* e = R"P(
-SquarePlusOne[T:{float, double, int32, int64}](x:T) -> (y:T) {
-  a = Square[T=$T](x)
-  o = One[T=$T]()
-  y = Add[T=$T](a:y:0, o:y:0)
-  return y = y:z:0
-}
-)P";
-  EXPECT_EQ(DebugString(fdef), e);
-
-  // Instantiate one with T=float
-  InstantiationResult result;
-  TF_ASSERT_OK(InstantiateFunction(fdef, {{"T", DT_FLOAT}}, GetOpSig, &result));
-  const char* e2 = R"P(
-(n0:float) -> (n3:float) {
-  n1 = Square[T=float](n0)
-  n2 = One[T=float]()
-  n3 = Add[T=float](n1, n2)
-}
-)P";
-  EXPECT_EQ(result.arg_types, DataTypeVector({DT_FLOAT}));
-  EXPECT_EQ(result.ret_types, DataTypeVector({DT_FLOAT}));
-  EXPECT_EQ(DebugString(result.gdef), e2);
-}
-
-TEST(TFunc, SquarePlusOneNodeDef) {
-  auto fdef = FDH::Create(  // Create a FunctionDef using NodeDefs.
+TEST(TFunc, SquarePlusOne) {
+  auto fdef = FDH::Create(
       // Name
       "SquarePlusOne",
       // Inputs
@@ -138,8 +94,8 @@ SquarePlusOne[T:{float, double, int32, int64}](x:T) -> (y:T) {
   EXPECT_EQ(DebugString(result.gdef), e2);
 }
 
-TEST(TFunc, ControlDepNodeDef) {
-  auto fdef = FDH::Create(  // Create a FunctionDef using NodeDefs.
+TEST(TFunc, ControlDep) {
+  auto fdef = FDH::Create(
       // Name
       "ControlDep",
       // Inputs
@@ -190,44 +146,8 @@ REGISTER_OP("HasDefaultType")
 // This verifies that a function using an op before a type attr (with
 // a default) is added, still works.  This is important for backwards
 // compatibilty.
-TEST(TFunc, MissingTypeAttrOld) {
-  auto fdef = FDH::Define(  // Create a FunctionDef using Function::Nodes.
-      // Name
-      "BackCompat",
-      // Args
-      {},
-      // Return values
-      {"y: float"},
-      // Attrs
-      {},
-      // Nodes
-      {// y = HasDefaultType(x), T missing, defaults to float
-       {{"y"}, "HasDefaultType", {}, {}}});
-
-  const char* e = R"P(
-BackCompat() -> (y:float) {
-  y = HasDefaultType()
-  return y = y:out:0
-}
-)P";
-  EXPECT_EQ(DebugString(fdef), e);
-
-  InstantiationResult result;
-  TF_ASSERT_OK(
-      InstantiateFunction(fdef, InstantiateAttrValueMap{}, GetOpSig, &result));
-  // Should get T=float from Op's default.
-  const char* e2 = R"P(
-() -> (n0:float) {
-  n0 = HasDefaultType[T=float]()
-}
-)P";
-  EXPECT_EQ(result.arg_types, DataTypeVector());
-  EXPECT_EQ(result.ret_types, DataTypeVector({DT_FLOAT}));
-  EXPECT_EQ(DebugString(result.gdef), e2);
-}
-
-TEST(TFunc, MissingTypeAttrNodeDef) {
-  auto fdef = FDH::Create(  // Create a FunctionDef using NodeDefs.
+TEST(TFunc, MissingTypeAttr) {
+  auto fdef = FDH::Create(
       // Name
       "BackCompat",
       // Args
@@ -264,11 +184,8 @@ BackCompat() -> (y:float) {
   EXPECT_EQ(DebugString(result.gdef), e2);
 }
 
-TEST(TFunc, NTimesTNodeDef) {
-  // Note that the equivalent FunctionDef using FunctionDef::Node requires
-  // using a _ListToArray to package up the two inputs to AddN as a single
-  // N*T edge.
-  auto fdef = FDH::Create(  // Create a FunctionDef using NodeDefs.
+TEST(TFunc, NTimesT) {
+  auto fdef = FDH::Create(
       // Name
       "NTimesT",
       // Inputs
@@ -790,8 +707,8 @@ TEST(InstantiateErrors, TypeList_Missing_Arg) {
            "input unknown is not found");
 }
 
-TEST(InstantiateErrors, NodeDef_TooManyInputs) {
-  auto fdef = FDH::Create(  // Create a FunctionDef using NodeDefs.
+TEST(InstantiateErrors, TooManyInputs) {
+  auto fdef = FDH::Create(
       // Name
       "TooManyInputs",
       // Inputs
@@ -811,8 +728,8 @@ TEST(InstantiateErrors, NodeDef_TooManyInputs) {
            "Expected input[2] == 'x' to be a control input.");
 }
 
-TEST(InstantiateErrors, NodeDef_TooFewInputs) {
-  auto fdef = FDH::Create(  // Create a FunctionDef using NodeDefs.
+TEST(InstantiateErrors, TooFewInputs) {
+  auto fdef = FDH::Create(
       // Name
       "TooFewInputs",
       // Inputs
@@ -832,8 +749,8 @@ TEST(InstantiateErrors, NodeDef_TooFewInputs) {
            "Attempt to access beyond input size: 2 >= 2");
 }
 
-TEST(InstantiateErrors, NodeDef_TooManyInputsFromArray1) {
-  auto fdef = FDH::Create(  // Create a FunctionDef using NodeDefs.
+TEST(InstantiateErrors, TooManyInputsFromArray1) {
+  auto fdef = FDH::Create(
       // Name
       "TooManyInputsFromArray",
       // Inputs
@@ -860,8 +777,8 @@ TEST(InstantiateErrors, NodeDef_TooManyInputsFromArray1) {
            "Expected input[1] == 'y' to be a control input.");
 }
 
-TEST(InstantiateErrors, NodeDef_TooManyInputsFromArray2) {
-  auto fdef = FDH::Create(  // Create a FunctionDef using NodeDefs.
+TEST(InstantiateErrors, TooManyInputsFromArray2) {
+  auto fdef = FDH::Create(
       // Name
       "TooManyInputsFromArray",
       // Inputs
@@ -888,8 +805,8 @@ TEST(InstantiateErrors, NodeDef_TooManyInputsFromArray2) {
            "Input a:output too long for inputs");
 }
 
-TEST(InstantiateErrors, NodeDef_TypeMismatch) {
-  auto fdef = FDH::Create(  // Create a FunctionDef using NodeDefs.
+TEST(InstantiateErrors, TypeMismatch) {
+  auto fdef = FDH::Create(
       // Name
       "TypeMismatch",
       // Inputs
diff --git a/tensorflow/core/framework/graph_def_util.cc b/tensorflow/core/framework/graph_def_util.cc
index 58fb8cf611b..b76ab40b683 100644
--- a/tensorflow/core/framework/graph_def_util.cc
+++ b/tensorflow/core/framework/graph_def_util.cc
@@ -178,14 +178,8 @@ void OpsUsedByGraph(const GraphDef& graph_def,
   while (!functions_to_process.empty()) {
     const FunctionDef* fun = functions_to_process.back();
     functions_to_process.pop_back();
-    if (fun->node_def_size() > 0) {
-      for (const auto& node : fun->node_def()) {
-        mark_op_as_used(node.op());
-      }
-    } else {  // TODO(josh11b): Eventually drop support for this.
-      for (const auto& node : fun->node()) {
-        mark_op_as_used(node.op());
-      }
+    for (const auto& node : fun->node_def()) {
+      mark_op_as_used(node.op());
     }
   }
 
diff --git a/tensorflow/core/framework/load_library.cc b/tensorflow/core/framework/load_library.cc
index f56e5fae1bc..f8253353008 100644
--- a/tensorflow/core/framework/load_library.cc
+++ b/tensorflow/core/framework/load_library.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mem.h"
 
 namespace tensorflow {
 
@@ -91,7 +92,7 @@ Status LoadLibrary(const char* library_filename, void** result,
   }
   string str;
   library.op_list.SerializeToString(&str);
-  char* str_buf = reinterpret_cast<char*>(malloc(str.length()));
+  char* str_buf = reinterpret_cast<char*>(port::Malloc(str.length()));
   memcpy(str_buf, str.data(), str.length());
   *buf = str_buf;
   *len = str.length();
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index 5ff8aea02c4..568346a71d3 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -185,6 +185,17 @@ Status GetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
   return Status::OK();
 }
 
+Status GetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
+                   std::vector<NameAttrList>* value) {
+  const AttrValue* attr_value;
+  TF_RETURN_IF_ERROR(attrs.Find(attr_name, &attr_value));
+  TF_RETURN_IF_ERROR(AttrValueHasType(*attr_value, "list(func)"));
+  for (const auto& v : attr_value->list().func()) {
+    value->emplace_back(v);
+  }
+  return Status::OK();
+}
+
 namespace {  // Helper for InOutTypesForNode().
 
 Status AddArgToSig(const NodeDef& node_def, const OpDef::ArgDef& arg_def,
diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h
index 85b83c4d74b..5c4d2272682 100644
--- a/tensorflow/core/framework/node_def_util.h
+++ b/tensorflow/core/framework/node_def_util.h
@@ -150,6 +150,9 @@ Status GetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
 Status GetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
                    const NameAttrList** value);  // type: "func"
 
+Status GetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
+                   std::vector<NameAttrList>* value);  // type: "list(func)"
+
 // Computes the input and output types for a specific node.
 // REQUIRES: ValidateOpDef(op_def).ok()
 Status InOutTypesForNode(const NodeDef& node_def, const OpDef& op_def,
diff --git a/tensorflow/core/framework/tracking_allocator_test.cc b/tensorflow/core/framework/tracking_allocator_test.cc
index 98134392ef7..850cdc39099 100644
--- a/tensorflow/core/framework/tracking_allocator_test.cc
+++ b/tensorflow/core/framework/tracking_allocator_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -27,7 +28,7 @@ class TestableSizeTrackingAllocator : public Allocator {
  public:
   string Name() override { return "test"; }
   void* AllocateRaw(size_t /*alignment*/, size_t num_bytes) override {
-    void* ptr = malloc(num_bytes);
+    void* ptr = port::Malloc(num_bytes);
     size_map_[ptr] = num_bytes;
     return ptr;
   }
@@ -35,7 +36,7 @@ class TestableSizeTrackingAllocator : public Allocator {
     const auto& iter = size_map_.find(ptr);
     EXPECT_NE(size_map_.end(), iter);
     size_map_.erase(iter);
-    free(ptr);
+    port::Free(ptr);
   }
   bool TracksAllocationSizes() override { return true; }
   size_t RequestedSize(void* ptr) override {
diff --git a/tensorflow/core/graph/testlib.cc b/tensorflow/core/graph/testlib.cc
index 13c6a2146bc..ef4dd047875 100644
--- a/tensorflow/core/graph/testlib.cc
+++ b/tensorflow/core/graph/testlib.cc
@@ -254,6 +254,10 @@ Node* Identity(Graph* g, Node* input, int index) {
 
 Node* Add(Graph* g, Node* in0, Node* in1) { return Binary(g, "Add", in0, in1); }
 
+Node* Reverse(Graph* g, Node* tensor, Node* axis) {
+  return Binary(g, "ReverseV2", tensor, axis);
+}
+
 Node* Error(Graph* g, Node* input, const string& errmsg) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Error")
diff --git a/tensorflow/core/graph/testlib.h b/tensorflow/core/graph/testlib.h
index 7b4425bfeb9..7a23b20c2c8 100644
--- a/tensorflow/core/graph/testlib.h
+++ b/tensorflow/core/graph/testlib.h
@@ -100,6 +100,9 @@ Node* Multi(Graph* g, const string& func, gtl::ArraySlice<Node*> ins);
 // Adds a binary add node in "g" doing in0 + in1.
 Node* Add(Graph* g, Node* in0, Node* in1);
 
+// Reverses <axis> dimensions of <tensor>>
+Node* Reverse(Graph* g, Node* tensor, Node* axis);
+
 // Generates random unit uniform distribution of the input shape.
 Node* RandomUniform(Graph* g, Node* input, DataType dtype);
 
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index b117b84c6d8..1b5f5292d3a 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -256,6 +256,15 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "stage_op",
+    srcs = ["stage_op.cc"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "queue_base",
     srcs = ["queue_base.cc"],
@@ -1161,6 +1170,7 @@ cc_library(
         ":session_ops",
         ":sparse_conditional_accumulator_op",
         ":stack_ops",
+        ":stage_op",
         ":tensor_array_ops",
     ],
 )
@@ -3228,6 +3238,7 @@ tf_kernel_library(
     prefix = "training_ops",
     deps = [
         ":bounds_check",
+        ":variable_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:training_ops_op_lib",
diff --git a/tensorflow/core/kernels/conv_ops.h b/tensorflow/core/kernels/conv_ops.h
index 897afe77966..60091fc27fd 100644
--- a/tensorflow/core/kernels/conv_ops.h
+++ b/tensorflow/core/kernels/conv_ops.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/util/tensor_format.h"
 
 #if GOOGLE_CUDA
@@ -44,9 +45,9 @@ class LaunchConv2DOp {
 template <class T, size_t size>
 struct Im2ColBufferResource : public ResourceBase {
   Im2ColBufferResource<T, size>() {
-    data = static_cast<T*>(malloc(size * sizeof(T)));
+    data = static_cast<T*>(port::Malloc(size * sizeof(T)));
   }
-  ~Im2ColBufferResource<T, size>() { free(data); }
+  ~Im2ColBufferResource<T, size>() { port::Free(data); }
   // This mutex ensures that only a single operation at a time is able to use
   // the buffer memory held by this resource.
   mutex mu;
diff --git a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc
index 1648a54f2fa..ecebd3c599c 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc
+++ b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc
@@ -26,9 +26,9 @@ limitations under the License.
 namespace tensorflow {
 
 const bool SHOW_DBG_IN_SOC = false;
-const bool DBG_DUMP_RESULT = false;
 const bool DBG_USE_DUMMY_INPUT = false;
 const bool DBG_USE_SAMPLE_INPUT = false;
+const bool DBG_SHOW_RESULT = false;
 const int64 FLAG_ENABLE_PANDA_BINARY_INPUT = 0x01;
 
 #ifdef USE_HEXAGON_LIBS
@@ -169,7 +169,7 @@ bool HexagonControlWrapper::SetupGraph(
   return soc_interface_ConstructGraph();
 
   // Keep following comment to use dummy graph construction
-  // return soc_interface_SetupGraphDummy(3 /* inception version */);
+  // return soc_interface_setupDummyGraph(3 /* inception version */);
 }
 
 bool HexagonControlWrapper::ExecuteGraph() {
@@ -213,7 +213,7 @@ bool HexagonControlWrapper::ReadOutputNode(
   // TODO: Accept all results
   std::get<2>(output) = DT_FLOAT;
   outputs->emplace_back(output);
-  if (DBG_DUMP_RESULT) {
+  if (DBG_SHOW_RESULT) {
     const int byte_size = std::get<1>(output);
     const int element_count = byte_size / sizeof(float);
     const float* float_array = reinterpret_cast<float*>(std::get<0>(output));
diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc
index f644fa02ed3..7852499965c 100644
--- a/tensorflow/core/kernels/reverse_op.cc
+++ b/tensorflow/core/kernels/reverse_op.cc
@@ -27,23 +27,83 @@ limitations under the License.
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
+namespace {
+
+// Reverse rows (middle dimension) of a three dimensional tensor.
+// NUM_CHANNELS can be <= 0 to compute it dynamically from <input>
+// Otherwise, it must equal input.dim_size(2) and is used as a compile-time
+// constant.
+template <int NUM_CHANNELS>
+void ReverseRows(OpKernelContext* context, const Tensor& input,
+                 Tensor* result) {
+  auto work = [&input, result](int64 start, int64 end) {
+    const int64 inner_size =
+        NUM_CHANNELS > 0 ? NUM_CHANNELS : input.dim_size(2);
+    const int64 middle_size = input.dim_size(1);
+    const int64 row_size = inner_size * middle_size;
+    DCHECK_EQ(input.dim_size(2), inner_size);
+
+    const int32* in_ptr = input.bit_casted_tensor<int32, 3>().data();
+    int32* out_ptr = result->bit_casted_tensor<int32, 3>().data();
+
+    in_ptr += start * row_size;
+    out_ptr += start * row_size;
+
+    for (int outer_dim = start; outer_dim < end; ++outer_dim) {
+      out_ptr += row_size;
+      int remaining = middle_size;
+      while (remaining > 0) {
+        out_ptr -= inner_size;
+        memcpy(out_ptr, in_ptr, inner_size * sizeof(float));
+        in_ptr += inner_size;
+        --remaining;
+      }
+
+      out_ptr += row_size;
+    }
+  };
+
+  // Shard across outer dimension.
+  const int64 N = input.dim_size(0);
+  const int64 cost_per_unit = input.NumElements() / N;
+  auto worker_threads = context->device()->tensorflow_cpu_worker_threads();
+  Shard(worker_threads->num_threads, worker_threads->workers, N, cost_per_unit,
+        std::move(work));
+}
+
+}  // namespace
+
 template <typename Device, typename T, int NDIMS>
 void HandleReverseCase(OpKernelContext* context,
                        typename TTypes<bool, 1>::ConstTensor dims,
                        Tensor* result) {
+  const Tensor& input = context->input(0);
+
+  // Use optimized reverse if possible.
+  if (NDIMS == 3 && std::is_same<Device, CPUDevice>::value &&
+      std::is_same<T, float>::value && (!dims(0) && dims(1) && !dims(2))) {
+    if (input.dim_size(2) == 3) {
+      ReverseRows<3>(context, input, result);
+    } else {
+      ReverseRows<-1>(context, input, result);
+    }
+    return;
+  }
+
   typename Eigen::array<bool, NDIMS> axes_di;
   for (int i = 0; i < NDIMS; i++) {
     axes_di[i] = dims(i);
   }
   functor::Reverse<Device, T, NDIMS>()(context->eigen_device<Device>(),
-                                       context->input(0).tensor<T, NDIMS>(),
-                                       axes_di, result->tensor<T, NDIMS>());
+                                       input.tensor<T, NDIMS>(), axes_di,
+                                       result->tensor<T, NDIMS>());
 }
 
 template <typename Device, typename T>
@@ -105,13 +165,26 @@ class ReverseOp : public OpKernel {
 template <typename Device, typename T, int NDIMS>
 void HandleReverseV2Case(OpKernelContext* context,
                          const gtl::ArraySlice<bool>& axes, Tensor* result) {
+  const Tensor& input = context->input(0);
+
+  // Use optimized reverse if possible.
+  if (NDIMS == 3 && std::is_same<Device, CPUDevice>::value &&
+      std::is_same<T, float>::value && (!axes[0] && axes[1] && !axes[2])) {
+    if (input.dim_size(2) == 3) {
+      ReverseRows<3>(context, input, result);
+    } else {
+      ReverseRows<-1>(context, input, result);
+    }
+    return;
+  }
+
   typename Eigen::array<bool, NDIMS> axes_di;
   for (int i = 0; i < NDIMS; i++) {
     axes_di[i] = axes[i];
   }
   functor::Reverse<Device, T, NDIMS>()(context->eigen_device<Device>(),
-                                       context->input(0).tensor<T, NDIMS>(),
-                                       axes_di, result->tensor<T, NDIMS>());
+                                       input.tensor<T, NDIMS>(), axes_di,
+                                       result->tensor<T, NDIMS>());
 }
 
 template <typename Device, typename T>
@@ -158,6 +231,11 @@ class ReverseV2Op : public OpKernel {
       OP_REQUIRES_OK(context,
                      context->allocate_output(0, input.shape(), &output));
 
+// TODO(cwhipkey): we can do dimension folding to reduce, e.g., a reverse of
+// a single dimension to the dims=3 or dims=2 case, regardless of the number
+// of dimensions in the tensor. This would let some ops use faster
+// lower-dimension code (and use optimized versions).
+
 #define HANDLE_REVERSE(NDIMS)                                           \
   case NDIMS:                                                           \
     HandleReverseV2Case<Device, T, NDIMS>(context, axes_dense, output); \
diff --git a/tensorflow/core/kernels/reverse_op_test.cc b/tensorflow/core/kernels/reverse_op_test.cc
index ee59adf9dab..19e25b887d7 100644
--- a/tensorflow/core/kernels/reverse_op_test.cc
+++ b/tensorflow/core/kernels/reverse_op_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -31,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
 
 namespace tensorflow {
 namespace {
@@ -109,5 +111,104 @@ TEST_F(ReverseOpTest, Reverse_1234) {
   test::ExpectTensorEqual<float>(expected, *params_tensor);
 }
 
+static SessionOptions GetOptions(int intra_threads) {
+  SessionOptions opts;
+  opts.config.set_intra_op_parallelism_threads(intra_threads);
+  opts.config.set_inter_op_parallelism_threads(1);
+  return opts;
+}
+
+// Creates a Graph which "reduce"s a 3D float tensor of "num" elements
+// into a scalar.
+static Graph* Reverse(TensorShape shape, int reverse_axis) {
+  Graph* g = new Graph(OpRegistry::Global());
+  Tensor data(DT_FLOAT, shape);
+  data.flat<float>().setRandom();
+  Tensor axes(DT_INT32, TensorShape({1}));
+  axes.flat<int32>()(0) = reverse_axis;
+  test::graph::Reverse(g, test::graph::Constant(g, data),
+                       test::graph::Constant(g, axes));
+  return g;
+}
+
+static void RunReverseRowsBenchmark(int iters, int outer_dim, int middle_dim,
+                                    int intra_threads, int channels) {
+  SessionOptions opts = GetOptions(intra_threads);
+  TensorShape shape{outer_dim, middle_dim, channels};
+  const int64 num_items = static_cast<int64>(iters) * shape.num_elements();
+  testing::ItemsProcessed(num_items);
+  testing::BytesProcessed(num_items * sizeof(float));
+  testing::UseRealTime();
+  test::Benchmark("cpu", Reverse(shape, 1), &opts).Run(iters);
+}
+
+static void BM_ReverseRowsOf1Channel_1T(int iters, int outer_dim,
+                                        int middle_dim) {
+  RunReverseRowsBenchmark(iters, outer_dim, middle_dim, 1 /* intra_threads */,
+                          1 /* channels */);
+}
+
+BENCHMARK(BM_ReverseRowsOf1Channel_1T)
+    ->ArgPair(288, 288)
+    ->ArgPair(1024, 1024)
+    ->ArgPair(10 * 1024, 1024);
+
+static void BM_ReverseRowsOf1Channel_4T(int iters, int outer_dim,
+                                        int middle_dim) {
+  RunReverseRowsBenchmark(iters, outer_dim, middle_dim, 4 /* intra_threads */,
+                          1 /* channels */);
+}
+
+BENCHMARK(BM_ReverseRowsOf1Channel_4T)
+    ->ArgPair(288, 288)
+    ->ArgPair(1024, 1024)
+    ->ArgPair(10 * 1024, 1024);
+
+static void BM_ReverseRowsOf3Channels_1T(int iters, int outer_dim,
+                                         int middle_dim) {
+  RunReverseRowsBenchmark(iters, outer_dim, middle_dim, 1 /* intra_threads */,
+                          3 /* channels */);
+}
+
+BENCHMARK(BM_ReverseRowsOf3Channels_1T)
+    ->ArgPair(288, 288)
+    ->ArgPair(224, 224)
+    ->ArgPair(1024, 1024)
+    ->ArgPair(10 * 1024, 1024);
+
+static void BM_ReverseRowsOf3Channels_4T(int iters, int outer_dim,
+                                         int middle_dim) {
+  RunReverseRowsBenchmark(iters, outer_dim, middle_dim, 4 /* intra_threads */,
+                          3 /* channels */);
+}
+
+BENCHMARK(BM_ReverseRowsOf3Channels_4T)
+    ->ArgPair(288, 288)
+    ->ArgPair(224, 224)
+    ->ArgPair(1024, 1024)
+    ->ArgPair(10 * 1024, 1024);
+
+static void BM_ReverseRowsOf4Channels_1T(int iters, int outer_dim,
+                                         int middle_dim) {
+  RunReverseRowsBenchmark(iters, outer_dim, middle_dim, 1 /* intra_threads */,
+                          4 /* channels */);
+}
+
+BENCHMARK(BM_ReverseRowsOf4Channels_1T)
+    ->ArgPair(288, 288)
+    ->ArgPair(1024, 1024)
+    ->ArgPair(10 * 1024, 1024);
+
+static void BM_ReverseRowsOf4Channels_4T(int iters, int outer_dim,
+                                         int middle_dim) {
+  RunReverseRowsBenchmark(iters, outer_dim, middle_dim, 4 /* intra_threads */,
+                          4 /* channels */);
+}
+
+BENCHMARK(BM_ReverseRowsOf4Channels_4T)
+    ->ArgPair(288, 288)
+    ->ArgPair(1024, 1024)
+    ->ArgPair(10 * 1024, 1024);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/stage_op.cc b/tensorflow/core/kernels/stage_op.cc
new file mode 100644
index 00000000000..34db850013d
--- /dev/null
+++ b/tensorflow/core/kernels/stage_op.cc
@@ -0,0 +1,130 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <deque>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+namespace {
+
+class Buffer : public ResourceBase {
+ public:
+  explicit Buffer() {}
+
+  typedef std::vector<Tensor> Tuple;
+
+  // the Buffer takes ownership of the Tuple
+  void Put(Tuple* tuple) {
+    mutex_lock l(mu_);
+    buf_.push_back(std::move(*tuple));
+    non_empty_cond_var_.notify_one();  // maybe possible to optimize by reducing
+                                       // how often this signal is sent
+  }
+
+  void Get(Tuple* tuple) {  // TODO(zhifengc): Support cancellation.
+    mutex_lock l(mu_);
+    while (buf_.empty()) {
+      non_empty_cond_var_.wait(l);
+    }
+
+    *tuple = std::move(buf_.front());
+    buf_.pop_front();
+  }
+
+  string DebugString() {
+    mutex_lock l(mu_);
+    return strings::StrCat("Staging size: ", buf_.size());
+  }
+
+ private:
+  mutex mu_;
+  condition_variable non_empty_cond_var_;
+  std::deque<Tuple> buf_ GUARDED_BY(mu_);
+};
+
+Status CreateBuffer(Buffer** ret) {
+  *ret = new Buffer;
+  return Status::OK();
+}
+
+Status GetBuffer(OpKernelContext* ctx, const NodeDef& ndef, Buffer** buf) {
+  auto rm = ctx->resource_manager();
+  ContainerInfo cinfo;
+  TF_RETURN_IF_ERROR(cinfo.Init(rm, ndef, true /* use name() */));
+  TF_RETURN_IF_ERROR(rm->LookupOrCreate<Buffer>(cinfo.container(), cinfo.name(),
+                                                buf, CreateBuffer));
+  return Status::OK();
+}
+
+}  // namespace
+
+class StageOp : public OpKernel {
+ public:
+  explicit StageOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    Buffer* buf = nullptr;
+    OP_REQUIRES_OK(ctx, GetBuffer(ctx, def(), &buf));
+    core::ScopedUnref scope(buf);
+    Buffer::Tuple tuple;
+    for (int i = 0; i < ctx->num_inputs(); ++i) {
+      tuple.push_back(ctx->input(i));
+    }
+    buf->Put(&tuple);
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_CPU), StageOp);
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_GPU), StageOp);
+#endif
+
+class UnstageOp : public OpKernel {
+ public:
+  explicit UnstageOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  // Using this op in such a way that it blocks forever
+  // is an error.  As such cancellation is not handled.
+  void Compute(OpKernelContext* ctx) override {
+    Buffer* buf = nullptr;
+    OP_REQUIRES_OK(ctx, GetBuffer(ctx, def(), &buf));
+    core::ScopedUnref scope(buf);
+    Buffer::Tuple tuple;
+    buf->Get(&tuple);
+    OP_REQUIRES(
+        ctx, tuple.size() == ctx->num_outputs(),
+        errors::InvalidArgument("Mismatch stage/unstage: ", tuple.size(),
+                                " vs. ", ctx->num_outputs()));
+    for (int i = 0; i < tuple.size(); ++i) {
+      ctx->set_output(i, tuple[i]);
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_CPU), UnstageOp);
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_GPU), UnstageOp);
+#endif
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index 641c991a7e3..cbc44017dcf 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/variable_ops.h"
 
 namespace tensorflow {
 
@@ -292,10 +293,26 @@ struct ApplyCenteredRMSProp<CPUDevice, T> {
 
 }  // namespace functor
 
+mutex* GetMutex(OpKernelContext* ctx, int input) {
+  if (ctx->input_dtype(input) == DT_RESOURCE) {
+    Var* var;
+    if (LookupResource(ctx, HandleFromInput(ctx, input), &var).ok()) {
+      return var->mu();
+    } else {
+      ctx->CtxFailureWithWarning(
+          errors::Internal("Invalid variable reference."));
+      return nullptr;
+    }
+  }
+  return ctx->input_ref_mutex(input);
+}
+
 // MaybeLockMutexesInOrder is a helper function to acquire mutexes in address
-// order to mitigate deadlock.  Returns a vector of acquired mutexes.
-// Safe to pass duplicates - will only lock each distinct mutex once.
-// If do_lock is false, returns immediately.
+// order to mitigate deadlock.  Returns a vector of acquired mutexes.  Safe to
+// pass duplicates - will only lock each distinct mutex once.  If do_lock is
+// false, returns immediately.  Note that this silently doesn't lock mutexes for
+// invalid variable references; in all usages this is followed by GetInputTensor
+// which will signal a failure.
 std::vector<mutex_lock> MaybeLockMutexesInOrder(
     OpKernelContext* ctx, bool do_lock, const std::vector<int>& input_ids) {
   std::vector<mutex_lock> locks;
@@ -305,7 +322,7 @@ std::vector<mutex_lock> MaybeLockMutexesInOrder(
   std::vector<mutex*> mutexes;
   std::vector<int> acquire_order;
   for (auto input : input_ids) {
-    auto* mutex = ctx->input_ref_mutex(input);
+    mutex* mutex = GetMutex(ctx, input);
     // Only lock each mutex once if duplicates exist (n^2 but n is 2 or 3).
     if (std::find(mutexes.begin(), mutexes.end(), mutex) == mutexes.end()) {
       acquire_order.push_back(input);
@@ -316,11 +333,41 @@ std::vector<mutex_lock> MaybeLockMutexesInOrder(
             [&mutexes](int a, int b) { return mutexes[a] < mutexes[b]; });
 
   for (auto input : acquire_order) {
-    locks.emplace_back(*ctx->input_ref_mutex(input));
+    mutex* mu = GetMutex(ctx, input);
+    if (mu != nullptr) {
+      locks.emplace_back(*mu);
+    }
   }
   return locks;
 }
 
+Status GetInputTensor(OpKernelContext* ctx, int input, bool lock_held,
+                      Tensor* out) {
+  if (ctx->input_dtype(input) == DT_RESOURCE) {
+    Var* var;
+    if (LookupResource(ctx, HandleFromInput(ctx, input), &var).ok()) {
+      if (lock_held) {
+        *out = *var->tensor();
+      } else {
+        mutex_lock ml(*var->mu());
+        *out = *var->tensor();
+      }
+      return Status::OK();
+    } else {
+      return errors::Internal("Invalid variable reference.");
+    }
+  }
+  *out = ctx->mutable_input(input, lock_held);
+  return Status::OK();
+}
+
+void MaybeForwardRefInputToRefOutput(OpKernelContext* ctx, int input,
+                                     int output) {
+  if (ctx->input_dtype(input) != DT_RESOURCE) {
+    ctx->forward_ref_input_to_ref_output(input, output);
+  }
+}
+
 template <typename Device, typename T>
 class ApplyGradientDescentOp : public OpKernel {
  public:
@@ -330,7 +377,8 @@ class ApplyGradientDescentOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0});
-    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -351,7 +399,7 @@ class ApplyGradientDescentOp : public OpKernel {
     functor::ApplyGradientDescent<Device, T>()(
         device, var.flat<T>(), alpha.scalar<T>(), delta.flat<T>());
 
-    ctx->forward_ref_input_to_ref_output(0, 0);
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
 
  private:
@@ -361,7 +409,11 @@ class ApplyGradientDescentOp : public OpKernel {
 #define REGISTER_KERNELS(D, T)                                                \
   REGISTER_KERNEL_BUILDER(                                                    \
       Name("ApplyGradientDescent").Device(DEVICE_##D).TypeConstraint<T>("T"), \
-      ApplyGradientDescentOp<D##Device, T>);
+      ApplyGradientDescentOp<D##Device, T>);                                  \
+  REGISTER_KERNEL_BUILDER(Name("ResourceApplyGradientDescent")                \
+                              .Device(DEVICE_##D)                             \
+                              .TypeConstraint<T>("T"),                        \
+                          ApplyGradientDescentOp<D##Device, T>);
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
 TF_CALL_half(REGISTER_CPU_KERNELS);
@@ -406,7 +458,7 @@ class ApplyAdadeltaOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     if (use_exclusive_lock_) {
-      mutex_lock l1(*ctx->input_ref_mutex(0));
+      mutex_lock l1(*GetMutex(ctx, 0));
       // Don't try to acquire a lock on the second ref as they share the same
       // mutex.
       //
@@ -419,16 +471,20 @@ class ApplyAdadeltaOp : public OpKernel {
       if (!ctx->status().ok()) return;
       DoCompute(ctx);
     }
-    ctx->forward_ref_input_to_ref_output(0, 0);
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
 
  private:
   bool use_exclusive_lock_;
 
   void DoValidate(OpKernelContext* ctx) {
-    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
-    Tensor accum = ctx->mutable_input(1, use_exclusive_lock_);
-    Tensor accum_update = ctx->mutable_input(2, use_exclusive_lock_);
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    Tensor accum;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
+    Tensor accum_update;
+    OP_REQUIRES_OK(ctx,
+                   GetInputTensor(ctx, 2, use_exclusive_lock_, &accum_update));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -474,9 +530,13 @@ class ApplyAdadeltaOp : public OpKernel {
 
   void DoCompute(OpKernelContext* ctx) {
     const Device& device = ctx->template eigen_device<Device>();
-    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
-    Tensor accum = ctx->mutable_input(1, use_exclusive_lock_);
-    Tensor accum_update = ctx->mutable_input(2, use_exclusive_lock_);
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    Tensor accum;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
+    Tensor accum_update;
+    OP_REQUIRES_OK(ctx,
+                   GetInputTensor(ctx, 2, use_exclusive_lock_, &accum_update));
 
     const Tensor& lr = ctx->input(3);
     const Tensor& rho = ctx->input(4);
@@ -492,9 +552,12 @@ class ApplyAdadeltaOp : public OpKernel {
 using CPUDevice = Eigen::ThreadPoolDevice;
 using GPUDevice = Eigen::GpuDevice;
 
-#define REGISTER_KERNELS(D, T)                                         \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("ApplyAdadelta").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+#define REGISTER_KERNELS(D, T)                                                 \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("ApplyAdadelta").Device(DEVICE_##D).TypeConstraint<T>("T"),         \
+      ApplyAdadeltaOp<D##Device, T>);                                          \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("ResourceApplyAdadelta").Device(DEVICE_##D).TypeConstraint<T>("T"), \
       ApplyAdadeltaOp<D##Device, T>);
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
@@ -536,7 +599,7 @@ class SparseApplyAdadeltaOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    mutex* mu_var = ctx->input_ref_mutex(0);
+    mutex* mu_var = GetMutex(ctx, 0);
     // mu_accum is actually the same mutex as mu_var since currently we use a
     // global mutex.
     //
@@ -544,9 +607,14 @@ class SparseApplyAdadeltaOp : public OpKernel {
     if (use_exclusive_lock_) {
       mu_var->lock();
     }
-    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
-    Tensor accum_grad = ctx->mutable_input(1, use_exclusive_lock_);
-    Tensor accum_update = ctx->mutable_input(2, use_exclusive_lock_);
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    Tensor accum_grad;
+    OP_REQUIRES_OK(ctx,
+                   GetInputTensor(ctx, 1, use_exclusive_lock_, &accum_grad));
+    Tensor accum_update;
+    OP_REQUIRES_OK(ctx,
+                   GetInputTensor(ctx, 2, use_exclusive_lock_, &accum_update));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -642,7 +710,7 @@ class SparseApplyAdadeltaOp : public OpKernel {
       mu_var->unlock();
     }
 
-    ctx->forward_ref_input_to_ref_output(0, 0);
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
 
  private:
@@ -654,6 +722,11 @@ class SparseApplyAdadeltaOp : public OpKernel {
                               .Device(DEVICE_CPU)                    \
                               .TypeConstraint<T>("T")                \
                               .TypeConstraint<Tindices>("Tindices"), \
+                          SparseApplyAdadeltaOp<T, Tindices>);       \
+  REGISTER_KERNEL_BUILDER(Name("ResourceSparseApplyAdadelta")        \
+                              .Device(DEVICE_CPU)                    \
+                              .TypeConstraint<T>("T")                \
+                              .TypeConstraint<Tindices>("Tindices"), \
                           SparseApplyAdadeltaOp<T, Tindices>);
 #define REGISTER_CPU_KERNELS(T) \
   REGISTER_KERNELS(T, int32);   \
@@ -677,7 +750,8 @@ class ApplyProximalGradientDescentOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0});
-    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -710,17 +784,21 @@ class ApplyProximalGradientDescentOp : public OpKernel {
         device, var.flat<T>(), alpha.scalar<T>(), l1.scalar<T>(),
         l2.scalar<T>(), delta.flat<T>());
 
-    ctx->forward_ref_input_to_ref_output(0, 0);
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
 
  private:
   bool use_exclusive_lock_;
 };
 
-#define REGISTER_KERNELS(D, T)                                 \
-  REGISTER_KERNEL_BUILDER(Name("ApplyProximalGradientDescent") \
-                              .Device(DEVICE_##D)              \
-                              .TypeConstraint<T>("T"),         \
+#define REGISTER_KERNELS(D, T)                                           \
+  REGISTER_KERNEL_BUILDER(Name("ApplyProximalGradientDescent")           \
+                              .Device(DEVICE_##D)                        \
+                              .TypeConstraint<T>("T"),                   \
+                          ApplyProximalGradientDescentOp<D##Device, T>); \
+  REGISTER_KERNEL_BUILDER(Name("ResourceApplyProximalGradientDescent")   \
+                              .Device(DEVICE_##D)                        \
+                              .TypeConstraint<T>("T"),                   \
                           ApplyProximalGradientDescentOp<D##Device, T>);
 
 REGISTER_KERNELS(CPU, float);
@@ -738,7 +816,8 @@ class SparseApplyProximalGradientDescentOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
     auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
-    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
     OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(var.shape()),
                 errors::InvalidArgument("var must be at least 1 dimensional"));
 
@@ -846,18 +925,23 @@ class SparseApplyProximalGradientDescentOp : public OpKernel {
       }
     }
 
-    ctx->forward_ref_input_to_ref_output(0, 0);
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
 
  private:
   bool use_exclusive_lock_;
 };
 
-#define REGISTER_KERNELS(T, Tindices)                                \
-  REGISTER_KERNEL_BUILDER(Name("SparseApplyProximalGradientDescent") \
-                              .Device(DEVICE_CPU)                    \
-                              .TypeConstraint<T>("T")                \
-                              .TypeConstraint<Tindices>("Tindices"), \
+#define REGISTER_KERNELS(T, Tindices)                                         \
+  REGISTER_KERNEL_BUILDER(Name("SparseApplyProximalGradientDescent")          \
+                              .Device(DEVICE_CPU)                             \
+                              .TypeConstraint<T>("T")                         \
+                              .TypeConstraint<Tindices>("Tindices"),          \
+                          SparseApplyProximalGradientDescentOp<T, Tindices>); \
+  REGISTER_KERNEL_BUILDER(Name("ResourceSparseApplyProximalGradientDescent")  \
+                              .Device(DEVICE_CPU)                             \
+                              .TypeConstraint<T>("T")                         \
+                              .TypeConstraint<Tindices>("Tindices"),          \
                           SparseApplyProximalGradientDescentOp<T, Tindices>);
 
 REGISTER_KERNELS(float, int32);
@@ -875,8 +959,10 @@ class ApplyAdagradOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
-    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
-    Tensor accum = ctx->mutable_input(1, use_exclusive_lock_);
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    Tensor accum;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -905,7 +991,7 @@ class ApplyAdagradOp : public OpKernel {
     functor::ApplyAdagrad<Device, T>()(device, var.flat<T>(), accum.flat<T>(),
                                        lr.scalar<T>(), grad.flat<T>());
 
-    ctx->forward_ref_input_to_ref_output(0, 0);
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
 
  private:
@@ -915,9 +1001,12 @@ class ApplyAdagradOp : public OpKernel {
 using CPUDevice = Eigen::ThreadPoolDevice;
 using GPUDevice = Eigen::GpuDevice;
 
-#define REGISTER_KERNELS(D, T)                                        \
-  REGISTER_KERNEL_BUILDER(                                            \
-      Name("ApplyAdagrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+#define REGISTER_KERNELS(D, T)                                                \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("ApplyAdagrad").Device(DEVICE_##D).TypeConstraint<T>("T"),         \
+      ApplyAdagradOp<D##Device, T>);                                          \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("ResourceApplyAdagrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \
       ApplyAdagradOp<D##Device, T>);
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
@@ -957,8 +1046,10 @@ class ApplyProximalAdagradOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
-    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
-    Tensor accum = ctx->mutable_input(1, use_exclusive_lock_);
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    Tensor accum;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1004,7 +1095,7 @@ class ApplyProximalAdagradOp : public OpKernel {
         device, var.flat<T>(), accum.flat<T>(), lr.scalar<T>(), l1.scalar<T>(),
         l2.scalar<T>(), grad.flat<T>());
 
-    ctx->forward_ref_input_to_ref_output(0, 0);
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
 
  private:
@@ -1017,7 +1108,11 @@ using GPUDevice = Eigen::GpuDevice;
 #define REGISTER_KERNELS(D, T)                                                \
   REGISTER_KERNEL_BUILDER(                                                    \
       Name("ApplyProximalAdagrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \
-      ApplyProximalAdagradOp<D##Device, T>);
+      ApplyProximalAdagradOp<D##Device, T>);                                  \
+  REGISTER_KERNEL_BUILDER(Name("ResourceApplyProximalAdagrad")                \
+                              .Device(DEVICE_##D)                             \
+                              .TypeConstraint<T>("T"),                        \
+                          ApplyProximalAdagradOp<D##Device, T>);
 
 REGISTER_KERNELS(CPU, float);
 REGISTER_KERNELS(CPU, double);
@@ -1053,8 +1148,10 @@ class SparseApplyAdagradOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
     auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
-    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
-    Tensor accum = ctx->mutable_input(1, use_exclusive_lock_);
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    Tensor accum;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1142,7 +1239,7 @@ class SparseApplyAdagradOp : public OpKernel {
       }
     }
 
-    ctx->forward_ref_input_to_ref_output(0, 0);
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
 
  private:
@@ -1154,6 +1251,11 @@ class SparseApplyAdagradOp : public OpKernel {
                               .Device(DEVICE_CPU)                    \
                               .TypeConstraint<T>("T")                \
                               .TypeConstraint<Tindices>("Tindices"), \
+                          SparseApplyAdagradOp<T, Tindices>);        \
+  REGISTER_KERNEL_BUILDER(Name("ResourceSparseApplyAdagrad")         \
+                              .Device(DEVICE_CPU)                    \
+                              .TypeConstraint<T>("T")                \
+                              .TypeConstraint<Tindices>("Tindices"), \
                           SparseApplyAdagradOp<T, Tindices>);
 #define REGISTER_CPU_KERNELS(T) \
   REGISTER_KERNELS(T, int32);   \
@@ -1177,8 +1279,10 @@ class SparseApplyProximalAdagradOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
     auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
-    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
-    Tensor accum = ctx->mutable_input(1, use_exclusive_lock_);
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    Tensor accum;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1311,18 +1415,23 @@ class SparseApplyProximalAdagradOp : public OpKernel {
       }
     }
 
-    ctx->forward_ref_input_to_ref_output(0, 0);
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
 
  private:
   bool use_exclusive_lock_;
 };
 
-#define REGISTER_KERNELS(T, Tindices)                                \
-  REGISTER_KERNEL_BUILDER(Name("SparseApplyProximalAdagrad")         \
-                              .Device(DEVICE_CPU)                    \
-                              .TypeConstraint<T>("T")                \
-                              .TypeConstraint<Tindices>("Tindices"), \
+#define REGISTER_KERNELS(T, Tindices)                                 \
+  REGISTER_KERNEL_BUILDER(Name("SparseApplyProximalAdagrad")          \
+                              .Device(DEVICE_CPU)                     \
+                              .TypeConstraint<T>("T")                 \
+                              .TypeConstraint<Tindices>("Tindices"),  \
+                          SparseApplyProximalAdagradOp<T, Tindices>); \
+  REGISTER_KERNEL_BUILDER(Name("ResourceSparseApplyProximalAdagrad")  \
+                              .Device(DEVICE_CPU)                     \
+                              .TypeConstraint<T>("T")                 \
+                              .TypeConstraint<Tindices>("Tindices"),  \
                           SparseApplyProximalAdagradOp<T, Tindices>);
 
 REGISTER_KERNELS(float, int32);
@@ -1340,9 +1449,14 @@ class ApplyAdagradDAOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
-    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
-    Tensor gradient_accum = ctx->mutable_input(1, use_exclusive_lock_);
-    Tensor gradient_squared_accum = ctx->mutable_input(2, use_exclusive_lock_);
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    Tensor gradient_accum;
+    OP_REQUIRES_OK(
+        ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &gradient_accum));
+    Tensor gradient_squared_accum;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_,
+                                       &gradient_squared_accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1399,7 +1513,7 @@ class ApplyAdagradDAOp : public OpKernel {
         global_step.scalar<int64>()(), l1.scalar<T>(), l2.scalar<T>(),
         grad.flat<T>());
 
-    ctx->forward_ref_input_to_ref_output(0, 0);
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
 
  private:
@@ -1428,9 +1542,14 @@ class SparseApplyAdagradDAOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
     auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
-    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
-    Tensor gradient_accum = ctx->mutable_input(1, use_exclusive_lock_);
-    Tensor gradient_squared_accum = ctx->mutable_input(2, use_exclusive_lock_);
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    Tensor gradient_accum;
+    OP_REQUIRES_OK(
+        ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &gradient_accum));
+    Tensor gradient_squared_accum;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_,
+                                       &gradient_squared_accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1580,7 +1699,7 @@ class SparseApplyAdagradDAOp : public OpKernel {
       }
     }
 
-    ctx->forward_ref_input_to_ref_output(0, 0);
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
 
  private:
@@ -1592,6 +1711,11 @@ class SparseApplyAdagradDAOp : public OpKernel {
                               .Device(DEVICE_CPU)                    \
                               .TypeConstraint<T>("T")                \
                               .TypeConstraint<Tindices>("Tindices"), \
+                          SparseApplyAdagradDAOp<T, Tindices>);      \
+  REGISTER_KERNEL_BUILDER(Name("ResourceSparseApplyAdagradDA")       \
+                              .Device(DEVICE_CPU)                    \
+                              .TypeConstraint<T>("T")                \
+                              .TypeConstraint<Tindices>("Tindices"), \
                           SparseApplyAdagradDAOp<T, Tindices>);
 
 REGISTER_KERNELS(float, int32);
@@ -1610,9 +1734,12 @@ class ApplyFtrlOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2});
 
-    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
-    Tensor accum = ctx->mutable_input(1, use_exclusive_lock_);
-    Tensor linear = ctx->mutable_input(2, use_exclusive_lock_);
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    Tensor accum;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
+    Tensor linear;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, &linear));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1677,7 +1804,7 @@ class ApplyFtrlOp : public OpKernel {
                                     lr.scalar<T>(), l1.scalar<T>(),
                                     l2.scalar<T>(), lr_power.scalar<T>());
 
-    ctx->forward_ref_input_to_ref_output(0, 0);
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
 
  private:
@@ -1687,9 +1814,12 @@ class ApplyFtrlOp : public OpKernel {
 using CPUDevice = Eigen::ThreadPoolDevice;
 using GPUDevice = Eigen::GpuDevice;
 
-#define REGISTER_KERNELS(D, T)                                     \
-  REGISTER_KERNEL_BUILDER(                                         \
-      Name("ApplyFtrl").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+#define REGISTER_KERNELS(D, T)                                             \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("ApplyFtrl").Device(DEVICE_##D).TypeConstraint<T>("T"),         \
+      ApplyFtrlOp<D##Device, T>);                                          \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("ResourceApplyFtrl").Device(DEVICE_##D).TypeConstraint<T>("T"), \
       ApplyFtrlOp<D##Device, T>);
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
@@ -1710,9 +1840,12 @@ class SparseApplyFtrlOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
     auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2});
-    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
-    Tensor accum = ctx->mutable_input(1, use_exclusive_lock_);
-    Tensor linear = ctx->mutable_input(2, use_exclusive_lock_);
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    Tensor accum;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
+    Tensor linear;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, &linear));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1874,18 +2007,23 @@ class SparseApplyFtrlOp : public OpKernel {
       }
     }
 
-    ctx->forward_ref_input_to_ref_output(0, 0);
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
 
  private:
   bool use_exclusive_lock_;
 };
 
-#define REGISTER_KERNELS(T, Tindices)                                \
-  REGISTER_KERNEL_BUILDER(Name("SparseApplyFtrl")                    \
-                              .Device(DEVICE_CPU)                    \
-                              .TypeConstraint<T>("T")                \
-                              .TypeConstraint<Tindices>("Tindices"), \
+#define REGISTER_KERNELS(T, Tindices)                                 \
+  REGISTER_KERNEL_BUILDER(Name("SparseApplyFtrl")                     \
+                              .Device(DEVICE_CPU)                     \
+                              .TypeConstraint<T>("T")                 \
+                              .TypeConstraint<Tindices>("Tindices"),  \
+                          SparseApplyFtrlOp<CPUDevice, T, Tindices>); \
+  REGISTER_KERNEL_BUILDER(Name("ResourceSparseApplyFtrl")             \
+                              .Device(DEVICE_CPU)                     \
+                              .TypeConstraint<T>("T")                 \
+                              .TypeConstraint<Tindices>("Tindices"),  \
                           SparseApplyFtrlOp<CPUDevice, T, Tindices>);
 #define REGISTER_CPU_KERNELS(T) \
   REGISTER_KERNELS(T, int32);   \
@@ -1909,8 +2047,10 @@ class ApplyMomentumOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
 
-    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
-    Tensor accum = ctx->mutable_input(1, use_exclusive_lock_);
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    Tensor accum;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1944,7 +2084,7 @@ class ApplyMomentumOp : public OpKernel {
     functor::ApplyMomentum<Device, T>()(device, var.flat<T>(), accum.flat<T>(),
                                         lr.scalar<T>(), grad.flat<T>(),
                                         momentum.scalar<T>(), use_nesterov_);
-    ctx->forward_ref_input_to_ref_output(0, 0);
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
 
  private:
@@ -1955,9 +2095,12 @@ class ApplyMomentumOp : public OpKernel {
 using CPUDevice = Eigen::ThreadPoolDevice;
 using GPUDevice = Eigen::GpuDevice;
 
-#define REGISTER_KERNELS(D, T)                                         \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("ApplyMomentum").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+#define REGISTER_KERNELS(D, T)                                                 \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("ApplyMomentum").Device(DEVICE_##D).TypeConstraint<T>("T"),         \
+      ApplyMomentumOp<D##Device, T>);                                          \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("ResourceApplyMomentum").Device(DEVICE_##D).TypeConstraint<T>("T"), \
       ApplyMomentumOp<D##Device, T>);
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
@@ -2001,8 +2144,10 @@ class SparseApplyMomentumOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
     auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
 
-    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
-    Tensor accum = ctx->mutable_input(1, use_exclusive_lock_);
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    Tensor accum;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -2072,7 +2217,7 @@ class SparseApplyMomentumOp : public OpKernel {
       }
     }
 
-    ctx->forward_ref_input_to_ref_output(0, 0);
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
 
  private:
@@ -2085,6 +2230,11 @@ class SparseApplyMomentumOp : public OpKernel {
                               .Device(DEVICE_CPU)                    \
                               .TypeConstraint<T>("T")                \
                               .TypeConstraint<Tindices>("Tindices"), \
+                          SparseApplyMomentumOp<T, Tindices>);       \
+  REGISTER_KERNEL_BUILDER(Name("ResourceSparseApplyMomentum")        \
+                              .Device(DEVICE_CPU)                    \
+                              .TypeConstraint<T>("T")                \
+                              .TypeConstraint<Tindices>("Tindices"), \
                           SparseApplyMomentumOp<T, Tindices>);
 #define REGISTER_CPU_KERNELS(T) \
   REGISTER_KERNELS(T, int32);   \
@@ -2107,9 +2257,12 @@ class ApplyAdamOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2});
 
-    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
-    Tensor m = ctx->mutable_input(1, use_exclusive_lock_);
-    Tensor v = ctx->mutable_input(2, use_exclusive_lock_);
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    Tensor m;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &m));
+    Tensor v;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, &v));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -2171,7 +2324,7 @@ class ApplyAdamOp : public OpKernel {
                                     beta1.scalar<T>(), beta2.scalar<T>(),
                                     epsilon.scalar<T>(), grad.flat<T>());
 
-    ctx->forward_ref_input_to_ref_output(0, 0);
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
 
  private:
@@ -2181,9 +2334,12 @@ class ApplyAdamOp : public OpKernel {
 using CPUDevice = Eigen::ThreadPoolDevice;
 using GPUDevice = Eigen::GpuDevice;
 
-#define REGISTER_KERNELS(D, T)                                     \
-  REGISTER_KERNEL_BUILDER(                                         \
-      Name("ApplyAdam").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+#define REGISTER_KERNELS(D, T)                                             \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("ApplyAdam").Device(DEVICE_##D).TypeConstraint<T>("T"),         \
+      ApplyAdamOp<D##Device, T>);                                          \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("ResourceApplyAdam").Device(DEVICE_##D).TypeConstraint<T>("T"), \
       ApplyAdamOp<D##Device, T>);
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
@@ -2236,9 +2392,12 @@ class ApplyRMSPropOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2});
 
-    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
-    Tensor ms = ctx->mutable_input(1, use_exclusive_lock_);
-    Tensor mom = ctx->mutable_input(2, use_exclusive_lock_);
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    Tensor ms;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &ms));
+    Tensor mom;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, &mom));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -2294,7 +2453,7 @@ class ApplyRMSPropOp : public OpKernel {
                                        rho.scalar<T>(), momentum.scalar<T>(),
                                        epsilon.scalar<T>(), grad.flat<T>());
 
-    ctx->forward_ref_input_to_ref_output(0, 0);
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
 
  private:
@@ -2312,10 +2471,14 @@ class ApplyCenteredRMSPropOp : public OpKernel {
     auto locks =
         MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2, 3});
 
-    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
-    Tensor mg = ctx->mutable_input(1, use_exclusive_lock_);
-    Tensor ms = ctx->mutable_input(2, use_exclusive_lock_);
-    Tensor mom = ctx->mutable_input(3, use_exclusive_lock_);
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    Tensor mg;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &mg));
+    Tensor ms;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, &ms));
+    Tensor mom;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 3, use_exclusive_lock_, &mom));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -2379,7 +2542,7 @@ class ApplyCenteredRMSPropOp : public OpKernel {
         device, var.flat<T>(), mg.flat<T>(), ms.flat<T>(), mom.flat<T>(),
         lr.scalar<T>(), rho.scalar<T>(), momentum.scalar<T>(),
         epsilon.scalar<T>(), grad.flat<T>());
-    ctx->forward_ref_input_to_ref_output(0, 0);
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
 
  private:
@@ -2395,7 +2558,14 @@ using GPUDevice = Eigen::GpuDevice;
       ApplyRMSPropOp<D##Device, T>);                                          \
   REGISTER_KERNEL_BUILDER(                                                    \
       Name("ApplyCenteredRMSProp").Device(DEVICE_##D).TypeConstraint<T>("T"), \
-      ApplyCenteredRMSPropOp<D##Device, T>);
+      ApplyCenteredRMSPropOp<D##Device, T>);                                  \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("ResourceApplyRMSProp").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+      ApplyRMSPropOp<D##Device, T>);                                          \
+  REGISTER_KERNEL_BUILDER(Name("ResourceApplyCenteredRMSProp")                \
+                              .Device(DEVICE_##D)                             \
+                              .TypeConstraint<T>("T"),                        \
+                          ApplyCenteredRMSPropOp<D##Device, T>);
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
 TF_CALL_half(REGISTER_CPU_KERNELS);
@@ -2449,9 +2619,12 @@ class SparseApplyRMSPropOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
     auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2});
 
-    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
-    Tensor ms = ctx->mutable_input(1, use_exclusive_lock_);
-    Tensor mom = ctx->mutable_input(2, use_exclusive_lock_);
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    Tensor ms;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &ms));
+    Tensor mom;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, &mom));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -2552,7 +2725,7 @@ class SparseApplyRMSPropOp : public OpKernel {
       }
     }
 
-    ctx->forward_ref_input_to_ref_output(0, 0);
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
 
  private:
@@ -2572,10 +2745,14 @@ class SparseApplyCenteredRMSPropOp : public OpKernel {
     auto locks =
         MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2, 3});
 
-    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
-    Tensor mg = ctx->mutable_input(1, use_exclusive_lock_);
-    Tensor ms = ctx->mutable_input(2, use_exclusive_lock_);
-    Tensor mom = ctx->mutable_input(3, use_exclusive_lock_);
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    Tensor mg;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &mg));
+    Tensor ms;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, &ms));
+    Tensor mom;
+    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 3, use_exclusive_lock_, &mom));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -2685,23 +2862,33 @@ class SparseApplyCenteredRMSPropOp : public OpKernel {
       }
     }
 
-    ctx->forward_ref_input_to_ref_output(0, 0);
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
 
  private:
   bool use_exclusive_lock_;
 };
 
-#define REGISTER_KERNELS(T, Tindices)                                \
-  REGISTER_KERNEL_BUILDER(Name("SparseApplyRMSProp")                 \
-                              .Device(DEVICE_CPU)                    \
-                              .TypeConstraint<T>("T")                \
-                              .TypeConstraint<Tindices>("Tindices"), \
-                          SparseApplyRMSPropOp<T, Tindices>);        \
-  REGISTER_KERNEL_BUILDER(Name("SparseApplyCenteredRMSProp")         \
-                              .Device(DEVICE_CPU)                    \
-                              .TypeConstraint<T>("T")                \
-                              .TypeConstraint<Tindices>("Tindices"), \
+#define REGISTER_KERNELS(T, Tindices)                                 \
+  REGISTER_KERNEL_BUILDER(Name("SparseApplyRMSProp")                  \
+                              .Device(DEVICE_CPU)                     \
+                              .TypeConstraint<T>("T")                 \
+                              .TypeConstraint<Tindices>("Tindices"),  \
+                          SparseApplyRMSPropOp<T, Tindices>);         \
+  REGISTER_KERNEL_BUILDER(Name("SparseApplyCenteredRMSProp")          \
+                              .Device(DEVICE_CPU)                     \
+                              .TypeConstraint<T>("T")                 \
+                              .TypeConstraint<Tindices>("Tindices"),  \
+                          SparseApplyCenteredRMSPropOp<T, Tindices>); \
+  REGISTER_KERNEL_BUILDER(Name("ResourceSparseApplyRMSProp")          \
+                              .Device(DEVICE_CPU)                     \
+                              .TypeConstraint<T>("T")                 \
+                              .TypeConstraint<Tindices>("Tindices"),  \
+                          SparseApplyRMSPropOp<T, Tindices>);         \
+  REGISTER_KERNEL_BUILDER(Name("ResourceSparseApplyCenteredRMSProp")  \
+                              .Device(DEVICE_CPU)                     \
+                              .TypeConstraint<T>("T")                 \
+                              .TypeConstraint<Tindices>("Tindices"),  \
                           SparseApplyCenteredRMSPropOp<T, Tindices>);
 
 REGISTER_KERNELS(Eigen::half, int32);
diff --git a/tensorflow/core/lib/core/arena.cc b/tensorflow/core/lib/core/arena.cc
index a7148ed1c75..ef1b9a5468c 100644
--- a/tensorflow/core/lib/core/arena.cc
+++ b/tensorflow/core/lib/core/arena.cc
@@ -25,6 +25,7 @@ limitations under the License.
 
 #include <assert.h>
 
+#include <algorithm>
 #include <vector>
 
 #include "tensorflow/core/platform/logging.h"
@@ -48,7 +49,8 @@ Arena::Arena(const size_t block_size)
       overflow_blocks_(NULL) {
   assert(block_size > kDefaultAlignment);
 
-  first_blocks_[0].mem = reinterpret_cast<char*>(malloc(block_size_));
+  first_blocks_[0].mem =
+      reinterpret_cast<char*>(port::AlignedMalloc(block_size_, sizeof(void*)));
 
   first_blocks_[0].size = block_size_;
 
@@ -59,7 +61,9 @@ Arena::~Arena() {
   FreeBlocks();
   assert(overflow_blocks_ == NULL);  // FreeBlocks() should do that
   // The first X blocks stay allocated always by default.  Delete them now.
-  for (size_t i = 0; i < blocks_alloced_; ++i) free(first_blocks_[i].mem);
+  for (size_t i = 0; i < blocks_alloced_; ++i) {
+    port::AlignedFree(first_blocks_[i].mem);
+  }
 }
 
 // Returns true iff it advances freestart_ to the first position
@@ -162,8 +166,11 @@ Arena::AllocatedBlock* Arena::AllocNewBlock(const size_t block_size,
 
   // Must be a multiple of kDefaultAlignment, unless requested
   // alignment is 1, in which case we don't care at all.
-  const uint32 adjusted_alignment =
+  uint32 adjusted_alignment =
       (alignment > 1 ? LeastCommonMultiple(alignment, kDefaultAlignment) : 1);
+  // Required minimum alignment for port::AlignedMalloc().
+  adjusted_alignment =
+      std::max(adjusted_alignment, static_cast<uint32>(sizeof(void*)));
 
   CHECK_LE(adjusted_alignment, static_cast<uint32>(1 << 20))
       << "Alignment on boundaries greater than 1MB not supported.";
@@ -171,16 +178,12 @@ Arena::AllocatedBlock* Arena::AllocNewBlock(const size_t block_size,
   // If block_size > alignment we force block_size to be a multiple
   // of alignment; if block_size < alignment we make no adjustment.
   size_t adjusted_block_size = block_size;
-  if (adjusted_alignment > 1) {
-    if (adjusted_block_size > adjusted_alignment) {
-      const uint32 excess = adjusted_block_size % adjusted_alignment;
-      adjusted_block_size += (excess > 0 ? adjusted_alignment - excess : 0);
-    }
-    block->mem = reinterpret_cast<char*>(
-        port::aligned_malloc(adjusted_block_size, adjusted_alignment));
-  } else {
-    block->mem = reinterpret_cast<char*>(malloc(adjusted_block_size));
+  if (adjusted_block_size > adjusted_alignment) {
+    const uint32 excess = adjusted_block_size % adjusted_alignment;
+    adjusted_block_size += (excess > 0 ? adjusted_alignment - excess : 0);
   }
+  block->mem = reinterpret_cast<char*>(
+      port::AlignedMalloc(adjusted_block_size, adjusted_alignment));
   block->size = adjusted_block_size;
   CHECK(NULL != block->mem) << "block_size=" << block_size
                             << " adjusted_block_size=" << adjusted_block_size
@@ -242,7 +245,7 @@ void* Arena::GetMemoryFallback(const size_t size, const int alignment) {
 
 void Arena::FreeBlocks() {
   for (size_t i = 1; i < blocks_alloced_; ++i) {  // keep first block alloced
-    free(first_blocks_[i].mem);
+    port::AlignedFree(first_blocks_[i].mem);
     first_blocks_[i].mem = NULL;
     first_blocks_[i].size = 0;
   }
@@ -250,7 +253,7 @@ void Arena::FreeBlocks() {
   if (overflow_blocks_ != NULL) {
     std::vector<AllocatedBlock>::iterator it;
     for (it = overflow_blocks_->begin(); it != overflow_blocks_->end(); ++it) {
-      free(it->mem);
+      port::AlignedFree(it->mem);
     }
     delete overflow_blocks_;  // These should be used very rarely
     overflow_blocks_ = NULL;
diff --git a/tensorflow/core/lib/gtl/inlined_vector.h b/tensorflow/core/lib/gtl/inlined_vector.h
index fc439f9eb66..d6e5d9effa7 100644
--- a/tensorflow/core/lib/gtl/inlined_vector.h
+++ b/tensorflow/core/lib/gtl/inlined_vector.h
@@ -45,6 +45,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/manual_constructor.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/types.h"
 
 #include <initializer_list>  // NOLINT(build/include_order)
@@ -353,7 +354,7 @@ class InlinedVector {
     size_t n = size();
     Destroy(base, n);
     if (!is_inline()) {
-      free(base);
+      port::Free(base);
     }
   }
 
@@ -434,7 +435,7 @@ class InlinedVector {
     }
 
     T* src = data();
-    T* dst = static_cast<T*>(malloc(target * sizeof(T)));
+    T* dst = static_cast<T*>(port::Malloc(target * sizeof(T)));
 
     // Need to copy elem before discarding src since it might alias src.
     InitType{}(dst + s, std::forward<Args>(args)...);
diff --git a/tensorflow/core/lib/gtl/manual_constructor.h b/tensorflow/core/lib/gtl/manual_constructor.h
index 8f041a13538..0a76e0962e6 100644
--- a/tensorflow/core/lib/gtl/manual_constructor.h
+++ b/tensorflow/core/lib/gtl/manual_constructor.h
@@ -30,7 +30,7 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mem.h"  // For aligned_malloc/aligned_free
+#include "tensorflow/core/platform/mem.h"
 
 namespace tensorflow {
 namespace gtl {
@@ -127,9 +127,9 @@ class ManualConstructor {
   // Support users creating arrays of ManualConstructor<>s.  This ensures that
   // the array itself has the correct alignment.
   static void* operator new[](size_t size) {
-    return port::aligned_malloc(size, TF_LIB_GTL_ALIGN_OF(Type));
+    return port::AlignedMalloc(size, TF_LIB_GTL_ALIGN_OF(Type));
   }
-  static void operator delete[](void* mem) { port::aligned_free(mem); }
+  static void operator delete[](void* mem) { port::AlignedFree(mem); }
 
   inline Type* get() { return reinterpret_cast<Type*>(space_); }
   inline const Type* get() const {
diff --git a/tensorflow/core/ops/compat/ops_history.v0.pbtxt b/tensorflow/core/ops/compat/ops_history.v0.pbtxt
index 1c4c52be975..b9589d1c6fa 100644
--- a/tensorflow/core/ops/compat/ops_history.v0.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v0.pbtxt
@@ -24830,6 +24830,1318 @@ op {
     }
   }
 }
+op {
+  name: "ResourceApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResourceApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResourceApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResourceApplyAdam"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResourceApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mg"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResourceApplyFtrl"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResourceApplyGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResourceApplyMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResourceApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResourceApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResourceApplyRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResourceSparseApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResourceSparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResourceSparseApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResourceSparseApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mg"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResourceSparseApplyFtrl"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResourceSparseApplyMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResourceSparseApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResourceSparseApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResourceSparseApplyRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "Restore"
   input_arg {
@@ -32556,6 +33868,34 @@ op {
     }
   }
 }
+op {
+  name: "Stage"
+  input_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "StopGradient"
   input_arg {
@@ -37032,6 +38372,34 @@ op {
     }
   }
 }
+op {
+  name: "Unstage"
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Variable"
   output_arg {
diff --git a/tensorflow/core/ops/data_flow_ops.cc b/tensorflow/core/ops/data_flow_ops.cc
index ea24a0a16f2..54e766e8e9c 100644
--- a/tensorflow/core/ops/data_flow_ops.cc
+++ b/tensorflow/core/ops/data_flow_ops.cc
@@ -2180,4 +2180,35 @@ Delete the tensor specified by its handle in the session.
 handle: The handle for a tensor stored in the session state.
 )doc");
 
+REGISTER_OP("Stage")
+    .Input("values: dtypes")
+    .Attr("dtypes: list(type)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetShapeFn(shape_inference::UnknownShape)
+    .SetIsStateful()
+    .Doc(R"doc(
+Stage values similar to a lightweight Enqueue.  The basic functionality of this
+Op is similar to a queue with many fewer capabilities and options.  This Op is
+optimized for performance.
+
+values: a list of tensors
+container: If non-empty, this queue is placed in the given container. Otherwise,
+  a default container is used.
+shared_name: It is necessary to match this name to the matching Unstage Op.
+    )doc");
+
+REGISTER_OP("Unstage")
+    .Output("values: dtypes")
+    .Attr("dtypes: list(type)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetShapeFn(shape_inference::UnknownShape)
+    .SetIsStateful()
+    .Doc(R"doc(
+Op is similar to a lightweight Dequeue.  The basic funtionality is similar to
+dequeue with many fewer capabilities and options.  This Op is optimized for
+performance.
+    )doc");
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index dd26b30d32f..65ad47e7b73 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -15933,6 +15933,1513 @@ op {
   }
   summary: "Computes the gradient of nearest neighbor interpolation."
 }
+op {
+  name: "ResourceApplyAdadelta"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    description: "Decay factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    description: "Constant factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If True, updating of the var, accum and update_accum tensors will be protected by\na lock; otherwise the behavior is undefined, but may exhibit less contention."
+  }
+  summary: "Update \'*var\' according to the adadelta scheme."
+  description: "accum = rho() * accum + (1 - rho()) * grad.square();\nupdate = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;\nupdate_accum = rho() * update_accum + (1 - rho()) * update.square();\nvar -= update;"
+}
+op {
+  name: "ResourceApplyAdagrad"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
+  }
+  summary: "Update \'*var\' according to the adagrad scheme."
+  description: "accum += grad * grad\nvar -= lr * grad * (1 / sqrt(accum))"
+}
+op {
+  name: "ResourceApplyAdagradDA"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    description: "L1 regularization. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    description: "L2 regularization. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    description: "Training step number. Must be a scalar."
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If True, updating of the var and accum tensors will be protected by\na lock; otherwise the behavior is undefined, but may exhibit less contention."
+  }
+  summary: "Update \'*var\' according to the proximal adagrad scheme."
+}
+op {
+  name: "ResourceApplyAdam"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
+    description: "Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    description: "Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    description: "Momentum factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    description: "Momentum factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    description: "Ridge term. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If `True`, updating of the var, m, and v tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
+  }
+  summary: "Update \'*var\' according to the Adam algorithm."
+  description: "lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)\nm_t <- beta1 * m_{t-1} + (1 - beta1) * g_t\nv_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t\nvariable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)"
+}
+op {
+  name: "ResourceApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mg"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    description: "Decay rate. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    description: "Ridge term. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If `True`, updating of the var, mg, ms, and mom tensors is\nprotected by a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
+  }
+  summary: "Update \'*var\' according to the centered RMSProp algorithm."
+  description: "The centered RMSProp algorithm uses an estimate of the centered second moment\n(i.e., the variance) for normalization, as opposed to regular RMSProp, which\nuses the (uncentered) second moment. This often helps with training, but is\nslightly more expensive in terms of computation and memory.\n\nNote that in dense implementation of this algorithm, mg, ms, and mom will\nupdate even if the grad is zero, but in this sparse implementation, mg, ms,\nand mom will not update in iterations during which the grad is zero.\n\nmean_square = decay * mean_square + (1-decay) * gradient ** 2\nmean_grad = decay * mean_grad + (1-decay) * gradient\n\nDelta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)\n\nmg <- rho * mg_{t-1} + (1-rho) * grad\nms <- rho * ms_{t-1} + (1-rho) * grad * grad\nmom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)\nvar <- var - mom"
+}
+op {
+  name: "ResourceApplyFtrl"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    description: "L1 regulariation. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    description: "L2 regulariation. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
+  }
+  summary: "Update \'*var\' according to the Ftrl-proximal scheme."
+  description: "accum_new = accum + grad * grad\nlinear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var\nquadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2\nvar = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0\naccum = accum_new"
+}
+op {
+  name: "ResourceApplyGradientDescent"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    description: "The change."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If `True`, the subtraction will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
+  }
+  summary: "Update \'*var\' by subtracting \'alpha\' * \'delta\' from it."
+}
+op {
+  name: "ResourceApplyMomentum"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    description: "Momentum. Must be a scalar."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If `True`, the tensor passed to compute grad will be\nvar - lr * momentum * accum, so in the end, the var you get is actually\nvar - lr * momentum * accum."
+  }
+  summary: "Update \'*var\' according to the momentum scheme. Set use_nesterov = True if you"
+  description: "want to use Nesterov momentum.\n\naccum = accum * momentum + grad\nvar -= lr * accum"
+}
+op {
+  name: "ResourceApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    description: "L1 regularization. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    description: "L2 regularization. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If True, updating of the var and accum tensors will be protected by\na lock; otherwise the behavior is undefined, but may exhibit less contention."
+  }
+  summary: "Update \'*var\' and \'*accum\' according to FOBOS with Adagrad learning rate."
+  description: "accum += grad * grad\nprox_v = var - lr * grad * (1 / sqrt(accum))\nvar = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}"
+}
+op {
+  name: "ResourceApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    description: "L1 regularization. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    description: "L2 regularization. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    description: "The change."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If True, the subtraction will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
+  }
+  summary: "Update \'*var\' as FOBOS algorithm with fixed learning rate."
+  description: "prox_v = var - alpha * delta\nvar = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}"
+}
+op {
+  name: "ResourceApplyRMSProp"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    description: "Decay rate. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    description: "Ridge term. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If `True`, updating of the var, ms, and mom tensors is protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
+  }
+  summary: "Update \'*var\' according to the RMSProp algorithm."
+  description: "Note that in dense implementation of this algorithm, ms and mom will\nupdate even if the grad is zero, but in this sparse implementation, ms\nand mom will not update in iterations during which the grad is zero.\n\nmean_square = decay * mean_square + (1-decay) * gradient ** 2\nDelta = learning_rate * gradient / sqrt(mean_square + epsilon)\n\nms <- rho * ms_{t-1} + (1-rho) * grad * grad\nmom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)\nvar <- var - mom"
+}
+op {
+  name: "ResourceSparseApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    description: ": Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    description: "Learning rate. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    description: "Decay factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    description: "Constant factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    description: "A vector of indices into the first dimension of var and accum."
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If True, updating of the var and accum tensors will be protected by\na lock; otherwise the behavior is undefined, but may exhibit less contention."
+  }
+  summary: "var: Should be from a Variable()."
+}
+op {
+  name: "ResourceSparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    description: "Learning rate. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    description: "A vector of indices into the first dimension of var and accum."
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
+  }
+  summary: "Update relevant entries in \'*var\' and \'*accum\' according to the adagrad scheme."
+  description: "That is for rows we have grad for, we update var and accum as follows:\naccum += grad * grad\nvar -= lr * grad * (1 / sqrt(accum))"
+}
+op {
+  name: "ResourceSparseApplyAdagradDA"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    description: "A vector of indices into the first dimension of var and accum."
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    description: "Learning rate. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    description: "L1 regularization. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    description: "L2 regularization. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    description: "Training step number. Must be a scalar."
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If True, updating of the var and accum tensors will be protected by\na lock; otherwise the behavior is undefined, but may exhibit less contention."
+  }
+  summary: "Update entries in \'*var\' and \'*accum\' according to the proximal adagrad scheme."
+}
+op {
+  name: "ResourceSparseApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mg"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    description: "Decay rate. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    description: "Ridge term. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    description: "A vector of indices into the first dimension of var, ms and mom."
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If `True`, updating of the var, mg, ms, and mom tensors is\nprotected by a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
+  }
+  summary: "Update \'*var\' according to the centered RMSProp algorithm."
+  description: "The centered RMSProp algorithm uses an estimate of the centered second moment\n(i.e., the variance) for normalization, as opposed to regular RMSProp, which\nuses the (uncentered) second moment. This often helps with training, but is\nslightly more expensive in terms of computation and memory.\n\nNote that in dense implementation of this algorithm, mg, ms, and mom will\nupdate even if the grad is zero, but in this sparse implementation, mg, ms,\nand mom will not update in iterations during which the grad is zero.\n\nmean_square = decay * mean_square + (1-decay) * gradient ** 2\nmean_grad = decay * mean_grad + (1-decay) * gradient\nDelta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)\n\nms <- rho * ms_{t-1} + (1-rho) * grad * grad\nmom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)\nvar <- var - mom"
+}
+op {
+  name: "ResourceSparseApplyFtrl"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    description: "A vector of indices into the first dimension of var and accum."
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    description: "L1 regularization. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    description: "L2 regularization. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
+  }
+  summary: "Update relevant entries in \'*var\' according to the Ftrl-proximal scheme."
+  description: "That is for rows we have grad for, we update var, accum and linear as follows:\naccum_new = accum + grad * grad\nlinear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var\nquadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2\nvar = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0\naccum = accum_new"
+}
+op {
+  name: "ResourceSparseApplyMomentum"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    description: "Learning rate. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    description: "A vector of indices into the first dimension of var and accum."
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    description: "Momentum. Must be a scalar."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If `True`, the tensor passed to compute grad will be\nvar - lr * momentum * accum, so in the end, the var you get is actually\nvar - lr * momentum * accum."
+  }
+  summary: "Update relevant entries in \'*var\' and \'*accum\' according to the momentum scheme."
+  description: "Set use_nesterov = True if you want to use Nesterov momentum.\n\nThat is for rows we have grad for, we update var and accum as follows:\n\naccum = accum * momentum + grad\nvar -= lr * accum"
+}
+op {
+  name: "ResourceSparseApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    description: "Learning rate. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    description: "L1 regularization. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    description: "L2 regularization. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    description: "A vector of indices into the first dimension of var and accum."
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If True, updating of the var and accum tensors will be protected by\na lock; otherwise the behavior is undefined, but may exhibit less contention."
+  }
+  summary: "Sparse update entries in \'*var\' and \'*accum\' according to FOBOS algorithm."
+  description: "That is for rows we have grad for, we update var and accum as follows:\naccum += grad * grad\nprox_v = var\nprox_v -= lr * grad * (1 / sqrt(accum))\nvar = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}"
+}
+op {
+  name: "ResourceSparseApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    description: "L1 regularization. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    description: "L2 regularization. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    description: "A vector of indices into the first dimension of var and accum."
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If True, the subtraction will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
+  }
+  summary: "Sparse update \'*var\' as FOBOS algorithm with fixed learning rate."
+  description: "That is for rows we have grad for, we update var as follows:\nprox_v = var - alpha * grad\nvar = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}"
+}
+op {
+  name: "ResourceSparseApplyRMSProp"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    description: "Decay rate. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    description: "Ridge term. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    description: "A vector of indices into the first dimension of var, ms and mom."
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If `True`, updating of the var, ms, and mom tensors is protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
+  }
+  summary: "Update \'*var\' according to the RMSProp algorithm."
+  description: "Note that in dense implementation of this algorithm, ms and mom will\nupdate even if the grad is zero, but in this sparse implementation, ms\nand mom will not update in iterations during which the grad is zero.\n\nmean_square = decay * mean_square + (1-decay) * gradient ** 2\nDelta = learning_rate * gradient / sqrt(mean_square + epsilon)\n\nms <- rho * ms_{t-1} + (1-rho) * grad * grad\nmom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)\nvar <- var - mom"
+}
 op {
   name: "Restore"
   input_arg {
@@ -21028,6 +22535,39 @@ op {
   }
   summary: "Push an element onto the stack."
 }
+op {
+  name: "Stage"
+  input_arg {
+    name: "values"
+    description: "a list of tensors"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this queue is placed in the given container. Otherwise,\na default container is used."
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "It is necessary to match this name to the matching Unstage Op."
+  }
+  summary: "Stage values similar to a lightweight Enqueue.  The basic functionality of this"
+  description: "Op is similar to a queue with many fewer capabilities and options.  This Op is\noptimized for performance."
+  is_stateful: true
+}
 op {
   name: "StopGradient"
   input_arg {
@@ -23604,6 +25144,36 @@ op {
   summary: "Computes the sum along segments of a tensor."
   description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n`(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such\nthat `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`\nneed not be sorted and need not cover all values in the full\nrange of valid values.\n\nIf the sum is empty for a given segment ID `i`, `output[i] = 0`.\n\n`num_segments` should equal the number of distinct segment IDs.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/UnsortedSegmentSum.png\" alt>\n</div>"
 }
+op {
+  name: "Unstage"
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  summary: "Op is similar to a lightweight Dequeue.  The basic funtionality is similar to"
+  description: "dequeue with many fewer capabilities and options.  This Op is optimized for\nperformance."
+  is_stateful: true
+}
 op {
   name: "Variable"
   output_arg {
diff --git a/tensorflow/core/ops/set_ops.cc b/tensorflow/core/ops/set_ops.cc
index 3da83ddae24..fad70072071 100644
--- a/tensorflow/core/ops/set_ops.cc
+++ b/tensorflow/core/ops/set_ops.cc
@@ -64,24 +64,20 @@ REGISTER_OP("DenseToDenseSetOperation")
       }
       // The following should stay in sync with `ComputeDenseToDense` shape
       // assertions in kernels/set_kernels.cc.
-      // Dimension n contains the set values to be compared, so ranks and the
-      // first n-1 dimensions of inputs and output must match.
+      // Dimension n contains the set values to be compared, so ranks must be
+      // >= 2, and the first n-1 dimensions of inputs and output must be
+      // compatible.
       DimensionHandle output_rank;
       ShapeHandle input0_shape = c->input(0);
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(input0_shape, 2, &input0_shape));
       if (c->RankKnown(input0_shape)) {
         const int32 input0_rank = c->Rank(input0_shape);
-        if (input0_rank < 2) {
-          return errors::InvalidArgument("Input 0, expected rank >= 2, got ",
-                                         input0_rank, ".");
-        }
         ShapeHandle input1_shape = c->input(1);
+        TF_RETURN_IF_ERROR(
+            c->WithRank(input1_shape, input0_rank, &input1_shape));
         if (c->RankKnown(input1_shape)) {
+          // If both ranks are specified, the first n-1 dims must be compatible.
           const int32 rank = c->Rank(input1_shape);
-          if (input0_rank != rank) {
-            return errors::InvalidArgument("Ranks do not match: input 0 ",
-                                           input0_rank, ", input 1 ", rank,
-                                           ".");
-          }
           ShapeHandle group0_shape;
           TF_RETURN_IF_ERROR(
               c->Subshape(input0_shape, 0, rank - 1, &group0_shape));
@@ -95,28 +91,16 @@ REGISTER_OP("DenseToDenseSetOperation")
         output_rank = c->MakeDim(input0_rank);
       } else {
         ShapeHandle input1_shape = c->input(1);
+        TF_RETURN_IF_ERROR(c->WithRankAtLeast(input1_shape, 2, &input1_shape));
         if (c->RankKnown(input1_shape)) {
-          const int32 input1_rank = c->Rank(input1_shape);
-          if (input1_rank < 2) {
-            return errors::InvalidArgument("Input 0, expected rank >= 2, got ",
-                                           input1_rank, ".");
-          }
-          output_rank = c->MakeDim(input1_rank);
+          output_rank = c->MakeDim(c->Rank(input1_shape));
         } else {
           output_rank = c->UnknownDim();
         }
       }
-      DimensionHandle output_num_elements = c->Dim(input0_shape, 0);
-      if (!c->ValueKnown(output_num_elements)) {
-        ShapeHandle input1_shape = c->input(1);
-        output_num_elements = c->Dim(input1_shape, 0);
-        if (!c->ValueKnown(output_num_elements)) {
-          output_num_elements = c->UnknownDim();
-        }
-      }
 
-      c->set_output(0, c->Matrix(output_num_elements, output_rank));
-      c->set_output(1, c->Vector(output_num_elements));
+      c->set_output(0, c->Matrix(c->UnknownDim(), output_rank));
+      c->set_output(1, c->Vector(c->UnknownDim()));
       c->set_output(2, c->Vector(output_rank));
       return Status::OK();
     })
@@ -159,30 +143,30 @@ REGISTER_OP("DenseToSparseSetOperation")
       }
       // The following should stay in sync with `ComputeDenseToSparse` shape
       // assertions in kernels/set_kernels.cc.
-      // Dimension n contains the set values to be compared, so ranks and the
-      // first n-1 dimensions of inputs and output must match.
-      DimensionHandle output_rank;
+      // Ranks must be compatible, and be >= 2.
+      ShapeHandle input1_shape_shape = c->input(3);
+      TF_RETURN_IF_ERROR(shape_inference::ValidateSparseTensor(
+          c, c->input(1), c->input(2), input1_shape_shape));
+
+      DimensionHandle input1_rank_dim = c->Dim(input1_shape_shape, 0);
+
+      DimensionHandle output_rank_dim;
       ShapeHandle input0_shape = c->input(0);
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(input0_shape, 2, &input0_shape));
       if (c->RankKnown(input0_shape)) {
         const int32 input0_rank = c->Rank(input0_shape);
-        if (input0_rank < 2) {
-          return errors::InvalidArgument("Input 0, expected rank >= 2, got ",
-                                         input0_rank, ".");
-        }
-        output_rank = c->MakeDim(input0_rank);
+        TF_RETURN_IF_ERROR(
+            c->WithValue(input1_rank_dim, input0_rank, &input1_rank_dim));
+        output_rank_dim = c->MakeDim(input0_rank);
+      } else if (c->ValueKnown(input1_rank_dim)) {
+        output_rank_dim = input1_rank_dim;
       } else {
-        output_rank = c->UnknownDim();
-      }
-      TF_RETURN_IF_ERROR(shape_inference::ValidateSparseTensor(
-          c, c->input(1), c->input(2), c->input(3)));
-      DimensionHandle output_num_elements = c->Dim(input0_shape, 0);
-      if (!c->ValueKnown(output_num_elements)) {
-        output_num_elements = c->UnknownDim();
+        output_rank_dim = c->UnknownDim();
       }
 
-      c->set_output(0, c->Matrix(output_num_elements, output_rank));
-      c->set_output(1, c->Vector(output_num_elements));
-      c->set_output(2, c->Vector(output_rank));
+      c->set_output(0, c->Matrix(c->UnknownDim(), output_rank_dim));
+      c->set_output(1, c->Vector(c->UnknownDim()));
+      c->set_output(2, c->Vector(output_rank_dim));
       return Status::OK();
     })
     .Doc(R"doc(
@@ -239,13 +223,40 @@ REGISTER_OP("SparseToSparseSetOperation")
       }
       // The following should stay in sync with `ComputeSparseToSparse` shape
       // assertions in kernels/set_kernels.cc.
+      // Ranks must be compatible, and be >= 2.
+      ShapeHandle input0_shape_shape = c->input(2);
+      ShapeHandle input1_shape_shape = c->input(5);
       TF_RETURN_IF_ERROR(shape_inference::ValidateSparseTensor(
-          c, c->input(0), c->input(1), c->input(2)));
+          c, c->input(0), c->input(1), input0_shape_shape));
       TF_RETURN_IF_ERROR(shape_inference::ValidateSparseTensor(
-          c, c->input(3), c->input(4), c->input(5)));
-      c->set_output(0, c->Matrix(c->UnknownDim(), c->UnknownDim()));
+          c, c->input(3), c->input(4), input1_shape_shape));
+
+      DimensionHandle input0_rank_dim = c->Dim(input0_shape_shape, 0);
+      DimensionHandle input1_rank_dim = c->Dim(input1_shape_shape, 0);
+      DimensionHandle output_rank_dim;
+      if (c->ValueKnown(input0_rank_dim)) {
+        const int32 input0_rank = c->Value(input0_rank_dim);
+        if (input0_rank < 2) {
+          return errors::InvalidArgument("Input 0, expected rank >= 2, got ",
+                                         input0_rank, ".");
+        }
+        TF_RETURN_IF_ERROR(
+            c->WithValue(input1_rank_dim, input0_rank, &input1_rank_dim));
+        output_rank_dim = input0_rank_dim;
+      } else if (c->ValueKnown(input1_rank_dim)) {
+        const int32 input1_rank = c->Value(input1_rank_dim);
+        if (input1_rank < 2) {
+          return errors::InvalidArgument("Input 1, expected rank >= 2, got ",
+                                         input1_rank, ".");
+        }
+        output_rank_dim = input1_rank_dim;
+      } else {
+        output_rank_dim = c->UnknownDim();
+      }
+
+      c->set_output(0, c->Matrix(c->UnknownDim(), output_rank_dim));
       c->set_output(1, c->Vector(c->UnknownDim()));
-      c->set_output(2, c->Vector(c->UnknownDim()));
+      c->set_output(2, c->Vector(output_rank_dim));
       return Status::OK();
     })
     .Doc(R"doc(
diff --git a/tensorflow/core/ops/set_ops_test.cc b/tensorflow/core/ops/set_ops_test.cc
index 75da599767e..3a58eb974ff 100644
--- a/tensorflow/core/ops/set_ops_test.cc
+++ b/tensorflow/core/ops/set_ops_test.cc
@@ -34,16 +34,16 @@ TEST(SetOpsTest, DenseToDenseShape) {
   INFER_OK(op, "?;?", "[?,?];[?];[?]");
 
   // Invalid rank.
-  INFER_ERROR("expected rank >= 2", op, "[?];?");
-  INFER_ERROR("expected rank >= 2", op, "?;[?]");
-  INFER_ERROR("expected rank >= 2", op, "[2];?");
-  INFER_ERROR("expected rank >= 2", op, "?;[2]");
+  INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "[?];?");
+  INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "?;[?]");
+  INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "[2];?");
+  INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "?;[2]");
 
   // Mismatched ranks.
-  INFER_ERROR("Ranks do not match", op, "[?,?];[?,?,?]");
-  INFER_ERROR("Ranks do not match", op, "[?,?,?];[?,?]");
-  INFER_ERROR("Ranks do not match", op, "[2,1];[2,1,2]");
-  INFER_ERROR("Ranks do not match", op, "[2,1,2];[2,1]");
+  INFER_ERROR("Shape must be rank 2 but is rank 3", op, "[?,?];[?,?,?]");
+  INFER_ERROR("Shape must be rank 3 but is rank 2", op, "[?,?,?];[?,?]");
+  INFER_ERROR("Shape must be rank 2 but is rank 3", op, "[2,1];[2,1,2]");
+  INFER_ERROR("Shape must be rank 3 but is rank 2", op, "[2,1,2];[2,1]");
 
   // Rank 2, unknown dims.
   INFER_OK(op, "[?,?];?", "[?,2];[?];[2]");
@@ -55,26 +55,26 @@ TEST(SetOpsTest, DenseToDenseShape) {
   INFER_OK(op, "?;[?,?,?,?]", "[?,4];[?];[4]");
   INFER_OK(op, "[?,?,?,?];[?,?,?,?]", "[?,4];[?];[4]");
 
-  // Known dimension 0.
-  INFER_OK(op, "[4,?,?,?];?", "[d0_0,4];[d0_0];[4]");
-  INFER_OK(op, "?;[4,?,?,?]", "[d1_0,4];[d1_0];[4]");
-  INFER_OK(op, "[4,?,?,?];[?,?,?,?]", "[d0_0,4];[d0_0];[4]");
-  INFER_OK(op, "[?,?,?,?];[4,?,?,?]", "[d1_0,4];[d1_0];[4]");
-  INFER_OK(op, "[4,?,?,?];[4,?,?,?]", "[d0_0,4];[d0_0];[4]");
+  // Known rank for 1 input.
+  INFER_OK(op, "[5,3,2,1];?", "[?,4];[?];[4]");
+  INFER_OK(op, "?;[5,3,2,1]", "[?,4];[?];[4]");
+  INFER_OK(op, "[5,3,2,1];[?,?,?,?]", "[?,4];[?];[4]");
+  INFER_OK(op, "[?,?,?,?];[5,3,2,1]", "[?,4];[?];[4]");
+  INFER_OK(op, "[5,3,2,1];[?,?,?,?]", "[?,4];[?];[4]");
 
-  // Mismatched known n-1 dims.
+  // Mismatched n-1 dims.
   INFER_ERROR("Dimension 0 in both shapes must be equal", op,
               "[4,?,2,?];[3,1,?,5]");
   INFER_ERROR("Dimension 2 in both shapes must be equal", op,
               "[4,3,2,1];[4,3,3,1]");
 
-  // Matched known n-1 dims.
-  INFER_OK(op, "[4,5,6,7];[?,?,?,?]", "[d0_0,4];[d0_0];[4]");
-  INFER_OK(op, "[4,5,6,7];[?,?,?,4]", "[d0_0,4];[d0_0];[4]");
-  INFER_OK(op, "[?,?,?,?];[4,5,6,7]", "[d1_0,4];[d1_0];[4]");
-  INFER_OK(op, "[4,?,2,?];[?,1,?,5]", "[d0_0,4];[d0_0];[4]");
-  INFER_OK(op, "[4,5,6,7];[4,?,6,?]", "[d0_0,4];[d0_0];[4]");
-  INFER_OK(op, "[4,5,6,7];[4,5,6,4]", "[d0_0,4];[d0_0];[4]");
+  // Matched n-1 dims.
+  INFER_OK(op, "[4,5,6,7];[?,?,?,?]", "[?,4];[?];[4]");
+  INFER_OK(op, "[4,5,6,7];[?,?,?,4]", "[?,4];[?];[4]");
+  INFER_OK(op, "[?,?,?,?];[4,5,6,7]", "[?,4];[?];[4]");
+  INFER_OK(op, "[4,?,2,?];[?,1,?,5]", "[?,4];[?];[4]");
+  INFER_OK(op, "[4,5,6,7];[4,?,6,?]", "[?,4];[?];[4]");
+  INFER_OK(op, "[4,5,6,7];[4,5,6,4]", "[?,4];[?];[4]");
 }
 
 TEST(SetOpsTest, DenseToSparseShape_InvalidNumberOfInputs) {
@@ -89,35 +89,37 @@ TEST(SetOpsTest, DenseToSparseShape) {
 
   // Unknown shapes.
   INFER_OK(op, "?;?;?;?", "[?,?];[?];[?]");
+  INFER_OK(op, "?;[?,?];[?];[?]", "[?,?];[?];[?]");
 
   // Invalid rank.
-  INFER_ERROR("expected rank >= 2", op, "[?];?;?;?");
-  INFER_ERROR("expected rank >= 2", op, "[?];[?,?];[?];[?]");
-  INFER_ERROR("expected rank >= 2", op, "[?];[5,3];[5];[3]");
-  INFER_ERROR("expected rank >= 2", op, "[2];?;?;?");
-  INFER_ERROR("expected rank >= 2", op, "[2];[?,?];[?];[?]");
-  INFER_ERROR("expected rank >= 2", op, "[2];[5,3];[5];[3]");
+  INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "[?];?;?;?");
+  INFER_ERROR("Shape must be at least rank 2 but is rank 1", op,
+              "[?];[?,?];[?];[?]");
+  INFER_ERROR("Shape must be at least rank 2 but is rank 1", op,
+              "[?];[5,3];[5];[3]");
+  INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "[2];?;?;?");
+  INFER_ERROR("Shape must be at least rank 2 but is rank 1", op,
+              "[2];[?,?];[?];[?]");
+  INFER_ERROR("Shape must be at least rank 2 but is rank 1", op,
+              "[2];[5,3];[5];[3]");
 
-  // Rank 2, unknown dims.
+  // Unknown sparse rank.
   INFER_OK(op, "[?,?];?;?;?", "[?,2];[?];[2]");
   INFER_OK(op, "[?,?];[?,?];[?];[?]", "[?,2];[?];[2]");
-  INFER_OK(op, "[?,?];[5,3];[5];[3]", "[?,2];[?];[2]");
 
-  // Rank 4, unknown dims.
-  INFER_OK(op, "[?,?,?,?];?;?;?", "[?,4];[?];[4]");
-  INFER_OK(op, "[?,?,?,?];[?,?];[?];[?]", "[?,4];[?];[4]");
-  INFER_OK(op, "[?,?,?,?];[5,3];[5];[3]", "[?,4];[?];[4]");
+  // Unknown dense rank.
+  INFER_OK(op, "?;[?,2];[?];[2]", "[?,d3_0];[?];[d3_0]");
+  INFER_OK(op, "?;[5,2];[5];[2]", "[?,d3_0];[?];[d3_0]");
 
-  // Known dimension 0.
-  INFER_OK(op, "[4,?,?,?];?;?;?", "[d0_0,4];[d0_0];[4]");
-  INFER_OK(op, "[4,?,?,?];[?,?];[?];[?]", "[d0_0,4];[d0_0];[4]");
-  INFER_OK(op, "[4,?,?,?];[5,3];[5];[3]", "[d0_0,4];[d0_0];[4]");
+  // Known both ranks.
+  INFER_OK(op, "[?,?];[5,2];[5];[2]", "[?,2];[?];[2]");
+  INFER_OK(op, "[4,3];[5,2];[5];[2]", "[?,2];[?];[2]");
 
   // Invalid input sparse tensor.
   INFER_ERROR("elements in index (5) and values (6) do not match", op,
-              "[?,?];[5,3];[6];[3]");
+              "?;[5,3];[6];[3]");
   INFER_ERROR("rank (3) and shape rank (4) do not match", op,
-              "[?,?];[5,3];[5];[4]");
+              "?;[5,3];[5];[4]");
 }
 
 TEST(SetOpsTest, SparseToSparseShape_InvalidNumberOfInputs) {
@@ -128,7 +130,21 @@ TEST(SetOpsTest, SparseToSparseShape_InvalidNumberOfInputs) {
 
 TEST(SetOpsTest, SparseToSparseShape) {
   ShapeInferenceTestOp op("SparseToSparseSetOperation");
+
+  // Unknown.
   INFER_OK(op, "?;?;?;?;?;?", "[?,?];[?];[?]");
+  INFER_OK(op, "[?,?];[?];[?];[?,?];[?];[?]", "[?,?];[?];[?]");
+  INFER_OK(op, "?;?;?;[?,?];[?];[?]", "[?,?];[?];[?]");
+  INFER_OK(op, "[?,?];[?];[?];?;?;?", "[?,?];[?];[?]");
+
+  // Known rank for 1 input.
+  INFER_OK(op, "[?,2];[?];[2];?;?;?", "[?,d2_0];[?];[d2_0]");
+  INFER_OK(op, "?;?;?;[?,2];[?];[2]", "[?,d5_0];[?];[d5_0]");
+  INFER_OK(op, "[?,2];[?];[2];[?,?];[?];[?]", "[?,d2_0];[?];[d2_0]");
+  INFER_OK(op, "[?,?];[?];[?];[?,2];[?];[2]", "[?,d5_0];[?];[d5_0]");
+
+  // Known rank for both inputs.
+  INFER_OK(op, "[?,2];[?];[2];[?,2];[?];[2]", "[?,d2_0];[?];[d2_0]");
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/training_ops.cc b/tensorflow/core/ops/training_ops.cc
index a2615b9f9f2..2027bf4603d 100644
--- a/tensorflow/core/ops/training_ops.cc
+++ b/tensorflow/core/ops/training_ops.cc
@@ -22,12 +22,20 @@ using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
+static ShapeHandle ShapeOrHandleShape(InferenceContext* c, int input) {
+  auto h_dtype = c->input_handle_dtype(input);
+  if (h_dtype == DT_INVALID) {
+    return c->input(input);
+  }
+  return c->input_handle_shape(input);
+}
+
 // Handle the gradient and, if <sparse>, indices inputs.
 // <s> is an input+output parameter, containing the current known input shape to
 // the gradient.
 static Status HandleGradAndIndicesInputs(InferenceContext* c, bool sparse,
                                          int grad_idx, ShapeHandle* s) {
-  ShapeHandle grad = c->input(grad_idx);
+  ShapeHandle grad = ShapeOrHandleShape(c, grad_idx);
   if (!sparse) {
     TF_RETURN_IF_ERROR(c->Merge(*s, grad, s));
     return Status::OK();
@@ -49,10 +57,12 @@ static Status HandleGradAndIndicesInputs(InferenceContext* c, bool sparse,
 
 static Status ApplyGradientDescentShapeFn(InferenceContext* c) {
   ShapeHandle unused;
-  ShapeHandle s = c->input(0);                               // var
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                  // var
   TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));  // alpha
   TF_RETURN_IF_ERROR(c->Merge(s, c->input(2), &s));          // delta
-  c->set_output(0, s);
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
   return Status::OK();
 }
 
@@ -75,16 +85,35 @@ use_locking: If `True`, the subtraction will be protected by a lock;
   otherwise the behavior is undefined, but may exhibit less contention.
 )doc");
 
+REGISTER_OP("ResourceApplyGradientDescent")
+    .Input("var: resource")
+    .Input("alpha: T")
+    .Input("delta: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn(ApplyGradientDescentShapeFn)
+    .Doc(R"doc(
+Update '*var' by subtracting 'alpha' * 'delta' from it.
+
+var: Should be from a Variable().
+alpha: Scaling factor. Must be a scalar.
+delta: The change.
+use_locking: If `True`, the subtraction will be protected by a lock;
+  otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
 static Status ApplyProximalGradientDescentShapeFn(InferenceContext* c,
                                                   bool sparse) {
   ShapeHandle unused;
-  ShapeHandle s = c->input(0);                               // var
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                  // var
   TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));  // alpha
   TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));  // l1
   TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));  // l2
   TF_RETURN_IF_ERROR(
       HandleGradAndIndicesInputs(c, sparse, 4 /* grad_idx */, &s));
-  c->set_output(0, s);
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
   return Status::OK();
 }
 
@@ -146,17 +175,76 @@ out: Same as "var".
 use_locking: If True, the subtraction will be protected by a lock;
   otherwise the behavior is undefined, but may exhibit less contention.
 )doc");
+
+REGISTER_OP("ResourceApplyProximalGradientDescent")
+    .Input("var: resource")
+    .Input("alpha: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("delta: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyProximalGradientDescentShapeFn(c, false /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' as FOBOS algorithm with fixed learning rate.
+prox_v = var - alpha * delta
+var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+
+var: Should be from a Variable().
+alpha: Scaling factor. Must be a scalar.
+l1: L1 regularization. Must be a scalar.
+l2: L2 regularization. Must be a scalar.
+delta: The change.
+use_locking: If True, the subtraction will be protected by a lock;
+  otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
+REGISTER_OP("ResourceSparseApplyProximalGradientDescent")
+    .Input("var: resource")
+    .Input("alpha: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyProximalGradientDescentShapeFn(c, true /* sparse */);
+    })
+    .Doc(R"doc(
+Sparse update '*var' as FOBOS algorithm with fixed learning rate.
+
+That is for rows we have grad for, we update var as follows:
+prox_v = var - alpha * grad
+var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+
+var: Should be from a Variable().
+alpha: Scaling factor. Must be a scalar.
+l1: L1 regularization. Must be a scalar.
+l2: L2 regularization. Must be a scalar.
+grad: The gradient.
+indices: A vector of indices into the first dimension of var and accum.
+use_locking: If True, the subtraction will be protected by a lock;
+  otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
 static Status ApplyAdadeltaShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
-  ShapeHandle s = c->input(0);                               // var
-  TF_RETURN_IF_ERROR(c->Merge(s, c->input(1), &s));          // accum
-  TF_RETURN_IF_ERROR(c->Merge(s, c->input(2), &s));          // accum update
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // accum
+  TF_RETURN_IF_ERROR(
+      c->Merge(s, ShapeOrHandleShape(c, 2), &s));            // accum update
   TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));  // lr
   TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));  // rho
   TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));  // epsilon
   TF_RETURN_IF_ERROR(
       HandleGradAndIndicesInputs(c, sparse, 6 /* grad_idx */, &s));
-  c->set_output(0, s);
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
   return Status::OK();
 }
 
@@ -224,14 +312,76 @@ use_locking: If True, updating of the var and accum tensors will be protected by
 a lock; otherwise the behavior is undefined, but may exhibit less contention.
 )doc");
 
+REGISTER_OP("ResourceApplyAdadelta")
+    .Input("var: resource")
+    .Input("accum: resource")
+    .Input("accum_update: resource")
+    .Input("lr: T")
+    .Input("rho: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdadeltaShapeFn(c, false /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' according to the adadelta scheme.
+
+accum = rho() * accum + (1 - rho()) * grad.square();
+update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
+update_accum = rho() * update_accum + (1 - rho()) * update.square();
+var -= update;
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+accum_update: Should be from a Variable().
+lr: Scaling factor. Must be a scalar.
+rho: Decay factor. Must be a scalar.
+epsilon: Constant factor. Must be a scalar.
+grad: The gradient.
+use_locking: If True, updating of the var, accum and update_accum tensors will be protected by
+a lock; otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
+REGISTER_OP("ResourceSparseApplyAdadelta")
+    .Input("var: resource")
+    .Input("accum: resource")
+    .Input("accum_update: resource")
+    .Input("lr: T")
+    .Input("rho: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdadeltaShapeFn(c, true /* sparse */);
+    })
+    .Doc(R"doc(
+var: Should be from a Variable().
+accum: Should be from a Variable().
+accum_update:: Should be from a Variable().
+lr: Learning rate. Must be a scalar.
+rho: Decay factor. Must be a scalar.
+epsilon: Constant factor. Must be a scalar.
+grad: The gradient.
+indices: A vector of indices into the first dimension of var and accum.
+use_locking: If True, updating of the var and accum tensors will be protected by
+a lock; otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
 static Status ApplyAdagradShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
-  ShapeHandle s = c->input(0);                               // var
-  TF_RETURN_IF_ERROR(c->Merge(s, c->input(1), &s));          // accum
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // accum
   TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));  // lr
   TF_RETURN_IF_ERROR(
       HandleGradAndIndicesInputs(c, sparse, 3 /* grad_idx */, &s));
-  c->set_output(0, s);
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
   return Status::OK();
 }
 
@@ -261,16 +411,44 @@ use_locking: If `True`, updating of the var and accum tensors will be protected
   by a lock; otherwise the behavior is undefined, but may exhibit less
   contention.
 )doc");
+
+REGISTER_OP("ResourceApplyAdagrad")
+    .Input("var: resource")
+    .Input("accum: resource")
+    .Input("lr: T")
+    .Input("grad: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdagradShapeFn(c, false /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' according to the adagrad scheme.
+
+accum += grad * grad
+var -= lr * grad * (1 / sqrt(accum))
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+lr: Scaling factor. Must be a scalar.
+grad: The gradient.
+use_locking: If `True`, updating of the var and accum tensors will be protected
+  by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+)doc");
+
 static Status ApplyProximalAdagradShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
-  ShapeHandle s = c->input(0);                               // var
-  TF_RETURN_IF_ERROR(c->Merge(s, c->input(1), &s));          // accum
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // accum
   TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));  // lr
   TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));  // l1
   TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));  // l2
   TF_RETURN_IF_ERROR(
       HandleGradAndIndicesInputs(c, sparse, 5 /* grad_idx */, &s));
-  c->set_output(0, s);
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
   return Status::OK();
 }
 
@@ -304,6 +482,34 @@ use_locking: If True, updating of the var and accum tensors will be protected by
 a lock; otherwise the behavior is undefined, but may exhibit less contention.
 )doc");
 
+REGISTER_OP("ResourceApplyProximalAdagrad")
+    .Input("var: resource")
+    .Input("accum: resource")
+    .Input("lr: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("grad: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyProximalAdagradShapeFn(c, false /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
+accum += grad * grad
+prox_v = var - lr * grad * (1 / sqrt(accum))
+var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+grad: The gradient.
+lr: Scaling factor. Must be a scalar.
+l1: L1 regularization. Must be a scalar.
+l2: L2 regularization. Must be a scalar.
+use_locking: If True, updating of the var and accum tensors will be protected by
+a lock; otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
 REGISTER_OP("SparseApplyAdagrad")
     .Input("var: Ref(T)")
     .Input("accum: Ref(T)")
@@ -335,12 +541,42 @@ use_locking: If `True`, updating of the var and accum tensors will be protected
   contention.
 )doc");
 
+REGISTER_OP("ResourceSparseApplyAdagrad")
+    .Input("var: resource")
+    .Input("accum: resource")
+    .Input("lr: T")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdagradShapeFn(c, true /* sparse */);
+    })
+    .Doc(R"doc(
+Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
+
+That is for rows we have grad for, we update var and accum as follows:
+accum += grad * grad
+var -= lr * grad * (1 / sqrt(accum))
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+lr: Learning rate. Must be a scalar.
+grad: The gradient.
+indices: A vector of indices into the first dimension of var and accum.
+use_locking: If `True`, updating of the var and accum tensors will be protected
+  by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+)doc");
+
 static Status ApplyAdagradDAShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
-  ShapeHandle s = c->input(0);                       // var
-  TF_RETURN_IF_ERROR(c->Merge(s, c->input(1), &s));  // grad_accumulator
+  ShapeHandle s = ShapeOrHandleShape(c, 0);  // var
   TF_RETURN_IF_ERROR(
-      c->Merge(s, c->input(2), &s));  // gradient_squared_accumulator
+      c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // grad_accumulator
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2),
+                              &s));  // gradient_squared_accumulator
   TF_RETURN_IF_ERROR(
       HandleGradAndIndicesInputs(c, sparse, 3 /* grad_idx */, &s));
   int idx = sparse ? 5 : 4;
@@ -348,7 +584,9 @@ static Status ApplyAdagradDAShapeFn(InferenceContext* c, bool sparse) {
   TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused));  // l1
   TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused));  // l2
   TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused));  // global step
-  c->set_output(0, s);
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
   return Status::OK();
 }
 
@@ -453,11 +691,106 @@ use_locking: If True, updating of the var and accum tensors will be protected by
 a lock; otherwise the behavior is undefined, but may exhibit less contention.
 )doc");
 
+REGISTER_OP("ResourceApplyAdagradDA")
+    .Input("var: resource")
+    .Input("gradient_accumulator: resource")
+    .Input("gradient_squared_accumulator: resource")
+    .Input("grad: T")
+    .Input("lr: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("global_step: int64")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdagradDAShapeFn(c, false /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' according to the proximal adagrad scheme.
+
+var: Should be from a Variable().
+gradient_accumulator: Should be from a Variable().
+gradient_squared_accumulator: Should be from a Variable().
+grad: The gradient.
+lr: Scaling factor. Must be a scalar.
+l1: L1 regularization. Must be a scalar.
+l2: L2 regularization. Must be a scalar.
+global_step: Training step number. Must be a scalar.
+use_locking: If True, updating of the var and accum tensors will be protected by
+a lock; otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
+REGISTER_OP("ResourceSparseApplyAdagradDA")
+    .Input("var: resource")
+    .Input("gradient_accumulator: resource")
+    .Input("gradient_squared_accumulator: resource")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Input("lr: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("global_step: int64")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdagradDAShapeFn(c, true /* sparse */);
+    })
+    .Doc(R"doc(
+Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
+
+var: Should be from a Variable().
+gradient_accumulator: Should be from a Variable().
+gradient_squared_accumulator: Should be from a Variable().
+grad: The gradient.
+indices: A vector of indices into the first dimension of var and accum.
+lr: Learning rate. Must be a scalar.
+l1: L1 regularization. Must be a scalar.
+l2: L2 regularization. Must be a scalar.
+global_step: Training step number. Must be a scalar.
+use_locking: If True, updating of the var and accum tensors will be protected by
+a lock; otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
+REGISTER_OP("ResourceSparseApplyProximalAdagrad")
+    .Input("var: resource")
+    .Input("accum: resource")
+    .Input("lr: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyProximalAdagradShapeFn(c, true /* sparse */);
+    })
+    .Doc(R"doc(
+Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
+
+That is for rows we have grad for, we update var and accum as follows:
+accum += grad * grad
+prox_v = var
+prox_v -= lr * grad * (1 / sqrt(accum))
+var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+lr: Learning rate. Must be a scalar.
+l1: L1 regularization. Must be a scalar.
+l2: L2 regularization. Must be a scalar.
+grad: The gradient.
+indices: A vector of indices into the first dimension of var and accum.
+use_locking: If True, updating of the var and accum tensors will be protected by
+a lock; otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
 static Status ApplyFtrlShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
-  ShapeHandle s = c->input(0);                       // var
-  TF_RETURN_IF_ERROR(c->Merge(s, c->input(1), &s));  // accum
-  TF_RETURN_IF_ERROR(c->Merge(s, c->input(2), &s));  // linear
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // accum
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s));  // linear
   TF_RETURN_IF_ERROR(
       HandleGradAndIndicesInputs(c, sparse, 3 /* grad_idx */, &s));
   int idx = sparse ? 5 : 4;
@@ -465,7 +798,9 @@ static Status ApplyFtrlShapeFn(InferenceContext* c, bool sparse) {
   TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused));  // l1
   TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused));  // l2
   TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused));  // lr_power
-  c->set_output(0, s);
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
   return Status::OK();
 }
 
@@ -549,16 +884,94 @@ use_locking: If `True`, updating of the var and accum tensors will be protected
   contention.
 )doc");
 
+REGISTER_OP("ResourceApplyFtrl")
+    .Input("var: resource")
+    .Input("accum: resource")
+    .Input("linear: resource")
+    .Input("grad: T")
+    .Input("lr: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("lr_power: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyFtrlShapeFn(c, false /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' according to the Ftrl-proximal scheme.
+
+accum_new = accum + grad * grad
+linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+accum = accum_new
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+linear: Should be from a Variable().
+grad: The gradient.
+lr: Scaling factor. Must be a scalar.
+l1: L1 regulariation. Must be a scalar.
+l2: L2 regulariation. Must be a scalar.
+lr_power: Scaling factor. Must be a scalar.
+use_locking: If `True`, updating of the var and accum tensors will be protected
+  by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+)doc");
+
+REGISTER_OP("ResourceSparseApplyFtrl")
+    .Input("var: resource")
+    .Input("accum: resource")
+    .Input("linear: resource")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Input("lr: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("lr_power: T")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyFtrlShapeFn(c, true /* sparse */);
+    })
+    .Doc(R"doc(
+Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+
+That is for rows we have grad for, we update var, accum and linear as follows:
+accum_new = accum + grad * grad
+linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+accum = accum_new
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+linear: Should be from a Variable().
+grad: The gradient.
+indices: A vector of indices into the first dimension of var and accum.
+lr: Scaling factor. Must be a scalar.
+l1: L1 regularization. Must be a scalar.
+l2: L2 regularization. Must be a scalar.
+lr_power: Scaling factor. Must be a scalar.
+use_locking: If `True`, updating of the var and accum tensors will be protected
+  by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+)doc");
+
 static Status ApplyMomentumShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
-  ShapeHandle s = c->input(0);                               // var
-  TF_RETURN_IF_ERROR(c->Merge(s, c->input(1), &s));          // accum
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // accum
   TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));  // lr
   TF_RETURN_IF_ERROR(
       HandleGradAndIndicesInputs(c, sparse, 3 /* grad_idx */, &s));
   int idx = sparse ? 5 : 4;
   TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused));  // momentum
-  c->set_output(0, s);
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
   return Status::OK();
 }
 
@@ -635,11 +1048,80 @@ var - lr * momentum * accum, so in the end, the var you get is actually
 var - lr * momentum * accum.
 )doc");
 
+REGISTER_OP("ResourceApplyMomentum")
+    .Input("var: resource")
+    .Input("accum: resource")
+    .Input("lr: T")
+    .Input("grad: T")
+    .Input("momentum: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .Attr("use_nesterov: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyMomentumShapeFn(c, false /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' according to the momentum scheme. Set use_nesterov = True if you
+want to use Nesterov momentum.
+
+accum = accum * momentum + grad
+var -= lr * accum
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+lr: Scaling factor. Must be a scalar.
+grad: The gradient.
+momentum: Momentum. Must be a scalar.
+use_locking: If `True`, updating of the var and accum tensors will be protected
+  by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+use_nesterov: If `True`, the tensor passed to compute grad will be 
+var - lr * momentum * accum, so in the end, the var you get is actually
+var - lr * momentum * accum.
+)doc");
+
+REGISTER_OP("ResourceSparseApplyMomentum")
+    .Input("var: resource")
+    .Input("accum: resource")
+    .Input("lr: T")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Input("momentum: T")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .Attr("use_nesterov: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyMomentumShapeFn(c, true /* sparse */);
+    })
+    .Doc(R"doc(
+Update relevant entries in '*var' and '*accum' according to the momentum scheme.
+Set use_nesterov = True if you want to use Nesterov momentum.
+
+That is for rows we have grad for, we update var and accum as follows:
+
+accum = accum * momentum + grad
+var -= lr * accum
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+lr: Learning rate. Must be a scalar.
+grad: The gradient.
+indices: A vector of indices into the first dimension of var and accum.
+momentum: Momentum. Must be a scalar.
+use_locking: If `True`, updating of the var and accum tensors will be protected
+  by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+use_nesterov: If `True`, the tensor passed to compute grad will be 
+var - lr * momentum * accum, so in the end, the var you get is actually
+var - lr * momentum * accum.
+)doc");
+
 static Status ApplyAdamShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
-  ShapeHandle s = c->input(0);                               // var
-  TF_RETURN_IF_ERROR(c->Merge(s, c->input(1), &s));          // m
-  TF_RETURN_IF_ERROR(c->Merge(s, c->input(2), &s));          // v
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // m
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s));  // v
   TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));  // beta1_power
   TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));  // beta2_power
   TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));  // lr
@@ -648,7 +1130,9 @@ static Status ApplyAdamShapeFn(InferenceContext* c, bool sparse) {
   TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 0, &unused));  // epsilon
   TF_RETURN_IF_ERROR(
       HandleGradAndIndicesInputs(c, sparse, 9 /* grad_idx */, &s));
-  c->set_output(0, s);
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
   return Status::OK();
 }
 
@@ -693,34 +1177,77 @@ use_locking: If `True`, updating of the var, m, and v tensors will be protected
   contention.
 )doc");
 
+REGISTER_OP("ResourceApplyAdam")
+    .Input("var: resource")
+    .Input("m: resource")
+    .Input("v: resource")
+    .Input("beta1_power: T")
+    .Input("beta2_power: T")
+    .Input("lr: T")
+    .Input("beta1: T")
+    .Input("beta2: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdamShapeFn(c, false /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' according to the Adam algorithm.
+
+lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
+m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t
+v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t
+variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
+
+var: Should be from a Variable().
+m: Should be from a Variable().
+v: Should be from a Variable().
+beta1_power: Must be a scalar.
+beta2_power: Must be a scalar.
+lr: Scaling factor. Must be a scalar.
+beta1: Momentum factor. Must be a scalar.
+beta2: Momentum factor. Must be a scalar.
+epsilon: Ridge term. Must be a scalar.
+grad: The gradient.
+use_locking: If `True`, updating of the var, m, and v tensors will be protected
+  by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+)doc");
+
 static Status ApplyRMSPropShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
-  ShapeHandle s = c->input(0);                               // var
-  TF_RETURN_IF_ERROR(c->Merge(s, c->input(1), &s));          // ms
-  TF_RETURN_IF_ERROR(c->Merge(s, c->input(2), &s));          // mom
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // ms
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s));  // mom
   TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));  // lr
   TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));  // rho
   TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));  // momentum
   TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));  // epsilon
   TF_RETURN_IF_ERROR(
       HandleGradAndIndicesInputs(c, sparse, 7 /* grad_idx */, &s));
-  c->set_output(0, s);
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
   return Status::OK();
 }
 
 static Status ApplyCenteredRMSPropShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
-  ShapeHandle s = c->input(0);                               // var
-  TF_RETURN_IF_ERROR(c->Merge(s, c->input(1), &s));          // ms
-  TF_RETURN_IF_ERROR(c->Merge(s, c->input(2), &s));          // mg
-  TF_RETURN_IF_ERROR(c->Merge(s, c->input(3), &s));          // mom
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // ms
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s));  // mg
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 3), &s));  // mom
   TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));  // lr
   TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));  // rho
   TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));  // momentum
   TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));  // epsilon
   TF_RETURN_IF_ERROR(
       HandleGradAndIndicesInputs(c, sparse, 8 /* grad_idx */, &s));
-  c->set_output(0, s);
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
   return Status::OK();
 }
 
@@ -912,4 +1439,184 @@ use_locking: If `True`, updating of the var, mg, ms, and mom tensors is
   contention.
 )doc");
 
+REGISTER_OP("ResourceApplyRMSProp")
+    .Input("var: resource")
+    .Input("ms: resource")
+    .Input("mom: resource")
+    .Input("lr: T")
+    .Input("rho: T")
+    .Input("momentum: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyRMSPropShapeFn(c, false /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' according to the RMSProp algorithm.
+Note that in dense implementation of this algorithm, ms and mom will
+update even if the grad is zero, but in this sparse implementation, ms
+and mom will not update in iterations during which the grad is zero.
+
+mean_square = decay * mean_square + (1-decay) * gradient ** 2
+Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+
+ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+var <- var - mom
+
+var: Should be from a Variable().
+ms: Should be from a Variable().
+mom: Should be from a Variable().
+lr: Scaling factor. Must be a scalar.
+epsilon: Ridge term. Must be a scalar.
+rho: Decay rate. Must be a scalar.
+grad: The gradient.
+use_locking: If `True`, updating of the var, ms, and mom tensors is protected
+  by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+)doc");
+
+REGISTER_OP("ResourceApplyCenteredRMSProp")
+    .Input("var: resource")
+    .Input("mg: resource")
+    .Input("ms: resource")
+    .Input("mom: resource")
+    .Input("lr: T")
+    .Input("rho: T")
+    .Input("momentum: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyCenteredRMSPropShapeFn(c, false /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' according to the centered RMSProp algorithm.
+The centered RMSProp algorithm uses an estimate of the centered second moment
+(i.e., the variance) for normalization, as opposed to regular RMSProp, which
+uses the (uncentered) second moment. This often helps with training, but is
+slightly more expensive in terms of computation and memory.
+
+Note that in dense implementation of this algorithm, mg, ms, and mom will
+update even if the grad is zero, but in this sparse implementation, mg, ms,
+and mom will not update in iterations during which the grad is zero.
+
+mean_square = decay * mean_square + (1-decay) * gradient ** 2
+mean_grad = decay * mean_grad + (1-decay) * gradient
+
+Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+
+mg <- rho * mg_{t-1} + (1-rho) * grad
+ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
+var <- var - mom
+
+var: Should be from a Variable().
+mg: Should be from a Variable().
+ms: Should be from a Variable().
+mom: Should be from a Variable().
+lr: Scaling factor. Must be a scalar.
+epsilon: Ridge term. Must be a scalar.
+rho: Decay rate. Must be a scalar.
+grad: The gradient.
+use_locking: If `True`, updating of the var, mg, ms, and mom tensors is
+  protected by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+)doc");
+
+REGISTER_OP("ResourceSparseApplyRMSProp")
+    .Input("var: resource")
+    .Input("ms: resource")
+    .Input("mom: resource")
+    .Input("lr: T")
+    .Input("rho: T")
+    .Input("momentum: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyRMSPropShapeFn(c, true /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' according to the RMSProp algorithm.
+Note that in dense implementation of this algorithm, ms and mom will
+update even if the grad is zero, but in this sparse implementation, ms
+and mom will not update in iterations during which the grad is zero.
+
+mean_square = decay * mean_square + (1-decay) * gradient ** 2
+Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+
+ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+var <- var - mom
+
+var: Should be from a Variable().
+ms: Should be from a Variable().
+mom: Should be from a Variable().
+lr: Scaling factor. Must be a scalar.
+epsilon: Ridge term. Must be a scalar.
+rho: Decay rate. Must be a scalar.
+grad: The gradient.
+indices: A vector of indices into the first dimension of var, ms and mom.
+use_locking: If `True`, updating of the var, ms, and mom tensors is protected
+  by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+)doc");
+
+REGISTER_OP("ResourceSparseApplyCenteredRMSProp")
+    .Input("var: resource")
+    .Input("mg: resource")
+    .Input("ms: resource")
+    .Input("mom: resource")
+    .Input("lr: T")
+    .Input("rho: T")
+    .Input("momentum: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyCenteredRMSPropShapeFn(c, true /* sparse */);
+    })
+    .Doc(R"doc(
+Update '*var' according to the centered RMSProp algorithm.
+The centered RMSProp algorithm uses an estimate of the centered second moment
+(i.e., the variance) for normalization, as opposed to regular RMSProp, which
+uses the (uncentered) second moment. This often helps with training, but is
+slightly more expensive in terms of computation and memory.
+
+Note that in dense implementation of this algorithm, mg, ms, and mom will
+update even if the grad is zero, but in this sparse implementation, mg, ms,
+and mom will not update in iterations during which the grad is zero.
+
+mean_square = decay * mean_square + (1-decay) * gradient ** 2
+mean_grad = decay * mean_grad + (1-decay) * gradient
+Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+
+ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+var <- var - mom
+
+var: Should be from a Variable().
+mg: Should be from a Variable().
+ms: Should be from a Variable().
+mom: Should be from a Variable().
+lr: Scaling factor. Must be a scalar.
+epsilon: Ridge term. Must be a scalar.
+rho: Decay rate. Must be a scalar.
+grad: The gradient.
+indices: A vector of indices into the first dimension of var, ms and mom.
+use_locking: If `True`, updating of the var, mg, ms, and mom tensors is
+  protected by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+)doc");
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 3998324047c..ab56ad09d11 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -307,6 +307,7 @@ class GcsWritableFile : public WritableFile {
         object_(object),
         auth_provider_(auth_provider),
         http_request_factory_(http_request_factory),
+        sync_needed_(true),
         max_upload_attempts_(max_upload_attempts) {
     if (GetTmpFilename(&tmp_content_filename_).ok()) {
       outfile_.open(tmp_content_filename_,
@@ -328,6 +329,7 @@ class GcsWritableFile : public WritableFile {
         object_(object),
         auth_provider_(auth_provider),
         http_request_factory_(http_request_factory),
+        sync_needed_(true),
         max_upload_attempts_(max_upload_attempts) {
     tmp_content_filename_ = tmp_content_filename;
     outfile_.open(tmp_content_filename_,
@@ -338,6 +340,7 @@ class GcsWritableFile : public WritableFile {
 
   Status Append(const StringPiece& data) override {
     TF_RETURN_IF_ERROR(CheckWritable());
+    sync_needed_ = true;
     outfile_ << data;
     if (!outfile_.good()) {
       return errors::Internal(
@@ -357,14 +360,26 @@ class GcsWritableFile : public WritableFile {
 
   Status Flush() override { return Sync(); }
 
+  Status Sync() override {
+    TF_RETURN_IF_ERROR(CheckWritable());
+    if (!sync_needed_) {
+      return Status::OK();
+    }
+    Status status = SyncImpl();
+    if (status.ok()) {
+      sync_needed_ = false;
+    }
+    return status;
+  }
+
+ private:
   /// Copies the current version of the file to GCS.
   ///
-  /// This Sync() uploads the object to GCS.
+  /// This SyncImpl() uploads the object to GCS.
   /// In case of a failure, it resumes failed uploads as recommended by the GCS
   /// resumable API documentation. When the whole upload needs to be
   /// restarted, Sync() returns UNAVAILABLE and relies on RetryingFileSystem.
-  Status Sync() override {
-    TF_RETURN_IF_ERROR(CheckWritable());
+  Status SyncImpl() {
     outfile_.flush();
     if (!outfile_.good()) {
       return errors::Internal(
@@ -410,7 +425,6 @@ class GcsWritableFile : public WritableFile {
     return errors::Aborted("Upload gs://", bucket_, "/", object_, " failed.");
   }
 
- private:
   Status CheckWritable() const {
     if (!outfile_.is_open()) {
       return errors::FailedPrecondition(
@@ -556,6 +570,7 @@ class GcsWritableFile : public WritableFile {
   string tmp_content_filename_;
   std::ofstream outfile_;
   HttpRequest::Factory* http_request_factory_;
+  bool sync_needed_;  // whether there is buffered data that needs to be synced
   int32 max_upload_attempts_;
 };
 
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index 5f5f868a5c7..84f219616a9 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -228,6 +228,11 @@ TEST(GcsFileSystemTest, NewWritableFile) {
 
   TF_EXPECT_OK(file->Append("content1,"));
   TF_EXPECT_OK(file->Append("content2"));
+  TF_EXPECT_OK(file->Flush());
+  // The calls to flush, sync, and close below should not cause uploads because
+  // the file is not dirty.
+  TF_EXPECT_OK(file->Flush());
+  TF_EXPECT_OK(file->Sync());
   TF_EXPECT_OK(file->Close());
 }
 
diff --git a/tensorflow/core/platform/cloud/http_request_test.cc b/tensorflow/core/platform/cloud/http_request_test.cc
index 93c4ec51d95..31ba3e337f9 100644
--- a/tensorflow/core/platform/cloud/http_request_test.cc
+++ b/tensorflow/core/platform/cloud/http_request_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <fstream>
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -172,7 +173,8 @@ class FakeLibCurl : public LibCurl {
       temp_str.replace(n, victim.size(), encoded);
       n += encoded.size();
     }
-    char* out_char_str = (char*)malloc(sizeof(char) * temp_str.size() + 1);
+    char* out_char_str =
+        (char*)port::Malloc(sizeof(char) * temp_str.size() + 1);
     std::copy(temp_str.begin(), temp_str.end(), out_char_str);
     out_char_str[temp_str.size()] = '\0';
     return out_char_str;
@@ -180,7 +182,7 @@ class FakeLibCurl : public LibCurl {
   void curl_slist_free_all(curl_slist* list) override {
     delete reinterpret_cast<std::vector<string>*>(list);
   }
-  void curl_free(void* p) override { free(p); }
+  void curl_free(void* p) override { port::Free(p); }
 
   // Variables defining the behavior of this fake.
   string response_content;
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 80c23b1df15..168f9df2e84 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -3,10 +3,11 @@
 load("@protobuf//:protobuf.bzl", "cc_proto_library")
 load("@protobuf//:protobuf.bzl", "py_proto_library")
 
-# configure may change the following lines to True
+# configure may change the following lines
 WITH_GCP_SUPPORT = False
 WITH_HDFS_SUPPORT = False
 WITH_XLA_SUPPORT = False
+WITH_JEMALLOC = True
 
 # Appends a suffix to a list of deps.
 def tf_deps(deps, suffix):
@@ -176,7 +177,29 @@ def tf_additional_test_srcs():
 def tf_kernel_tests_linkstatic():
   return 0
 
+# jemalloc only enabled on Linux for now.
+# TODO(jhseu): Enable on other platforms.
+def tf_additional_lib_defines():
+  defines = []
+  if WITH_JEMALLOC:
+    defines += select({
+        "//tensorflow:linux_x86_64": [
+            "TENSORFLOW_USE_JEMALLOC"
+        ],
+        "//conditions:default": [],
+    })
+  return defines
+
 def tf_additional_lib_deps():
+  deps = []
+  if WITH_JEMALLOC:
+    deps += select({
+        "//tensorflow:linux_x86_64": ["@jemalloc"],
+        "//conditions:default": [],
+    })
+  return deps
+
+def tf_additional_core_deps():
   deps = []
   if WITH_GCP_SUPPORT:
     deps.append("//tensorflow/core/platform/cloud:gcs_file_system")
diff --git a/tensorflow/core/platform/hexagon/soc_interface.h b/tensorflow/core/platform/hexagon/soc_interface.h
index 61567de3276..f4a3cdf4bda 100644
--- a/tensorflow/core/platform/hexagon/soc_interface.h
+++ b/tensorflow/core/platform/hexagon/soc_interface.h
@@ -48,7 +48,7 @@ bool soc_interface_ReadOutputNodeFloat(const char* const node_name,
                                        uint8_t** buf, uint64_t* buf_size);
 // Setup graph
 // TODO(satok): Remove and use runtime version
-bool soc_interface_SetupGraphDummy(int version);
+bool soc_interface_setupDummyGraph(int version);
 
 // Allocate memory for params of node inputs and node outputs
 bool soc_interface_AllocateNodeInputAndNodeOutputArray(int total_input_count,
diff --git a/tensorflow/core/platform/mem.h b/tensorflow/core/platform/mem.h
index 6618145c3d1..dc389a87415 100644
--- a/tensorflow/core/platform/mem.h
+++ b/tensorflow/core/platform/mem.h
@@ -24,9 +24,14 @@ limitations under the License.
 namespace tensorflow {
 namespace port {
 
-// Aligned allocation/deallocation
-void* aligned_malloc(size_t size, int minimum_alignment);
-void aligned_free(void* aligned_memory);
+// Aligned allocation/deallocation. `minimum_alignment` must be a power of 2
+// and a multiple of sizeof(void*).
+void* AlignedMalloc(size_t size, int minimum_alignment);
+void AlignedFree(void* aligned_memory);
+
+void* Malloc(size_t size);
+void* Realloc(void* ptr, size_t size);
+void Free(void* ptr);
 
 // Tries to release num_bytes of free memory back to the operating
 // system for reuse.  Use this routine with caution -- to get this
diff --git a/tensorflow/core/platform/port_test.cc b/tensorflow/core/platform/port_test.cc
index 8d98eb25a20..8930e49ff84 100644
--- a/tensorflow/core/platform/port_test.cc
+++ b/tensorflow/core/platform/port_test.cc
@@ -25,11 +25,11 @@ namespace port {
 
 TEST(Port, AlignedMalloc) {
   for (size_t alignment = 1; alignment <= 1 << 20; alignment <<= 1) {
-    void* p = aligned_malloc(1, alignment);
-    ASSERT_TRUE(p != NULL) << "aligned_malloc(1, " << alignment << ")";
+    void* p = AlignedMalloc(1, alignment);
+    ASSERT_TRUE(p != NULL) << "AlignedMalloc(1, " << alignment << ")";
     uintptr_t pval = reinterpret_cast<uintptr_t>(p);
     EXPECT_EQ(pval % alignment, 0);
-    aligned_free(p);
+    AlignedFree(p);
   }
 }
 
diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc
index 84bc9492b57..91d612f2339 100644
--- a/tensorflow/core/platform/posix/port.cc
+++ b/tensorflow/core/platform/posix/port.cc
@@ -13,8 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifdef TENSORFLOW_USE_JEMALLOC
+#include "jemalloc/jemalloc.h"
+#endif
+
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/types.h"
 #if defined(__linux__) && !defined(__ANDROID__)
 #include <sched.h>
@@ -33,7 +38,7 @@ limitations under the License.
 namespace tensorflow {
 namespace port {
 
-void InitMain(const char *usage, int *argc, char ***argv) {}
+void InitMain(const char* usage, int* argc, char*** argv) {}
 
 string Hostname() {
   char hostname[1024];
@@ -60,36 +65,66 @@ int NumSchedulableCPUs() {
   return kDefaultCores;
 }
 
-void *aligned_malloc(size_t size, int minimum_alignment) {
+void* AlignedMalloc(size_t size, int minimum_alignment) {
 #if defined(__ANDROID__)
   return memalign(minimum_alignment, size);
 #else  // !defined(__ANDROID__)
-  void *ptr = NULL;
+  void* ptr = NULL;
   // posix_memalign requires that the requested alignment be at least
   // sizeof(void*). In this case, fall back on malloc which should return
   // memory aligned to at least the size of a pointer.
-  const int required_alignment = sizeof(void *);
-  if (minimum_alignment < required_alignment) return malloc(size);
-  if (posix_memalign(&ptr, minimum_alignment, size) != 0)
+  const int required_alignment = sizeof(void*);
+  if (minimum_alignment < required_alignment) return Malloc(size);
+#ifdef TENSORFLOW_USE_JEMALLOC
+  int err = jemalloc_posix_memalign(&ptr, minimum_alignment, size);
+#else
+  int err = posix_memalign(&ptr, minimum_alignment, size);
+#endif
+  if (err != 0) {
     return NULL;
-  else
+  } else {
     return ptr;
+  }
 #endif
 }
 
-void aligned_free(void *aligned_memory) { free(aligned_memory); }
+void AlignedFree(void* aligned_memory) { Free(aligned_memory); }
+
+void* Malloc(size_t size) {
+#ifdef TENSORFLOW_USE_JEMALLOC
+  return jemalloc_malloc(size);
+#else
+  return malloc(size);
+#endif
+}
+
+void* Realloc(void* ptr, size_t size) {
+#ifdef TENSORFLOW_USE_JEMALLOC
+  return jemalloc_realloc(ptr, size);
+#else
+  return realloc(ptr, size);
+#endif
+}
+
+void Free(void* ptr) {
+#ifdef TENSORFLOW_USE_JEMALLOC
+  jemalloc_free(ptr);
+#else
+  free(ptr);
+#endif
+}
 
 void MallocExtension_ReleaseToSystem(std::size_t num_bytes) {
   // No-op.
 }
 
-std::size_t MallocExtension_GetAllocatedSize(const void *p) { return 0; }
+std::size_t MallocExtension_GetAllocatedSize(const void* p) { return 0; }
 
-void AdjustFilenameForLogging(string *filename) {
+void AdjustFilenameForLogging(string* filename) {
   // Nothing to do
 }
 
-bool Snappy_Compress(const char *input, size_t length, string *output) {
+bool Snappy_Compress(const char* input, size_t length, string* output) {
 #ifdef SNAPPY
   output->resize(snappy::MaxCompressedLength(length));
   size_t outlen;
@@ -101,8 +136,8 @@ bool Snappy_Compress(const char *input, size_t length, string *output) {
 #endif
 }
 
-bool Snappy_GetUncompressedLength(const char *input, size_t length,
-                                  size_t *result) {
+bool Snappy_GetUncompressedLength(const char* input, size_t length,
+                                  size_t* result) {
 #ifdef SNAPPY
   return snappy::GetUncompressedLength(input, length, result);
 #else
@@ -110,7 +145,7 @@ bool Snappy_GetUncompressedLength(const char *input, size_t length,
 #endif
 }
 
-bool Snappy_Uncompress(const char *input, size_t length, char *output) {
+bool Snappy_Uncompress(const char* input, size_t length, char* output) {
 #ifdef SNAPPY
   return snappy::RawUncompress(input, length, output);
 #else
@@ -118,7 +153,7 @@ bool Snappy_Uncompress(const char *input, size_t length, char *output) {
 #endif
 }
 
-string Demangle(const char *mangled) { return mangled; }
+string Demangle(const char* mangled) { return mangled; }
 
 }  // namespace port
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/windows/port.cc b/tensorflow/core/platform/windows/port.cc
index ee5be221cd6..b2167081a69 100644
--- a/tensorflow/core/platform/windows/port.cc
+++ b/tensorflow/core/platform/windows/port.cc
@@ -52,11 +52,17 @@ int NumSchedulableCPUs() {
   return system_info.dwNumberOfProcessors;
 }
 
-void* aligned_malloc(size_t size, int minimum_alignment) {
+void* AlignedMalloc(size_t size, int minimum_alignment) {
   return _aligned_malloc(size, minimum_alignment);
 }
 
-void aligned_free(void* aligned_memory) { _aligned_free(aligned_memory); }
+void AlignedFree(void* aligned_memory) { _aligned_free(aligned_memory); }
+
+void* Malloc(size_t size) { return ::malloc(size); }
+
+void* Realloc(void* ptr, size_t size) { return ::realloc(ptr, size); }
+
+void Free(void* ptr) { ::free(ptr); }
 
 void MallocExtension_ReleaseToSystem(std::size_t num_bytes) {
   // No-op.
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index d960b8dd42f..a2b333aad1b 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -79,10 +79,12 @@ limitations under the License.
 //     used for tf.split, ReverseV2 is now used by tf.reverse, ConcatV2 is
 //     now used by tf.concat_v2 (and soon tf.concat). Graphs use flooring
 //     division and mod semantics. TensorArrayV3. (12dec2016)
+// 21. Dropped FunctionDef.Node support, switched to node_def introduced
+//     in version 12. (11jan2017)
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 20
+#define TF_GRAPH_DEF_VERSION 21
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
diff --git a/tensorflow/examples/android/AndroidManifest.xml b/tensorflow/examples/android/AndroidManifest.xml
index e388734564b..9f229d8b9d4 100644
--- a/tensorflow/examples/android/AndroidManifest.xml
+++ b/tensorflow/examples/android/AndroidManifest.xml
@@ -41,7 +41,7 @@
                 <category android:name="android.intent.category.LAUNCHER" />
             </intent-filter>
         </activity>
-      
+
         <activity android:name="org.tensorflow.demo.DetectorActivity"
                   android:screenOrientation="portrait"
                   android:label="@string/activity_name_detection">
@@ -50,6 +50,15 @@
                 <category android:name="android.intent.category.LAUNCHER" />
             </intent-filter>
         </activity>
+
+        <activity android:name="org.tensorflow.demo.StylizeActivity"
+                  android:screenOrientation="portrait"
+                  android:label="@string/activity_name_stylize">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
     </application>
 
 </manifest>
diff --git a/tensorflow/examples/android/BUILD b/tensorflow/examples/android/BUILD
index 3ba3a494aba..0c1cea5fc35 100644
--- a/tensorflow/examples/android/BUILD
+++ b/tensorflow/examples/android/BUILD
@@ -66,6 +66,7 @@ android_binary(
         "//tensorflow/examples/android/assets:asset_files",
         "@inception5h//:model_files",
         "@mobile_multibox//:model_files",
+        "@stylize//:model_files",
     ],
     assets_dir = "",
     custom_package = "org.tensorflow.demo",
diff --git a/tensorflow/examples/android/README.md b/tensorflow/examples/android/README.md
index 79f543fb748..fbbe9f276bc 100644
--- a/tensorflow/examples/android/README.md
+++ b/tensorflow/examples/android/README.md
@@ -22,6 +22,10 @@ existing application.
         Demonstrates a model based on [Scalable Object Detection
         using Deep Neural Networks](https://arxiv.org/abs/1312.2249) to
         localize and track people in the camera preview in real-time.
+3. [TF Stylize](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java):
+        Uses a model based on [A Learned Representation For Artistic Style]
+        (https://arxiv.org/abs/1610.07629) to restyle the camera preview image
+        to that of a number of different artists.
 
 ## Prebuilt APK:
 
diff --git a/tensorflow/examples/android/bin/AndroidManifest.xml b/tensorflow/examples/android/bin/AndroidManifest.xml
new file mode 100644
index 00000000000..d4792bc4823
--- /dev/null
+++ b/tensorflow/examples/android/bin/AndroidManifest.xml
@@ -0,0 +1,64 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    package="org.tensorflow.demo_internal">
+
+    <uses-permission android:name="android.permission.CAMERA" />
+    <uses-feature android:name="android.hardware.camera" />
+    <uses-feature android:name="android.hardware.camera.autofocus" />
+    <uses-permission android:name="android.permission.WRITE_EXTERNAL_STORAGE"/>
+
+    <uses-sdk
+        android:minSdkVersion="21"
+        android:targetSdkVersion="23" />
+
+    <application android:allowBackup="true"
+        android:debuggable="true"
+        android:label="@string/app_name"
+        android:icon="@drawable/ic_launcher"
+        android:theme="@style/MaterialTheme">
+
+        <activity android:name="org.tensorflow.demo.ClassifierActivity"
+                  android:screenOrientation="portrait"
+                  android:label="@string/activity_name_classification">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
+
+        <activity android:name="org.tensorflow.demo.DetectorActivity"
+                  android:screenOrientation="portrait"
+                  android:label="@string/activity_name_detection">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
+
+        <activity android:name="org.tensorflow.demo.StylizeActivity"
+                  android:screenOrientation="portrait"
+                  android:label="@string/activity_name_stylize">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
+    </application>
+
+</manifest>
diff --git a/tensorflow/examples/android/res/layout/camera_connection_fragment_stylize.xml b/tensorflow/examples/android/res/layout/camera_connection_fragment_stylize.xml
new file mode 100644
index 00000000000..1cdb24cab03
--- /dev/null
+++ b/tensorflow/examples/android/res/layout/camera_connection_fragment_stylize.xml
@@ -0,0 +1,51 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
+      android:orientation="vertical"
+      android:layout_width="match_parent"
+      android:layout_height="match_parent">
+  <org.tensorflow.demo.AutoFitTextureView
+    android:id="@+id/texture"
+    android:layout_width="wrap_content"
+    android:layout_height="wrap_content"
+    android:layout_alignParentTop="true" />
+
+  <RelativeLayout
+    android:id="@+id/black"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:background="#FF000000" />
+
+  <GridView
+    android:id="@+id/grid_layout"
+    android:numColumns="7"
+    android:stretchMode="columnWidth"
+    android:layout_alignParentBottom="true"
+    android:layout_width="match_parent"
+    android:layout_height="wrap_content" />
+
+  <org.tensorflow.demo.OverlayView
+      android:id="@+id/overlay"
+      android:layout_width="match_parent"
+      android:layout_height="match_parent"
+      android:layout_alignParentTop="true" />
+
+  <org.tensorflow.demo.OverlayView
+    android:id="@+id/debug_overlay"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:layout_alignParentTop="true" />
+</RelativeLayout>
diff --git a/tensorflow/examples/android/res/values/base-strings.xml b/tensorflow/examples/android/res/values/base-strings.xml
index f6c57d5030b..56edb55def7 100644
--- a/tensorflow/examples/android/res/values/base-strings.xml
+++ b/tensorflow/examples/android/res/values/base-strings.xml
@@ -19,4 +19,5 @@
     <string name="app_name">TensorFlow Demo</string>
     <string name="activity_name_classification">TF Classify</string>
     <string name="activity_name_detection">TF Detect</string>
+    <string name="activity_name_stylize">TF Stylize</string>
 </resources>
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java
index 853dae4b83d..387bd3f8faa 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java
@@ -73,7 +73,7 @@ public class ClassifierActivity extends CameraActivity implements OnImageAvailab
 
   private static final boolean MAINTAIN_ASPECT = true;
 
-  private TensorFlowImageClassifier classifier;
+  private Classifier classifier;
 
   private Integer sensorOrientation;
 
@@ -88,7 +88,6 @@ public class ClassifierActivity extends CameraActivity implements OnImageAvailab
 
   private boolean computing = false;
 
-
   private Matrix frameToCropTransform;
   private Matrix cropToFrameTransform;
 
@@ -112,17 +111,15 @@ public class ClassifierActivity extends CameraActivity implements OnImageAvailab
 
   @Override
   public void onPreviewSizeChosen(final Size size, final int rotation) {
-    final float textSizePx = TypedValue.applyDimension(
-        TypedValue.COMPLEX_UNIT_DIP, TEXT_SIZE_DIP,
-        getResources().getDisplayMetrics());
+    final float textSizePx =
+        TypedValue.applyDimension(
+            TypedValue.COMPLEX_UNIT_DIP, TEXT_SIZE_DIP, getResources().getDisplayMetrics());
     borderedText = new BorderedText(textSizePx);
     borderedText.setTypeface(Typeface.MONOSPACE);
 
-    classifier = new TensorFlowImageClassifier();
-
     try {
-      final int initStatus =
-          classifier.initializeTensorFlow(
+      classifier =
+          TensorFlowImageClassifier.create(
               getAssets(),
               MODEL_FILE,
               LABEL_FILE,
@@ -132,10 +129,6 @@ public class ClassifierActivity extends CameraActivity implements OnImageAvailab
               IMAGE_STD,
               INPUT_NAME,
               OUTPUT_NAME);
-      if (initStatus != 0) {
-        LOGGER.e("TF init status != 0: %d", initStatus);
-        throw new RuntimeException();
-      }
     } catch (final Exception e) {
       throw new RuntimeException("Error initializing TensorFlow!", e);
     }
@@ -147,8 +140,7 @@ public class ClassifierActivity extends CameraActivity implements OnImageAvailab
     final Display display = getWindowManager().getDefaultDisplay();
     final int screenOrientation = display.getRotation();
 
-    LOGGER.i("Sensor orientation: %d, Screen orientation: %d",
-        rotation, screenOrientation);
+    LOGGER.i("Sensor orientation: %d, Screen orientation: %d", rotation, screenOrientation);
 
     sensorOrientation = rotation + screenOrientation;
 
@@ -157,22 +149,24 @@ public class ClassifierActivity extends CameraActivity implements OnImageAvailab
     rgbFrameBitmap = Bitmap.createBitmap(previewWidth, previewHeight, Config.ARGB_8888);
     croppedBitmap = Bitmap.createBitmap(INPUT_SIZE, INPUT_SIZE, Config.ARGB_8888);
 
-    frameToCropTransform = ImageUtils.getTransformationMatrix(
-        previewWidth, previewHeight,
-        INPUT_SIZE, INPUT_SIZE,
-        sensorOrientation, MAINTAIN_ASPECT);
+    frameToCropTransform =
+        ImageUtils.getTransformationMatrix(
+            previewWidth, previewHeight,
+            INPUT_SIZE, INPUT_SIZE,
+            sensorOrientation, MAINTAIN_ASPECT);
 
     cropToFrameTransform = new Matrix();
     frameToCropTransform.invert(cropToFrameTransform);
 
     yuvBytes = new byte[3][];
 
-    addCallback(new DrawCallback() {
-      @Override
-      public void drawCallback(final Canvas canvas) {
-        renderDebug(canvas);
-      }
-    });
+    addCallback(
+        new DrawCallback() {
+          @Override
+          public void drawCallback(final Canvas canvas) {
+            renderDebug(canvas);
+          }
+        });
   }
 
   @Override
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java
index c8aeb8ae25c..9ab5a7108ab 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java
@@ -124,30 +124,19 @@ public class DetectorActivity extends CameraActivity implements OnImageAvailable
 
     tracker = new MultiBoxTracker(getResources().getDisplayMetrics());
 
-    if (USE_YOLO) {
-      final TensorFlowYoloDetector yoloDetector = new TensorFlowYoloDetector();
-      try {
-        final int initStatus =
-            yoloDetector.initializeTensorFlow(
+    try {
+      if (USE_YOLO) {
+        detector =
+            TensorFlowYoloDetector.create(
                 getAssets(),
                 YOLO_MODEL_FILE,
                 YOLO_INPUT_SIZE,
                 YOLO_INPUT_NAME,
                 YOLO_OUTPUT_NAMES,
                 YOLO_BLOCK_SIZE);
-        if (initStatus != 0) {
-          LOGGER.e("TF init status != 0: %d", initStatus);
-          throw new RuntimeException();
-        }
-      } catch (final Exception e) {
-        throw new RuntimeException("Error initializing TensorFlow!", e);
-      }
-      detector = yoloDetector;
-    } else {
-      final TensorFlowMultiBoxDetector multiBoxDetector = new TensorFlowMultiBoxDetector();
-      try {
-        final int initStatus =
-            multiBoxDetector.initializeTensorFlow(
+      } else {
+        detector =
+            TensorFlowMultiBoxDetector.create(
                 getAssets(),
                 MB_MODEL_FILE,
                 MB_LOCATION_FILE,
@@ -157,14 +146,9 @@ public class DetectorActivity extends CameraActivity implements OnImageAvailable
                 MB_IMAGE_STD,
                 MB_INPUT_NAME,
                 MB_OUTPUT_NAMES);
-        if (initStatus != 0) {
-          LOGGER.e("TF init status != 0: %d", initStatus);
-          throw new RuntimeException();
-        }
-      } catch (final Exception e) {
-        throw new RuntimeException("Error initializing TensorFlow!", e);
       }
-      detector = multiBoxDetector;
+    } catch (final Exception e) {
+      throw new RuntimeException("Error initializing TensorFlow!", e);
     }
 
     previewWidth = size.getWidth();
@@ -249,6 +233,7 @@ public class DetectorActivity extends CameraActivity implements OnImageAvailable
   }
 
   OverlayView trackingOverlay;
+
   @Override
   public void onImageAvailable(final ImageReader reader) {
     Image image = null;
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java
new file mode 100644
index 00000000000..8a3c7a4ef92
--- /dev/null
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java
@@ -0,0 +1,662 @@
+/*
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.tensorflow.demo;
+
+import android.content.Context;
+import android.content.res.AssetManager;
+import android.graphics.Bitmap;
+import android.graphics.Bitmap.Config;
+import android.graphics.BitmapFactory;
+import android.graphics.Canvas;
+import android.graphics.Color;
+import android.graphics.Matrix;
+import android.graphics.Paint;
+import android.graphics.Paint.Style;
+import android.graphics.Rect;
+import android.graphics.Typeface;
+import android.media.Image;
+import android.media.Image.Plane;
+import android.media.ImageReader;
+import android.media.ImageReader.OnImageAvailableListener;
+import android.os.Bundle;
+import android.os.SystemClock;
+import android.os.Trace;
+import android.util.Size;
+import android.util.TypedValue;
+import android.view.Display;
+import android.view.MotionEvent;
+import android.view.View;
+import android.view.View.OnClickListener;
+import android.view.View.OnTouchListener;
+import android.view.ViewGroup;
+import android.widget.BaseAdapter;
+import android.widget.Button;
+import android.widget.GridView;
+import android.widget.ImageView;
+import android.widget.Toast;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Vector;
+import org.tensorflow.contrib.android.TensorFlowInferenceInterface;
+import org.tensorflow.demo.OverlayView.DrawCallback;
+import org.tensorflow.demo.env.BorderedText;
+import org.tensorflow.demo.env.ImageUtils;
+import org.tensorflow.demo.env.Logger;
+import org.tensorflow.demo.R;
+
+/**
+ * Sample activity that stylizes the camera preview according to "A Learned Representation For
+ * Artistic Style" (https://arxiv.org/abs/1610.07629)
+ */
+public class StylizeActivity extends CameraActivity implements OnImageAvailableListener {
+  static {
+    System.loadLibrary("tensorflow_demo");
+  }
+
+  private static final Logger LOGGER = new Logger();
+
+  private static final String MODEL_FILE = "file:///android_asset/stylize_quantized.pb";
+  private static final String INPUT_NODE = "input:0";
+  private static final String STYLE_NODE = "style_num:0";
+  private static final String OUTPUT_NODE = "transformer/expand/conv3/conv/Sigmoid";
+  private static final int NUM_STYLES = 26;
+
+  private static final boolean SAVE_PREVIEW_BITMAP = false;
+
+  // Whether to actively manipulate non-selected sliders so that sum of activations always appears
+  // to be 1.0. The actual style input tensor will be normalized to sum to 1.0 regardless.
+  private static final boolean NORMALIZE_SLIDERS = true;
+
+  private static final float TEXT_SIZE_DIP = 12;
+
+  private static final boolean DEBUG_MODEL = false;
+
+  private static final int[] SIZES = {32, 48, 64, 96, 128, 192, 256, 384, 512, 768, 1024};
+
+  // Start at a medium size, but let the user step up through smaller sizes so they don't get
+  // immediately stuck processing a large image.
+  private int desiredSizeIndex = -1;
+  private int desiredSize = 256;
+  private int initializedSize = 0;
+
+  private Integer sensorOrientation;
+
+  private int previewWidth = 0;
+  private int previewHeight = 0;
+  private byte[][] yuvBytes;
+  private int[] rgbBytes = null;
+  private Bitmap rgbFrameBitmap = null;
+  private Bitmap croppedBitmap = null;
+
+  private final float[] styleVals = new float[NUM_STYLES];
+  private int[] intValues;
+  private float[] floatValues;
+
+  private int frameNum = 0;
+
+  private Bitmap cropCopyBitmap;
+  private Bitmap textureCopyBitmap;
+
+  private boolean computing = false;
+
+  private Matrix frameToCropTransform;
+  private Matrix cropToFrameTransform;
+
+  private BorderedText borderedText;
+
+  private long lastProcessingTimeMs;
+
+  private TensorFlowInferenceInterface inferenceInterface;
+
+  private int lastOtherStyle = 1;
+
+  private boolean allZero = false;
+
+  private ImageGridAdapter adapter;
+  private GridView grid;
+
+  private final OnTouchListener gridTouchAdapter =
+      new OnTouchListener() {
+        ImageSlider slider = null;
+
+        @Override
+        public boolean onTouch(final View v, final MotionEvent event) {
+          switch (event.getActionMasked()) {
+            case MotionEvent.ACTION_DOWN:
+              for (int i = 0; i < NUM_STYLES; ++i) {
+                final ImageSlider child = adapter.items[i];
+                final Rect rect = new Rect();
+                child.getHitRect(rect);
+                if (rect.contains((int) event.getX(), (int) event.getY())) {
+                  slider = child;
+                  slider.setHilighted(true);
+                }
+              }
+              break;
+
+            case MotionEvent.ACTION_MOVE:
+              if (slider != null) {
+                final Rect rect = new Rect();
+                slider.getHitRect(rect);
+
+                final float newSliderVal =
+                    (float)
+                        Math.min(
+                            1.0,
+                            Math.max(
+                                0.0, 1.0 - (event.getY() - slider.getTop()) / slider.getHeight()));
+
+                setStyle(slider, newSliderVal);
+              }
+              break;
+
+            case MotionEvent.ACTION_UP:
+              if (slider != null) {
+                slider.setHilighted(false);
+                slider = null;
+              }
+              break;
+          }
+          return true;
+        }
+      };
+
+  @Override
+  public void onCreate(final Bundle savedInstanceState) {
+    super.onCreate(savedInstanceState);
+  }
+
+  @Override
+  protected int getLayoutId() {
+    return R.layout.camera_connection_fragment_stylize;
+  }
+
+  @Override
+  protected int getDesiredPreviewFrameSize() {
+    return SIZES[SIZES.length - 1];
+  }
+
+  public static Bitmap getBitmapFromAsset(final Context context, final String filePath) {
+    final AssetManager assetManager = context.getAssets();
+
+    Bitmap bitmap = null;
+    try {
+      final InputStream inputStream = assetManager.open(filePath);
+      bitmap = BitmapFactory.decodeStream(inputStream);
+    } catch (final IOException e) {
+      LOGGER.e("Error opening bitmap!", e);
+    }
+
+    return bitmap;
+  }
+
+  private class ImageSlider extends ImageView {
+    private float value = 0.0f;
+    private boolean hilighted = false;
+
+    private final Paint boxPaint;
+    private final Paint linePaint;
+
+    public ImageSlider(final Context context) {
+      super(context);
+      value = 0.0f;
+
+      boxPaint = new Paint();
+      boxPaint.setColor(Color.BLACK);
+      boxPaint.setAlpha(128);
+
+      linePaint = new Paint();
+      linePaint.setColor(Color.WHITE);
+      linePaint.setStrokeWidth(10.0f);
+      linePaint.setStyle(Style.STROKE);
+    }
+
+    @Override
+    public void onDraw(final Canvas canvas) {
+      super.onDraw(canvas);
+      final float y = (1.0f - value) * canvas.getHeight();
+
+      // If all sliders are zero, don't bother shading anything.
+      if (!allZero) {
+        canvas.drawRect(0, 0, canvas.getWidth(), y, boxPaint);
+      }
+
+      if (value > 0.0f) {
+        canvas.drawLine(0, y, canvas.getWidth(), y, linePaint);
+      }
+
+      if (hilighted) {
+        canvas.drawRect(0, 0, getWidth(), getHeight(), linePaint);
+      }
+    }
+
+    @Override
+    protected void onMeasure(final int widthMeasureSpec, final int heightMeasureSpec) {
+      super.onMeasure(widthMeasureSpec, heightMeasureSpec);
+      setMeasuredDimension(getMeasuredWidth(), getMeasuredWidth());
+    }
+
+    public void setValue(final float value) {
+      this.value = value;
+      postInvalidate();
+    }
+
+    public void setHilighted(final boolean highlighted) {
+      this.hilighted = highlighted;
+      this.postInvalidate();
+    }
+  }
+
+  private class ImageGridAdapter extends BaseAdapter {
+    final ImageSlider[] items = new ImageSlider[NUM_STYLES];
+    final ArrayList<Button> buttons = new ArrayList<Button>();
+
+    {
+      final Button sizeButton =
+          new Button(StylizeActivity.this) {
+            @Override
+            protected void onMeasure(final int widthMeasureSpec, final int heightMeasureSpec) {
+              super.onMeasure(widthMeasureSpec, heightMeasureSpec);
+              setMeasuredDimension(getMeasuredWidth(), getMeasuredWidth());
+            }
+          };
+      sizeButton.setText("" + desiredSize);
+      sizeButton.setOnClickListener(
+          new OnClickListener() {
+            @Override
+            public void onClick(final View v) {
+              desiredSizeIndex = (desiredSizeIndex + 1) % SIZES.length;
+              desiredSize = SIZES[desiredSizeIndex];
+              sizeButton.setText("" + desiredSize);
+              sizeButton.postInvalidate();
+            }
+          });
+
+      final Button saveButton =
+          new Button(StylizeActivity.this) {
+            @Override
+            protected void onMeasure(final int widthMeasureSpec, final int heightMeasureSpec) {
+              super.onMeasure(widthMeasureSpec, heightMeasureSpec);
+              setMeasuredDimension(getMeasuredWidth(), getMeasuredWidth());
+            }
+          };
+      saveButton.setText("Save");
+      saveButton.setOnClickListener(
+          new OnClickListener() {
+            @Override
+            public void onClick(final View v) {
+              if (textureCopyBitmap != null) {
+                // TODO(andrewharp): Save as jpeg with guaranteed unique filename.
+                ImageUtils.saveBitmap(textureCopyBitmap, "stylized" + frameNum + ".png");
+                Toast.makeText(
+                        StylizeActivity.this,
+                        "Saved image to: /sdcard/tensorflow/" + "stylized" + frameNum + ".png",
+                        Toast.LENGTH_LONG)
+                    .show();
+              }
+            }
+          });
+
+      buttons.add(sizeButton);
+      buttons.add(saveButton);
+
+      for (int i = 0; i < NUM_STYLES; ++i) {
+        LOGGER.v("Creating item %d", i);
+
+        if (items[i] == null) {
+          final ImageSlider slider = new ImageSlider(StylizeActivity.this);
+          final Bitmap bm =
+              getBitmapFromAsset(StylizeActivity.this, "thumbnails/style" + i + ".jpg");
+          slider.setImageBitmap(bm);
+
+          items[i] = slider;
+        }
+      }
+    }
+
+    @Override
+    public int getCount() {
+      return buttons.size() + NUM_STYLES;
+    }
+
+    @Override
+    public Object getItem(final int position) {
+      if (position < buttons.size()) {
+        return buttons.get(position);
+      } else {
+        return items[position - buttons.size()];
+      }
+    }
+
+    @Override
+    public long getItemId(final int position) {
+      return getItem(position).hashCode();
+    }
+
+    @Override
+    public View getView(final int position, final View convertView, final ViewGroup parent) {
+      if (convertView != null) {
+        return convertView;
+      }
+      return (View) getItem(position);
+    }
+  }
+
+  @Override
+  public void onPreviewSizeChosen(final Size size, final int rotation) {
+    final float textSizePx =
+        TypedValue.applyDimension(
+            TypedValue.COMPLEX_UNIT_DIP, TEXT_SIZE_DIP, getResources().getDisplayMetrics());
+    borderedText = new BorderedText(textSizePx);
+    borderedText.setTypeface(Typeface.MONOSPACE);
+
+    inferenceInterface = new TensorFlowInferenceInterface();
+    inferenceInterface.initializeTensorFlow(getAssets(), MODEL_FILE);
+
+    previewWidth = size.getWidth();
+    previewHeight = size.getHeight();
+
+    final Display display = getWindowManager().getDefaultDisplay();
+    final int screenOrientation = display.getRotation();
+
+    LOGGER.i("Sensor orientation: %d, Screen orientation: %d", rotation, screenOrientation);
+
+    sensorOrientation = rotation + screenOrientation;
+
+    addCallback(
+        new DrawCallback() {
+          @Override
+          public void drawCallback(final Canvas canvas) {
+            renderDebug(canvas);
+          }
+        });
+
+    adapter = new ImageGridAdapter();
+    grid = (GridView) findViewById(R.id.grid_layout);
+    grid.setAdapter(adapter);
+    grid.setOnTouchListener(gridTouchAdapter);
+
+    setStyle(adapter.items[0], 1.0f);
+  }
+
+  private void setStyle(final ImageSlider slider, final float value) {
+    slider.setValue(value);
+
+    if (NORMALIZE_SLIDERS) {
+      // Slider vals correspond directly to the input tensor vals, and normalization is visually
+      // maintained by remanipulating non-selected sliders.
+      float otherSum = 0.0f;
+
+      for (int i = 0; i < NUM_STYLES; ++i) {
+        if (adapter.items[i] != slider) {
+          otherSum += adapter.items[i].value;
+        }
+      }
+
+      if (otherSum > 0.0) {
+        float highestOtherVal = 0;
+        final float factor = otherSum > 0.0f ? (1.0f - value) / otherSum : 0.0f;
+        for (int i = 0; i < NUM_STYLES; ++i) {
+          final ImageSlider child = adapter.items[i];
+          if (child == slider) {
+            continue;
+          }
+          final float newVal = child.value * factor;
+          child.setValue(newVal > 0.01f ? newVal : 0.0f);
+
+          if (child.value > highestOtherVal) {
+            lastOtherStyle = i;
+            highestOtherVal = child.value;
+          }
+        }
+      } else {
+        // Everything else is 0, so just pick a suitable slider to push up when the
+        // selected one goes down.
+        if (adapter.items[lastOtherStyle] == slider) {
+          lastOtherStyle = lastOtherStyle + 1 % NUM_STYLES;
+        }
+        adapter.items[lastOtherStyle].setValue(1.0f - value);
+      }
+    }
+
+    final boolean lastAllZero = allZero;
+    float sum = 0.0f;
+    for (int i = 0; i < NUM_STYLES; ++i) {
+      sum += adapter.items[i].value;
+    }
+    allZero = sum == 0.0f;
+
+    // Now update the values used for the input tensor. If nothing is set, mix in everything
+    // equally. Otherwise everything is normalized to sum to 1.0.
+    for (int i = 0; i < NUM_STYLES; ++i) {
+      styleVals[i] = allZero ? 1.0f / NUM_STYLES : adapter.items[i].value / sum;
+
+      if (lastAllZero != allZero) {
+        adapter.items[i].postInvalidate();
+      }
+    }
+  }
+
+  @Override
+  public void onImageAvailable(final ImageReader reader) {
+    Image image = null;
+
+    try {
+      image = reader.acquireLatestImage();
+
+      if (image == null) {
+        return;
+      }
+
+      if (computing) {
+        image.close();
+        return;
+      }
+
+      if (desiredSize != initializedSize) {
+        LOGGER.i(
+            "Initializing at size preview size %dx%d, stylize size %d",
+            previewWidth, previewHeight, desiredSize);
+        rgbBytes = new int[previewWidth * previewHeight];
+        rgbFrameBitmap = Bitmap.createBitmap(previewWidth, previewHeight, Config.ARGB_8888);
+        croppedBitmap = Bitmap.createBitmap(desiredSize, desiredSize, Config.ARGB_8888);
+
+        frameToCropTransform =
+            ImageUtils.getTransformationMatrix(
+                previewWidth, previewHeight,
+                desiredSize, desiredSize,
+                sensorOrientation, true);
+
+        cropToFrameTransform = new Matrix();
+        frameToCropTransform.invert(cropToFrameTransform);
+
+        yuvBytes = new byte[3][];
+
+        intValues = new int[desiredSize * desiredSize];
+        floatValues = new float[desiredSize * desiredSize * 3];
+        initializedSize = desiredSize;
+      }
+
+      computing = true;
+
+      Trace.beginSection("imageAvailable");
+
+      final Plane[] planes = image.getPlanes();
+      fillBytes(planes, yuvBytes);
+
+      final int yRowStride = planes[0].getRowStride();
+      final int uvRowStride = planes[1].getRowStride();
+      final int uvPixelStride = planes[1].getPixelStride();
+      ImageUtils.convertYUV420ToARGB8888(
+          yuvBytes[0],
+          yuvBytes[1],
+          yuvBytes[2],
+          rgbBytes,
+          previewWidth,
+          previewHeight,
+          yRowStride,
+          uvRowStride,
+          uvPixelStride,
+          false);
+
+      image.close();
+    } catch (final Exception e) {
+      if (image != null) {
+        image.close();
+      }
+      LOGGER.e(e, "Exception!");
+      Trace.endSection();
+      return;
+    }
+
+    rgbFrameBitmap.setPixels(rgbBytes, 0, previewWidth, 0, 0, previewWidth, previewHeight);
+    final Canvas canvas = new Canvas(croppedBitmap);
+    canvas.drawBitmap(rgbFrameBitmap, frameToCropTransform, null);
+
+    // For examining the actual TF input.
+    if (SAVE_PREVIEW_BITMAP) {
+      ImageUtils.saveBitmap(croppedBitmap);
+    }
+
+    runInBackground(
+        new Runnable() {
+          @Override
+          public void run() {
+            cropCopyBitmap = Bitmap.createBitmap(croppedBitmap);
+
+            final long startTime = SystemClock.uptimeMillis();
+            stylizeImage(croppedBitmap);
+            lastProcessingTimeMs = SystemClock.uptimeMillis() - startTime;
+
+            textureCopyBitmap = Bitmap.createBitmap(croppedBitmap);
+
+            requestRender();
+            computing = false;
+          }
+        });
+
+    Trace.endSection();
+  }
+
+  String outputNode = "";
+
+  private void stylizeImage(final Bitmap bitmap) {
+    ++frameNum;
+    bitmap.getPixels(intValues, 0, bitmap.getWidth(), 0, 0, bitmap.getWidth(), bitmap.getHeight());
+
+    if (DEBUG_MODEL) {
+      // Create a white square that steps through a black background 1 pixel per frame.
+      final int centerX = (frameNum + bitmap.getWidth() / 2) % bitmap.getWidth();
+      final int centerY = bitmap.getHeight() / 2;
+      final int squareSize = 10;
+      for (int i = 0; i < intValues.length; ++i) {
+        final int x = i % bitmap.getWidth();
+        final int y = i / bitmap.getHeight();
+        final float val =
+            Math.abs(x - centerX) < squareSize && Math.abs(y - centerY) < squareSize ? 1.0f : 0.0f;
+        floatValues[i * 3] = val;
+        floatValues[i * 3 + 1] = val;
+        floatValues[i * 3 + 2] = val;
+      }
+    } else {
+      for (int i = 0; i < intValues.length; ++i) {
+        final int val = intValues[i];
+        floatValues[i * 3] = ((val >> 16) & 0xFF) / 255.0f;
+        floatValues[i * 3 + 1] = ((val >> 8) & 0xFF) / 255.0f;
+        floatValues[i * 3 + 2] = (val & 0xFF) / 255.0f;
+      }
+    }
+
+    // Copy the input data into TensorFlow.
+    inferenceInterface.fillNodeFloat(
+        INPUT_NODE, new int[] {1, bitmap.getWidth(), bitmap.getHeight(), 3}, floatValues);
+    inferenceInterface.fillNodeFloat(STYLE_NODE, new int[] {NUM_STYLES}, styleVals);
+
+    inferenceInterface.runInference(new String[] {OUTPUT_NODE});
+    inferenceInterface.readNodeFloat(OUTPUT_NODE, floatValues);
+
+    for (int i = 0; i < intValues.length; ++i) {
+      intValues[i] =
+          0xFF000000
+              | (((int) (floatValues[i * 3] * 255)) << 16)
+              | (((int) (floatValues[i * 3 + 1] * 255)) << 8)
+              | ((int) (floatValues[i * 3 + 2] * 255));
+    }
+
+    bitmap.setPixels(intValues, 0, bitmap.getWidth(), 0, 0, bitmap.getWidth(), bitmap.getHeight());
+  }
+
+  @Override
+  public void onSetDebug(final boolean debug) {
+    inferenceInterface.enableStatLogging(debug);
+  }
+
+  private void renderDebug(final Canvas canvas) {
+    // TODO(andrewharp): move result display to its own View instead of using debug overlay.
+    final Bitmap texture = textureCopyBitmap;
+    if (texture != null) {
+      final Matrix matrix = new Matrix();
+      final float scaleFactor =
+          DEBUG_MODEL
+              ? 4.0f
+              : Math.min(
+                  (float) canvas.getWidth() / texture.getWidth(),
+                  (float) canvas.getHeight() / texture.getHeight());
+      matrix.postScale(scaleFactor, scaleFactor);
+      canvas.drawBitmap(texture, matrix, new Paint());
+    }
+
+    if (!isDebug()) {
+      return;
+    }
+
+    final Bitmap copy = cropCopyBitmap;
+    if (copy == null) {
+      return;
+    }
+
+    canvas.drawColor(0x55000000);
+
+    final Matrix matrix = new Matrix();
+    final float scaleFactor = 2;
+    matrix.postScale(scaleFactor, scaleFactor);
+    matrix.postTranslate(
+        canvas.getWidth() - copy.getWidth() * scaleFactor,
+        canvas.getHeight() - copy.getHeight() * scaleFactor);
+    canvas.drawBitmap(copy, matrix, new Paint());
+
+    final Vector<String> lines = new Vector<String>();
+
+    final String[] statLines = inferenceInterface.getStatString().split("\n");
+    for (final String line : statLines) {
+      lines.add(line);
+    }
+
+    lines.add("");
+
+    lines.add("Frame: " + previewWidth + "x" + previewHeight);
+    lines.add("Crop: " + copy.getWidth() + "x" + copy.getHeight());
+    lines.add("View: " + canvas.getWidth() + "x" + canvas.getHeight());
+    lines.add("Rotation: " + sensorOrientation);
+    lines.add("Inference time: " + lastProcessingTimeMs + "ms");
+    lines.add("Desired size: " + desiredSize);
+    lines.add("Initialized size: " + initializedSize);
+
+    borderedText.drawLines(canvas, 10, canvas.getHeight() - 10, lines);
+  }
+}
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowImageClassifier.java b/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowImageClassifier.java
index 6ea6cc27192..d1f69e8cc31 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowImageClassifier.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowImageClassifier.java
@@ -57,6 +57,8 @@ public class TensorFlowImageClassifier implements Classifier {
 
   private TensorFlowInferenceInterface inferenceInterface;
 
+  private TensorFlowImageClassifier() {}
+
   /**
    * Initializes a native TensorFlow session for classifying images.
    *
@@ -69,10 +71,9 @@ public class TensorFlowImageClassifier implements Classifier {
    * @param imageStd The assumed std of the image values.
    * @param inputName The label of the image input node.
    * @param outputName The label of the output node.
-   * @return The native return value, 0 indicating success.
    * @throws IOException
    */
-  public int initializeTensorFlow(
+  public static Classifier create(
       AssetManager assetManager,
       String modelFilename,
       String labelFilename,
@@ -81,9 +82,11 @@ public class TensorFlowImageClassifier implements Classifier {
       int imageMean,
       float imageStd,
       String inputName,
-      String outputName) throws IOException {
-    this.inputName = inputName;
-    this.outputName = outputName;
+      String outputName)
+      throws IOException {
+    TensorFlowImageClassifier c = new TensorFlowImageClassifier();
+    c.inputName = inputName;
+    c.outputName = outputName;
 
     // Read the label names into memory.
     // TODO(andrewharp): make this handle non-assets.
@@ -93,24 +96,29 @@ public class TensorFlowImageClassifier implements Classifier {
     br = new BufferedReader(new InputStreamReader(assetManager.open(actualFilename)));
     String line;
     while ((line = br.readLine()) != null) {
-      labels.add(line);
+      c.labels.add(line);
     }
     br.close();
-    Log.i(TAG, "Read " + labels.size() + ", " + numClasses + " specified");
+    Log.i(TAG, "Read " + c.labels.size() + ", " + numClasses + " specified");
 
-    this.inputSize = inputSize;
-    this.imageMean = imageMean;
-    this.imageStd = imageStd;
+    c.inputSize = inputSize;
+    c.imageMean = imageMean;
+    c.imageStd = imageStd;
 
     // Pre-allocate buffers.
-    outputNames = new String[] {outputName};
-    intValues = new int[inputSize * inputSize];
-    floatValues = new float[inputSize * inputSize * 3];
-    outputs = new float[numClasses];
+    c.outputNames = new String[] {outputName};
+    c.intValues = new int[inputSize * inputSize];
+    c.floatValues = new float[inputSize * inputSize * 3];
+    c.outputs = new float[numClasses];
 
-    inferenceInterface = new TensorFlowInferenceInterface();
+    c.inferenceInterface = new TensorFlowInferenceInterface();
 
-    return inferenceInterface.initializeTensorFlow(assetManager, modelFilename);
+    final int status = c.inferenceInterface.initializeTensorFlow(assetManager, modelFilename);
+    if (status != 0) {
+      Log.e(TAG, "TF init status: " + status);
+      throw new RuntimeException("TF init status (" + status + ") != 0");
+    }
+    return c;
   }
 
   @Override
@@ -147,18 +155,19 @@ public class TensorFlowImageClassifier implements Classifier {
     Trace.endSection();
 
     // Find the best classifications.
-    PriorityQueue<Recognition> pq = new PriorityQueue<Recognition>(3,
-        new Comparator<Recognition>() {
-          @Override
-          public int compare(Recognition lhs, Recognition rhs) {
-            // Intentionally reversed to put high confidence at the head of the queue.
-            return Float.compare(rhs.getConfidence(), lhs.getConfidence());
-          }
-        });
+    PriorityQueue<Recognition> pq =
+        new PriorityQueue<Recognition>(
+            3,
+            new Comparator<Recognition>() {
+              @Override
+              public int compare(Recognition lhs, Recognition rhs) {
+                // Intentionally reversed to put high confidence at the head of the queue.
+                return Float.compare(rhs.getConfidence(), lhs.getConfidence());
+              }
+            });
     for (int i = 0; i < outputs.length; ++i) {
       if (outputs[i] > THRESHOLD) {
-        pq.add(new Recognition(
-            "" + i, labels.get(i), outputs[i], null));
+        pq.add(new Recognition("" + i, labels.get(i), outputs[i], null));
       }
     }
     final ArrayList<Recognition> recognitions = new ArrayList<Recognition>();
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowMultiBoxDetector.java b/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowMultiBoxDetector.java
index 80b76051ffe..e438956c7dd 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowMultiBoxDetector.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowMultiBoxDetector.java
@@ -19,7 +19,6 @@ import android.content.res.AssetManager;
 import android.graphics.Bitmap;
 import android.graphics.RectF;
 import android.os.Trace;
-import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Comparator;
 import java.util.List;
@@ -70,10 +69,8 @@ public class TensorFlowMultiBoxDetector implements Classifier {
    * @param imageStd The assumed std of the image values.
    * @param inputName The label of the image input node.
    * @param outputName The label of the output node.
-   * @return The native return value, 0 indicating success.
-   * @throws IOException
    */
-  public int initializeTensorFlow(
+  public static Classifier create(
       final AssetManager assetManager,
       final String modelFilename,
       final String locationFilename,
@@ -82,30 +79,37 @@ public class TensorFlowMultiBoxDetector implements Classifier {
       final int imageMean,
       final float imageStd,
       final String inputName,
-      final String outputName)
-      throws IOException {
-    this.inputName = inputName;
-    this.inputSize = inputSize;
-    this.imageMean = imageMean;
-    this.imageStd = imageStd;
-    this.numLocations = numLocations;
+      final String outputName) {
+    TensorFlowMultiBoxDetector d = new TensorFlowMultiBoxDetector();
+    d.inputName = inputName;
+    d.inputSize = inputSize;
+    d.imageMean = imageMean;
+    d.imageStd = imageStd;
+    d.numLocations = numLocations;
 
-    this.boxPriors = new float[numLocations * 8];
+    d.boxPriors = new float[numLocations * 8];
 
-    loadCoderOptions(assetManager, locationFilename, boxPriors);
+    d.loadCoderOptions(assetManager, locationFilename, d.boxPriors);
 
     // Pre-allocate buffers.
-    outputNames = outputName.split(",");
-    intValues = new int[inputSize * inputSize];
-    floatValues = new float[inputSize * inputSize * 3];
-    outputScores = new float[numLocations];
-    outputLocations = new float[numLocations * 4];
+    d.outputNames = outputName.split(",");
+    d.intValues = new int[inputSize * inputSize];
+    d.floatValues = new float[inputSize * inputSize * 3];
+    d.outputScores = new float[numLocations];
+    d.outputLocations = new float[numLocations * 4];
 
-    inferenceInterface = new TensorFlowInferenceInterface();
+    d.inferenceInterface = new TensorFlowInferenceInterface();
 
-    return inferenceInterface.initializeTensorFlow(assetManager, modelFilename);
+    final int status = d.inferenceInterface.initializeTensorFlow(assetManager, modelFilename);
+    if (status != 0) {
+      LOGGER.e("TF init status: " + status);
+      throw new RuntimeException("TF init status (" + status + ") != 0");
+    }
+    return d;
   }
 
+  private TensorFlowMultiBoxDetector() {}
+
   // Load BoxCoderOptions from native code.
   private native void loadCoderOptions(
       AssetManager assetManager, String locationFilename, float[] boxPriors);
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowYoloDetector.java b/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowYoloDetector.java
index b8dd11ba051..86c922b5891 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowYoloDetector.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowYoloDetector.java
@@ -19,7 +19,6 @@ import android.content.res.AssetManager;
 import android.graphics.Bitmap;
 import android.graphics.RectF;
 import android.os.Trace;
-import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Comparator;
 import java.util.List;
@@ -89,34 +88,36 @@ public class TensorFlowYoloDetector implements Classifier {
 
   private TensorFlowInferenceInterface inferenceInterface;
 
-  /**
-   * Initializes a native TensorFlow session for classifying images.
-   *
-   * @throws IOException
-   */
-  public int initializeTensorFlow(
+  /** Initializes a native TensorFlow session for classifying images. */
+  public static Classifier create(
       final AssetManager assetManager,
       final String modelFilename,
       final int inputSize,
       final String inputName,
       final String outputName,
-      final int blockSize)
-      throws IOException {
-    this.inputName = inputName;
-    this.inputSize = inputSize;
+      final int blockSize) {
+    TensorFlowYoloDetector d = new TensorFlowYoloDetector();
+    d.inputName = inputName;
+    d.inputSize = inputSize;
 
     // Pre-allocate buffers.
-    outputNames = outputName.split(",");
-    intValues = new int[inputSize * inputSize];
-    floatValues = new float[inputSize * inputSize * 3];
-    this.blockSize = blockSize;
+    d.outputNames = outputName.split(",");
+    d.intValues = new int[inputSize * inputSize];
+    d.floatValues = new float[inputSize * inputSize * 3];
+    d.blockSize = blockSize;
 
-    inferenceInterface = new TensorFlowInferenceInterface();
+    d.inferenceInterface = new TensorFlowInferenceInterface();
 
-    // Graphs must be converted from https://github.com/thtrieu/darkflow
-    return inferenceInterface.initializeTensorFlow(assetManager, modelFilename);
+    final int status = d.inferenceInterface.initializeTensorFlow(assetManager, modelFilename);
+    if (status != 0) {
+      LOGGER.e("TF init status: " + status);
+      throw new RuntimeException("TF init status (" + status + ") != 0");
+    }
+    return d;
   }
 
+  private TensorFlowYoloDetector() {}
+
   private float expit(final float x) {
     return (float) (1. / (1. + Math.exp(-x)));
   }
diff --git a/tensorflow/examples/learn/wide_n_deep_tutorial.py b/tensorflow/examples/learn/wide_n_deep_tutorial.py
index 760b26bd529..09af375314e 100644
--- a/tensorflow/examples/learn/wide_n_deep_tutorial.py
+++ b/tensorflow/examples/learn/wide_n_deep_tutorial.py
@@ -17,27 +17,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import argparse
+import sys
 import tempfile
+
 from six.moves import urllib
 
 import pandas as pd
 import tensorflow as tf
 
-flags = tf.app.flags
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string("model_dir", "", "Base directory for output models.")
-flags.DEFINE_string("model_type", "wide_n_deep",
-                    "Valid model types: {'wide', 'deep', 'wide_n_deep'}.")
-flags.DEFINE_integer("train_steps", 200, "Number of training steps.")
-flags.DEFINE_string(
-    "train_data",
-    "",
-    "Path to the training data.")
-flags.DEFINE_string(
-    "test_data",
-    "",
-    "Path to the test data.")
 
 COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num",
            "marital_status", "occupation", "relationship", "race", "gender",
@@ -50,10 +38,10 @@ CONTINUOUS_COLUMNS = ["age", "education_num", "capital_gain", "capital_loss",
                       "hours_per_week"]
 
 
-def maybe_download():
+def maybe_download(train_data, test_data):
   """Maybe downloads training data and returns train and test file names."""
-  if FLAGS.train_data:
-    train_file_name = FLAGS.train_data
+  if train_data:
+    train_file_name = train_data
   else:
     train_file = tempfile.NamedTemporaryFile(delete=False)
     urllib.request.urlretrieve("http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.data", train_file.name)  # pylint: disable=line-too-long
@@ -61,8 +49,8 @@ def maybe_download():
     train_file.close()
     print("Training data is downloaded to %s" % train_file_name)
 
-  if FLAGS.test_data:
-    test_file_name = FLAGS.test_data
+  if test_data:
+    test_file_name = test_data
   else:
     test_file = tempfile.NamedTemporaryFile(delete=False)
     urllib.request.urlretrieve("http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.test", test_file.name)  # pylint: disable=line-too-long
@@ -73,7 +61,7 @@ def maybe_download():
   return train_file_name, test_file_name
 
 
-def build_estimator(model_dir):
+def build_estimator(model_dir, model_type):
   """Build an estimator."""
   # Sparse base columns.
   gender = tf.contrib.layers.sparse_column_with_keys(column_name="gender",
@@ -128,10 +116,10 @@ def build_estimator(model_dir):
       hours_per_week,
   ]
 
-  if FLAGS.model_type == "wide":
+  if model_type == "wide":
     m = tf.contrib.learn.LinearClassifier(model_dir=model_dir,
                                           feature_columns=wide_columns)
-  elif FLAGS.model_type == "deep":
+  elif model_type == "deep":
     m = tf.contrib.learn.DNNClassifier(model_dir=model_dir,
                                        feature_columns=deep_columns,
                                        hidden_units=[100, 50])
@@ -166,9 +154,9 @@ def input_fn(df):
   return feature_cols, label
 
 
-def train_and_eval():
+def train_and_eval(model_dir, model_type, train_steps, train_data, test_data):
   """Train and evaluate the model."""
-  train_file_name, test_file_name = maybe_download()
+  train_file_name, test_file_name = maybe_download(train_data, test_data)
   df_train = pd.read_csv(
       tf.gfile.Open(train_file_name),
       names=COLUMNS,
@@ -190,19 +178,56 @@ def train_and_eval():
   df_test[LABEL_COLUMN] = (
       df_test["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
 
-  model_dir = tempfile.mkdtemp() if not FLAGS.model_dir else FLAGS.model_dir
+  model_dir = tempfile.mkdtemp() if not model_dir else model_dir
   print("model directory = %s" % model_dir)
 
-  m = build_estimator(model_dir)
-  m.fit(input_fn=lambda: input_fn(df_train), steps=FLAGS.train_steps)
+  m = build_estimator(model_dir, model_type)
+  m.fit(input_fn=lambda: input_fn(df_train), steps=train_steps)
   results = m.evaluate(input_fn=lambda: input_fn(df_test), steps=1)
   for key in sorted(results):
     print("%s: %s" % (key, results[key]))
 
 
+FLAGS = None
+
+
 def main(_):
-  train_and_eval()
+  train_and_eval(FLAGS.model_dir, FLAGS.model_type, FLAGS.train_steps,
+                 FLAGS.train_data, FLAGS.test_data)
 
 
 if __name__ == "__main__":
-  tf.app.run()
+  parser = argparse.ArgumentParser()
+  parser.register("type", "bool", lambda v: v.lower() == "true")
+  parser.add_argument(
+      "--model_dir",
+      type=str,
+      default="",
+      help="Base directory for output models."
+  )
+  parser.add_argument(
+      "--model_type",
+      type=str,
+      default="wide_n_deep",
+      help="Valid model types: {'wide', 'deep', 'wide_n_deep'}."
+  )
+  parser.add_argument(
+      "--train_steps",
+      type=int,
+      default=200,
+      help="Number of training steps."
+  )
+  parser.add_argument(
+      "--train_data",
+      type=str,
+      default="",
+      help="Path to the training data."
+  )
+  parser.add_argument(
+      "--test_data",
+      type=str,
+      default="",
+      help="Path to the test data."
+  )
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/saved_model/example/BUILD b/tensorflow/examples/saved_model/BUILD
similarity index 53%
rename from tensorflow/python/saved_model/example/BUILD
rename to tensorflow/examples/saved_model/BUILD
index 4128f14c44b..844e99dcd46 100644
--- a/tensorflow/python/saved_model/example/BUILD
+++ b/tensorflow/examples/saved_model/BUILD
@@ -1,7 +1,7 @@
 # Description: SavedModel half plus two example.
 
 package(
-    default_visibility = ["//tensorflow/python/saved_model:__subpackages__"],
+    default_visibility = ["//tensorflow:internal"],
     features = [
         "-layering_check",
     ],
@@ -32,14 +32,5 @@ py_binary(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:util",
-        "//tensorflow/python/saved_model:builder",
-        "//tensorflow/python/saved_model:constants",
-        "//tensorflow/python/saved_model:signature_constants",
-        "//tensorflow/python/saved_model:signature_def_utils",
-        "//tensorflow/python/saved_model:tag_constants",
-        "//tensorflow/python/saved_model:utils",
     ],
 )
diff --git a/tensorflow/python/saved_model/example/saved_model_half_plus_two.py b/tensorflow/examples/saved_model/saved_model_half_plus_two.py
similarity index 73%
rename from tensorflow/python/saved_model/example/saved_model_half_plus_two.py
rename to tensorflow/examples/saved_model/saved_model_half_plus_two.py
index a71747da108..f466778296e 100644
--- a/tensorflow/python/saved_model/example/saved_model_half_plus_two.py
+++ b/tensorflow/examples/saved_model/saved_model_half_plus_two.py
@@ -12,16 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Exports an example linear regression inference graph.
+r"""Exports an example linear regression inference graph.
 
-Exports a TensorFlow graph to /tmp/saved_model/half_plus_two/ based on the
-SavedModel format.
+Exports a TensorFlow graph to `/tmp/saved_model/half_plus_two/` based on the
+`SavedModel` format.
 
 This graph calculates,
+
+\\(
   y = a*x + b
+\\)
+
 and/or, independently,
+
+\\(
   y2 = a*x2 + c
-where a, b and c are variables with a=0.5 and b=2 and c=3.
+\\)
+
+where `a`, `b` and `c` are variables with `a=0.5` and `b=2` and `c=3`.
 
 Output from this program is typically used to exercise SavedModel load and
 execution code.
@@ -37,13 +45,6 @@ import sys
 
 import tensorflow as tf
 
-from tensorflow.core.protobuf import meta_graph_pb2
-from tensorflow.python.lib.io import file_io
-from tensorflow.python.saved_model import builder as saved_model_builder
-from tensorflow.python.saved_model import signature_constants
-from tensorflow.python.saved_model import signature_def_utils
-from tensorflow.python.saved_model import tag_constants
-from tensorflow.python.util import compat
 
 FLAGS = None
 
@@ -59,12 +60,12 @@ def _write_assets(assets_directory, assets_filename):
   Returns:
     The path to which the assets file was written.
   """
-  if not file_io.file_exists(assets_directory):
-    file_io.recursive_create_dir(assets_directory)
+  if not tf.python_io.file_exists(assets_directory):
+    tf.python_io.recursive_create_dir(assets_directory)
 
   path = os.path.join(
-      compat.as_bytes(assets_directory), compat.as_bytes(assets_filename))
-  file_io.write_string_to_file(path, "asset-file-contents")
+      tf.compat.as_bytes(assets_directory), tf.compat.as_bytes(assets_filename))
+  tf.python_io.write_string_to_file(path, "asset-file-contents")
   return path
 
 
@@ -75,7 +76,7 @@ def _generate_saved_model_for_half_plus_two(export_dir, as_text=False):
     export_dir: The directory to which the SavedModel should be written.
     as_text: Writes the SavedModel protocol buffer in text format to disk.
   """
-  builder = saved_model_builder.SavedModelBuilder(export_dir)
+  builder = tf.saved_model.builder.SavedModelBuilder(export_dir)
 
   with tf.Session(graph=tf.Graph()) as sess:
     # Set up the model parameters as variables to exercise variable loading
@@ -117,44 +118,48 @@ def _generate_saved_model_for_half_plus_two(export_dir, as_text=False):
 
     # Set up the signature for regression with input and output tensor
     # specification.
-    input_tensor = meta_graph_pb2.TensorInfo()
+    input_tensor = tf.TensorInfo()
     input_tensor.name = serialized_tf_example.name
-    signature_inputs = {signature_constants.REGRESS_INPUTS: input_tensor}
+    signature_inputs = {
+        tf.saved_model.signature_constants.REGRESS_INPUTS: input_tensor}
 
-    output_tensor = meta_graph_pb2.TensorInfo()
+    output_tensor = tf.TensorInfo()
     output_tensor.name = tf.identity(y).name
-    signature_outputs = {signature_constants.REGRESS_OUTPUTS: output_tensor}
-    signature_def = signature_def_utils.build_signature_def(
+    signature_outputs = {
+        tf.saved_model.signature_constants.REGRESS_OUTPUTS: output_tensor}
+    signature_def = tf.saved_model.signature_def_utils.build_signature_def(
         signature_inputs, signature_outputs,
-        signature_constants.REGRESS_METHOD_NAME)
+        tf.saved_model.signature_constants.REGRESS_METHOD_NAME)
 
     # Set up the signature for Predict with input and output tensor
     # specification.
-    predict_input_tensor = meta_graph_pb2.TensorInfo()
+    predict_input_tensor = tf.TensorInfo()
     predict_input_tensor.name = x.name
     predict_signature_inputs = {
         "x": predict_input_tensor
     }
 
-    predict_output_tensor = meta_graph_pb2.TensorInfo()
+    predict_output_tensor = tf.TensorInfo()
     predict_output_tensor.name = y.name
     predict_signature_outputs = {
         "y": predict_output_tensor
     }
-    predict_signature_def = signature_def_utils.build_signature_def(
-        predict_signature_inputs, predict_signature_outputs,
-        signature_constants.PREDICT_METHOD_NAME)
+    predict_signature_def = (
+        tf.saved_model.signature_def_utils.build_signature_def(
+            predict_signature_inputs, predict_signature_outputs,
+            tf.saved_model.signature_constants.PREDICT_METHOD_NAME))
 
     # Initialize all variables and then save the SavedModel.
     sess.run(tf.global_variables_initializer())
+    signature_def_map = {
+        tf.saved_model.signature_constants.REGRESS_METHOD_NAME:
+            signature_def,
+        tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+            predict_signature_def
+    }
     builder.add_meta_graph_and_variables(
-        sess, [tag_constants.SERVING],
-        signature_def_map={
-            signature_constants.REGRESS_METHOD_NAME:
-                signature_def,
-            signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
-                predict_signature_def
-        },
+        sess, [tf.saved_model.tag_constants.SERVING],
+        signature_def_map=signature_def_map,
         assets_collection=tf.get_collection(tf.GraphKeys.ASSET_FILEPATHS),
         legacy_init_op=tf.group(assign_filename_op))
     builder.save(as_text)
diff --git a/tensorflow/examples/tutorials/estimators/abalone.py b/tensorflow/examples/tutorials/estimators/abalone.py
index 6d8ce2cbc7e..9faaa3e6305 100644
--- a/tensorflow/examples/tutorials/estimators/abalone.py
+++ b/tensorflow/examples/tutorials/estimators/abalone.py
@@ -17,27 +17,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import argparse
+import sys
 import tempfile
+
 from six.moves import urllib
 
 import numpy as np
 import tensorflow as tf
 
-flags = tf.app.flags
-FLAGS = flags.FLAGS
 
-flags.DEFINE_string(
-    "train_data",
-    "",
-    "Path to the training data.")
-flags.DEFINE_string(
-    "test_data",
-    "",
-    "Path to the test data.")
-flags.DEFINE_string(
-    "predict_data",
-    "",
-    "Path to the prediction data.")
+FLAGS = None
 
 tf.logging.set_verbosity(tf.logging.INFO)
 
@@ -45,31 +35,36 @@ tf.logging.set_verbosity(tf.logging.INFO)
 LEARNING_RATE = 0.001
 
 
-def maybe_download():
+def maybe_download(train_data, test_data, predict_data):
   """Maybe downloads training data and returns train and test file names."""
-  if FLAGS.train_data:
-    train_file_name = FLAGS.train_data
+  if train_data:
+    train_file_name = train_data
   else:
     train_file = tempfile.NamedTemporaryFile(delete=False)
-    urllib.request.urlretrieve("http://download.tensorflow.org/data/abalone_train.csv", train_file.name)  # pylint: disable=line-too-long
+    urllib.request.urlretrieve(
+        "http://download.tensorflow.org/data/abalone_train.csv",
+        train_file.name)  # pylint: disable=line-too-long
     train_file_name = train_file.name
     train_file.close()
     print("Training data is downloaded to %s" % train_file_name)
 
-  if FLAGS.test_data:
-    test_file_name = FLAGS.test_data
+  if test_data:
+    test_file_name = test_data
   else:
     test_file = tempfile.NamedTemporaryFile(delete=False)
-    urllib.request.urlretrieve("http://download.tensorflow.org/data/abalone_test.csv", test_file.name)  # pylint: disable=line-too-long
+    urllib.request.urlretrieve(
+        "http://download.tensorflow.org/data/abalone_test.csv", test_file.name)  # pylint: disable=line-too-long
     test_file_name = test_file.name
     test_file.close()
     print("Test data is downloaded to %s" % test_file_name)
 
-  if FLAGS.predict_data:
-    predict_file_name = FLAGS.predict_data
+  if predict_data:
+    predict_file_name = predict_data
   else:
     predict_file = tempfile.NamedTemporaryFile(delete=False)
-    urllib.request.urlretrieve("http://download.tensorflow.org/data/abalone_predict.csv", predict_file.name)  # pylint: disable=line-too-long
+    urllib.request.urlretrieve(
+        "http://download.tensorflow.org/data/abalone_predict.csv",
+        predict_file.name)  # pylint: disable=line-too-long
     predict_file_name = predict_file.name
     predict_file.close()
     print("Prediction data is downloaded to %s" % predict_file_name)
@@ -109,32 +104,26 @@ def model_fn(features, targets, mode, params):
 
 def main(unused_argv):
   # Load datasets
-  abalone_train, abalone_test, abalone_predict = maybe_download()
+  abalone_train, abalone_test, abalone_predict = maybe_download(
+      FLAGS.train_data, FLAGS.test_data, FLAGS.predict_data)
 
   # Training examples
   training_set = tf.contrib.learn.datasets.base.load_csv_without_header(
-      filename=abalone_train,
-      target_dtype=np.int,
-      features_dtype=np.float64)
+      filename=abalone_train, target_dtype=np.int, features_dtype=np.float64)
 
   # Test examples
   test_set = tf.contrib.learn.datasets.base.load_csv_without_header(
-      filename=abalone_test,
-      target_dtype=np.int,
-      features_dtype=np.float64)
+      filename=abalone_test, target_dtype=np.int, features_dtype=np.float64)
 
   # Set of 7 examples for which to predict abalone ages
   prediction_set = tf.contrib.learn.datasets.base.load_csv_without_header(
-      filename=abalone_predict,
-      target_dtype=np.int,
-      features_dtype=np.float64)
+      filename=abalone_predict, target_dtype=np.int, features_dtype=np.float64)
 
   # Set model params
   model_params = {"learning_rate": LEARNING_RATE}
 
   # Build 2 layer fully connected DNN with 10, 10 units respectively.
-  nn = tf.contrib.learn.Estimator(
-      model_fn=model_fn, params=model_params)
+  nn = tf.contrib.learn.Estimator(model_fn=model_fn, params=model_params)
 
   # Fit
   nn.fit(x=training_set.data, y=training_set.target, steps=5000)
@@ -145,11 +134,22 @@ def main(unused_argv):
   print("Loss: %s" % loss_score)
 
   # Print out predictions
-  predictions = nn.predict(x=prediction_set.data,
-                           as_iterable=True)
+  predictions = nn.predict(x=prediction_set.data, as_iterable=True)
   for i, p in enumerate(predictions):
     print("Prediction %s: %s" % (i + 1, p["ages"]))
 
 
 if __name__ == "__main__":
-  tf.app.run()
+  parser = argparse.ArgumentParser()
+  parser.register("type", "bool", lambda v: v.lower() == "true")
+  parser.add_argument(
+      "--train_data", type=str, default="", help="Path to the training data.")
+  parser.add_argument(
+      "--test_data", type=str, default="", help="Path to the test data.")
+  parser.add_argument(
+      "--predict_data",
+      type=str,
+      default="",
+      help="Path to the prediction data.")
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/g3doc/api_docs/python/contrib.learn.monitors.md b/tensorflow/g3doc/api_docs/python/contrib.learn.monitors.md
index 64a05437b19..c7e32f04372 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.learn.monitors.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.learn.monitors.md
@@ -2632,7 +2632,7 @@ Wraps monitors into a SessionRunHook.
 
 - - -
 
-#### `tf.contrib.learn.monitors.RunHookAdapterForMonitors.after_create_session(session)` {#RunHookAdapterForMonitors.after_create_session}
+#### `tf.contrib.learn.monitors.RunHookAdapterForMonitors.after_create_session(session, coord)` {#RunHookAdapterForMonitors.after_create_session}
 
 Called when new TensorFlow session is created.
 
@@ -2648,6 +2648,7 @@ has two essential differences with the situation in which `begin` is called:
 
 
 *  <b>`session`</b>: A TensorFlow Session that has been created.
+*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
 
 
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.TensorArray.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.TensorArray.md
index a8e3cd1b634..a10d61aedcb 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.TensorArray.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.TensorArray.md
@@ -64,25 +64,6 @@ must all match.
   The in the `TensorArray` selected by `indices`, packed into one tensor.
 
 
-- - -
-
-#### `tf.TensorArray.pack(*args, **kwargs)` {#TensorArray.pack}
-
-Return the values in the TensorArray as a stacked `Tensor`.
-
-All of the values must have been written and their shapes must all match.
-If input shapes have rank-`R`, then output shape will have rank-`(R+1)`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  All the tensors in the TensorArray stacked into one tensor.
-
-
 - - -
 
 #### `tf.TensorArray.stack(name=None)` {#TensorArray.stack}
@@ -171,32 +152,6 @@ Scatter the values of a `Tensor` in specific indices of a `TensorArray`.
 *  <b>`ValueError`</b>: if the shape inference fails.
 
 
-- - -
-
-#### `tf.TensorArray.unpack(*args, **kwargs)` {#TensorArray.unpack}
-
-Unstack the values of a `Tensor` in the TensorArray.
-
-If input value shapes have rank-`R`, then the output TensorArray will
-contain elements whose shapes are rank-`(R-1)`.
-
-##### Args:
-
-
-*  <b>`value`</b>: (N+1)-D.  Tensor of type `dtype`.  The Tensor to unstack.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A new TensorArray object with flow that ensures the unstack occurs.
-  Use this object all for subsequent operations.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the shape inference fails.
-
-
 - - -
 
 #### `tf.TensorArray.unstack(value, name=None)` {#TensorArray.unstack}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.mul.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.mul.md
deleted file mode 100644
index 1ac6abe73a2..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.mul.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.mul(x, y, name=None)` {#mul}
-
-Returns x * y element-wise.
-
-*NOTE*: `Mul` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `uint8`, `int8`, `uint16`, `int16`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.norm.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.norm.md
index d9edeec1ce5..f91766f656d 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.norm.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.norm.md
@@ -53,7 +53,7 @@ inf-norm) and up to 9218868437227405311 different vectors norms.
 *  <b>`ValueError`</b>: If `ord` or `axis` is invalid.
 
 @compatibility(numpy)
-Mostly equivalent to np.linalg.norm.
+Mostly equivalent to numpy.linalg.norm.
 Not supported: ord <= 0, 2-norm for matrices, nuclear norm.
 
 ##### Other differences:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.GlobalStepWaiterHook.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.GlobalStepWaiterHook.md
index 5b4a5a0da77..4710e841d1f 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.GlobalStepWaiterHook.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.GlobalStepWaiterHook.md
@@ -18,7 +18,7 @@ Create a _GlobalStepWaiterHook.
 
 - - -
 
-#### `tf.train.GlobalStepWaiterHook.after_create_session(session)` {#GlobalStepWaiterHook.after_create_session}
+#### `tf.train.GlobalStepWaiterHook.after_create_session(session, coord)` {#GlobalStepWaiterHook.after_create_session}
 
 Called when new TensorFlow session is created.
 
@@ -34,6 +34,7 @@ has two essential differences with the situation in which `begin` is called:
 
 
 *  <b>`session`</b>: A TensorFlow Session that has been created.
+*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
 
 
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.SessionRunHook.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.SessionRunHook.md
index 71b17d080f5..00bd190dbf3 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.SessionRunHook.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.SessionRunHook.md
@@ -1,7 +1,7 @@
 Hook to extend calls to MonitoredSession.run().
 - - -
 
-#### `tf.train.SessionRunHook.after_create_session(session)` {#SessionRunHook.after_create_session}
+#### `tf.train.SessionRunHook.after_create_session(session, coord)` {#SessionRunHook.after_create_session}
 
 Called when new TensorFlow session is created.
 
@@ -17,6 +17,7 @@ has two essential differences with the situation in which `begin` is called:
 
 
 *  <b>`session`</b>: A TensorFlow Session that has been created.
+*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
 
 
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.limit_epochs.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.limit_epochs.md
index 8489c9ed62f..bcd9d32c30c 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.limit_epochs.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.limit_epochs.md
@@ -2,7 +2,7 @@
 
 Returns tensor `num_epochs` times and then raises an `OutOfRange` error.
 
-Note: creates local counter `epochs`. Use `local_variable_initializer()` to
+Note: creates local counter `epochs`. Use `local_variables_initializer()` to
 initialize local variables.
 
 ##### Args:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf_debug.DumpingDebugWrapperSession.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf_debug.DumpingDebugWrapperSession.md
new file mode 100644
index 00000000000..e655a570507
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf_debug.DumpingDebugWrapperSession.md
@@ -0,0 +1,158 @@
+Debug Session wrapper that dumps debug data to filesystem.
+- - -
+
+#### `tf_debug.DumpingDebugWrapperSession.__enter__()` {#DumpingDebugWrapperSession.__enter__}
+
+
+
+
+- - -
+
+#### `tf_debug.DumpingDebugWrapperSession.__exit__(exec_type, exec_value, exec_tb)` {#DumpingDebugWrapperSession.__exit__}
+
+
+
+
+- - -
+
+#### `tf_debug.DumpingDebugWrapperSession.__init__(sess, session_root, watch_fn=None, log_usage=True)` {#DumpingDebugWrapperSession.__init__}
+
+Constructor of DumpingDebugWrapperSession.
+
+##### Args:
+
+
+*  <b>`sess`</b>: The TensorFlow `Session` object being wrapped.
+*  <b>`session_root`</b>: (`str`) Path to the session root directory. Must be a
+    directory that does not exist or an empty directory. If the directory
+    does not exist, it will be created by the debugger core during debug
+    [`Session.run()`](../../../g3doc/api_docs/python/client.md#session.run)
+    calls.
+    As the `run()` calls occur, subdirectories will be added to
+    `session_root`. The subdirectories' names has the following pattern:
+      run_<epoch_time_stamp>_<uuid>
+    E.g., run_1480734393835964_ad4c953a85444900ae79fc1b652fb324
+*  <b>`watch_fn`</b>: (`Callable`) A Callable of the following signature:
+    ```
+    def watch_fn(fetches, feeds):
+      # Args:
+      #   fetches: the fetches to the `Session.run()` call.
+      #   feeds: the feeds to the `Session.run()` call.
+      #
+      # Returns: (node_name_regex_whitelist, op_type_regex_whitelist)
+      #   debug_ops: (str or list of str) Debug op(s) to be used by the
+      #     debugger in this run() call.
+      #   node_name_regex_whitelist: Regular-expression whitelist for node
+      #     name. Same as the corresponding arg to `debug_util.watch_graph`.
+      #   op_type_regex_whiteslit: Regular-expression whitelist for op type.
+      #     Same as the corresponding arg to `debug_util.watch_graph`.
+      #
+      #   Both or either can be None. If both are set, the two whitelists
+      #   will operate in a logical AND relation. This is consistent with
+      #   `debug_utils.watch_graph()`.
+    ```
+*  <b>`log_usage`</b>: (`bool`) whether the usage of this class is to be logged.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If `session_root` is an existing and non-empty directory or
+   if
+     `session_root` is a file.
+*  <b>`TypeError`</b>: If a non-None `watch_fn` is specified and it is not callable.
+
+
+- - -
+
+#### `tf_debug.DumpingDebugWrapperSession.close()` {#DumpingDebugWrapperSession.close}
+
+
+
+
+- - -
+
+#### `tf_debug.DumpingDebugWrapperSession.graph` {#DumpingDebugWrapperSession.graph}
+
+
+
+
+- - -
+
+#### `tf_debug.DumpingDebugWrapperSession.invoke_node_stepper(node_stepper, restore_variable_values_on_exit=True)` {#DumpingDebugWrapperSession.invoke_node_stepper}
+
+See doc of BaseDebugWrapperSession.invoke_node_stepper.
+
+
+- - -
+
+#### `tf_debug.DumpingDebugWrapperSession.on_run_end(request)` {#DumpingDebugWrapperSession.on_run_end}
+
+See doc of BaseDebugWrapperSession.on_run_end.
+
+
+- - -
+
+#### `tf_debug.DumpingDebugWrapperSession.on_run_start(request)` {#DumpingDebugWrapperSession.on_run_start}
+
+See doc of BaseDebugWrapperSession.on_run_start.
+
+
+- - -
+
+#### `tf_debug.DumpingDebugWrapperSession.on_session_init(request)` {#DumpingDebugWrapperSession.on_session_init}
+
+See doc of BaseDebugWrapperSession.on_run_start.
+
+
+- - -
+
+#### `tf_debug.DumpingDebugWrapperSession.partial_run(handle, fetches, feed_dict=None)` {#DumpingDebugWrapperSession.partial_run}
+
+
+
+
+- - -
+
+#### `tf_debug.DumpingDebugWrapperSession.partial_run_setup(fetches, feeds=None)` {#DumpingDebugWrapperSession.partial_run_setup}
+
+Sets up the feeds and fetches for partial runs in the session.
+
+
+- - -
+
+#### `tf_debug.DumpingDebugWrapperSession.run(fetches, feed_dict=None, options=None, run_metadata=None)` {#DumpingDebugWrapperSession.run}
+
+Wrapper around Session.run() that inserts tensor watch options.
+
+##### Args:
+
+
+*  <b>`fetches`</b>: Same as the `fetches` arg to regular `Session.run()`.
+*  <b>`feed_dict`</b>: Same as the `feed_dict` arg to regular `Session.run()`.
+*  <b>`options`</b>: Same as the `options` arg to regular `Session.run()`.
+*  <b>`run_metadata`</b>: Same as the `run_metadata` arg to regular `Session.run()`.
+
+##### Returns:
+
+  Simply forwards the output of the wrapped `Session.run()` call.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: On invalid `OnRunStartAction` value.
+
+
+- - -
+
+#### `tf_debug.DumpingDebugWrapperSession.sess_str` {#DumpingDebugWrapperSession.sess_str}
+
+
+
+
+- - -
+
+#### `tf_debug.DumpingDebugWrapperSession.session` {#DumpingDebugWrapperSession.session}
+
+
+
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.LoggingTensorHook.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.LoggingTensorHook.md
index 3cebf3cfada..519d5f253ed 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.LoggingTensorHook.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.LoggingTensorHook.md
@@ -26,7 +26,7 @@ Initializes a LoggingHook monitor.
 
 - - -
 
-#### `tf.train.LoggingTensorHook.after_create_session(session)` {#LoggingTensorHook.after_create_session}
+#### `tf.train.LoggingTensorHook.after_create_session(session, coord)` {#LoggingTensorHook.after_create_session}
 
 Called when new TensorFlow session is created.
 
@@ -42,6 +42,7 @@ has two essential differences with the situation in which `begin` is called:
 
 
 *  <b>`session`</b>: A TensorFlow Session that has been created.
+*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
 
 
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.NanTensorHook.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.NanTensorHook.md
index b805ae1a149..6e509684c28 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.NanTensorHook.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.train.NanTensorHook.md
@@ -17,7 +17,7 @@ Initializes NanLoss monitor.
 
 - - -
 
-#### `tf.train.NanTensorHook.after_create_session(session)` {#NanTensorHook.after_create_session}
+#### `tf.train.NanTensorHook.after_create_session(session, coord)` {#NanTensorHook.after_create_session}
 
 Called when new TensorFlow session is created.
 
@@ -33,6 +33,7 @@ has two essential differences with the situation in which `begin` is called:
 
 
 *  <b>`session`</b>: A TensorFlow Session that has been created.
+*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
 
 
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf_debug.LocalCLIDebugHook.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf_debug.LocalCLIDebugHook.md
index 26977c2b903..851a1d2210e 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf_debug.LocalCLIDebugHook.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf_debug.LocalCLIDebugHook.md
@@ -1,6 +1,7 @@
 Command-line-interface debugger hook.
 
-Can be used as a monitor/hook for tf.train.MonitoredSession.
+Can be used as a monitor/hook for `tf.train.MonitoredSession`s and
+`tf.contrib.learn`'s `Estimator`s and `Experiment`s.
 - - -
 
 #### `tf_debug.LocalCLIDebugHook.__enter__()` {#LocalCLIDebugHook.__enter__}
@@ -43,7 +44,7 @@ Add a tensor filter.
 
 - - -
 
-#### `tf_debug.LocalCLIDebugHook.after_create_session(session)` {#LocalCLIDebugHook.after_create_session}
+#### `tf_debug.LocalCLIDebugHook.after_create_session(session, coord)` {#LocalCLIDebugHook.after_create_session}
 
 Called when new TensorFlow session is created.
 
@@ -59,6 +60,7 @@ has two essential differences with the situation in which `begin` is called:
 
 
 *  <b>`session`</b>: A TensorFlow Session that has been created.
+*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
 
 
 - - -
@@ -191,7 +193,7 @@ Overrides on-session-init callback.
 
 ##### Returns:
 
-  An instance of OnSessionInitResponse.
+  An instance of `OnSessionInitResponse`.
 
 
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.neg.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.neg.md
index ae501e13b34..519fd9a8756 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.neg.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.neg.md
@@ -1,21 +1,16 @@
-### `tf.neg(*args, **kwargs)` {#neg}
+### `tf.neg(x, name=None)` {#neg}
 
-Computes numerical negative value element-wise. (deprecated)
+Computes numerical negative value element-wise.
 
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
-Instructions for updating:
-`tf.neg(x)` is deprecated, please use `tf.negative(x)` or `-x`
-
-I.e., \(y = -x\).
+I.e., \\(y = -x\\).
 
 ##### Args:
 
 
-*  <b>`x`</b>: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-    `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
+*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
 *  <b>`name`</b>: A name for the operation (optional).
 
 ##### Returns:
 
-  A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
+  A `Tensor`. Has the same type as `x`.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.test.TestCase.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.test.TestCase.md
index ec995fd99c9..4d4330488f6 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.test.TestCase.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.test.TestCase.md
@@ -452,7 +452,7 @@ then compares them using self._AssertProtoEqual().
 
 - - -
 
-#### `tf.test.TestCase.assertProtoEqualsVersion(expected, actual, producer=20, min_consumer=0)` {#TestCase.assertProtoEqualsVersion}
+#### `tf.test.TestCase.assertProtoEqualsVersion(expected, actual, producer=21, min_consumer=0)` {#TestCase.assertProtoEqualsVersion}
 
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.CheckpointSaverHook.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.CheckpointSaverHook.md
index 4ca22214404..8654557bd5d 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.CheckpointSaverHook.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.CheckpointSaverHook.md
@@ -28,7 +28,7 @@ Initialize CheckpointSaverHook monitor.
 
 - - -
 
-#### `tf.train.CheckpointSaverHook.after_create_session(session)` {#CheckpointSaverHook.after_create_session}
+#### `tf.train.CheckpointSaverHook.after_create_session(session, coord)` {#CheckpointSaverHook.after_create_session}
 
 Called when new TensorFlow session is created.
 
@@ -44,6 +44,7 @@ has two essential differences with the situation in which `begin` is called:
 
 
 *  <b>`session`</b>: A TensorFlow Session that has been created.
+*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
 
 
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.shuffle_batch.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.shuffle_batch.md
index fc65502ce8c..1da7793d58e 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.shuffle_batch.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.shuffle_batch.md
@@ -53,7 +53,7 @@ In addition, all output tensors' static shapes, as accessed via the
 operations that depend on fixed batch_size would fail.
 
 Note: if `num_epochs` is not `None`, this function creates local counter
-`epochs`. Use `local_variable_initializer()` to initialize local variables.
+`epochs`. Use `local_variables_initializer()` to initialize local variables.
 
 ##### Args:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.svd.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.svd.md
index 4dd39189be5..74185ba7c91 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.svd.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.svd.md
@@ -39,3 +39,9 @@ s = svd(a, compute_uv=False)
     shape is `[..., N, P]`. If `full_matrices` is `True` then shape is
     `[..., N, N]`. Not returned if `compute_uv` is `False`.
 
+@compatibility(numpy)
+Mostly equivalent to numpy.linalg.svd, except that the order of output
+arguments here is `s`, `u`, `v` when `compute_uv` is `True`, as opposed to
+`u`, `s`, `v` for numpy.linalg.svd.
+@end_compatibility
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.train.SyncReplicasOptimizer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.train.SyncReplicasOptimizer.md
index d8ff8889551..84f1099ffec 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.train.SyncReplicasOptimizer.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.train.SyncReplicasOptimizer.md
@@ -66,43 +66,22 @@ opt = tf.SyncReplicasOptimizer(opt, replicas_to_aggregate=50,
 
 # Now you can call `minimize()` or `compute_gradients()` and
 # `apply_gradients()` normally
-grads = opt.minimize(total_loss, global_step=self.global_step)
+training_op = opt.minimize(total_loss, global_step=self.global_step)
 
 
-# You can now call get_init_tokens_op() and get_chief_queue_runner().
-# Note that get_init_tokens_op() must be called before creating session
-# because it modifies the graph by adding new nodes.
-init_token_op = opt.get_init_tokens_op()
-chief_queue_runner = opt.get_chief_queue_runner()
+# You can create the hook which handles initialization and queues.
+sync_replicas_hook = opt.make_session_run_hook(is_chief)
 ```
 
 In the training program, every worker will run the train_op as if not
-synchronized. But one worker (usually the chief) will need to execute the
-chief_queue_runner and get_init_tokens_op from this optimizer.
+synchronized.
 
 ```python
-# When you create the supervisor, you need to add the local_init_op and
-# ready_for_local_init_op to make sure the local_step is initialized to the
-# global_step. Here is an example:
-if is_chief:
-  local_init_op = opt.chief_init_op
-else:
-  local_init_op = opt.local_step_init_op
-ready_for_local_init_op = opt.ready_for_local_init_op
-sv = tf.Supervisor(graph=g,
-                   is_chief=is_chief,
-                   # This initialize local step.
-                   local_init_op=local_init_op,
-                   # This makes sure global step is initialized before using.
-                   ready_for_local_init_op=ready_for_local_init_op,
-                   saver=model.saver)
-
-# After the session is created by the Supervisor and before the main while
-# loop:
-if is_chief and FLAGS.sync_replicas:
-  sv.start_queue_runners(sess, [chief_queue_runner])
-  # Insert initial tokens to the queue.
-  sess.run(init_token_op)
+with training.MonitoredTrainingSession(
+    master=workers[worker_id].target, is_chief=is_chief,
+    hooks=[sync_replicas_hook]) as mon_sess:
+  while not mon_sess.should_stop():
+    mon_sess.run(training_op)
 ```
 
 - - -
@@ -280,3 +259,10 @@ This simply wraps the get_slot_names() from the actual optimizer.
   A list of strings.
 
 
+- - -
+
+#### `tf.train.SyncReplicasOptimizer.make_session_run_hook(is_chief, num_tokens=-1)` {#SyncReplicasOptimizer.make_session_run_hook}
+
+Creates a hook to handle SyncReplicasHook ops such as initialization.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.train.range_input_producer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.train.range_input_producer.md
index e6ea66a9e2b..51fac958eca 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.train.range_input_producer.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.train.range_input_producer.md
@@ -3,7 +3,7 @@
 Produces the integers from 0 to limit-1 in a queue.
 
 Note: if `num_epochs` is not `None`, this function creates local counter
-`epochs`. Use `local_variable_initializer()` to initialize local variables.
+`epochs`. Use `local_variables_initializer()` to initialize local variables.
 
 ##### Args:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf_debug.LocalCLIDebugWrapperSession.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf_debug.LocalCLIDebugWrapperSession.md
index 4e90d88ef9c..8194e8ef07e 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf_debug.LocalCLIDebugWrapperSession.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf_debug.LocalCLIDebugWrapperSession.md
@@ -150,7 +150,7 @@ Overrides on-session-init callback.
 
 ##### Returns:
 
-  An instance of OnSessionInitResponse.
+  An instance of `OnSessionInitResponse`.
 
 
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.sub.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.sub.md
deleted file mode 100644
index 83dbd7a93c8..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.sub.md
+++ /dev/null
@@ -1,18 +0,0 @@
-### `tf.sub(x, y, name=None)` {#sub}
-
-Returns x - y element-wise.
-
-*NOTE*: `Sub` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf_debug.DebugDumpDir.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf_debug.DebugDumpDir.md
index d4860b501c6..6a7a6088c9c 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf_debug.DebugDumpDir.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf_debug.DebugDumpDir.md
@@ -405,6 +405,30 @@ Get the partition graphs.
 *  <b>`LookupError`</b>: If no partition graphs have been loaded.
 
 
+- - -
+
+#### `tf_debug.DebugDumpDir.run_feed_keys_info` {#DebugDumpDir.run_feed_keys_info}
+
+Get a str representation of the feed_dict used in the Session.run() call.
+
+##### Returns:
+
+  If the information is available, a `str` obtained from `repr(feed_dict)`.
+  If the information is not available, `None`.
+
+
+- - -
+
+#### `tf_debug.DebugDumpDir.run_fetches_info` {#DebugDumpDir.run_fetches_info}
+
+Get a str representation of the fetches used in the Session.run() call.
+
+##### Returns:
+
+  If the information is available, a `str` obtained from `repr(fetches)`.
+  If the information is not available, `None`.
+
+
 - - -
 
 #### `tf_debug.DebugDumpDir.set_python_graph(python_graph)` {#DebugDumpDir.set_python_graph}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf_debug.DumpingDebugHook.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf_debug.DumpingDebugHook.md
new file mode 100644
index 00000000000..7a2b8936b3c
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf_debug.DumpingDebugHook.md
@@ -0,0 +1,185 @@
+A debugger hook that dumps debug data to filesystem.
+
+Can be used as a monitor/hook for `tf.train.MonitoredSession`s and
+`tf.contrib.learn`'s `Estimator`s and `Experiment`s.
+- - -
+
+#### `tf_debug.DumpingDebugHook.__enter__()` {#DumpingDebugHook.__enter__}
+
+
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.__exit__(exec_type, exec_value, exec_tb)` {#DumpingDebugHook.__exit__}
+
+
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.__init__(session_root, watch_fn=None, log_usage=True)` {#DumpingDebugHook.__init__}
+
+Create a local debugger command-line interface (CLI) hook.
+
+##### Args:
+
+
+*  <b>`session_root`</b>: See doc of
+    `dumping_wrapper.DumpingDebugWrapperSession.__init__`.
+*  <b>`watch_fn`</b>: See doc of
+    `dumping_wrapper.DumpingDebugWrapperSession.__init__`.
+*  <b>`log_usage`</b>: (bool) Whether usage is to be logged.
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.after_create_session(session, coord)` {#DumpingDebugHook.after_create_session}
+
+Called when new TensorFlow session is created.
+
+This is called to signal the hooks that a new session has been created. This
+has two essential differences with the situation in which `begin` is called:
+
+* When this is called, the graph is finalized and ops can no longer be added
+    to the graph.
+* This method will also be called as a result of recovering a wrapped
+    session, not only at the beginning of the overall session.
+
+##### Args:
+
+
+*  <b>`session`</b>: A TensorFlow Session that has been created.
+*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.after_run(run_context, run_values)` {#DumpingDebugHook.after_run}
+
+
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.before_run(run_context)` {#DumpingDebugHook.before_run}
+
+
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.begin()` {#DumpingDebugHook.begin}
+
+
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.close()` {#DumpingDebugHook.close}
+
+
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.end(session)` {#DumpingDebugHook.end}
+
+Called at the end of session.
+
+The `session` argument can be used in case the hook wants to run final ops,
+such as saving a last checkpoint.
+
+##### Args:
+
+
+*  <b>`session`</b>: A TensorFlow Session that will be soon closed.
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.graph` {#DumpingDebugHook.graph}
+
+
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.invoke_node_stepper(node_stepper, restore_variable_values_on_exit=True)` {#DumpingDebugHook.invoke_node_stepper}
+
+See doc of BaseDebugWrapperSession.invoke_node_stepper.
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.on_run_end(request)` {#DumpingDebugHook.on_run_end}
+
+See doc of BaseDebugWrapperSession.on_run_end.
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.on_run_start(request)` {#DumpingDebugHook.on_run_start}
+
+See doc of BaseDebugWrapperSession.on_run_start.
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.on_session_init(request)` {#DumpingDebugHook.on_session_init}
+
+See doc of BaseDebugWrapperSession.on_run_start.
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.partial_run(handle, fetches, feed_dict=None)` {#DumpingDebugHook.partial_run}
+
+
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.partial_run_setup(fetches, feeds=None)` {#DumpingDebugHook.partial_run_setup}
+
+Sets up the feeds and fetches for partial runs in the session.
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.run(fetches, feed_dict=None, options=None, run_metadata=None)` {#DumpingDebugHook.run}
+
+Wrapper around Session.run() that inserts tensor watch options.
+
+##### Args:
+
+
+*  <b>`fetches`</b>: Same as the `fetches` arg to regular `Session.run()`.
+*  <b>`feed_dict`</b>: Same as the `feed_dict` arg to regular `Session.run()`.
+*  <b>`options`</b>: Same as the `options` arg to regular `Session.run()`.
+*  <b>`run_metadata`</b>: Same as the `run_metadata` arg to regular `Session.run()`.
+
+##### Returns:
+
+  Simply forwards the output of the wrapped `Session.run()` call.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: On invalid `OnRunStartAction` value.
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.sess_str` {#DumpingDebugHook.sess_str}
+
+
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.session` {#DumpingDebugHook.session}
+
+
+
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.learn.monitors.RunHookAdapterForMonitors.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.learn.monitors.RunHookAdapterForMonitors.md
index 3c129291aa1..4f1e3dcc94f 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.learn.monitors.RunHookAdapterForMonitors.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.learn.monitors.RunHookAdapterForMonitors.md
@@ -8,7 +8,7 @@ Wraps monitors into a SessionRunHook.
 
 - - -
 
-#### `tf.contrib.learn.monitors.RunHookAdapterForMonitors.after_create_session(session)` {#RunHookAdapterForMonitors.after_create_session}
+#### `tf.contrib.learn.monitors.RunHookAdapterForMonitors.after_create_session(session, coord)` {#RunHookAdapterForMonitors.after_create_session}
 
 Called when new TensorFlow session is created.
 
@@ -24,6 +24,7 @@ has two essential differences with the situation in which `begin` is called:
 
 
 *  <b>`session`</b>: A TensorFlow Session that has been created.
+*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
 
 
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.test.main.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.test.main.md
index c7aa9cf8012..4a9fbf12bff 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.test.main.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.test.main.md
@@ -1,4 +1,4 @@
-### `tf.test.main()` {#main}
+### `tf.test.main(argv=None)` {#main}
 
 Runs all unit tests.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.StopAtStepHook.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.StopAtStepHook.md
index 9108195046f..e599bfc21a5 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.StopAtStepHook.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.StopAtStepHook.md
@@ -28,7 +28,7 @@ call.
 
 - - -
 
-#### `tf.train.StopAtStepHook.after_create_session(session)` {#StopAtStepHook.after_create_session}
+#### `tf.train.StopAtStepHook.after_create_session(session, coord)` {#StopAtStepHook.after_create_session}
 
 Called when new TensorFlow session is created.
 
@@ -44,6 +44,7 @@ has two essential differences with the situation in which `begin` is called:
 
 
 *  <b>`session`</b>: A TensorFlow Session that has been created.
+*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
 
 
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.train.SummarySaverHook.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.train.SummarySaverHook.md
index e8f325a595a..2d09da7b0ce 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.train.SummarySaverHook.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.train.SummarySaverHook.md
@@ -30,7 +30,7 @@ Initializes a `SummarySaver` monitor.
 
 - - -
 
-#### `tf.train.SummarySaverHook.after_create_session(session)` {#SummarySaverHook.after_create_session}
+#### `tf.train.SummarySaverHook.after_create_session(session, coord)` {#SummarySaverHook.after_create_session}
 
 Called when new TensorFlow session is created.
 
@@ -46,6 +46,7 @@ has two essential differences with the situation in which `begin` is called:
 
 
 *  <b>`session`</b>: A TensorFlow Session that has been created.
+*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
 
 
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.train.batch.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.train.batch.md
index b5fca20989f..965f4f2eeff 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.train.batch.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.train.batch.md
@@ -47,7 +47,7 @@ In addition, all output tensors' static shapes, as accessed via the
 operations that depend on fixed batch_size would fail.
 
 Note: if `num_epochs` is not `None`, this function creates local counter
-`epochs`. Use `local_variable_initializer()` to initialize local variables.
+`epochs`. Use `local_variables_initializer()` to initialize local variables.
 
 ##### Args:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.StepCounterHook.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.StepCounterHook.md
index 8132e90b99f..50ebc652abd 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.StepCounterHook.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.StepCounterHook.md
@@ -8,7 +8,7 @@ Steps per second monitor.
 
 - - -
 
-#### `tf.train.StepCounterHook.after_create_session(session)` {#StepCounterHook.after_create_session}
+#### `tf.train.StepCounterHook.after_create_session(session, coord)` {#StepCounterHook.after_create_session}
 
 Called when new TensorFlow session is created.
 
@@ -24,6 +24,7 @@ has two essential differences with the situation in which `begin` is called:
 
 
 *  <b>`session`</b>: A TensorFlow Session that has been created.
+*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
 
 
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.input_producer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.input_producer.md
index c60ad774f43..c98eed194ec 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.input_producer.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.input_producer.md
@@ -3,7 +3,7 @@
 Output the rows of `input_tensor` to a queue for an input pipeline.
 
 Note: if `num_epochs` is not `None`, this function creates local counter
-`epochs`. Use `local_variable_initializer()` to initialize local variables.
+`epochs`. Use `local_variables_initializer()` to initialize local variables.
 
 ##### Args:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.string_input_producer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.string_input_producer.md
index f3e9f09ac59..1aba482ef09 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.string_input_producer.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.train.string_input_producer.md
@@ -3,7 +3,7 @@
 Output strings (e.g. filenames) to a queue for an input pipeline.
 
 Note: if `num_epochs` is not `None`, this function creates local counter
-`epochs`. Use `local_variable_initializer()` to initialize local variables.
+`epochs`. Use `local_variables_initializer()` to initialize local variables.
 
 ##### Args:
 
diff --git a/tensorflow/g3doc/api_docs/python/index.md b/tensorflow/g3doc/api_docs/python/index.md
index 5493b2c864f..449e582d190 100644
--- a/tensorflow/g3doc/api_docs/python/index.md
+++ b/tensorflow/g3doc/api_docs/python/index.md
@@ -258,7 +258,6 @@
   * [`maximum`](../../api_docs/python/math_ops.md#maximum)
   * [`minimum`](../../api_docs/python/math_ops.md#minimum)
   * [`mod`](../../api_docs/python/math_ops.md#mod)
-  * [`mul`](../../api_docs/python/math_ops.md#mul)
   * [`multiply`](../../api_docs/python/math_ops.md#multiply)
   * [`neg`](../../api_docs/python/math_ops.md#neg)
   * [`negative`](../../api_docs/python/math_ops.md#negative)
@@ -297,7 +296,6 @@
   * [`sqrt`](../../api_docs/python/math_ops.md#sqrt)
   * [`square`](../../api_docs/python/math_ops.md#square)
   * [`squared_difference`](../../api_docs/python/math_ops.md#squared_difference)
-  * [`sub`](../../api_docs/python/math_ops.md#sub)
   * [`subtract`](../../api_docs/python/math_ops.md#subtract)
   * [`svd`](../../api_docs/python/math_ops.md#svd)
   * [`tan`](../../api_docs/python/math_ops.md#tan)
@@ -1156,6 +1154,8 @@
   * [`add_debug_tensor_watch`](../../api_docs/python/tf_debug.md#add_debug_tensor_watch)
   * [`DebugDumpDir`](../../api_docs/python/tf_debug.md#DebugDumpDir)
   * [`DebugTensorDatum`](../../api_docs/python/tf_debug.md#DebugTensorDatum)
+  * [`DumpingDebugHook`](../../api_docs/python/tf_debug.md#DumpingDebugHook)
+  * [`DumpingDebugWrapperSession`](../../api_docs/python/tf_debug.md#DumpingDebugWrapperSession)
   * [`has_inf_or_nan`](../../api_docs/python/tf_debug.md#has_inf_or_nan)
   * [`load_tensor_from_event_file`](../../api_docs/python/tf_debug.md#load_tensor_from_event_file)
   * [`LocalCLIDebugHook`](../../api_docs/python/tf_debug.md#LocalCLIDebugHook)
diff --git a/tensorflow/g3doc/api_docs/python/io_ops.md b/tensorflow/g3doc/api_docs/python/io_ops.md
index 6e645ecd8dd..f4ebe712a2a 100644
--- a/tensorflow/g3doc/api_docs/python/io_ops.md
+++ b/tensorflow/g3doc/api_docs/python/io_ops.md
@@ -2910,7 +2910,7 @@ Save the list of files matching pattern, so it is only computed once.
 
 Returns tensor `num_epochs` times and then raises an `OutOfRange` error.
 
-Note: creates local counter `epochs`. Use `local_variable_initializer()` to
+Note: creates local counter `epochs`. Use `local_variables_initializer()` to
 initialize local variables.
 
 ##### Args:
@@ -2938,7 +2938,7 @@ initialize local variables.
 Output the rows of `input_tensor` to a queue for an input pipeline.
 
 Note: if `num_epochs` is not `None`, this function creates local counter
-`epochs`. Use `local_variable_initializer()` to initialize local variables.
+`epochs`. Use `local_variables_initializer()` to initialize local variables.
 
 ##### Args:
 
@@ -2983,7 +2983,7 @@ Note: if `num_epochs` is not `None`, this function creates local counter
 Produces the integers from 0 to limit-1 in a queue.
 
 Note: if `num_epochs` is not `None`, this function creates local counter
-`epochs`. Use `local_variable_initializer()` to initialize local variables.
+`epochs`. Use `local_variables_initializer()` to initialize local variables.
 
 ##### Args:
 
@@ -3052,7 +3052,7 @@ is added to the current `Graph`'s `QUEUE_RUNNER` collection.
 Output strings (e.g. filenames) to a queue for an input pipeline.
 
 Note: if `num_epochs` is not `None`, this function creates local counter
-`epochs`. Use `local_variable_initializer()` to initialize local variables.
+`epochs`. Use `local_variables_initializer()` to initialize local variables.
 
 ##### Args:
 
@@ -3156,7 +3156,7 @@ In addition, all output tensors' static shapes, as accessed via the
 operations that depend on fixed batch_size would fail.
 
 Note: if `num_epochs` is not `None`, this function creates local counter
-`epochs`. Use `local_variable_initializer()` to initialize local variables.
+`epochs`. Use `local_variables_initializer()` to initialize local variables.
 
 ##### Args:
 
@@ -3422,7 +3422,7 @@ In addition, all output tensors' static shapes, as accessed via the
 operations that depend on fixed batch_size would fail.
 
 Note: if `num_epochs` is not `None`, this function creates local counter
-`epochs`. Use `local_variable_initializer()` to initialize local variables.
+`epochs`. Use `local_variables_initializer()` to initialize local variables.
 
 ##### Args:
 
diff --git a/tensorflow/g3doc/api_docs/python/math_ops.md b/tensorflow/g3doc/api_docs/python/math_ops.md
index bb85b98380d..92b001f8985 100644
--- a/tensorflow/g3doc/api_docs/python/math_ops.md
+++ b/tensorflow/g3doc/api_docs/python/math_ops.md
@@ -1749,7 +1749,7 @@ inf-norm) and up to 9218868437227405311 different vectors norms.
 *  <b>`ValueError`</b>: If `ord` or `axis` is invalid.
 
 @compatibility(numpy)
-Mostly equivalent to np.linalg.norm.
+Mostly equivalent to numpy.linalg.norm.
 Not supported: ord <= 0, 2-norm for matrices, nuclear norm.
 
 ##### Other differences:
@@ -2141,6 +2141,12 @@ s = svd(a, compute_uv=False)
     shape is `[..., N, P]`. If `full_matrices` is `True` then shape is
     `[..., N, N]`. Not returned if `compute_uv` is `False`.
 
+@compatibility(numpy)
+Mostly equivalent to numpy.linalg.svd, except that the order of output
+arguments here is `s`, `u`, `v` when `compute_uv` is `True`, as opposed to
+`u`, `s`, `v` for numpy.linalg.svd.
+@end_compatibility
+
 
 
 
@@ -3718,63 +3724,16 @@ invert_permutation(x) ==> [2, 4, 3, 0, 1]
 ## Other Functions and Classes
 - - -
 
-### `tf.mul(x, y, name=None)` {#mul}
+### `tf.neg(x, name=None)` {#neg}
 
-Returns x * y element-wise.
+Computes numerical negative value element-wise.
 
-*NOTE*: `Mul` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `uint8`, `int8`, `uint16`, `int16`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `x`.
-
-
-- - -
-
-### `tf.neg(*args, **kwargs)` {#neg}
-
-Computes numerical negative value element-wise. (deprecated)
-
-THIS FUNCTION IS DEPRECATED. It will be removed after 2016-12-30.
-Instructions for updating:
-`tf.neg(x)` is deprecated, please use `tf.negative(x)` or `-x`
-
-I.e., \(y = -x\).
-
-##### Args:
-
-
-*  <b>`x`</b>: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-    `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
-
-
-- - -
-
-### `tf.sub(x, y, name=None)` {#sub}
-
-Returns x - y element-wise.
-
-*NOTE*: `Sub` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+I.e., \\(y = -x\\).
 
 ##### Args:
 
 
 *  <b>`x`</b>: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-*  <b>`y`</b>: A `Tensor`. Must have the same type as `x`.
 *  <b>`name`</b>: A name for the operation (optional).
 
 ##### Returns:
diff --git a/tensorflow/g3doc/api_docs/python/tensor_array_ops.md b/tensorflow/g3doc/api_docs/python/tensor_array_ops.md
index e715ebacc6a..f8e76333b74 100644
--- a/tensorflow/g3doc/api_docs/python/tensor_array_ops.md
+++ b/tensorflow/g3doc/api_docs/python/tensor_array_ops.md
@@ -81,25 +81,6 @@ must all match.
   The in the `TensorArray` selected by `indices`, packed into one tensor.
 
 
-- - -
-
-#### `tf.TensorArray.pack(*args, **kwargs)` {#TensorArray.pack}
-
-Return the values in the TensorArray as a stacked `Tensor`.
-
-All of the values must have been written and their shapes must all match.
-If input shapes have rank-`R`, then output shape will have rank-`(R+1)`.
-
-##### Args:
-
-
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  All the tensors in the TensorArray stacked into one tensor.
-
-
 - - -
 
 #### `tf.TensorArray.stack(name=None)` {#TensorArray.stack}
@@ -188,32 +169,6 @@ Scatter the values of a `Tensor` in specific indices of a `TensorArray`.
 *  <b>`ValueError`</b>: if the shape inference fails.
 
 
-- - -
-
-#### `tf.TensorArray.unpack(*args, **kwargs)` {#TensorArray.unpack}
-
-Unstack the values of a `Tensor` in the TensorArray.
-
-If input value shapes have rank-`R`, then the output TensorArray will
-contain elements whose shapes are rank-`(R-1)`.
-
-##### Args:
-
-
-*  <b>`value`</b>: (N+1)-D.  Tensor of type `dtype`.  The Tensor to unstack.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A new TensorArray object with flow that ensures the unstack occurs.
-  Use this object all for subsequent operations.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if the shape inference fails.
-
-
 - - -
 
 #### `tf.TensorArray.unstack(value, name=None)` {#TensorArray.unstack}
diff --git a/tensorflow/g3doc/api_docs/python/test.md b/tensorflow/g3doc/api_docs/python/test.md
index 625be05fd92..265e4028d0f 100644
--- a/tensorflow/g3doc/api_docs/python/test.md
+++ b/tensorflow/g3doc/api_docs/python/test.md
@@ -29,7 +29,7 @@ methods.  We will document these methods soon.
 
 - - -
 
-### `tf.test.main()` {#main}
+### `tf.test.main(argv=None)` {#main}
 
 Runs all unit tests.
 
@@ -492,7 +492,7 @@ then compares them using self._AssertProtoEqual().
 
 - - -
 
-#### `tf.test.TestCase.assertProtoEqualsVersion(expected, actual, producer=20, min_consumer=0)` {#TestCase.assertProtoEqualsVersion}
+#### `tf.test.TestCase.assertProtoEqualsVersion(expected, actual, producer=21, min_consumer=0)` {#TestCase.assertProtoEqualsVersion}
 
 
 
diff --git a/tensorflow/g3doc/api_docs/python/tf_debug.md b/tensorflow/g3doc/api_docs/python/tf_debug.md
index 3f226518354..01d52d2c691 100644
--- a/tensorflow/g3doc/api_docs/python/tf_debug.md
+++ b/tensorflow/g3doc/api_docs/python/tf_debug.md
@@ -666,6 +666,30 @@ Get the partition graphs.
 *  <b>`LookupError`</b>: If no partition graphs have been loaded.
 
 
+- - -
+
+#### `tf_debug.DebugDumpDir.run_feed_keys_info` {#DebugDumpDir.run_feed_keys_info}
+
+Get a str representation of the feed_dict used in the Session.run() call.
+
+##### Returns:
+
+  If the information is available, a `str` obtained from `repr(feed_dict)`.
+  If the information is not available, `None`.
+
+
+- - -
+
+#### `tf_debug.DebugDumpDir.run_fetches_info` {#DebugDumpDir.run_fetches_info}
+
+Get a str representation of the fetches used in the Session.run() call.
+
+##### Returns:
+
+  If the information is available, a `str` obtained from `repr(fetches)`.
+  If the information is not available, `None`.
+
+
 - - -
 
 #### `tf_debug.DebugDumpDir.set_python_graph(python_graph)` {#DebugDumpDir.set_python_graph}
@@ -812,10 +836,363 @@ The signature of this function follows the requirement of the method
 
 These classes allow you to
 
-* wrap aroundTensorFlow `Session` objects to debug  plain TensorFlow models
-  (see `LocalCLIDebugWrapperSession`), or
+* wrap aroundTensorFlow `Session` objects to debug plain TensorFlow models
+  (see `DumpingDebugWrapperSession` and `LocalCLIDebugWrapperSession`), or
 * generate `SessionRunHook` objects to debug `tf.contrib.learn` models (see
-  `LocalCLIDebugHook`).
+  `DumpingDebugHook` and `LocalCLIDebugHook`).
+
+- - -
+
+### `class tf_debug.DumpingDebugHook` {#DumpingDebugHook}
+
+A debugger hook that dumps debug data to filesystem.
+
+Can be used as a monitor/hook for `tf.train.MonitoredSession`s and
+`tf.contrib.learn`'s `Estimator`s and `Experiment`s.
+- - -
+
+#### `tf_debug.DumpingDebugHook.__enter__()` {#DumpingDebugHook.__enter__}
+
+
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.__exit__(exec_type, exec_value, exec_tb)` {#DumpingDebugHook.__exit__}
+
+
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.__init__(session_root, watch_fn=None, log_usage=True)` {#DumpingDebugHook.__init__}
+
+Create a local debugger command-line interface (CLI) hook.
+
+##### Args:
+
+
+*  <b>`session_root`</b>: See doc of
+    `dumping_wrapper.DumpingDebugWrapperSession.__init__`.
+*  <b>`watch_fn`</b>: See doc of
+    `dumping_wrapper.DumpingDebugWrapperSession.__init__`.
+*  <b>`log_usage`</b>: (bool) Whether usage is to be logged.
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.after_create_session(session, coord)` {#DumpingDebugHook.after_create_session}
+
+Called when new TensorFlow session is created.
+
+This is called to signal the hooks that a new session has been created. This
+has two essential differences with the situation in which `begin` is called:
+
+* When this is called, the graph is finalized and ops can no longer be added
+    to the graph.
+* This method will also be called as a result of recovering a wrapped
+    session, not only at the beginning of the overall session.
+
+##### Args:
+
+
+*  <b>`session`</b>: A TensorFlow Session that has been created.
+*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.after_run(run_context, run_values)` {#DumpingDebugHook.after_run}
+
+
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.before_run(run_context)` {#DumpingDebugHook.before_run}
+
+
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.begin()` {#DumpingDebugHook.begin}
+
+
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.close()` {#DumpingDebugHook.close}
+
+
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.end(session)` {#DumpingDebugHook.end}
+
+Called at the end of session.
+
+The `session` argument can be used in case the hook wants to run final ops,
+such as saving a last checkpoint.
+
+##### Args:
+
+
+*  <b>`session`</b>: A TensorFlow Session that will be soon closed.
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.graph` {#DumpingDebugHook.graph}
+
+
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.invoke_node_stepper(node_stepper, restore_variable_values_on_exit=True)` {#DumpingDebugHook.invoke_node_stepper}
+
+See doc of BaseDebugWrapperSession.invoke_node_stepper.
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.on_run_end(request)` {#DumpingDebugHook.on_run_end}
+
+See doc of BaseDebugWrapperSession.on_run_end.
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.on_run_start(request)` {#DumpingDebugHook.on_run_start}
+
+See doc of BaseDebugWrapperSession.on_run_start.
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.on_session_init(request)` {#DumpingDebugHook.on_session_init}
+
+See doc of BaseDebugWrapperSession.on_run_start.
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.partial_run(handle, fetches, feed_dict=None)` {#DumpingDebugHook.partial_run}
+
+
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.partial_run_setup(fetches, feeds=None)` {#DumpingDebugHook.partial_run_setup}
+
+Sets up the feeds and fetches for partial runs in the session.
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.run(fetches, feed_dict=None, options=None, run_metadata=None)` {#DumpingDebugHook.run}
+
+Wrapper around Session.run() that inserts tensor watch options.
+
+##### Args:
+
+
+*  <b>`fetches`</b>: Same as the `fetches` arg to regular `Session.run()`.
+*  <b>`feed_dict`</b>: Same as the `feed_dict` arg to regular `Session.run()`.
+*  <b>`options`</b>: Same as the `options` arg to regular `Session.run()`.
+*  <b>`run_metadata`</b>: Same as the `run_metadata` arg to regular `Session.run()`.
+
+##### Returns:
+
+  Simply forwards the output of the wrapped `Session.run()` call.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: On invalid `OnRunStartAction` value.
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.sess_str` {#DumpingDebugHook.sess_str}
+
+
+
+
+- - -
+
+#### `tf_debug.DumpingDebugHook.session` {#DumpingDebugHook.session}
+
+
+
+
+
+- - -
+
+### `class tf_debug.DumpingDebugWrapperSession` {#DumpingDebugWrapperSession}
+
+Debug Session wrapper that dumps debug data to filesystem.
+- - -
+
+#### `tf_debug.DumpingDebugWrapperSession.__enter__()` {#DumpingDebugWrapperSession.__enter__}
+
+
+
+
+- - -
+
+#### `tf_debug.DumpingDebugWrapperSession.__exit__(exec_type, exec_value, exec_tb)` {#DumpingDebugWrapperSession.__exit__}
+
+
+
+
+- - -
+
+#### `tf_debug.DumpingDebugWrapperSession.__init__(sess, session_root, watch_fn=None, log_usage=True)` {#DumpingDebugWrapperSession.__init__}
+
+Constructor of DumpingDebugWrapperSession.
+
+##### Args:
+
+
+*  <b>`sess`</b>: The TensorFlow `Session` object being wrapped.
+*  <b>`session_root`</b>: (`str`) Path to the session root directory. Must be a
+    directory that does not exist or an empty directory. If the directory
+    does not exist, it will be created by the debugger core during debug
+    [`Session.run()`](../../../g3doc/api_docs/python/client.md#session.run)
+    calls.
+    As the `run()` calls occur, subdirectories will be added to
+    `session_root`. The subdirectories' names has the following pattern:
+      run_<epoch_time_stamp>_<uuid>
+    E.g., run_1480734393835964_ad4c953a85444900ae79fc1b652fb324
+*  <b>`watch_fn`</b>: (`Callable`) A Callable of the following signature:
+    ```
+    def watch_fn(fetches, feeds):
+      # Args:
+      #   fetches: the fetches to the `Session.run()` call.
+      #   feeds: the feeds to the `Session.run()` call.
+      #
+      # Returns: (node_name_regex_whitelist, op_type_regex_whitelist)
+      #   debug_ops: (str or list of str) Debug op(s) to be used by the
+      #     debugger in this run() call.
+      #   node_name_regex_whitelist: Regular-expression whitelist for node
+      #     name. Same as the corresponding arg to `debug_util.watch_graph`.
+      #   op_type_regex_whiteslit: Regular-expression whitelist for op type.
+      #     Same as the corresponding arg to `debug_util.watch_graph`.
+      #
+      #   Both or either can be None. If both are set, the two whitelists
+      #   will operate in a logical AND relation. This is consistent with
+      #   `debug_utils.watch_graph()`.
+    ```
+*  <b>`log_usage`</b>: (`bool`) whether the usage of this class is to be logged.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If `session_root` is an existing and non-empty directory or
+   if
+     `session_root` is a file.
+*  <b>`TypeError`</b>: If a non-None `watch_fn` is specified and it is not callable.
+
+
+- - -
+
+#### `tf_debug.DumpingDebugWrapperSession.close()` {#DumpingDebugWrapperSession.close}
+
+
+
+
+- - -
+
+#### `tf_debug.DumpingDebugWrapperSession.graph` {#DumpingDebugWrapperSession.graph}
+
+
+
+
+- - -
+
+#### `tf_debug.DumpingDebugWrapperSession.invoke_node_stepper(node_stepper, restore_variable_values_on_exit=True)` {#DumpingDebugWrapperSession.invoke_node_stepper}
+
+See doc of BaseDebugWrapperSession.invoke_node_stepper.
+
+
+- - -
+
+#### `tf_debug.DumpingDebugWrapperSession.on_run_end(request)` {#DumpingDebugWrapperSession.on_run_end}
+
+See doc of BaseDebugWrapperSession.on_run_end.
+
+
+- - -
+
+#### `tf_debug.DumpingDebugWrapperSession.on_run_start(request)` {#DumpingDebugWrapperSession.on_run_start}
+
+See doc of BaseDebugWrapperSession.on_run_start.
+
+
+- - -
+
+#### `tf_debug.DumpingDebugWrapperSession.on_session_init(request)` {#DumpingDebugWrapperSession.on_session_init}
+
+See doc of BaseDebugWrapperSession.on_run_start.
+
+
+- - -
+
+#### `tf_debug.DumpingDebugWrapperSession.partial_run(handle, fetches, feed_dict=None)` {#DumpingDebugWrapperSession.partial_run}
+
+
+
+
+- - -
+
+#### `tf_debug.DumpingDebugWrapperSession.partial_run_setup(fetches, feeds=None)` {#DumpingDebugWrapperSession.partial_run_setup}
+
+Sets up the feeds and fetches for partial runs in the session.
+
+
+- - -
+
+#### `tf_debug.DumpingDebugWrapperSession.run(fetches, feed_dict=None, options=None, run_metadata=None)` {#DumpingDebugWrapperSession.run}
+
+Wrapper around Session.run() that inserts tensor watch options.
+
+##### Args:
+
+
+*  <b>`fetches`</b>: Same as the `fetches` arg to regular `Session.run()`.
+*  <b>`feed_dict`</b>: Same as the `feed_dict` arg to regular `Session.run()`.
+*  <b>`options`</b>: Same as the `options` arg to regular `Session.run()`.
+*  <b>`run_metadata`</b>: Same as the `run_metadata` arg to regular `Session.run()`.
+
+##### Returns:
+
+  Simply forwards the output of the wrapped `Session.run()` call.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: On invalid `OnRunStartAction` value.
+
+
+- - -
+
+#### `tf_debug.DumpingDebugWrapperSession.sess_str` {#DumpingDebugWrapperSession.sess_str}
+
+
+
+
+- - -
+
+#### `tf_debug.DumpingDebugWrapperSession.session` {#DumpingDebugWrapperSession.session}
+
+
+
+
 
 - - -
 
@@ -823,7 +1200,8 @@ These classes allow you to
 
 Command-line-interface debugger hook.
 
-Can be used as a monitor/hook for tf.train.MonitoredSession.
+Can be used as a monitor/hook for `tf.train.MonitoredSession`s and
+`tf.contrib.learn`'s `Estimator`s and `Experiment`s.
 - - -
 
 #### `tf_debug.LocalCLIDebugHook.__enter__()` {#LocalCLIDebugHook.__enter__}
@@ -866,7 +1244,7 @@ Add a tensor filter.
 
 - - -
 
-#### `tf_debug.LocalCLIDebugHook.after_create_session(session)` {#LocalCLIDebugHook.after_create_session}
+#### `tf_debug.LocalCLIDebugHook.after_create_session(session, coord)` {#LocalCLIDebugHook.after_create_session}
 
 Called when new TensorFlow session is created.
 
@@ -882,6 +1260,7 @@ has two essential differences with the situation in which `begin` is called:
 
 
 *  <b>`session`</b>: A TensorFlow Session that has been created.
+*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
 
 
 - - -
@@ -1014,7 +1393,7 @@ Overrides on-session-init callback.
 
 ##### Returns:
 
-  An instance of OnSessionInitResponse.
+  An instance of `OnSessionInitResponse`.
 
 
 - - -
@@ -1226,7 +1605,7 @@ Overrides on-session-init callback.
 
 ##### Returns:
 
-  An instance of OnSessionInitResponse.
+  An instance of `OnSessionInitResponse`.
 
 
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/train.md b/tensorflow/g3doc/api_docs/python/train.md
index 85d6d02714e..098418f7a60 100644
--- a/tensorflow/g3doc/api_docs/python/train.md
+++ b/tensorflow/g3doc/api_docs/python/train.md
@@ -4245,7 +4245,7 @@ tf.train.write_graph(sess.graph, '/tmp/my-model', 'train.pbtxt')
 Hook to extend calls to MonitoredSession.run().
 - - -
 
-#### `tf.train.SessionRunHook.after_create_session(session)` {#SessionRunHook.after_create_session}
+#### `tf.train.SessionRunHook.after_create_session(session, coord)` {#SessionRunHook.after_create_session}
 
 Called when new TensorFlow session is created.
 
@@ -4261,6 +4261,7 @@ has two essential differences with the situation in which `begin` is called:
 
 
 *  <b>`session`</b>: A TensorFlow Session that has been created.
+*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
 
 
 - - -
@@ -4371,7 +4372,7 @@ Initializes a LoggingHook monitor.
 
 - - -
 
-#### `tf.train.LoggingTensorHook.after_create_session(session)` {#LoggingTensorHook.after_create_session}
+#### `tf.train.LoggingTensorHook.after_create_session(session, coord)` {#LoggingTensorHook.after_create_session}
 
 Called when new TensorFlow session is created.
 
@@ -4387,6 +4388,7 @@ has two essential differences with the situation in which `begin` is called:
 
 
 *  <b>`session`</b>: A TensorFlow Session that has been created.
+*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
 
 
 - - -
@@ -4460,7 +4462,7 @@ call.
 
 - - -
 
-#### `tf.train.StopAtStepHook.after_create_session(session)` {#StopAtStepHook.after_create_session}
+#### `tf.train.StopAtStepHook.after_create_session(session, coord)` {#StopAtStepHook.after_create_session}
 
 Called when new TensorFlow session is created.
 
@@ -4476,6 +4478,7 @@ has two essential differences with the situation in which `begin` is called:
 
 
 *  <b>`session`</b>: A TensorFlow Session that has been created.
+*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
 
 
 - - -
@@ -4549,7 +4552,7 @@ Initialize CheckpointSaverHook monitor.
 
 - - -
 
-#### `tf.train.CheckpointSaverHook.after_create_session(session)` {#CheckpointSaverHook.after_create_session}
+#### `tf.train.CheckpointSaverHook.after_create_session(session, coord)` {#CheckpointSaverHook.after_create_session}
 
 Called when new TensorFlow session is created.
 
@@ -4565,6 +4568,7 @@ has two essential differences with the situation in which `begin` is called:
 
 
 *  <b>`session`</b>: A TensorFlow Session that has been created.
+*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
 
 
 - - -
@@ -4617,7 +4621,7 @@ Steps per second monitor.
 
 - - -
 
-#### `tf.train.StepCounterHook.after_create_session(session)` {#StepCounterHook.after_create_session}
+#### `tf.train.StepCounterHook.after_create_session(session, coord)` {#StepCounterHook.after_create_session}
 
 Called when new TensorFlow session is created.
 
@@ -4633,6 +4637,7 @@ has two essential differences with the situation in which `begin` is called:
 
 
 *  <b>`session`</b>: A TensorFlow Session that has been created.
+*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
 
 
 - - -
@@ -4708,7 +4713,7 @@ Initializes NanLoss monitor.
 
 - - -
 
-#### `tf.train.NanTensorHook.after_create_session(session)` {#NanTensorHook.after_create_session}
+#### `tf.train.NanTensorHook.after_create_session(session, coord)` {#NanTensorHook.after_create_session}
 
 Called when new TensorFlow session is created.
 
@@ -4724,6 +4729,7 @@ has two essential differences with the situation in which `begin` is called:
 
 
 *  <b>`session`</b>: A TensorFlow Session that has been created.
+*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
 
 
 - - -
@@ -4805,7 +4811,7 @@ Initializes a `SummarySaver` monitor.
 
 - - -
 
-#### `tf.train.SummarySaverHook.after_create_session(session)` {#SummarySaverHook.after_create_session}
+#### `tf.train.SummarySaverHook.after_create_session(session, coord)` {#SummarySaverHook.after_create_session}
 
 Called when new TensorFlow session is created.
 
@@ -4821,6 +4827,7 @@ has two essential differences with the situation in which `begin` is called:
 
 
 *  <b>`session`</b>: A TensorFlow Session that has been created.
+*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
 
 
 - - -
@@ -4876,7 +4883,7 @@ Create a _GlobalStepWaiterHook.
 
 - - -
 
-#### `tf.train.GlobalStepWaiterHook.after_create_session(session)` {#GlobalStepWaiterHook.after_create_session}
+#### `tf.train.GlobalStepWaiterHook.after_create_session(session, coord)` {#GlobalStepWaiterHook.after_create_session}
 
 Called when new TensorFlow session is created.
 
@@ -4892,6 +4899,7 @@ has two essential differences with the situation in which `begin` is called:
 
 
 *  <b>`session`</b>: A TensorFlow Session that has been created.
+*  <b>`coord`</b>: A Coordinator object which keeps track of all threads.
 
 
 - - -
@@ -5446,43 +5454,22 @@ opt = tf.SyncReplicasOptimizer(opt, replicas_to_aggregate=50,
 
 # Now you can call `minimize()` or `compute_gradients()` and
 # `apply_gradients()` normally
-grads = opt.minimize(total_loss, global_step=self.global_step)
+training_op = opt.minimize(total_loss, global_step=self.global_step)
 
 
-# You can now call get_init_tokens_op() and get_chief_queue_runner().
-# Note that get_init_tokens_op() must be called before creating session
-# because it modifies the graph by adding new nodes.
-init_token_op = opt.get_init_tokens_op()
-chief_queue_runner = opt.get_chief_queue_runner()
+# You can create the hook which handles initialization and queues.
+sync_replicas_hook = opt.make_session_run_hook(is_chief)
 ```
 
 In the training program, every worker will run the train_op as if not
-synchronized. But one worker (usually the chief) will need to execute the
-chief_queue_runner and get_init_tokens_op from this optimizer.
+synchronized.
 
 ```python
-# When you create the supervisor, you need to add the local_init_op and
-# ready_for_local_init_op to make sure the local_step is initialized to the
-# global_step. Here is an example:
-if is_chief:
-  local_init_op = opt.chief_init_op
-else:
-  local_init_op = opt.local_step_init_op
-ready_for_local_init_op = opt.ready_for_local_init_op
-sv = tf.Supervisor(graph=g,
-                   is_chief=is_chief,
-                   # This initialize local step.
-                   local_init_op=local_init_op,
-                   # This makes sure global step is initialized before using.
-                   ready_for_local_init_op=ready_for_local_init_op,
-                   saver=model.saver)
-
-# After the session is created by the Supervisor and before the main while
-# loop:
-if is_chief and FLAGS.sync_replicas:
-  sv.start_queue_runners(sess, [chief_queue_runner])
-  # Insert initial tokens to the queue.
-  sess.run(init_token_op)
+with training.MonitoredTrainingSession(
+    master=workers[worker_id].target, is_chief=is_chief,
+    hooks=[sync_replicas_hook]) as mon_sess:
+  while not mon_sess.should_stop():
+    mon_sess.run(training_op)
 ```
 
 - - -
@@ -5660,6 +5647,13 @@ This simply wraps the get_slot_names() from the actual optimizer.
   A list of strings.
 
 
+- - -
+
+#### `tf.train.SyncReplicasOptimizer.make_session_run_hook(is_chief, num_tokens=-1)` {#SyncReplicasOptimizer.make_session_run_hook}
+
+Creates a hook to handle SyncReplicasHook ops such as initialization.
+
+
 
 - - -
 
diff --git a/tensorflow/g3doc/how_tos/debugger/index.md b/tensorflow/g3doc/how_tos/debugger/index.md
index f91dc5d9368..f8ed894aed6 100644
--- a/tensorflow/g3doc/how_tos/debugger/index.md
+++ b/tensorflow/g3doc/how_tos/debugger/index.md
@@ -306,6 +306,24 @@ python $(python -c "import tensorflow as tf; import os; print(os.path.dirname(tf
     --dump_dir=/cns/is-d/home/somebody/tfdbg_dumps_1
 ```
 
+The `Session` wrapper `DumpingDebugWrapperSession` offers an easier and more
+flexible way to generate dumps on filesystem that can be analyzed offline.
+To use it, simply do:
+
+```python
+# Let your BUILD target depend on "//tensorflow/python/debug:debug_py
+from tensorflow.python.debug import debug_utils
+
+sess = tf_debug.DumpingDebugWrapperSession(
+    sess, "/cns/is-d/home/somebody/tfdbg_dumps_1/", watch_fn=my_watch_fn)
+```
+
+`watch_fn=my_watch_fn` is a `Callable` that allows you to configure what
+`Tensor`s to watch on different `Session.run()` calls, as a function of the
+`fetches` and `feed_dict` to the `run()` call and other states. See
+[the API doc of DumpingDebugWrapperSession](../../api_docs/python/tf_debug.md#DumpingDebugWrapperSession.__init__)
+for more details.
+
 If you model code is written in C++ or other languages, you can also
 modify the `debug_options` field of `RunOptions` to generate debug dumps that
 can be inspected offline. See
diff --git a/tensorflow/g3doc/how_tos/debugger/tfdbg-tflearn.md b/tensorflow/g3doc/how_tos/debugger/tfdbg-tflearn.md
index 0e92393b52d..34db40a3a1a 100644
--- a/tensorflow/g3doc/how_tos/debugger/tfdbg-tflearn.md
+++ b/tensorflow/g3doc/how_tos/debugger/tfdbg-tflearn.md
@@ -86,3 +86,36 @@ To see the `debug_tflearn_iris` example run in the `Experiment` mode, do:
 python $(python -c "import tensorflow as tf; import os; print(os.path.dirname(tf.__file__));")/python/debug/examples/debug_tflearn_iris.py \
     --use_experiment --debug
 ```
+
+## Debugging Estimators and Experiments without Terminal Access
+
+If your `Estimator` or `Experiment` is running in an environment to which you
+do not have command-line access (e.g., a remote server), you can use the
+non-interactive `DumpingDebugHook`. For example:
+
+```python
+# Let your BUILD target depend on "//tensorflow/python/debug:debug_py
+from tensorflow.python import debug as tf_debug
+
+hooks = [tf_debug.DumpingDebugHook("/cns/is-d/home/somebody/tfdbg_dumps_1")]
+```
+
+Then this `hook` can be used in the same way as the `LocalCLIDebugHook` examples
+above. As the training and/or evalution of `Estimator` or `Experiment`
+happens, directories of the naming pattern
+`/cns/is-d/home/somebody/tfdbg_dumps_1/run_<epoch_timestamp_microsec>_<uuid>`
+will appear. Each directory corresponds to a `Session.run()` call that underlies
+the `fit()` or `evaluate()` call. You can load these directories and inspect
+them in a command-line interface in an offline manner using the
+`offline_analyzer` offered by **tfdbg**. For example:
+
+```bash
+python $(python -c "import tensorflow as tf; import os; print(os.path.dirname(tf.__file__));")/python/debug/cli/offline_analyzer.py \
+    --dump_dir="/cns/is-d/home/somebody/tfdbg_dumps_1/run_<epoch_timestamp_microsec>_<uuid>"
+```
+
+The `LocalCLIDebugHook` also allows you to configure a `watch_fn` that can be
+used to flexibly specify what `Tensor`s to watch on different `Session.run()`
+calls, as a function of the `fetches` and `feed_dict` and other states. See
+[this API doc](../../api_docs/python/tf_debug.md#DumpingDebugWrapperSession.__init__)
+for more details.
diff --git a/tensorflow/g3doc/how_tos/distributed/index.md b/tensorflow/g3doc/how_tos/distributed/index.md
index 859bd3b7aa5..bce6af6f803 100644
--- a/tensorflow/g3doc/how_tos/distributed/index.md
+++ b/tensorflow/g3doc/how_tos/distributed/index.md
@@ -182,19 +182,12 @@ implementing **between-graph replication** and **asynchronous training**. It
 includes the code for the parameter server and worker tasks.
 
 ```python
+import argparse
+import sys
+
 import tensorflow as tf
 
-# Flags for defining the tf.train.ClusterSpec
-tf.app.flags.DEFINE_string("ps_hosts", "",
-                           "Comma-separated list of hostname:port pairs")
-tf.app.flags.DEFINE_string("worker_hosts", "",
-                           "Comma-separated list of hostname:port pairs")
-
-# Flags for defining the tf.train.Server
-tf.app.flags.DEFINE_string("job_name", "", "One of 'ps', 'worker'")
-tf.app.flags.DEFINE_integer("task_index", 0, "Index of task within the job")
-
-FLAGS = tf.app.flags.FLAGS
+FLAGS = None
 
 
 def main(_):
@@ -253,7 +246,36 @@ def main(_):
     sv.stop()
 
 if __name__ == "__main__":
-  tf.app.run()
+  parser = argparse.ArgumentParser()
+  parser.register("type", "bool", lambda v: v.lower() == "true")
+  # Flags for defining the tf.train.ClusterSpec
+  parser.add_argument(
+      "--ps_hosts",
+      type=str,
+      default="",
+      help="Comma-separated list of hostname:port pairs"
+  )
+  parser.add_argument(
+      "--worker_hosts",
+      type=str,
+      default="",
+      help="Comma-separated list of hostname:port pairs"
+  )
+  parser.add_argument(
+      "--job_name",
+      type=str,
+      default="",
+      help="One of 'ps', 'worker'"
+  )
+  # Flags for defining the tf.train.Server
+  parser.add_argument(
+      "--task_index",
+      type=int,
+      default=0,
+      help="Index of task within the job"
+  )
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
 ```
 
 To start the trainer with two parameter servers and two workers, use the
diff --git a/tensorflow/g3doc/tutorials/estimators/index.md b/tensorflow/g3doc/tutorials/estimators/index.md
index 0c7e12f51d7..6dede9eccc9 100644
--- a/tensorflow/g3doc/tutorials/estimators/index.md
+++ b/tensorflow/g3doc/tutorials/estimators/index.md
@@ -101,35 +101,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import argparse
+import sys
 import tempfile
 import urllib
 
 import numpy as np
 import tensorflow as tf
+
+FLAGS = None
 ```
 
-Then define flags to allow users to optionally specify CSV files for training,
-test, and prediction datasets via the command line (by default, files will be
-downloaded from [tensorflow.org](https://www.tensorflow.org/)), and enable
-logging:
+Enable logging:
 
 ```python
-flags = tf.app.flags
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string(
-    "train_data",
-    "",
-    "Path to the training data.")
-flags.DEFINE_string(
-    "test_data",
-    "",
-    "Path to the test data.")
-flags.DEFINE_string(
-    "predict_data",
-    "",
-    "Path to the prediction data.")
-
 tf.logging.set_verbosity(tf.logging.INFO)
 ```
 
@@ -138,10 +123,10 @@ command-line options, or downloaded from
 [tensorflow.org](https://www.tensorflow.org/)):
 
 ```python
-def maybe_download():
+def maybe_download(train_data, test_data, predict_data):
   """Maybe downloads training data and returns train and test file names."""
-  if FLAGS.train_data:
-    train_file_name = FLAGS.train_data
+  if train_data:
+    train_file_name = train_data
   else:
     train_file = tempfile.NamedTemporaryFile(delete=False)
     urllib.urlretrieve("http://download.tensorflow.org/data/abalone_train.csv", train_file.name)
@@ -149,8 +134,8 @@ def maybe_download():
     train_file.close()
     print("Training data is downloaded to %s" % train_file_name)
 
-  if FLAGS.test_data:
-    test_file_name = FLAGS.test_data
+  if test_data:
+    test_file_name = test_data
   else:
     test_file = tempfile.NamedTemporaryFile(delete=False)
     urllib.urlretrieve("http://download.tensorflow.org/data/abalone_test.csv", test_file.name)
@@ -158,8 +143,8 @@ def maybe_download():
     test_file.close()
     print("Test data is downloaded to %s" % test_file_name)
 
-  if FLAGS.predict_data:
-    predict_file_name = FLAGS.predict_data
+  if predict_data:
+    predict_file_name = predict_data
   else:
     predict_file = tempfile.NamedTemporaryFile(delete=False)
     urllib.urlretrieve("http://download.tensorflow.org/data/abalone_predict.csv", predict_file.name)
@@ -170,12 +155,16 @@ def maybe_download():
   return train_file_name, test_file_name, predict_file_name
 ```
 
-Finally, create `main()` and load the abalone CSVs into `Datasets`:
+Finally, create `main()` and load the abalone CSVs into `Datasets`,
+defining flags to allow users to optionally specify CSV files for training,
+test, and prediction datasets via the command line (by default, files will be
+downloaded from [tensorflow.org](https://www.tensorflow.org/)):
 
 ```python
 def main(unused_argv):
   # Load datasets
-  abalone_train, abalone_test, abalone_predict = maybe_download()
+  abalone_train, abalone_test, abalone_predict = maybe_download(
+    FLAGS.train_data, FLAGS.test_data, FLAGS.predict_data)
 
   # Training examples
   training_set = tf.contrib.learn.datasets.base.load_csv_without_header(
@@ -196,7 +185,28 @@ def main(unused_argv):
       features_dtype=np.float64)
 
 if __name__ == "__main__":
-  tf.app.run()
+  parser = argparse.ArgumentParser()
+  parser.register("type", "bool", lambda v: v.lower() == "true")
+  parser.add_argument(
+      "--train_data",
+      type=str,
+      default="",
+      help="Path to the training data."
+  )
+  parser.add_argument(
+      "--test_data",
+      type=str,
+      default="",
+      help="Path to the test data."
+  )
+  parser.add_argument(
+      "--predict_data",
+      type=str,
+      default="",
+      help="Path to the prediction data."
+  )
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
 ```
 
 ## Instantiating an Estimator
diff --git a/tensorflow/go/op/op.go b/tensorflow/go/op/op.go
index 3d820a60e69..29c59987247 100644
--- a/tensorflow/go/op/op.go
+++ b/tensorflow/go/op/op.go
@@ -41,7 +41,6 @@ func Const(scope *Scope, value interface{}) (output tf.Output) {
 		}
 	}
 	return scope.AddOperation(tf.OpSpec{
-		Name: scope.opName("Const"),
 		Type: "Const",
 		Attrs: map[string]interface{}{
 			"dtype": t.DataType(),
diff --git a/tensorflow/go/op/scope.go b/tensorflow/go/op/scope.go
index 346c756f563..c9fc432cd2d 100644
--- a/tensorflow/go/op/scope.go
+++ b/tensorflow/go/op/scope.go
@@ -60,11 +60,19 @@ func (s *Scope) Finalize() (*tf.Graph, error) {
 
 // AddOperation adds the operation to the Graph managed by s.
 //
-// See Graph.AddOperation.
+// If there is a name prefix associated with s (such as if s was created
+// by a call to SubScope), then this prefix will be applied to the name
+// of the operation being added. See also Graph.AddOperation.
 func (s *Scope) AddOperation(args tf.OpSpec) *tf.Operation {
 	if s.Err() != nil {
 		return nil
 	}
+	if args.Name == "" {
+		args.Name = args.Type
+	}
+	if s.namespace != "" {
+		args.Name = s.namespace + "/" + args.Name
+	}
 	op, err := s.graph.AddOperation(args)
 	if err != nil {
 		s.UpdateErr(args.Type, err)
diff --git a/tensorflow/go/op/scope_test.go b/tensorflow/go/op/scope_test.go
index 4fcb1a56d56..2fec36249e5 100644
--- a/tensorflow/go/op/scope_test.go
+++ b/tensorflow/go/op/scope_test.go
@@ -84,6 +84,15 @@ func TestScopeFinalize(t *testing.T) {
 	}
 }
 
+func TestMultipleGeneratedOps(t *testing.T) {
+	s := NewScope()
+	Placeholder(s.SubScope("x"), tf.Float)
+	Placeholder(s.SubScope("y"), tf.Float)
+	if _, err := s.Finalize(); err != nil {
+		t.Fatal(err)
+	}
+}
+
 func Example() {
 	// This example creates a Graph that multiplies a constant matrix with
 	// a matrix to be provided during graph execution (via
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index 07baa8f3871..aad853b88b2 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -8,7 +8,7 @@ licenses(["notice"])  # Apache 2.0
 java_library(
     name = "tensorflow",
     srcs = glob(["src/main/java/org/tensorflow/*.java"]),
-    data = [":libtensorflow-jni"],
+    data = [":libtensorflow_jni"],
     visibility = ["//visibility:public"],
 )
 
@@ -78,18 +78,42 @@ java_test(
 )
 
 filegroup(
-    name = "libtensorflow-jni",
+    name = "libtensorflow_jni",
     srcs = select({
-        "//tensorflow:darwin": [":libtensorflow-jni.dylib"],
-        "//conditions:default": [":libtensorflow-jni.so"],
+        "//tensorflow:darwin": [":libtensorflow_jni.dylib"],
+        "//conditions:default": [":libtensorflow_jni.so"],
     }),
 )
 
+LINKER_VERSION_SCRIPT = ":config/version_script.lds"
+
+LINKER_EXPORTED_SYMBOLS = ":config/exported_symbols.lds"
+
 cc_binary(
-    name = "libtensorflow-jni.so",
+    name = "libtensorflow_jni.so",
+    # Set linker options to strip out anything except the JNI
+    # symbols from the library. This reduces the size of the library
+    # considerably (~50% as of January 2017).
+    linkopts = select({
+        "//tensorflow:darwin": [
+            "-Wl,-exported_symbols_list",  # This line must be directly followed by LINKER_EXPORTED_SYMBOLS
+            LINKER_EXPORTED_SYMBOLS,
+        ],
+        "//tensorflow:windows": [],
+        "//conditions:default": [
+            "-z defs",
+            "-s",
+            "-Wl,--version-script",  #  This line must be directly followed by LINKER_VERSION_SCRIPT
+            LINKER_VERSION_SCRIPT,
+        ],
+    }),
     linkshared = 1,
     linkstatic = 1,
-    deps = ["//tensorflow/java/src/main/native"],
+    deps = [
+        "//tensorflow/java/src/main/native",
+        LINKER_VERSION_SCRIPT,
+        LINKER_EXPORTED_SYMBOLS,
+    ],
 )
 
 genrule(
@@ -111,8 +135,8 @@ cc_binary(
 # is resolved, perhaps this workaround rule can be removed.
 genrule(
     name = "darwin-compat",
-    srcs = [":libtensorflow-jni.so"],
-    outs = ["libtensorflow-jni.dylib"],
+    srcs = [":libtensorflow_jni.so"],
+    outs = ["libtensorflow_jni.dylib"],
     cmd = "cp $< $@",
     output_to_bindir = 1,
 )
diff --git a/tensorflow/java/README.md b/tensorflow/java/README.md
index 5c5e45d4d87..7edd109cfd8 100644
--- a/tensorflow/java/README.md
+++ b/tensorflow/java/README.md
@@ -37,7 +37,7 @@ Build the Java Archive (JAR) and native library:
 ```sh
 bazel build -c opt \
   //tensorflow/java:tensorflow \
-  //tensorflow/java:libtensorflow-jni
+  //tensorflow/java:libtensorflow_jni
 ```
 
 ### Maven
@@ -86,8 +86,8 @@ bazel run -c opt //tensorflow/java/src/main/java/org/tensorflow/examples:label_i
       ./src/main/java/org/tensorflow/examples/LabelImage.java
     ```
 
--   Make `libtensorflow.jar` and `libtensorflow-jni.so`
-    (`libtensorflow-jni.dylib` on OS X) available during execution. For example:
+-   Make `libtensorflow.jar` and `libtensorflow_jni.so`
+    (`libtensorflow_jni.dylib` on OS X) available during execution. For example:
 
     ```sh
     java \
diff --git a/tensorflow/java/config/exported_symbols.lds b/tensorflow/java/config/exported_symbols.lds
new file mode 100644
index 00000000000..4b79b8c333a
--- /dev/null
+++ b/tensorflow/java/config/exported_symbols.lds
@@ -0,0 +1,3 @@
+*Java_org_tensorflow_*
+*JNI_OnLoad
+*JNI_OnUnload
diff --git a/tensorflow/java/config/version_script.lds b/tensorflow/java/config/version_script.lds
new file mode 100644
index 00000000000..38c93dda730
--- /dev/null
+++ b/tensorflow/java/config/version_script.lds
@@ -0,0 +1,11 @@
+VERS_1.0 {
+  # Export JNI symbols.
+  global:
+    Java_*;
+    JNI_OnLoad;
+    JNI_OnUnload;
+
+  # Hide everything else.
+  local:
+    *;
+};
diff --git a/tensorflow/java/src/main/java/org/tensorflow/TensorFlow.java b/tensorflow/java/src/main/java/org/tensorflow/TensorFlow.java
index c42dfc8e200..96712520418 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/TensorFlow.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/TensorFlow.java
@@ -24,7 +24,7 @@ public final class TensorFlow {
 
   /** Load the TensorFlow runtime C library. */
   static void init() {
-    System.loadLibrary("tensorflow-jni");
+    System.loadLibrary("tensorflow_jni");
   }
 
   static {
diff --git a/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java b/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java
index 740248a29bb..19929188a5d 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java
@@ -154,7 +154,7 @@ public class LabelImage {
   }
 
   // In the fullness of time, equivalents of the methods of this class should be auto-generated from
-  // the OpDefs linked into libtensorflow-jni.so. That would match what is done in other languages
+  // the OpDefs linked into libtensorflow_jni.so. That would match what is done in other languages
   // like Python, C++ and Go.
   static class GraphBuilder {
     GraphBuilder(Graph g) {
diff --git a/tensorflow/java/src/main/native/BUILD b/tensorflow/java/src/main/native/BUILD
index 451906ab61c..8e95ea4f793 100644
--- a/tensorflow/java/src/main/native/BUILD
+++ b/tensorflow/java/src/main/native/BUILD
@@ -2,26 +2,49 @@
 # Java Native Interface (JNI) library intended for implementing the
 # TensorFlow Java API using the TensorFlow C library.
 
-package(default_visibility = ["//tensorflow/java:__pkg__"])
+package(default_visibility = [
+    "//tensorflow/java:__pkg__",
+    # TODO(ashankar): Temporary hack for the Java API and
+    # //third_party/tensorflow/contrib/android:android_tensorflow_inference_jni
+    # to co-exist in a single shared library. However, the hope is that
+    # //third_party/tensorflow/contrib/android:android_tensorflow_jni can be
+    # removed once the Java API provides feature parity with it.
+    "//tensorflow/contrib/android:__pkg__",
+])
 
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
+load("//tensorflow:tensorflow.bzl", "tf_cuda_library", "tf_copts")
 
 tf_cuda_library(
     name = "native",
-    srcs = glob(["*.cc"]) + [
-        ":jni.h",
-        ":jni_md.h",
-    ],
+    srcs = glob(["*.cc"]) + select({
+        # The Android toolchain makes "jni.h" available in the include path.
+        # For non-Android toolchains, generate jni.h and jni_md.h.
+        "//tensorflow:android": [],
+        "//conditions:default": [
+            ":jni.h",
+            ":jni_md.h",
+        ],
+    }),
     hdrs = glob(["*.h"]),
-    includes = ["."],
+    copts = tf_copts(),
+    includes = select({
+        "//tensorflow:android": [],
+        "//conditions:default": ["."],
+    }),
     deps = [
         "//tensorflow/c:c_api",
-        "//tensorflow/core:all_kernels",
-        "//tensorflow/core:direct_session",
-        "//tensorflow/core:ops",
-    ],
+    ] + select({
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib",
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:all_kernels",
+            "//tensorflow/core:direct_session",
+            "//tensorflow/core:ops",
+        ],
+    }),
     alwayslink = 1,
 )
 
@@ -29,15 +52,11 @@ tf_cuda_library(
 # #include <jni.h>
 # in the source headers work
 # (in combination with the "includes" attribute of the tf_cuda_library rule
-# above).
+# above. Not needed when using the Android toolchain).
 #
 # Inspired from:
 # https://github.com/bazelbuild/bazel/blob/f99a0543f8d97339d32075c7176b79f35be84606/src/main/native/BUILD
 # but hopefully there is a simpler alternative to this.
-#
-# TODO(ashankar): This should not be necessary for Android builds as the
-# toolchain makes <jni.h> available. Perhaps remove ":jni.h" and ":jni_md.h"
-# from "srcs" and make these genrules a no-op when building for Android?
 genrule(
     name = "copy_jni_h",
     srcs = ["@bazel_tools//tools/jdk:jni_header"],
diff --git a/tensorflow/java/src/main/native/exception_jni.cc b/tensorflow/java/src/main/native/exception_jni.cc
index 3ae610a15db..2df0973389a 100644
--- a/tensorflow/java/src/main/native/exception_jni.cc
+++ b/tensorflow/java/src/main/native/exception_jni.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <stdarg.h>
+#include <stdio.h>
 
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/java/src/main/native/exception_jni.h"
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 10f7dfe1422..3fe7cc1bb7f 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -77,6 +77,7 @@ py_library(
         ":util",
         "//third_party/py/numpy",
         "//tensorflow/python/ops/losses",
+        "//tensorflow/python/saved_model",
     ] + if_not_windows([
         "//tensorflow/contrib:contrib_py",
     ]),
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index c332aba20b1..e2ed3f3f8e1 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -76,6 +76,7 @@ from tensorflow.core.framework.graph_pb2 import *
 from tensorflow.core.framework.node_def_pb2 import *
 from tensorflow.core.framework.summary_pb2 import *
 from tensorflow.core.framework.attr_value_pb2 import *
+from tensorflow.core.protobuf.meta_graph_pb2 import TensorInfo
 from tensorflow.core.protobuf.config_pb2 import *
 from tensorflow.core.util.event_pb2 import *
 
@@ -101,6 +102,8 @@ from tensorflow.python.ops import sdca_ops as sdca
 from tensorflow.python.ops import image_ops as image
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.ops import sets
+from tensorflow.python.saved_model import saved_model
+from tensorflow.python.util import compat
 from tensorflow.python.user_ops import user_ops
 from tensorflow.python.util import compat
 from tensorflow.python.summary import summary
@@ -166,6 +169,7 @@ _allowed_symbols = [
     'RunMetadata',
     'SessionLog',
     'Summary',
+    'TensorInfo',  # Used for tf.saved_model functionality.
 ]
 
 # The following symbols are kept for compatibility. It is our plan
@@ -237,6 +241,7 @@ _allowed_symbols.extend([
     'nn',
     'python_io',
     'resource_loader',
+    'saved_model',
     'sdca',
     'sets',
     'summary',
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 0f306740cca..4b968c150d9 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -183,6 +183,16 @@ py_library(
     ],
 )
 
+py_library(
+    name = "dumping_wrapper",
+    srcs = ["wrappers/dumping_wrapper.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":debug_data",
+        ":framework",
+    ],
+)
+
 py_library(
     name = "local_cli_wrapper",
     srcs = ["wrappers/local_cli_wrapper.py"],
@@ -204,6 +214,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":debug_utils",
+        ":dumping_wrapper",
         ":framework",
         ":local_cli_wrapper",
         ":stepper",
@@ -530,6 +541,28 @@ cuda_py_test(
     ],
 )
 
+py_test(
+    name = "dumping_wrapper_test",
+    size = "small",
+    srcs = [
+        "wrappers/dumping_wrapper_test.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dumping_wrapper",
+        ":hooks",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+    ],
+)
+
 py_test(
     name = "local_cli_wrapper_test",
     size = "small",
diff --git a/tensorflow/python/debug/__init__.py b/tensorflow/python/debug/__init__.py
index a2e4f1635a7..2ef44ca6d91 100644
--- a/tensorflow/python/debug/__init__.py
+++ b/tensorflow/python/debug/__init__.py
@@ -50,11 +50,13 @@ runs. See `DebugDumpDir.find()` for more details.
 
 These classes allow you to
 
-* wrap aroundTensorFlow `Session` objects to debug  plain TensorFlow models
-  (see `LocalCLIDebugWrapperSession`), or
+* wrap aroundTensorFlow `Session` objects to debug plain TensorFlow models
+  (see `DumpingDebugWrapperSession` and `LocalCLIDebugWrapperSession`), or
 * generate `SessionRunHook` objects to debug `tf.contrib.learn` models (see
-  `LocalCLIDebugHook`).
+  `DumpingDebugHook` and `LocalCLIDebugHook`).
 
+@@DumpingDebugHook
+@@DumpingDebugWrapperSession
 @@LocalCLIDebugHook
 @@LocalCLIDebugWrapperSession
 
@@ -74,5 +76,7 @@ from tensorflow.python.debug.debug_utils import add_debug_tensor_watch
 from tensorflow.python.debug.debug_utils import watch_graph
 from tensorflow.python.debug.debug_utils import watch_graph_with_blacklists
 
+from tensorflow.python.debug.wrappers.dumping_wrapper import DumpingDebugWrapperSession
+from tensorflow.python.debug.wrappers.hooks import DumpingDebugHook
 from tensorflow.python.debug.wrappers.hooks import LocalCLIDebugHook
 from tensorflow.python.debug.wrappers.local_cli_wrapper import LocalCLIDebugWrapperSession
diff --git a/tensorflow/python/debug/debug_data.py b/tensorflow/python/debug/debug_data.py
index 4c252ade86a..3e638c6f04b 100644
--- a/tensorflow/python/debug/debug_data.py
+++ b/tensorflow/python/debug/debug_data.py
@@ -30,6 +30,12 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.platform import gfile
 
 
+METADATA_FILE_PREFIX = "_tfdbg_"
+GRAPH_FILE_TAG = "graph_"
+FETCHES_INFO_FILE_TAG = "fetches_info_"
+FEED_KEYS_INFO_FILE_TAG = "feed_keys_info_"
+
+
 def load_tensor_from_event_file(event_file_path):
   """Load a tensor from an event file.
 
@@ -67,6 +73,14 @@ def _load_graph_def_from_event_file(event_file_path):
   return graph_pb2.GraphDef.FromString(event.graph_def)
 
 
+def _load_log_message_from_event_file(event_file_path):
+  event = event_pb2.Event()
+  with gfile.Open(event_file_path, "rb") as f:
+    event.ParseFromString(f.read())
+
+  return event.log_message.message
+
+
 def parse_node_or_tensor_name(name):
   """Get the node name from a string that can be node or tensor name.
 
@@ -92,7 +106,15 @@ def parse_node_or_tensor_name(name):
 
 
 def _is_graph_file(file_name):
-  return file_name.startswith("_tfdbg_graph_")
+  return file_name.startswith(METADATA_FILE_PREFIX + GRAPH_FILE_TAG)
+
+
+def _is_run_fetches_info_file(file_name):
+  return file_name == METADATA_FILE_PREFIX + FETCHES_INFO_FILE_TAG
+
+
+def _is_run_feed_keys_info_file(file_name):
+  return file_name == METADATA_FILE_PREFIX + FEED_KEYS_INFO_FILE_TAG
 
 
 def get_node_name(element_name):
@@ -301,6 +323,9 @@ class DebugTensorDatum(object):
     self._dump_size_bytes = (gfile.Stat(self._file_path).length if
                              gfile.Exists(self._file_path) else None)
 
+    self._run_fetches_info = None
+    self._run_feed_keys_info = None
+
   def __str__(self):
     return "{DebugTensorDatum: %s:%d @ %s @ %d}" % (self.node_name,
                                                     self.output_slot,
@@ -467,9 +492,19 @@ class DebugDumpDir(object):
 
     for root, _, files in gfile.Walk(self._dump_root):
       for f in files:
-        if _is_graph_file(f):
-          self._dump_graph_file_paths.append(
-              os.path.join(self._dump_root, root, f))
+        if f.startswith(METADATA_FILE_PREFIX):
+          if _is_graph_file(f):
+            self._dump_graph_file_paths.append(
+                os.path.join(self._dump_root, root, f))
+
+          if _is_run_fetches_info_file(f):
+            self._run_fetches_info = _load_log_message_from_event_file(
+                os.path.join(root, f))
+
+          if _is_run_feed_keys_info_file(f):
+            self._run_feed_keys_info = _load_log_message_from_event_file(
+                os.path.join(root, f))
+
           continue
 
         datum = self._dump_file_name_to_datum(root, f)
@@ -824,6 +859,28 @@ class DebugDumpDir(object):
 
     return self._partition_graphs
 
+  @property
+  def run_fetches_info(self):
+    """Get a str representation of the fetches used in the Session.run() call.
+
+    Returns:
+      If the information is available, a `str` obtained from `repr(fetches)`.
+      If the information is not available, `None`.
+    """
+
+    return self._run_fetches_info
+
+  @property
+  def run_feed_keys_info(self):
+    """Get a str representation of the feed_dict used in the Session.run() call.
+
+    Returns:
+      If the information is available, a `str` obtained from `repr(feed_dict)`.
+      If the information is not available, `None`.
+    """
+
+    return self._run_feed_keys_info
+
   def nodes(self):
     """Get a list of all nodes from the partition graphs.
 
diff --git a/tensorflow/python/debug/wrappers/dumping_wrapper.py b/tensorflow/python/debug/wrappers/dumping_wrapper.py
new file mode 100644
index 00000000000..d0e2c3ea20e
--- /dev/null
+++ b/tensorflow/python/debug/wrappers/dumping_wrapper.py
@@ -0,0 +1,183 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Debugger wrapper session that dumps debug data to file:// URLs."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import time
+import uuid
+
+# Google-internal import(s).
+from tensorflow.core.util import event_pb2
+from tensorflow.python.debug import debug_data
+from tensorflow.python.debug.wrappers import framework
+from tensorflow.python.platform import gfile
+
+
+class DumpingDebugWrapperSession(framework.BaseDebugWrapperSession):
+  """Debug Session wrapper that dumps debug data to filesystem."""
+
+  def __init__(self, sess, session_root, watch_fn=None, log_usage=True):
+    """Constructor of DumpingDebugWrapperSession.
+
+    Args:
+      sess: The TensorFlow `Session` object being wrapped.
+      session_root: (`str`) Path to the session root directory. Must be a
+        directory that does not exist or an empty directory. If the directory
+        does not exist, it will be created by the debugger core during debug
+        [`Session.run()`](../../../g3doc/api_docs/python/client.md#session.run)
+        calls.
+        As the `run()` calls occur, subdirectories will be added to
+        `session_root`. The subdirectories' names has the following pattern:
+          run_<epoch_time_stamp>_<uuid>
+        E.g., run_1480734393835964_ad4c953a85444900ae79fc1b652fb324
+      watch_fn: (`Callable`) A Callable of the following signature:
+        ```
+        def watch_fn(fetches, feeds):
+          # Args:
+          #   fetches: the fetches to the `Session.run()` call.
+          #   feeds: the feeds to the `Session.run()` call.
+          #
+          # Returns: (node_name_regex_whitelist, op_type_regex_whitelist)
+          #   debug_ops: (str or list of str) Debug op(s) to be used by the
+          #     debugger in this run() call.
+          #   node_name_regex_whitelist: Regular-expression whitelist for node
+          #     name. Same as the corresponding arg to `debug_util.watch_graph`.
+          #   op_type_regex_whiteslit: Regular-expression whitelist for op type.
+          #     Same as the corresponding arg to `debug_util.watch_graph`.
+          #
+          #   Both or either can be None. If both are set, the two whitelists
+          #   will operate in a logical AND relation. This is consistent with
+          #   `debug_utils.watch_graph()`.
+        ```
+      log_usage: (`bool`) whether the usage of this class is to be logged.
+
+    Raises:
+       ValueError: If `session_root` is an existing and non-empty directory or
+       if
+         `session_root` is a file.
+       TypeError: If a non-None `watch_fn` is specified and it is not callable.
+    """
+
+    if log_usage:
+      pass  # No logging for open-source.
+
+    framework.BaseDebugWrapperSession.__init__(self, sess)
+
+    self._watch_fn = None
+    if watch_fn is not None:
+      if not callable(watch_fn):
+        raise TypeError("watch_fn is not callable")
+      self._watch_fn = watch_fn
+
+    if gfile.Exists(session_root):
+      if not gfile.IsDirectory(session_root):
+        raise ValueError(
+            "session_root path points to a file: %s" % session_root)
+      elif gfile.ListDirectory(session_root):
+        raise ValueError(
+            "session_root path points to a non-empty directory: %s" %
+            session_root)
+    self._session_root = session_root
+
+  def on_session_init(self, request):
+    """See doc of BaseDebugWrapperSession.on_run_start."""
+
+    return framework.OnSessionInitResponse(
+        framework.OnSessionInitAction.PROCEED)
+
+  def on_run_start(self, request):
+    """See doc of BaseDebugWrapperSession.on_run_start."""
+
+    (debug_urls, debug_ops, node_name_regex_whitelist,
+     op_type_regex_whitelist) = self._prepare_run_watch_config(
+         request.fetches, request.feed_dict)
+
+    return framework.OnRunStartResponse(
+        framework.OnRunStartAction.DEBUG_RUN,
+        debug_urls,
+        debug_ops=debug_ops,
+        node_name_regex_whitelist=node_name_regex_whitelist,
+        op_type_regex_whitelist=op_type_regex_whitelist)
+
+  def _prepare_run_watch_config(self, fetches, feed_dict):
+    """Get the debug_urls, and node/op whitelists for the current run() call.
+
+    Prepares a directory with a fixed naming pattern. Saves Event proto files
+    of names `_tfdbg_run_fetches_info` and `_tfdbg_run_feed_keys_info` in the
+    directory to save information about the `fetches` and `feed_dict.keys()`
+    used in this `run()` call, respectively.
+
+    Args:
+      fetches: Same as the `fetches` argument to `Session.run()`.
+      feed_dict: Same as the `feed_dict argument` to `Session.run()`.
+
+    Returns:
+      debug_urls: (str or list of str) Debug URLs for the current run() call.
+        Currently, the list consists of only one URL that is a file:// URL.
+      debug_ops: (str or list of str) Debug op(s) to be used by the
+        debugger.
+      node_name_regex_whitelist: (str or regex) Regular-expression whitelist for
+        node name. Same as the same-name argument to debug_utils.watch_graph.
+      op_type_regex_whitelist: (str or regex) Regular-expression whitelist for
+        op type. Same as the same-name argument to debug_utils.watch_graph.
+    """
+
+    # Add a UUID to accommodate the possibility of concurrent run() calls.
+    run_dir = os.path.join(self._session_root, "run_%d_%s" %
+                           (int(time.time() * 1e6), uuid.uuid4().hex))
+    gfile.MkDir(run_dir)
+
+    fetches_event = event_pb2.Event()
+    fetches_event.log_message.message = repr(fetches)
+    fetches_path = os.path.join(
+        run_dir,
+        debug_data.METADATA_FILE_PREFIX + debug_data.FETCHES_INFO_FILE_TAG)
+    with gfile.Open(os.path.join(fetches_path), "wb") as f:
+      f.write(fetches_event.SerializeToString())
+
+    feed_keys_event = event_pb2.Event()
+    feed_keys_event.log_message.message = (repr(feed_dict.keys()) if feed_dict
+                                           else repr(feed_dict))
+
+    feed_keys_path = os.path.join(
+        run_dir,
+        debug_data.METADATA_FILE_PREFIX + debug_data.FEED_KEYS_INFO_FILE_TAG)
+    with gfile.Open(os.path.join(feed_keys_path), "wb") as f:
+      f.write(feed_keys_event.SerializeToString())
+
+    debug_ops, node_name_regex_whitelist, op_type_regex_whitelist = (
+        "DebugIdentity", None, None)
+    if self._watch_fn is not None:
+      debug_ops, node_name_regex_whitelist, op_type_regex_whitelist = (
+          self._watch_fn(fetches, feed_dict))
+
+    return (["file://" + run_dir], debug_ops, node_name_regex_whitelist,
+            op_type_regex_whitelist)
+
+  def on_run_end(self, request):
+    """See doc of BaseDebugWrapperSession.on_run_end."""
+
+    return framework.OnRunEndResponse()
+
+  def invoke_node_stepper(self,
+                          node_stepper,
+                          restore_variable_values_on_exit=True):
+    """See doc of BaseDebugWrapperSession.invoke_node_stepper."""
+
+    return NotImplementedError(
+        "DumpingDebugWrapperSession does not support node-stepper mode.")
diff --git a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
new file mode 100644
index 00000000000..568c55e8ef4
--- /dev/null
+++ b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
@@ -0,0 +1,264 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit Tests for classes in dumping_wrapper.py."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import glob
+import os
+import shutil
+import tempfile
+
+from tensorflow.python.client import session
+from tensorflow.python.debug import debug_data
+from tensorflow.python.debug.wrappers import dumping_wrapper
+from tensorflow.python.debug.wrappers import hooks
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import googletest
+from tensorflow.python.training import monitored_session
+
+
+class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    self.session_root = tempfile.mkdtemp()
+
+    self.v = variables.Variable(10.0, dtype=dtypes.float32, name="v")
+    self.delta = constant_op.constant(1.0, dtype=dtypes.float32, name="delta")
+    self.eta = constant_op.constant(-1.4, dtype=dtypes.float32, name="eta")
+    self.inc_v = state_ops.assign_add(self.v, self.delta, name="inc_v")
+    self.dec_v = state_ops.assign_add(self.v, self.eta, name="dec_v")
+
+    self.ph = array_ops.placeholder(dtypes.float32, shape=(), name="ph")
+    self.inc_w_ph = state_ops.assign_add(self.v, self.ph, name="inc_w_ph")
+
+    self.sess = session.Session()
+    self.sess.run(self.v.initializer)
+
+  def tearDown(self):
+    ops.reset_default_graph()
+    if os.path.isdir(self.session_root):
+      shutil.rmtree(self.session_root)
+
+  def _assert_correct_run_subdir_naming(self, run_subdir):
+    self.assertStartsWith(run_subdir, "run_")
+    self.assertEqual(2, run_subdir.count("_"))
+    self.assertGreater(int(run_subdir.split("_")[1]), 0)
+
+  def testConstructWrapperWithExistingNonEmptyRootDirRaisesException(self):
+    dir_path = os.path.join(self.session_root, "foo")
+    os.mkdir(dir_path)
+    self.assertTrue(os.path.isdir(dir_path))
+
+    with self.assertRaisesRegexp(
+        ValueError, "session_root path points to a non-empty directory"):
+      dumping_wrapper.DumpingDebugWrapperSession(
+          session.Session(), session_root=self.session_root, log_usage=False)
+
+  def testConstructWrapperWithExistingFileDumpRootRaisesException(self):
+    file_path = os.path.join(self.session_root, "foo")
+    open(file_path, "a").close()  # Create the file
+    self.assertTrue(gfile.Exists(file_path))
+    self.assertFalse(gfile.IsDirectory(file_path))
+    with self.assertRaisesRegexp(ValueError,
+                                 "session_root path points to a file"):
+      dumping_wrapper.DumpingDebugWrapperSession(
+          session.Session(), session_root=file_path, log_usage=False)
+
+  def testDumpingOnASingleRunWorks(self):
+    sess = dumping_wrapper.DumpingDebugWrapperSession(
+        self.sess, session_root=self.session_root, log_usage=False)
+    sess.run(self.inc_v)
+
+    dump_dirs = glob.glob(os.path.join(self.session_root, "run_*"))
+    self.assertEqual(1, len(dump_dirs))
+
+    self._assert_correct_run_subdir_naming(os.path.basename(dump_dirs[0]))
+    dump = debug_data.DebugDumpDir(dump_dirs[0])
+    self.assertAllClose([10.0], dump.get_tensors("v", 0, "DebugIdentity"))
+
+    self.assertEqual(repr(self.inc_v), dump.run_fetches_info)
+    self.assertEqual(repr(None), dump.run_feed_keys_info)
+
+  def testDumpingOnASingleRunWithFeedDictWorks(self):
+    sess = dumping_wrapper.DumpingDebugWrapperSession(
+        self.sess, session_root=self.session_root, log_usage=False)
+    feed_dict = {self.ph: 3.2}
+    sess.run(self.inc_w_ph, feed_dict=feed_dict)
+
+    dump_dirs = glob.glob(os.path.join(self.session_root, "run_*"))
+    self.assertEqual(1, len(dump_dirs))
+
+    self._assert_correct_run_subdir_naming(os.path.basename(dump_dirs[0]))
+    dump = debug_data.DebugDumpDir(dump_dirs[0])
+    self.assertAllClose([10.0], dump.get_tensors("v", 0, "DebugIdentity"))
+
+    self.assertEqual(repr(self.inc_w_ph), dump.run_fetches_info)
+    self.assertEqual(repr(feed_dict.keys()), dump.run_feed_keys_info)
+
+  def testDumpingOnMultipleRunsWorks(self):
+    sess = dumping_wrapper.DumpingDebugWrapperSession(
+        self.sess, session_root=self.session_root, log_usage=False)
+    for _ in range(3):
+      sess.run(self.inc_v)
+
+    dump_dirs = glob.glob(os.path.join(self.session_root, "run_*"))
+    dump_dirs = sorted(
+        dump_dirs, key=lambda x: int(os.path.basename(x).split("_")[1]))
+    self.assertEqual(3, len(dump_dirs))
+    for i, dump_dir in enumerate(dump_dirs):
+      self._assert_correct_run_subdir_naming(os.path.basename(dump_dir))
+      dump = debug_data.DebugDumpDir(dump_dir)
+      self.assertAllClose([10.0 + 1.0 * i],
+                          dump.get_tensors("v", 0, "DebugIdentity"))
+      self.assertEqual(repr(self.inc_v), dump.run_fetches_info)
+      self.assertEqual(repr(None), dump.run_feed_keys_info)
+
+  def testUsingNonCallableAsWatchFnRaisesTypeError(self):
+    bad_watch_fn = "bad_watch_fn"
+    with self.assertRaisesRegexp(TypeError, "watch_fn is not callable"):
+      dumping_wrapper.DumpingDebugWrapperSession(
+          self.sess,
+          session_root=self.session_root,
+          watch_fn=bad_watch_fn,
+          log_usage=False)
+
+  def testDumpingWithWatchFnOnFetchesWorks(self):
+    """Use a watch_fn that returns different whitelists for different runs."""
+
+    def watch_fn(fetches, feeds):
+      del feeds
+      # A watch_fn that picks fetch name.
+      if fetches.name == "inc_v:0":
+        # If inc_v, watch everything.
+        return "DebugIdentity", r".*", r".*"
+      else:
+        # If dec_v, watch nothing.
+        return "DebugIdentity", r"$^", r"$^"
+
+    sess = dumping_wrapper.DumpingDebugWrapperSession(
+        self.sess,
+        session_root=self.session_root,
+        watch_fn=watch_fn,
+        log_usage=False)
+
+    for _ in range(3):
+      sess.run(self.inc_v)
+      sess.run(self.dec_v)
+
+    dump_dirs = glob.glob(os.path.join(self.session_root, "run_*"))
+    dump_dirs = sorted(
+        dump_dirs, key=lambda x: int(os.path.basename(x).split("_")[1]))
+    self.assertEqual(6, len(dump_dirs))
+
+    for i, dump_dir in enumerate(dump_dirs):
+      self._assert_correct_run_subdir_naming(os.path.basename(dump_dir))
+      dump = debug_data.DebugDumpDir(dump_dir)
+      if i % 2 == 0:
+        self.assertGreater(dump.size, 0)
+        self.assertAllClose([10.0 - 0.4 * (i / 2)],
+                            dump.get_tensors("v", 0, "DebugIdentity"))
+        self.assertEqual(repr(self.inc_v), dump.run_fetches_info)
+        self.assertEqual(repr(None), dump.run_feed_keys_info)
+      else:
+        self.assertEqual(0, dump.size)
+        self.assertEqual(repr(self.dec_v), dump.run_fetches_info)
+        self.assertEqual(repr(None), dump.run_feed_keys_info)
+
+  def testDumpingWithWatchFnWithNonDefaultDebugOpsWorks(self):
+    """Use a watch_fn tha specifies non-default debug ops."""
+
+    def watch_fn(fetches, feeds):
+      del fetches, feeds
+      return ["DebugIdentity", "DebugNumericSummary"], r".*", r".*"
+
+    sess = dumping_wrapper.DumpingDebugWrapperSession(
+        self.sess,
+        session_root=self.session_root,
+        watch_fn=watch_fn,
+        log_usage=False)
+
+    sess.run(self.inc_v)
+
+    dump_dirs = glob.glob(os.path.join(self.session_root, "run_*"))
+    self.assertEqual(1, len(dump_dirs))
+    dump = debug_data.DebugDumpDir(dump_dirs[0])
+
+    self.assertAllClose([10.0], dump.get_tensors("v", 0, "DebugIdentity"))
+    self.assertEqual(12,
+                     len(dump.get_tensors("v", 0, "DebugNumericSummary")[0]))
+
+  def testDumpingDebugHookWithoutWatchFnWorks(self):
+    dumping_hook = hooks.DumpingDebugHook(self.session_root, log_usage=False)
+    mon_sess = monitored_session._HookedSession(self.sess, [dumping_hook])
+    mon_sess.run(self.inc_v)
+
+    dump_dirs = glob.glob(os.path.join(self.session_root, "run_*"))
+    self.assertEqual(1, len(dump_dirs))
+
+    self._assert_correct_run_subdir_naming(os.path.basename(dump_dirs[0]))
+    dump = debug_data.DebugDumpDir(dump_dirs[0])
+    self.assertAllClose([10.0], dump.get_tensors("v", 0, "DebugIdentity"))
+
+    self.assertEqual(repr(self.inc_v), dump.run_fetches_info)
+    self.assertEqual(repr(None), dump.run_feed_keys_info)
+
+  def testDumpingDebugHookWithStatefulWatchFnWorks(self):
+    watch_fn_state = {"run_counter": 0}
+
+    def counting_watch_fn(fetches, feed_dict):
+      del fetches, feed_dict
+      watch_fn_state["run_counter"] += 1
+      if watch_fn_state["run_counter"] % 2 == 1:
+        # If odd-index run (1-based), watch everything.
+        return "DebugIdentity", r".*", r".*"
+      else:
+        # If even-index run, watch nothing.
+        return "DebugIdentity", r"$^", r"$^"
+
+    dumping_hook = hooks.DumpingDebugHook(
+        self.session_root, watch_fn=counting_watch_fn, log_usage=False)
+    mon_sess = monitored_session._HookedSession(self.sess, [dumping_hook])
+    for _ in range(4):
+      mon_sess.run(self.inc_v)
+
+    dump_dirs = glob.glob(os.path.join(self.session_root, "run_*"))
+    dump_dirs = sorted(
+        dump_dirs, key=lambda x: int(os.path.basename(x).split("_")[1]))
+    self.assertEqual(4, len(dump_dirs))
+
+    for i, dump_dir in enumerate(dump_dirs):
+      self._assert_correct_run_subdir_naming(os.path.basename(dump_dir))
+      dump = debug_data.DebugDumpDir(dump_dir)
+      if i % 2 == 0:
+        self.assertAllClose([10.0 + 1.0 * i],
+                            dump.get_tensors("v", 0, "DebugIdentity"))
+      else:
+        self.assertEqual(0, dump.size)
+
+      self.assertEqual(repr(self.inc_v), dump.run_fetches_info)
+      self.assertEqual(repr(None), dump.run_feed_keys_info)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/debug/wrappers/framework.py b/tensorflow/python/debug/wrappers/framework.py
index f72858863e5..cbc1fa26032 100644
--- a/tensorflow/python/debug/wrappers/framework.py
+++ b/tensorflow/python/debug/wrappers/framework.py
@@ -232,14 +232,24 @@ class OnRunStartResponse(object):
   action the debug-wrapper session actually takes on the run() call.
   """
 
-  def __init__(self, action, debug_urls):
+  def __init__(self,
+               action,
+               debug_urls,
+               debug_ops="DebugIdentity",
+               node_name_regex_whitelist=None,
+               op_type_regex_whitelist=None):
     """Constructor of `OnRunStartResponse`.
 
     Args:
       action: (`OnRunStartAction`) the action actually taken by the wrapped
         session for the run() call.
-      debug_urls: (list of str) debug_urls used in watching the tensors during
-        the run() call.
+      debug_urls: (`list` of `str`) debug_urls used in watching the tensors
+        during the run() call.
+      debug_ops: (`str` or `list` of `str`) Debug op(s) to be used by the
+        debugger.
+      node_name_regex_whitelist: Regular-expression whitelist for node
+        name.
+      op_type_regex_whitelist: Regular-expression whitelist for op type.
     """
 
     _check_type(action, str)
@@ -248,6 +258,11 @@ class OnRunStartResponse(object):
     _check_type(debug_urls, list)
     self.debug_urls = debug_urls
 
+    self.debug_ops = debug_ops
+
+    self.node_name_regex_whitelist = node_name_regex_whitelist
+    self.op_type_regex_whitelist = op_type_regex_whitelist
+
 
 class OnRunEndRequest(object):
   """Request to an on-run-end callback.
@@ -377,8 +392,12 @@ class BaseDebugWrapperSession(session.SessionInterface):
       decorated_run_options = options or config_pb2.RunOptions()
       run_metadata = run_metadata or config_pb2.RunMetadata()
 
-      self._decorate_run_options(decorated_run_options,
-                                 run_start_resp.debug_urls)
+      self._decorate_run_options(
+          decorated_run_options,
+          run_start_resp.debug_urls,
+          debug_ops=run_start_resp.debug_ops,
+          node_name_regex_whitelist=run_start_resp.node_name_regex_whitelist,
+          op_type_regex_whitelist=run_start_resp.op_type_regex_whitelist)
 
       # Invoke the run() method of the wrapped Session. Catch any TensorFlow
       # runtime errors.
@@ -434,7 +453,12 @@ class BaseDebugWrapperSession(session.SessionInterface):
     raise NotImplementedError(
         "partial_run is not implemented for debug-wrapper sessions.")
 
-  def _decorate_run_options(self, run_options, debug_urls):
+  def _decorate_run_options(self,
+                            run_options,
+                            debug_urls,
+                            debug_ops="DebugIdentity",
+                            node_name_regex_whitelist=None,
+                            op_type_regex_whitelist=None):
     """Modify a RunOptions object for debug tensor watching.
 
     Specifies request for outputting partition graphs. Adds
@@ -444,11 +468,20 @@ class BaseDebugWrapperSession(session.SessionInterface):
       run_options: (RunOptions) the modified RunOptions object.
       debug_urls: (list of str) debug URLs to be entered in run_options.
         debug_tensor_watch_opts.
+      debug_ops: (str or list of str) debug op(s) to be used by the debugger.
+      node_name_regex_whitelist: Regular-expression whitelist for node
+        name.
+      op_type_regex_whitelist: Regular-expression whitelist for op type.
     """
 
     run_options.output_partition_graphs = True
     debug_utils.watch_graph(
-        run_options, self._sess.graph, debug_urls=debug_urls)
+        run_options,
+        self._sess.graph,
+        debug_urls=debug_urls,
+        debug_ops=debug_ops,
+        node_name_regex_whitelist=node_name_regex_whitelist,
+        op_type_regex_whitelist=op_type_regex_whitelist)
 
   @abc.abstractmethod
   def on_session_init(self, request):
diff --git a/tensorflow/python/debug/wrappers/hooks.py b/tensorflow/python/debug/wrappers/hooks.py
index febbdcf3e57..cda2becc6eb 100644
--- a/tensorflow/python/debug/wrappers/hooks.py
+++ b/tensorflow/python/debug/wrappers/hooks.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.debug import debug_utils
 from tensorflow.python.debug import stepper
+from tensorflow.python.debug.wrappers import dumping_wrapper
 from tensorflow.python.debug.wrappers import framework
 from tensorflow.python.debug.wrappers import local_cli_wrapper
 from tensorflow.python.training import session_run_hook
@@ -30,7 +31,8 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook,
                         local_cli_wrapper.LocalCLIDebugWrapperSession):
   """Command-line-interface debugger hook.
 
-  Can be used as a monitor/hook for tf.train.MonitoredSession.
+  Can be used as a monitor/hook for `tf.train.MonitoredSession`s and
+  `tf.contrib.learn`'s `Estimator`s and `Experiment`s.
   """
 
   def __init__(self, ui_type="curses"):
@@ -101,3 +103,62 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook,
     debug_utils.watch_graph(
         options, graph, debug_urls=self._get_run_debug_urls())
     options.output_partition_graphs = True
+
+
+class DumpingDebugHook(session_run_hook.SessionRunHook,
+                       dumping_wrapper.DumpingDebugWrapperSession):
+  """A debugger hook that dumps debug data to filesystem.
+
+  Can be used as a monitor/hook for `tf.train.MonitoredSession`s and
+  `tf.contrib.learn`'s `Estimator`s and `Experiment`s.
+  """
+
+  def __init__(self, session_root, watch_fn=None, log_usage=True):
+    """Create a local debugger command-line interface (CLI) hook.
+
+    Args:
+      session_root: See doc of
+        `dumping_wrapper.DumpingDebugWrapperSession.__init__`.
+      watch_fn: See doc of
+        `dumping_wrapper.DumpingDebugWrapperSession.__init__`.
+      log_usage: (bool) Whether usage is to be logged.
+    """
+
+    self._session_root = session_root
+    self._watch_fn = watch_fn
+    self._log_usage = log_usage
+    self._wrapper_initialized = False
+
+  def begin(self):
+    pass
+
+  def before_run(self, run_context):
+    if not self._wrapper_initialized:
+      dumping_wrapper.DumpingDebugWrapperSession.__init__(
+          self,
+          run_context.session,
+          self._session_root,
+          watch_fn=self._watch_fn,
+          log_usage=self._log_usage)
+      self._wrapper_initialized = True
+
+    self._run_call_count += 1
+
+    (debug_urls, debug_ops, node_name_regex_whitelist,
+     op_type_regex_whitelist) = self._prepare_run_watch_config(
+         run_context.original_args.fetches, run_context.original_args.feed_dict)
+    run_options = config_pb2.RunOptions()
+    debug_utils.watch_graph(
+        run_options,
+        run_context.session.graph,
+        debug_urls=debug_urls,
+        debug_ops=debug_ops,
+        node_name_regex_whitelist=node_name_regex_whitelist,
+        op_type_regex_whitelist=op_type_regex_whitelist)
+
+    run_args = session_run_hook.SessionRunArgs(
+        None, feed_dict=None, options=run_options)
+    return run_args
+
+  def after_run(self, run_context, run_values):
+    pass
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper.py b/tensorflow/python/debug/wrappers/local_cli_wrapper.py
index 2c0660920d3..e1f5904c213 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper.py
@@ -161,7 +161,7 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
       request: An instance of `OnSessionInitRequest`.
 
     Returns:
-      An instance of OnSessionInitResponse.
+      An instance of `OnSessionInitResponse`.
     """
 
     return framework.OnSessionInitResponse(
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index 32800ac3076..1bb87e33e30 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -25,8 +25,6 @@ import hashlib
 import inspect
 import re
 
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import function_pb2
 from tensorflow.core.framework import op_def_pb2
@@ -67,82 +65,6 @@ def _get_node_def(op):
   return op._node_def  # pylint: disable=protected-access
 
 
-def _add_input_array(op, start, limit, dtype, func):
-  """Adds a _ListToArray node in the func for op.inputs[start:limit]."""
-  node = function_pb2.FunctionDef.Node()
-  node.op = "_ListToArray"
-  ret_name = op.name + "_L2A_" + str(start)
-  node.ret.extend([ret_name])
-  node.arg.extend(
-      [_make_argname_from_tensor_name(x.name) for x in op.inputs[start:limit]])
-  num = limit - start
-  node.attr["Tin"].CopyFrom(
-      attr_value_pb2.AttrValue(list=attr_value_pb2.AttrValue.ListValue(
-          type=[dtype] * num)))
-  node.attr["T"].CopyFrom(attr_value_pb2.AttrValue(type=dtype))
-  node.attr["N"].CopyFrom(attr_value_pb2.AttrValue(i=num))
-  func.node.extend([node])
-  return ret_name
-
-
-def _add_identity_dtype_proto(func, src, dst, dtype_proto):
-  node = function_pb2.FunctionDef.Node()
-  node.op = "Identity"
-  node.arg.append(src)
-  node.ret.append(dst)
-  node.attr["T"].CopyFrom(dtype_proto)
-  func.node.extend([node])
-
-
-def _add_identity_dtype_enum(func, src, dst, dtype):
-  dtype_proto = attr_value_pb2.AttrValue(type=dtype)
-  _add_identity_dtype_proto(func, src, dst, dtype_proto)
-
-
-def _add_output_array(op, start, limit, dtype, func):
-  """Adds a _ArrayToList node in the func for op.outputs[start:limit]."""
-  dtype_proto = attr_value_pb2.AttrValue(type=dtype)
-  # A node converting N*T to list(T)
-  node = function_pb2.FunctionDef.Node()
-  node.op = "_ArrayToList"
-  arg_name = op.name + "_A2L_" + str(start)
-  ret_name = arg_name + "_out"
-  node.ret.append(ret_name)
-  node.arg.append(arg_name)
-  node.attr["T"].CopyFrom(dtype_proto)
-  num = limit - start
-  node.attr["N"].CopyFrom(attr_value_pb2.AttrValue(i=num))
-  node.attr["out_types"].CopyFrom(
-      attr_value_pb2.AttrValue(list=attr_value_pb2.AttrValue.ListValue(
-          type=[dtype] * num)))
-  func.node.extend([node])
-  num = limit - start
-  # Adds an identity node for each element in the array N*T so that
-  # uses of each element can be added easily later. These Identity
-  # will be eliminated before graph execution.
-  for i in xrange(num):
-    _add_identity_dtype_proto(
-        func, ret_name + ":" + str(i),
-        _make_argname_from_tensor_name(op.outputs[i].name), dtype_proto)
-  return arg_name
-
-
-def _add_output_list(op, start, limit, dtype_lst, func):
-  """Adds a _ArrayToList node in the func for op.outputs[start:limit]."""
-  ret_name = op.name + "_Lst_" + str(start) + "_" + str(limit)
-  num = limit - start
-  assert len(dtype_lst) == num
-  # Adds an identity node for each element in the array N*T so that
-  # uses of each element can be added easily later. These Identity
-  # will be eliminated before graph execution.
-  for i in xrange(num):
-    _add_identity_dtype_enum(func,
-                             ret_name + ":" + str(i),
-                             _make_argname_from_tensor_name(op.outputs[i].name),
-                             dtype_lst[i])
-  return ret_name
-
-
 def _get_op_def(op):
   # pylint: disable=protected-access
   if hasattr(op, "_sig"):
@@ -197,76 +119,6 @@ def _add_op_node(op, func, input_dict):
           "%s missing from %s" % (node_def.input[i], input_dict.items()))
       node_def.input[i] = input_dict[node_def.input[i]]
 
-  # To support legacy consumers, add an entry in func.node.
-  # TODO(josh11b): Delete this.
-  node = function_pb2.FunctionDef.Node()
-  node.op = op.type
-  op_def = _get_op_def(op)
-  attrs = node_def.attr
-  if not op_def.output_arg:
-    node.ret.append(_make_argname_from_tensor_name(op.name))
-  else:
-    out_index = 0
-    for arg_def in op_def.output_arg:
-      if arg_def.number_attr:
-        dtype = arg_def.type or attrs[arg_def.type_attr].type
-        num = attrs[arg_def.number_attr].i
-        node.ret.append(
-            _add_output_array(op, out_index, out_index + num, dtype, func))
-        out_index += num
-      elif arg_def.type_list_attr:
-        dtype_lst = attrs[arg_def.type_list_attr].list.type
-        num = len(dtype_lst)
-        node.ret.append(
-            _add_output_list(op, out_index, out_index + num, dtype_lst, func))
-        out_index += num
-      else:
-        node.ret.append(
-            _make_argname_from_tensor_name(op.outputs[out_index].name))
-        out_index += 1
-  inp_index = 0
-  for arg_def in op_def.input_arg:
-    if arg_def.number_attr:
-      dtype = arg_def.type or attrs[arg_def.type_attr].type
-      num = attrs[arg_def.number_attr].i
-      node.arg.append(
-          _add_input_array(op, inp_index, inp_index + num, dtype, func))
-      inp_index += num
-    elif arg_def.type_list_attr:
-      num = len(attrs[arg_def.type_list_attr].list.type)
-      node.arg.extend([
-          _make_argname_from_tensor_name(op.inputs[i].name)
-          for i in range(inp_index, inp_index + num)
-      ])
-      inp_index += num
-    else:
-      node.arg.append(_make_argname_from_tensor_name(op.inputs[inp_index].name))
-      inp_index += 1
-  node.dep.extend(
-      [_make_argname_from_tensor_name(x.name) for x in op.control_inputs])
-  for k, v in attrs.items():
-    node.attr[k].CopyFrom(v)
-  func.node.extend([node])
-
-
-def _replace_ret(func, original, replacement):
-  for n in func.node:
-    for i, r in enumerate(n.ret):
-      if r == original:
-        n.ret[i] = replacement
-        return
-  raise ValueError("Could not find ret == '%s'" % original)
-
-
-def _replace_arg(func, original, replacement):
-  for n in func.node:
-    for i, a in enumerate(n.arg):
-      if a == original:
-        n.arg[i] = replacement
-    for i, d in enumerate(n.dep):
-      if d == original:
-        n.dep[i] = replacement
-
 
 def _graph_to_function_def(graph, inputs, outputs, out_names=None):
   """Returns `graph` as a `FunctionDef` protocol buffer.
@@ -323,20 +175,9 @@ def _graph_to_function_def(graph, inputs, outputs, out_names=None):
     for index, o in enumerate(outputs):
       k = func.signature.output_arg[index].name
       func.ret[k] = input_dict[o.name]
-      # TODO(josh11b): Delete this once we switch fully to NodeDefs for
-      # function bodies.
-      orig = _make_argname_from_tensor_name(o.name)
-      if k != orig:
-        _add_identity_dtype_enum(func, orig, k,
-                                 func.signature.output_arg[index].type)
   else:
     for o, n in zip(outputs, out_names):
       func.ret[n] = input_dict[o.name]
-      # TODO(josh11b): Delete this once we switch fully to NodeDefs for
-      # function bodies.
-      k = _make_argname_from_tensor_name(o.name)
-      _replace_ret(func, k, n)
-      _replace_arg(func, k, n)
 
   return func
 
diff --git a/tensorflow/python/framework/gen_docs_combined.py b/tensorflow/python/framework/gen_docs_combined.py
index 031c9668ab7..5f6f066cdb2 100644
--- a/tensorflow/python/framework/gen_docs_combined.py
+++ b/tensorflow/python/framework/gen_docs_combined.py
@@ -49,6 +49,7 @@ def module_names():
       "tf.nn",
       "tf.train",
       "tf.python_io",
+      "tf.saved_model",
       "tf.summary",
       "tf.test",
       "tf.contrib.bayesflow.entropy",
@@ -250,7 +251,7 @@ _hidden_symbols = ["Event", "LogMessage", "Summary", "SessionLog", "xrange",
                    "AttrValue", "OptimizerOptions",
                    "CollectionDef", "MetaGraphDef", "QueueRunnerDef",
                    "SaverDef", "VariableDef", "TestCase", "GrpcServer",
-                   "ClusterDef", "JobDef", "ServerDef"]
+                   "ClusterDef", "JobDef", "ServerDef", "TensorInfo"]
 
 # TODO(skleinfeld, deannarubin) Address shortname
 # conflict between tf.contrib.learn.NanLossDuringTrainingError and
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index e8aabb757c4..2c5f72ae951 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -2299,6 +2299,9 @@ class Graph(object):
     if (function.grad_func_name is not None) and (
         function.python_grad_func is not None):
       raise ValueError("Gradient defined twice for function %s" % name)
+    # Need a new-enough consumer to support the functions we add to the graph.
+    if self._graph_def_versions.min_consumer < 12:
+      self._graph_def_versions.min_consumer = 12
     self._functions[name] = function
 
   @property
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index e615fe5dba1..906db3a6622 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -957,6 +957,7 @@ cuda_py_test(
         "//tensorflow/python:test_ops",
         "//tensorflow/python:variables",
     ],
+    shard_count = 10,
 )
 
 cuda_py_test(
@@ -2227,6 +2228,20 @@ cuda_py_test(
     tags = ["notap"],  # http://b/31080670
 )
 
+cuda_py_test(
+    name = "stage_op_test",
+    size = "small",
+    srcs = ["stage_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python:data_flow_ops",
+    ],
+)
+
 cuda_py_test(
     name = "concat_op_test",
     size = "medium",
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index a8d31345ba9..61447dfc777 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -309,6 +309,46 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
     reverse_2d_t = reverse_v2(data_2d_t, axis_2d_t)
     self.assertEqual(2, reverse_2d_t.get_shape().ndims)
 
+  def testReverseRowsOf3Channels(self):
+    """Tests optimized code for reversing rows with last dim size = 3."""
+    with self.test_session(use_gpu=True):
+      for reverse_f in [array_ops.reverse_v2, array_ops.reverse]:
+        for outer_size in (1, 2):
+          for middle_size in list(range(50)) + [100000]:
+            x_np = np.reshape(
+                np.arange(
+                    outer_size * middle_size * 3, dtype=np.float32),
+                newshape=(outer_size, middle_size, 3))
+            x_tf = reverse_f(x_np, [1]).eval()
+            np_answer = x_np[:, ::-1, :]
+            self.assertAllEqual(x_tf, np_answer)
+
+  def testReverseRowsOf4Channels(self):
+    with self.test_session(use_gpu=True):
+      for reverse_f in [array_ops.reverse_v2, array_ops.reverse]:
+        for outer_size in (1, 2):
+          for middle_size in list(range(50)) + [100000]:
+            x_np = np.reshape(
+                np.arange(
+                    outer_size * middle_size * 4, dtype=np.float32),
+                newshape=(outer_size, middle_size, 4))
+            x_tf = reverse_f(x_np, [1]).eval()
+            np_answer = x_np[:, ::-1, :]
+            self.assertAllEqual(x_tf, np_answer)
+
+  def testReverseColumnsOf3Channels(self):
+    with self.test_session(use_gpu=True):
+      for reverse_f in [array_ops.reverse_v2, array_ops.reverse]:
+        for outer_size in list(range(50)) + [100000]:
+          for middle_size in (1, 2):
+            x_np = np.reshape(
+                np.arange(
+                    outer_size * middle_size * 3, dtype=np.float32),
+                newshape=(outer_size, middle_size, 3))
+            x_tf = reverse_f(x_np, [0]).eval()
+            np_answer = x_np[::-1, :, :]
+            self.assertAllEqual(x_tf, np_answer)
+
 
 class MeshgridTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/kernel_tests/confusion_matrix_test.py b/tensorflow/python/kernel_tests/confusion_matrix_test.py
index 15fa23efc93..cf882091488 100644
--- a/tensorflow/python/kernel_tests/confusion_matrix_test.py
+++ b/tensorflow/python/kernel_tests/confusion_matrix_test.py
@@ -31,18 +31,29 @@ from tensorflow.python.platform import test
 
 class ConfusionMatrixTest(test.TestCase):
 
-  def _testConfMatrix(self, predictions, labels, truth, weights=None):
+  def testExample(self):
+    """This is a test of the example provided in pydoc."""
+    with self.test_session():
+      self.assertAllEqual([
+          [0, 0, 0, 0, 0],
+          [0, 0, 1, 0, 0],
+          [0, 0, 1, 0, 0],
+          [0, 0, 0, 0, 0],
+          [0, 0, 0, 0, 1]
+      ], confusion_matrix.confusion_matrix(
+          labels=[1, 2, 4], predictions=[2, 2, 4]).eval())
+
+  def _testConfMatrix(self, labels, predictions, truth, weights=None):
     with self.test_session():
       dtype = predictions.dtype
       ans = confusion_matrix.confusion_matrix(
-          labels, predictions, dtype=dtype, weights=weights)
-      tf_ans = ans.eval()
-      self.assertAllClose(tf_ans, truth, atol=1e-10)
-      self.assertEqual(tf_ans.dtype, dtype)
+          labels, predictions, dtype=dtype, weights=weights).eval()
+      self.assertAllClose(truth, ans, atol=1e-10)
+      self.assertEqual(ans.dtype, dtype)
 
   def _testBasic(self, dtype):
-    predictions = np.arange(5, dtype=dtype)
     labels = np.arange(5, dtype=dtype)
+    predictions = np.arange(5, dtype=dtype)
 
     truth = np.asarray(
         [[1, 0, 0, 0, 0],
@@ -52,7 +63,7 @@ class ConfusionMatrixTest(test.TestCase):
          [0, 0, 0, 0, 1]],
         dtype=dtype)
 
-    self._testConfMatrix(predictions=predictions, labels=labels, truth=truth)
+    self._testConfMatrix(labels=labels, predictions=predictions, truth=truth)
 
   def testInt32Basic(self):
     self._testBasic(dtype=np.int32)
@@ -93,32 +104,32 @@ class ConfusionMatrixTest(test.TestCase):
       except NameError:  # In Python 3.
         range_builder = range
       for i in range_builder(len(d)):
-        truth[d[i], l[i]] += 1
+        truth[l[i], d[i]] += 1
 
       self.assertEqual(cm_out.dtype, np_dtype)
       self.assertAllClose(cm_out, truth, atol=1e-10)
 
-  def _testOnTensors_int32(self):
+  def testOnTensors_int32(self):
     self._testConfMatrixOnTensors(dtypes.int32, np.int32)
 
   def testOnTensors_int64(self):
     self._testConfMatrixOnTensors(dtypes.int64, np.int64)
 
   def _testDifferentLabelsInPredictionAndTarget(self, dtype):
-    predictions = np.asarray([1, 2, 3], dtype=dtype)
     labels = np.asarray([4, 5, 6], dtype=dtype)
+    predictions = np.asarray([1, 2, 3], dtype=dtype)
 
     truth = np.asarray(
         [[0, 0, 0, 0, 0, 0, 0],
-         [0, 0, 0, 0, 1, 0, 0],
-         [0, 0, 0, 0, 0, 1, 0],
-         [0, 0, 0, 0, 0, 0, 1],
          [0, 0, 0, 0, 0, 0, 0],
          [0, 0, 0, 0, 0, 0, 0],
-         [0, 0, 0, 0, 0, 0, 0]],
+         [0, 0, 0, 0, 0, 0, 0],
+         [0, 1, 0, 0, 0, 0, 0],
+         [0, 0, 1, 0, 0, 0, 0],
+         [0, 0, 0, 1, 0, 0, 0]],
         dtype=dtype)
 
-    self._testConfMatrix(predictions=predictions, labels=labels, truth=truth)
+    self._testConfMatrix(labels=labels, predictions=predictions, truth=truth)
 
   def testInt32DifferentLabels(self, dtype=np.int32):
     self._testDifferentLabelsInPredictionAndTarget(dtype)
@@ -127,20 +138,20 @@ class ConfusionMatrixTest(test.TestCase):
     self._testDifferentLabelsInPredictionAndTarget(dtype)
 
   def _testMultipleLabels(self, dtype):
-    predictions = np.asarray([1, 1, 2, 3, 5, 6, 1, 2, 3, 4], dtype=dtype)
     labels = np.asarray([1, 1, 2, 3, 5, 1, 3, 6, 3, 1], dtype=dtype)
+    predictions = np.asarray([1, 1, 2, 3, 5, 6, 1, 2, 3, 4], dtype=dtype)
 
     truth = np.asarray(
         [[0, 0, 0, 0, 0, 0, 0],
-         [0, 2, 0, 1, 0, 0, 0],
-         [0, 0, 1, 0, 0, 0, 1],
-         [0, 0, 0, 2, 0, 0, 0],
-         [0, 1, 0, 0, 0, 0, 0],
+         [0, 2, 0, 0, 1, 0, 1],
+         [0, 0, 1, 0, 0, 0, 0],
+         [0, 1, 0, 2, 0, 0, 0],
+         [0, 0, 0, 0, 0, 0, 0],
          [0, 0, 0, 0, 0, 1, 0],
-         [0, 1, 0, 0, 0, 0, 0]],
+         [0, 0, 1, 0, 0, 0, 0]],
         dtype=dtype)
 
-    self._testConfMatrix(predictions=predictions, labels=labels, truth=truth)
+    self._testConfMatrix(labels=labels, predictions=predictions, truth=truth)
 
   def testInt32MultipleLabels(self, dtype=np.int32):
     self._testMultipleLabels(dtype)
@@ -149,8 +160,8 @@ class ConfusionMatrixTest(test.TestCase):
     self._testMultipleLabels(dtype)
 
   def testWeighted(self):
-    predictions = np.arange(5, dtype=np.int32)
     labels = np.arange(5, dtype=np.int32)
+    predictions = np.arange(5, dtype=np.int32)
     weights = constant_op.constant(np.arange(5, dtype=np.int32))
 
     truth = np.asarray(
@@ -162,31 +173,32 @@ class ConfusionMatrixTest(test.TestCase):
         dtype=np.int32)
 
     self._testConfMatrix(
-        predictions=predictions, labels=labels, weights=weights, truth=truth)
+        labels=labels, predictions=predictions, weights=weights, truth=truth)
 
-  def testInvalidRank(self):
-    predictions = np.asarray([[1, 2, 3]])
+  def testInvalidRank_predictionsTooBig(self):
     labels = np.asarray([1, 2, 3])
+    predictions = np.asarray([[1, 2, 3]])
     self.assertRaisesRegexp(ValueError, "an not squeeze dim",
                             confusion_matrix.confusion_matrix, predictions,
                             labels)
 
-    predictions = np.asarray([1, 2, 3])
+  def testInvalidRank_predictionsTooSmall(self):
     labels = np.asarray([[1, 2, 3]])
+    predictions = np.asarray([1, 2, 3])
     self.assertRaisesRegexp(ValueError, "an not squeeze dim",
                             confusion_matrix.confusion_matrix, predictions,
                             labels)
 
   def testInputDifferentSize(self):
-    predictions = np.asarray([1, 2, 3])
     labels = np.asarray([1, 2])
+    predictions = np.asarray([1, 2, 3])
     self.assertRaisesRegexp(ValueError, "must be equal",
                             confusion_matrix.confusion_matrix, predictions,
                             labels)
 
   def testOutputIsInt32(self):
-    predictions = np.arange(2)
     labels = np.arange(2)
+    predictions = np.arange(2)
     with self.test_session():
       cm = confusion_matrix.confusion_matrix(
           labels, predictions, dtype=dtypes.int32)
@@ -194,8 +206,8 @@ class ConfusionMatrixTest(test.TestCase):
     self.assertEqual(tf_cm.dtype, np.int32)
 
   def testOutputIsInt64(self):
-    predictions = np.arange(2)
     labels = np.arange(2)
+    predictions = np.arange(2)
     with self.test_session():
       cm = confusion_matrix.confusion_matrix(
           labels, predictions, dtype=dtypes.int64)
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index bd35c2edaa1..1e510b28689 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -2075,7 +2075,7 @@ class ControlFlowTest(test.TestCase):
       y = constant_op.constant(2.0, name="y")
 
       c = lambda x: math_ops.less(x, 100.0)
-      b = lambda x: math_ops.mul(x, y)
+      b = lambda x: math_ops.multiply(x, y)
       rx = control_flow_ops.while_loop(c, b, [x])
 
       rg = gradients_impl.gradients(rx, y)[0]
diff --git a/tensorflow/python/kernel_tests/losses_test.py b/tensorflow/python/kernel_tests/losses_test.py
index d784ec43b5a..125d353df38 100644
--- a/tensorflow/python/kernel_tests/losses_test.py
+++ b/tensorflow/python/kernel_tests/losses_test.py
@@ -766,6 +766,13 @@ class MeanSquaredErrorTest(test.TestCase):
         losses.mean_squared_error(
             self._predictions, self._predictions, weights=None)
 
+  def testScalar(self):
+    with self.test_session():
+      self.assertEqual(
+          0.0,
+          losses.mean_squared_error(predictions=constant_op.constant(0),
+                                    labels=constant_op.constant(0)).eval())
+
   def testAllCorrectNoLossWeight(self):
     loss = losses.mean_squared_error(self._predictions, self._predictions)
     with self.test_session():
diff --git a/tensorflow/python/kernel_tests/metrics_test.py b/tensorflow/python/kernel_tests/metrics_test.py
index 0ea14e82bc7..07d805b90ec 100644
--- a/tensorflow/python/kernel_tests/metrics_test.py
+++ b/tensorflow/python/kernel_tests/metrics_test.py
@@ -3296,7 +3296,7 @@ class MeanIOUTest(test.TestCase):
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
       sess.run(variables.local_variables_initializer())
       confusion_matrix = update_op.eval()
-      self.assertAllEqual([[3, 2], [0, 5]], confusion_matrix)
+      self.assertAllEqual([[3, 0], [2, 5]], confusion_matrix)
       desired_miou = np.mean([3. / 5., 5. / 7.])
       self.assertAlmostEqual(desired_miou, miou.eval())
 
@@ -3317,7 +3317,7 @@ class MeanIOUTest(test.TestCase):
     with self.test_session() as sess:
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
       sess.run(variables.local_variables_initializer())
-      self.assertAllEqual([[0, 40], [0, 0]], update_op.eval())
+      self.assertAllEqual([[0, 0], [40, 0]], update_op.eval())
       self.assertEqual(0., miou.eval())
 
   def testResultsWithSomeMissing(self):
@@ -3348,7 +3348,7 @@ class MeanIOUTest(test.TestCase):
       miou, update_op = metrics.mean_iou(
           labels, predictions, num_classes, weights=weights)
       sess.run(variables.local_variables_initializer())
-      self.assertAllEqual([[2, 2], [0, 4]], update_op.eval())
+      self.assertAllEqual([[2, 0], [2, 4]], update_op.eval())
       desired_miou = np.mean([2. / 4., 4. / 6.])
       self.assertAlmostEqual(desired_miou, miou.eval())
 
diff --git a/tensorflow/python/kernel_tests/sets_test.py b/tensorflow/python/kernel_tests/sets_test.py
index cd1bb4e753e..52b723802f0 100644
--- a/tensorflow/python/kernel_tests/sets_test.py
+++ b/tensorflow/python/kernel_tests/sets_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sets
 from tensorflow.python.ops import sparse_ops
@@ -453,20 +454,67 @@ class SetOpsTest(test_util.TensorFlowTestCase):
             dtype=dtype)
         self.assertAllEqual(expected_counts, self._set_intersection_count(a, b))
 
-  def _assert_shapes(self, input_tensor, result_sparse_tensor):
-    expected_rows = (None if
-                     isinstance(input_tensor, sparse_tensor_lib.SparseTensor)
-                     else input_tensor.get_shape().as_list()[0])
-    expected_rank = (None if
-                     isinstance(input_tensor, sparse_tensor_lib.SparseTensor)
-                     else input_tensor.get_shape().ndims)
-    self.assertAllEqual((expected_rows, expected_rank),
+  def _assert_static_shapes(self, input_tensor, result_sparse_tensor):
+    if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
+      sparse_shape_dims = input_tensor.dense_shape.get_shape().dims
+      if sparse_shape_dims is None:
+        expected_rank = None
+      else:
+        expected_rank = sparse_shape_dims[0].value
+    else:
+      expected_rank = input_tensor.get_shape().ndims
+    self.assertAllEqual((None, expected_rank),
                         result_sparse_tensor.indices.get_shape().as_list())
-    self.assertAllEqual((expected_rows,),
+    self.assertAllEqual((None,),
                         result_sparse_tensor.values.get_shape().as_list())
     self.assertAllEqual((expected_rank,),
                         result_sparse_tensor.dense_shape.get_shape().as_list())
 
+  def _run_equivalent_set_ops(self, ops):
+    """Assert all ops return the same shapes, and return 1st result."""
+    # Collect shapes and results for all ops, and assert static shapes match.
+    dynamic_indices_shape_ops = []
+    dynamic_values_shape_ops = []
+    static_indices_shape = None
+    static_values_shape = None
+    with self.test_session() as sess:
+      for op in ops:
+        if static_indices_shape is None:
+          static_indices_shape = op.indices.get_shape()
+        else:
+          self.assertAllEqual(
+              static_indices_shape.as_list(), op.indices.get_shape().as_list())
+        if static_values_shape is None:
+          static_values_shape = op.values.get_shape()
+        else:
+          self.assertAllEqual(
+              static_values_shape.as_list(), op.values.get_shape().as_list())
+        dynamic_indices_shape_ops.append(array_ops.shape(op.indices))
+        dynamic_values_shape_ops.append(array_ops.shape(op.values))
+      results = sess.run(
+          list(ops) + dynamic_indices_shape_ops + dynamic_values_shape_ops)
+      op_count = len(ops)
+      op_results = results[0:op_count]
+      dynamic_indices_shapes = results[op_count:2 * op_count]
+      dynamic_values_shapes = results[2 * op_count:3 * op_count]
+
+    # Assert static and dynamic tensor shapes, and result shapes, are all
+    # consistent.
+    static_indices_shape.assert_is_compatible_with(dynamic_indices_shapes[0])
+    static_values_shape.assert_is_compatible_with(dynamic_values_shapes[0])
+    self.assertAllEqual(dynamic_indices_shapes[0], op_results[0].indices.shape)
+    self.assertAllEqual(dynamic_values_shapes[0], op_results[0].values.shape)
+
+    # Assert dynamic shapes and values are the same for all ops.
+    for i in range(1, len(ops)):
+      self.assertAllEqual(dynamic_indices_shapes[0], dynamic_indices_shapes[i])
+      self.assertAllEqual(dynamic_values_shapes[0], dynamic_values_shapes[i])
+      self.assertAllEqual(op_results[0].indices, op_results[i].indices)
+      self.assertAllEqual(op_results[0].values, op_results[i].values)
+      self.assertAllEqual(op_results[0].dense_shape, op_results[i].dense_shape)
+
+    return op_results[0]
+
   def _set_intersection(self, a, b):
     # Validate that we get the same results with or without `validate_indices`,
     # and with a & b swapped.
@@ -480,14 +528,8 @@ class SetOpsTest(test_util.TensorFlowTestCase):
         sets.set_intersection(
             b, a, validate_indices=False),)
     for op in ops:
-      self._assert_shapes(a, op)
-    with self.test_session() as sess:
-      results = sess.run(ops)
-    for i in range(1, 4):
-      self.assertAllEqual(results[0].indices, results[i].indices)
-      self.assertAllEqual(results[0].values, results[i].values)
-      self.assertAllEqual(results[0].dense_shape, results[i].dense_shape)
-    return results[0]
+      self._assert_static_shapes(a, op)
+    return self._run_equivalent_set_ops(ops)
 
   def _set_intersection_count(self, a, b):
     op = sets.set_size(sets.set_intersection(a, b))
@@ -924,14 +966,8 @@ class SetOpsTest(test_util.TensorFlowTestCase):
         sets.set_difference(
             b, a, aminusb=not aminusb, validate_indices=False),)
     for op in ops:
-      self._assert_shapes(a, op)
-    with self.test_session() as sess:
-      results = sess.run(ops)
-    for i in range(1, 4):
-      self.assertAllEqual(results[0].indices, results[i].indices)
-      self.assertAllEqual(results[0].values, results[i].values)
-      self.assertAllEqual(results[0].dense_shape, results[i].dense_shape)
-    return results[0]
+      self._assert_static_shapes(a, op)
+    return self._run_equivalent_set_ops(ops)
 
   def _set_difference_count(self, a, b, aminusb=True):
     op = sets.set_size(sets.set_difference(a, b, aminusb))
@@ -1179,14 +1215,8 @@ class SetOpsTest(test_util.TensorFlowTestCase):
         sets.set_union(
             b, a, validate_indices=False),)
     for op in ops:
-      self._assert_shapes(a, op)
-    with self.test_session() as sess:
-      results = sess.run(ops)
-    for i in range(1, 4):
-      self.assertAllEqual(results[0].indices, results[i].indices)
-      self.assertAllEqual(results[0].values, results[i].values)
-      self.assertAllEqual(results[0].dense_shape, results[i].dense_shape)
-    return results[0]
+      self._assert_static_shapes(a, op)
+    return self._run_equivalent_set_ops(ops)
 
   def _set_union_count(self, a, b):
     op = sets.set_size(sets.set_union(a, b))
@@ -1194,16 +1224,15 @@ class SetOpsTest(test_util.TensorFlowTestCase):
       return sess.run(op)
 
   def _assert_set_operation(self, expected_indices, expected_values,
-                            expected_shape, sparse_tensor, dtype):
-    self.assertAllEqual(expected_indices, sparse_tensor.indices)
+                            expected_shape, sparse_tensor_value, dtype):
+    self.assertAllEqual(expected_indices, sparse_tensor_value.indices)
     self.assertAllEqual(len(expected_indices), len(expected_values))
-    self.assertAllEqual(len(expected_values), len(sparse_tensor.values))
+    self.assertAllEqual(len(expected_values), len(sparse_tensor_value.values))
     expected_set = set()
     actual_set = set()
     last_indices = None
-    for indices, expected_value, actual_value in zip(expected_indices,
-                                                     expected_values,
-                                                     sparse_tensor.values):
+    for indices, expected_value, actual_value in zip(
+        expected_indices, expected_values, sparse_tensor_value.values):
       if dtype == dtypes.string:
         actual_value = actual_value.decode("utf-8")
       if last_indices and (last_indices[:-1] != indices[:-1]):
@@ -1218,7 +1247,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
     self.assertEqual(expected_set, actual_set,
                      "Expected %s, got %s, at %s." % (expected_set, actual_set,
                                                       last_indices))
-    self.assertAllEqual(expected_shape, sparse_tensor.dense_shape)
+    self.assertAllEqual(expected_shape, sparse_tensor_value.dense_shape)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/stage_op_test.py b/tensorflow/python/kernel_tests/stage_op_test.py
new file mode 100644
index 00000000000..e7d7138423e
--- /dev/null
+++ b/tensorflow/python/kernel_tests/stage_op_test.py
@@ -0,0 +1,83 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class StageTest(test.TestCase):
+
+  def testSimple(self):
+    with self.test_session(use_gpu=True) as sess:
+      with ops.device('/cpu:0'):
+        x = array_ops.placeholder(dtypes.float32)
+        v = 2. * (array_ops.zeros([1024, 1024]) + x)
+      with ops.device('/gpu:0'):
+        stager = data_flow_ops.StagingArea([dtypes.float32])
+        stage = stager.put([v])
+        y = stager.get()
+        y = math_ops.reduce_max(math_ops.matmul(y, y))
+      sess.run(stage, feed_dict={x: -1})
+      for i in range(10):
+        _, yval = sess.run([stage, y], feed_dict={x: i})
+        self.assertAllClose(4 * (i - 1) * (i - 1) * 1024, yval, rtol=1e-4)
+
+  def testMultiple(self):
+    with self.test_session(use_gpu=True) as sess:
+      with ops.device('/cpu:0'):
+        x = array_ops.placeholder(dtypes.float32)
+        v = 2. * (array_ops.zeros([128, 128]) + x)
+      with ops.device('/gpu:0'):
+        stager = data_flow_ops.StagingArea([dtypes.float32, dtypes.float32])
+        stage = stager.put([x, v])
+        z, y = stager.get()
+        y = math_ops.reduce_max(z * math_ops.matmul(y, y))
+      sess.run(stage, feed_dict={x: -1})
+      for i in range(10):
+        _, yval = sess.run([stage, y], feed_dict={x: i})
+        self.assertAllClose(
+            4 * (i - 1) * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
+
+  def testDictionary(self):
+    with self.test_session(use_gpu=True) as sess:
+      with ops.device('/cpu:0'):
+        x = array_ops.placeholder(dtypes.float32)
+        v = 2. * (array_ops.zeros([128, 128]) + x)
+      with ops.device('/gpu:0'):
+        stager = data_flow_ops.StagingArea(
+            [dtypes.float32, dtypes.float32],
+            shapes=[[], [128, 128]],
+            names=['x', 'v'])
+        stage = stager.put({'x': x, 'v': v})
+        ret = stager.get()
+        z = ret['x']
+        y = ret['v']
+        y = math_ops.reduce_max(z * math_ops.matmul(y, y))
+      sess.run(stage, feed_dict={x: -1})
+      for i in range(10):
+        _, yval = sess.run([stage, y], feed_dict={x: i})
+        self.assertAllClose(
+            4 * (i - 1) * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/layers/normalization.py b/tensorflow/python/layers/normalization.py
index eece1d0ee63..fcbc69f2c52 100644
--- a/tensorflow/python/layers/normalization.py
+++ b/tensorflow/python/layers/normalization.py
@@ -179,16 +179,13 @@ class BatchNormalization(base._Layer):  # pylint: disable=protected-access
         broadcast_gamma = None
 
     if training_value is not False:
-      # Use a copy of moving_mean as a shift to compute more reliable moments.
-      shift = math_ops.add(self.moving_mean, 0)
       if needs_broadcasting:
-        shift = array_ops.reshape(shift, broadcast_shape)
         broadcast_mean, broadcast_variance = nn.moments(
-            inputs, reduction_axes, shift=shift, keep_dims=True)
+            inputs, reduction_axes, keep_dims=True)
         mean = array_ops.reshape(broadcast_mean, [-1])
         variance = array_ops.reshape(broadcast_variance, [-1])
       else:
-        mean, variance = nn.moments(inputs, reduction_axes, shift=shift)
+        mean, variance = nn.moments(inputs, reduction_axes)
 
       # Prepare updates if necessary.
       if not self.updates:
diff --git a/tensorflow/python/layers/normalization_test.py b/tensorflow/python/layers/normalization_test.py
index 00c392e299b..93efc09ca06 100644
--- a/tensorflow/python/layers/normalization_test.py
+++ b/tensorflow/python/layers/normalization_test.py
@@ -63,16 +63,25 @@ class BNTest(test.TestCase):
     bn = normalization_layers.BatchNormalization(
         axis=1, epsilon=epsilon, momentum=0.9)
     inputs = variables.Variable(
-        np.random.random((5, 4, 3)), dtype=dtypes.float32)
+        np.random.random((5, 4, 3)) + 100, dtype=dtypes.float32)
     training = array_ops.placeholder(dtype='bool')
     outputs = bn.apply(inputs, training=training)
 
     with self.test_session() as sess:
       # Test training with placeholder learning phase.
       sess.run(variables.global_variables_initializer())
+
+      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      np_gamma = np.reshape(np_gamma, (1, 4, 1))
+      np_beta = np.reshape(np_beta, (1, 4, 1))
+
       for _ in range(100):
         np_output, _, _ = sess.run([outputs] + bn.updates,
                                    feed_dict={training: True})
+        # Verify that the axis is normalized during training.
+        normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+        self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
+        self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
       moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
@@ -83,14 +92,6 @@ class BNTest(test.TestCase):
       self.assertAllClose(mean, moving_mean, atol=1e-2)
       self.assertAllClose(variance, moving_var, atol=1e-2)
 
-      # Verify that the axis is normalized during training.
-      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
-      np_gamma = np.reshape(np_gamma, (1, 4, 1))
-      np_beta = np.reshape(np_beta, (1, 4, 1))
-      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
-      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
-
       # Test inference with placeholder learning phase.
       np_output = sess.run(outputs, feed_dict={training: False})
 
@@ -104,16 +105,23 @@ class BNTest(test.TestCase):
     bn = normalization_layers.BatchNormalization(
         axis=2, epsilon=epsilon, momentum=0.9)
     inputs = variables.Variable(
-        np.random.random((5, 4, 3)), dtype=dtypes.float32)
+        np.random.random((5, 4, 3)) + 100, dtype=dtypes.float32)
     training = array_ops.placeholder(dtype='bool')
     outputs = bn.apply(inputs, training=training)
 
     with self.test_session() as sess:
       # Test training with placeholder learning phase.
       sess.run(variables.global_variables_initializer())
+      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      np_gamma = np.reshape(np_gamma, (1, 1, 3))
+      np_beta = np.reshape(np_beta, (1, 1, 3))
       for _ in range(100):
         np_output, _, _ = sess.run([outputs] + bn.updates,
                                    feed_dict={training: True})
+        # Verify that the axis is normalized during training.
+        normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+        self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
+        self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
       moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
@@ -124,14 +132,6 @@ class BNTest(test.TestCase):
       self.assertAllClose(mean, moving_mean, atol=1e-2)
       self.assertAllClose(variance, moving_var, atol=1e-2)
 
-      # Verify that the axis is normalized during training.
-      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
-      np_gamma = np.reshape(np_gamma, (1, 1, 3))
-      np_beta = np.reshape(np_beta, (1, 1, 3))
-      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
-      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
-
       # Test inference with placeholder learning phase.
       np_output = sess.run(outputs, feed_dict={training: False})
 
@@ -145,16 +145,23 @@ class BNTest(test.TestCase):
     bn = normalization_layers.BatchNormalization(
         axis=1, epsilon=epsilon, momentum=0.9)
     inputs = variables.Variable(
-        np.random.random((5, 4, 3, 6)), dtype=dtypes.float32)
+        np.random.random((5, 4, 3, 6)) + 100, dtype=dtypes.float32)
     training = array_ops.placeholder(dtype='bool')
     outputs = bn.apply(inputs, training=training)
 
     with self.test_session() as sess:
       # Test training with placeholder learning phase.
       sess.run(variables.global_variables_initializer())
+      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      np_gamma = np.reshape(np_gamma, (1, 4, 1, 1))
+      np_beta = np.reshape(np_beta, (1, 4, 1, 1))
       for _ in range(100):
         np_output, _, _ = sess.run([outputs] + bn.updates,
                                    feed_dict={training: True})
+        # Verify that the axis is normalized during training.
+        normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+        self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
+        self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
       moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
@@ -165,14 +172,6 @@ class BNTest(test.TestCase):
       self.assertAllClose(mean, moving_mean, atol=1e-2)
       self.assertAllClose(variance, moving_var, atol=1e-2)
 
-      # Verify that the axis is normalized during training.
-      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
-      np_gamma = np.reshape(np_gamma, (1, 4, 1, 1))
-      np_beta = np.reshape(np_beta, (1, 4, 1, 1))
-      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
-      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
-
       # Test inference with placeholder learning phase.
       np_output = sess.run(outputs, feed_dict={training: False})
 
@@ -186,16 +185,23 @@ class BNTest(test.TestCase):
     bn = normalization_layers.BatchNormalization(
         axis=2, epsilon=epsilon, momentum=0.9)
     inputs = variables.Variable(
-        np.random.random((5, 4, 3, 6)), dtype=dtypes.float32)
+        np.random.random((5, 4, 3, 6)) + 100, dtype=dtypes.float32)
     training = array_ops.placeholder(dtype='bool')
     outputs = bn.apply(inputs, training=training)
 
     with self.test_session() as sess:
       # Test training with placeholder learning phase.
       sess.run(variables.global_variables_initializer())
+      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      np_gamma = np.reshape(np_gamma, (1, 1, 3, 1))
+      np_beta = np.reshape(np_beta, (1, 1, 3, 1))
       for _ in range(100):
         np_output, _, _ = sess.run([outputs] + bn.updates,
                                    feed_dict={training: True})
+        # Verify that the axis is normalized during training.
+        normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+        self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
+        self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
       moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
@@ -206,14 +212,6 @@ class BNTest(test.TestCase):
       self.assertAllClose(mean, moving_mean, atol=1e-2)
       self.assertAllClose(variance, moving_var, atol=1e-2)
 
-      # Verify that the axis is normalized during training.
-      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
-      np_gamma = np.reshape(np_gamma, (1, 1, 3, 1))
-      np_beta = np.reshape(np_beta, (1, 1, 3, 1))
-      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
-      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
-
       # Test inference with placeholder learning phase.
       np_output = sess.run(outputs, feed_dict={training: False})
 
@@ -227,16 +225,23 @@ class BNTest(test.TestCase):
     bn = normalization_layers.BatchNormalization(
         axis=3, epsilon=epsilon, momentum=0.9)
     inputs = variables.Variable(
-        np.random.random((5, 4, 3, 6)), dtype=dtypes.float32)
+        np.random.random((5, 4, 3, 6)) + 100, dtype=dtypes.float32)
     training = array_ops.placeholder(dtype='bool')
     outputs = bn.apply(inputs, training=training)
 
     with self.test_session() as sess:
       # Test training with placeholder learning phase.
       sess.run(variables.global_variables_initializer())
+      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
+      np_beta = np.reshape(np_beta, (1, 1, 1, 6))
       for _ in range(100):
         np_output, _, _ = sess.run([outputs] + bn.updates,
                                    feed_dict={training: True})
+        # Verify that the axis is normalized during training.
+        normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+        self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
+        self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
       moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
@@ -247,14 +252,6 @@ class BNTest(test.TestCase):
       self.assertAllClose(mean, moving_mean, atol=1e-2)
       self.assertAllClose(variance, moving_var, atol=1e-2)
 
-      # Verify that the axis is normalized during training.
-      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
-      np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
-      np_beta = np.reshape(np_beta, (1, 1, 1, 6))
-      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
-      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
-
       # Test inference with placeholder learning phase.
       np_output = sess.run(outputs, feed_dict={training: False})
 
@@ -268,17 +265,25 @@ class BNTest(test.TestCase):
     bn = normalization_layers.BatchNormalization(
         axis=-1, epsilon=epsilon, momentum=0.9)
     inputs = variables.Variable(
-        np.random.random((5, 4, 3, 6)), dtype=dtypes.float32)
+        np.random.random((5, 4, 3, 6)) + 100, dtype=dtypes.float32)
     training = array_ops.placeholder(dtype='bool')
     outputs = bn.apply(inputs, training=training)
 
     with self.test_session() as sess:
       # Test training with placeholder learning phase.
       sess.run(variables.global_variables_initializer())
+      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
+      np_beta = np.reshape(np_beta, (1, 1, 1, 6))
       for _ in range(100):
         np_output, _, _ = sess.run([outputs] + bn.updates,
                                    feed_dict={training: True})
 
+        # Verify that the axis is normalized during training.
+        normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+        self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
+        self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
+
       # Verify that the statistics are updated during training.
       moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
       np_inputs = sess.run(inputs)
@@ -288,14 +293,6 @@ class BNTest(test.TestCase):
       self.assertAllClose(mean, moving_mean, atol=1e-2)
       self.assertAllClose(variance, moving_var, atol=1e-2)
 
-      # Verify that the axis is normalized during training.
-      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
-      np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
-      np_beta = np.reshape(np_beta, (1, 1, 1, 6))
-      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
-      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
-
       # Test inference with placeholder learning phase.
       np_output = sess.run(outputs, feed_dict={training: False})
 
@@ -309,15 +306,22 @@ class BNTest(test.TestCase):
     bn = normalization_layers.BatchNormalization(
         axis=-1, epsilon=epsilon, momentum=0.9)
     inputs = variables.Variable(
-        np.random.random((5, 4, 3, 6)), dtype=dtypes.float32)
+        np.random.random((5, 4, 3, 6)) + 100, dtype=dtypes.float32)
     outputs_training = bn.apply(inputs, training=True)
     outputs_infer = bn.apply(inputs, training=False)
 
     with self.test_session() as sess:
       # Test training with placeholder learning phase.
       sess.run(variables.global_variables_initializer())
+      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
+      np_beta = np.reshape(np_beta, (1, 1, 1, 6))
       for _ in range(100):
         np_output, _, _ = sess.run([outputs_training] + bn.updates)
+        # Verify that the axis is normalized during training.
+        normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+        self.assertAlmostEqual(np.mean(normed_np_output), 0., places=2)
+        self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
       moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
@@ -328,14 +332,6 @@ class BNTest(test.TestCase):
       self.assertAllClose(mean, moving_mean, atol=1e-2)
       self.assertAllClose(variance, moving_var, atol=1e-2)
 
-      # Verify that the axis is normalized during training.
-      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
-      np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
-      np_beta = np.reshape(np_beta, (1, 1, 1, 6))
-      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
-      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
-
       # Test inference with placeholder learning phase.
       np_output = sess.run(outputs_infer)
 
@@ -367,9 +363,16 @@ class BNTest(test.TestCase):
     with self.test_session() as sess:
       # Test training with placeholder learning phase.
       sess.run(variables.global_variables_initializer())
+      np_gamma, np_beta = sess.run([gamma, beta])
+      np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
+      np_beta = np.reshape(np_beta, (1, 1, 1, 6))
       for _ in range(100):
         np_output, _, _ = sess.run([outputs] + updates,
                                    feed_dict={training: True})
+        # Verify that the axis is normalized during training.
+        normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
+        self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
+        self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
       np_moving_mean, np_moving_var = sess.run([moving_mean, moving_variance])
@@ -380,14 +383,6 @@ class BNTest(test.TestCase):
       self.assertAllClose(np_mean, np_moving_mean, atol=1e-2)
       self.assertAllClose(np_variance, np_moving_var, atol=1e-2)
 
-      # Verify that the axis is normalized during training.
-      np_gamma, np_beta = sess.run([gamma, beta])
-      np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
-      np_beta = np.reshape(np_beta, (1, 1, 1, 6))
-      normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
-      self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
-
       # Test inference with placeholder learning phase.
       np_output = sess.run(outputs, feed_dict={training: False})
 
@@ -448,7 +443,7 @@ class BNTest(test.TestCase):
       np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
       np_beta = np.reshape(np_beta, (1, 1, 1, 6))
       normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
+      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=2)
       self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Test inference with placeholder learning phase.
@@ -456,7 +451,7 @@ class BNTest(test.TestCase):
 
       # Verify that the axis is normalized during inference.
       normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
-      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
+      self.assertAlmostEqual(np.mean(normed_np_output), 0., places=2)
       self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
   def testNoCenter(self):
diff --git a/tensorflow/python/ops/batch_norm_benchmark.py b/tensorflow/python/ops/batch_norm_benchmark.py
index a4940e09ccb..93a96c90c38 100644
--- a/tensorflow/python/ops/batch_norm_benchmark.py
+++ b/tensorflow/python/ops/batch_norm_benchmark.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import argparse
+import sys
 import time
 
 from tensorflow.python.client import session as session_lib
@@ -30,12 +32,8 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_impl
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
-from tensorflow.python.platform import flags
 from tensorflow.python.platform import test
 
-FLAGS = flags.FLAGS
-flags.DEFINE_boolean("use_gpu", True, """Run GPU benchmarks.""")
-
 
 def batch_norm_op(tensor, mean, variance, beta, gamma, scale):
   """Fused kernel for batch normalization."""
@@ -245,4 +243,16 @@ class BatchNormBenchmark(test.Benchmark):
 
 
 if __name__ == "__main__":
-  test.main()
+  parser = argparse.ArgumentParser()
+  parser.register("type", "bool", lambda v: v.lower() == "true")
+  parser.add_argument(
+      "--use_gpu",
+      type="bool",
+      nargs="?",
+      const=True,
+      default=True,
+      help="Run GPU benchmarks."
+  )
+  global FLAGS  # pylint:disable=global-at-module-level
+  FLAGS, unparsed = parser.parse_known_args()
+  test.main(argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/ops/concat_benchmark.py b/tensorflow/python/ops/concat_benchmark.py
index 094f8bb2dc8..1ce48b511fc 100644
--- a/tensorflow/python/ops/concat_benchmark.py
+++ b/tensorflow/python/ops/concat_benchmark.py
@@ -29,12 +29,8 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import variables
-from tensorflow.python.platform import flags
 from tensorflow.python.platform import test
 
-FLAGS = flags.FLAGS
-flags.DEFINE_boolean("use_gpu", True, """Run GPU benchmarks.""")
-
 
 def build_graph(device, input_shape, variable, num_inputs, axis, grad):
   """Build a graph containing a sequence of concat operations.
diff --git a/tensorflow/python/ops/confusion_matrix.py b/tensorflow/python/ops/confusion_matrix.py
index a37a3db5724..628853545e9 100644
--- a/tensorflow/python/ops/confusion_matrix.py
+++ b/tensorflow/python/ops/confusion_matrix.py
@@ -87,11 +87,11 @@ def confusion_matrix(labels, predictions, num_classes=None, dtype=dtypes.int32,
   Calculate the Confusion Matrix for a pair of prediction and
   label 1-D int arrays.
 
-  The matrix rows represent the prediction labels and the columns
-  represents the real labels. The confusion matrix is always a 2-D array
-  of shape `[n, n]`, where `n` is the number of valid labels for a given
-  classification task. Both prediction and labels must be 1-D arrays of
-  the same shape in order for this function to work.
+  The matrix columns represent the prediction labels and the rows represent the
+  real labels. The confusion matrix is always a 2-D array of shape `[n, n]`,
+  where `n` is the number of valid labels for a given classification task. Both
+  prediction and labels must be 1-D arrays of the same shape in order for this
+  function to work.
 
   If `num_classes` is None, then `num_classes` will be set to the one plus
   the maximum value in either predictions or labels.
@@ -116,9 +116,8 @@ def confusion_matrix(labels, predictions, num_classes=None, dtype=dtypes.int32,
   resulting in a 5x5 confusion matrix.
 
   Args:
-    labels: A 1-D representing the real labels for the classification task.
-    predictions: A 1-D array representing the predictions for a given
-                 classification.
+    labels: 1-D `Tensor` of real labels for the classification task.
+    predictions: 1-D `Tensor` of predictions for a given classification.
     num_classes: The possible number of labels the classification task can
                  have. If this value is not provided, it will be calculated
                  using both predictions and labels array.
@@ -153,7 +152,7 @@ def confusion_matrix(labels, predictions, num_classes=None, dtype=dtypes.int32,
       weights = math_ops.cast(weights, dtype)
 
     shape = array_ops.stack([num_classes, num_classes])
-    indices = array_ops.transpose(array_ops.stack([predictions, labels]))
+    indices = array_ops.transpose(array_ops.stack([labels, predictions]))
     values = (array_ops.ones_like(predictions, dtype)
               if weights is None else weights)
     cm_sparse = sparse_tensor.SparseTensor(
diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py
index 4c8a8419856..16a9f5d96f7 100644
--- a/tensorflow/python/ops/data_flow_ops.py
+++ b/tensorflow/python/ops/data_flow_ops.py
@@ -22,6 +22,7 @@ from __future__ import print_function
 import collections
 import hashlib
 import re
+import threading
 
 import six
 
@@ -242,7 +243,7 @@ class QueueBase(object):
     dictionary with tensor values.
 
     If it is a dictionary, the queue must have been constructed with a
-    `names` attribute and the dictionary keys must math the queue names.
+    `names` attribute and the dictionary keys must match the queue names.
     If the queue was constructed with a `names` attribute, `vals` must
     be a dictionary.
 
@@ -1382,3 +1383,218 @@ class SparseConditionalAccumulator(ConditionalAccumulatorBase):
         indices=return_val.indices,
         values=return_val.values,
         dense_shape=return_val.shape)
+
+
+class StagingArea(object):
+  """Class for staging inputs. No ordering guarantees.
+
+  A `StagingArea` is a TensorFlow data structure that stores tensors across
+  multiple steps, and exposes operations that can put and get
+  tensors.
+
+  Each `StagingArea` element is a tuple of one or more tensors, where each
+  tuple component has a static dtype, and may have a static shape.
+
+  The capacity of a `StagingArea` is unbounded and supports multiple
+  concurrent producers and consumers; and provides exactly-once delivery.
+
+  Each element of a `StagingArea` is a fixed-length tuple of tensors whose
+  dtypes are described by `dtypes`, and whose shapes are optionally described
+  by the `shapes` argument.
+
+  If the `shapes` argument is specified, each component of a staging area
+  element must have the respective fixed shape. If it is
+  unspecified, different elements may have different shapes,
+  """
+
+  _identifier = 0
+  _lock = threading.Lock()
+
+  def __init__(self, dtypes, shapes=None, names=None, shared_name=None):
+    """Constructs a staging area object.
+
+    The two optional lists, `shapes` and `names`, must be of the same length
+    as `dtypes` if provided.  The values at a given index `i` indicate the
+    shape and name to use for the corresponding queue component in `dtypes`.
+
+    Args:
+      dtypes:  A list of types.  The length of dtypes must equal the number
+        of tensors in each element.
+      shapes: (Optional.) Constraints on the shapes of tensors in an element.
+        A list of shape tuples or None. This list is the same length
+        as dtypes.  If the shape of any tensors in the element are constrained,
+        all must be; shapes can be None if the shapes should not be constrained.
+      names: (Optional.) If provided, the `get()` and
+        `put()` methods will use dictionaries with these names as keys.
+        Must be None or a list or tuple of the same length as `dtypes`.
+      shared_name: (Optional.) A name to be used for the shared object. By
+        passing the same name to two different python objects they will share
+        the underlying staging area. Must be a string.
+
+    Raises:
+      ValueError: If one of the arguments is invalid.
+    """
+    if shared_name is None:
+      self._name = ops.get_default_graph().unique_name("StagingArea")
+    elif isinstance(shared_name, six.string_types):
+      self._name = shared_name
+    else:
+      raise ValueError("shared_name must be a string")
+    self._dtypes = dtypes
+    if shapes is not None:
+      if len(shapes) != len(dtypes):
+        raise ValueError("StagingArea shapes must be the same length as dtypes")
+      self._shapes = [tensor_shape.TensorShape(s) for s in shapes]
+    else:
+      self._shapes = [tensor_shape.unknown_shape() for _ in self._dtypes]
+    if names is not None:
+      if len(names) != len(dtypes):
+        raise ValueError("StagingArea names must be the same length as dtypes")
+      self._names = names
+    else:
+      self._names = None
+
+  @property
+  def name(self):
+    """The name of the staging area."""
+    return self._name
+
+  @property
+  def dtypes(self):
+    """The list of dtypes for each component of a staging area element."""
+    return self._dtypes
+
+  @property
+  def shapes(self):
+    """The list of shapes for each component of a staging area element."""
+    return self._shapes
+
+  @property
+  def names(self):
+    """The list of names for each component of a staging area element."""
+    return self._names
+
+  def _check_put_dtypes(self, vals):
+    """Validate and convert `vals` to a list of `Tensor`s.
+
+    The `vals` argument can be a Tensor, a list or tuple of tensors, or a
+    dictionary with tensor values.
+
+    If it is a dictionary, the staging area must have been constructed with a
+    `names` attribute and the dictionary keys must match the staging area names.
+    If the staging area was constructed with a `names` attribute, `vals` must
+    be a dictionary.
+
+    Args:
+      vals: A tensor, a list or tuple of tensors, or a dictionary..
+
+    Returns:
+      A list of `Tensor` objects.
+
+    Raises:
+      ValueError: If `vals` is invalid.
+    """
+    if isinstance(vals, dict):
+      if not self._names:
+        raise ValueError(
+            "Staging areas must have names to enqueue a dictionary")
+      if sorted(self._names) != sorted(vals.keys()):
+        raise ValueError("Keys in dictionary to put do not match names "
+                         "of staging area. Dictionary: (%s), Queue: (%s)" %
+                         (sorted(vals.keys()), sorted(self._names)))
+      # The order of values in `self._names` indicates the order in which the
+      # tensors in the dictionary `vals` must be listed.
+      vals = [vals[k] for k in self._names]
+    else:
+      if self._names:
+        raise ValueError("You must enqueue a dictionary in a staging area "
+                         "with names")
+      if not isinstance(vals, (list, tuple)):
+        vals = [vals]
+
+    tensors = []
+    for i, (val, dtype) in enumerate(zip(vals, self._dtypes)):
+      tensors.append(
+          ops.convert_to_tensor(
+              val, dtype=dtype, name="component_%d" % i))
+
+    return tensors
+
+  def _scope_vals(self, vals):
+    """Return a list of values to pass to `name_scope()`.
+
+    Args:
+      vals: A tensor, a list or tuple of tensors, or a dictionary.
+
+    Returns:
+      The values in vals as a list.
+    """
+    if isinstance(vals, (list, tuple)):
+      return vals
+    elif isinstance(vals, dict):
+      return vals.values()
+    else:
+      return [vals]
+
+  def put(self, values, name=None):
+    with ops.name_scope(name, "%s_put" % self._name,
+                        self._scope_vals(values)) as scope:
+      vals = self._check_put_dtypes(values)
+      if len(values) != len(self._dtypes):
+        raise ValueError("Unexpected number of inputs " + str(len(values)) +
+                         "vs " + str(len(self._dtypes)))
+      for val, dtype in zip(vals, self._dtypes):
+        if val.dtype != dtype:
+          raise ValueError("Datatypes do not match. " + str(val.dtype) + " != "
+                           + str(dtype))
+
+      for val, shape in zip(vals, self._shapes):
+        val.get_shape().assert_is_compatible_with(shape)
+
+      return gen_data_flow_ops.stage(vals, shared_name=self._name, name=scope)
+
+  def _get_return_value(self, tensors):
+    """Return the value to return from a get op.
+
+    If the staging area has names, return a dictionary with the
+    names as keys.  Otherwise return either a single tensor
+    or a list of tensors depending on the length of `tensors`.
+
+    Args:
+      tensors: List of tensors from the get op.
+
+    Returns:
+      A single tensor, a list of tensors, or a dictionary
+      of tensors.
+    """
+    if self._names:
+      # The returned values in `tensors` are in the same order as
+      # the names in `self._names`.
+      return {n: tensors[i] for i, n in enumerate(self._names)}
+    elif len(tensors) == 1:
+      return tensors[0]
+    else:
+      return tensors
+
+  def get(self, name=None):
+    """Gets one element from this staging area.
+
+    If the staging area is empty when this operation executes, it will block
+    until there is an element to dequeue.
+
+    Args:
+      name: A name for the operation (optional).
+
+    Returns:
+      The tuple of tensors that was gotten.
+    """
+    if name is None:
+      name = "%s_get" % self._name
+
+    ret = gen_data_flow_ops.unstage(self._dtypes, shared_name=self._name,
+                                    name=name)
+
+    for output, shape in zip(ret, self._shapes):
+      output.set_shape(shape)
+
+    return self._get_return_value(ret)
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 825ef557a1d..b6da60770de 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -449,6 +449,42 @@ class AdjustSaturationBenchmark(test.Benchmark):
     self._benchmarkAdjustSaturation(test.gpu_device_name(), None)
 
 
+class ResizeBicubicBenchmark(test.Benchmark):
+
+  def _benchmarkResize(self, image_size):
+    # 4D float tensor (10 images per batch, 3 channels per image)
+    img = variables.Variable(
+        random_ops.random_normal([10, image_size[0], image_size[1], 3]),
+        name='img')
+
+    deps = []
+    for _ in xrange(100):
+      with ops.control_dependencies(deps):
+        resize_op = image_ops.resize_bicubic(
+            img, [299, 299], align_corners=False)
+        deps = [resize_op]
+      benchmark_op = control_flow_ops.group(*deps)
+
+    with session.Session() as sess:
+      sess.run(variables.global_variables_initializer())
+      print('Variables initalized for resize_bicubic image size: %s.' %
+            (image_size,))
+      benchmark_values = self.run_op_benchmark(
+          sess,
+          benchmark_op,
+          name=('bicubic_%s_%s' % image_size),)
+      print('Benchmark values:\n%s' % benchmark_values)
+
+  def benchmarkSimilar(self):
+    self._benchmarkResize((183, 229))
+
+  def benchmarkScaleUp(self):
+    self._benchmarkResize((141, 186))
+
+  def benchmarkScaleDown(self):
+    self._benchmarkResize((749, 603))
+
+
 class AdjustSaturationTest(test_util.TensorFlowTestCase):
 
   def testHalfSaturation(self):
diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index d331017f953..595c645cbb0 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -264,6 +264,12 @@ def svd(tensor, full_matrices=False, compute_uv=True, name=None):
     v: Left singular vectors. If `full_matrices` is `False` (default) then
       shape is `[..., N, P]`. If `full_matrices` is `True` then shape is
       `[..., N, N]`. Not returned if `compute_uv` is `False`.
+
+  @compatibility(numpy)
+  Mostly equivalent to numpy.linalg.svd, except that the order of output
+  arguments here is `s`, `u`, `v` when `compute_uv` is `True`, as opposed to
+  `u`, `s`, `v` for numpy.linalg.svd.
+  @end_compatibility
   """
   # pylint: disable=protected-access
   s, u, v = gen_linalg_ops._svd(
@@ -324,7 +330,7 @@ def norm(tensor, ord='euclidean', axis=None, keep_dims=False, name=None):
     ValueError: If `ord` or `axis` is invalid.
 
   @compatibility(numpy)
-  Mostly equivalent to np.linalg.norm.
+  Mostly equivalent to numpy.linalg.norm.
   Not supported: ord <= 0, 2-norm for matrices, nuclear norm.
   Other differences:
     a) If axis is `None`, treats the the flattened `tensor` as a vector
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index c23d046d70d..486e25afc71 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -119,8 +119,11 @@ def _num_present(losses, weights, per_batch=False):
   """
   # If weights is a scalar, its easy to compute:
   if weights.get_shape().ndims == 0:
-    batch_size = array_ops.reshape(array_ops.slice(array_ops.shape(losses),
-                                                   [0], [1]), [])
+    if losses.get_shape().ndims == 0:
+      batch_size = 1
+    else:
+      batch_size = array_ops.reshape(array_ops.slice(array_ops.shape(losses),
+                                                     [0], [1]), [])
     num_per_batch = math_ops.div(math_ops.to_float(array_ops.size(losses)),
                                  math_ops.to_float(batch_size))
     num_per_batch = array_ops.where(math_ops.equal(weights, 0),
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 071f970c580..c9ad0936a5b 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -362,10 +362,10 @@ multiply.__doc__ = gen_math_ops._mul.__doc__.replace("Mul", "`tf.multiply`")
 @deprecated(
     "2016-12-30",
     "`tf.mul(x, y)` is deprecated, please use `tf.multiply(x, y)` or `x * y`")
-def mul(x, y, name=None):
+def _mul(x, y, name=None):
   return gen_math_ops._mul(x, y, name)
-mul.__doc__ = (gen_math_ops._mul.__doc__
-               + ("" if mul.__doc__ is None else mul.__doc__))
+_mul.__doc__ = (gen_math_ops._mul.__doc__
+                + ("" if _mul.__doc__ is None else _mul.__doc__))
 
 
 def subtract(x, y, name=None):
@@ -377,10 +377,10 @@ subtract.__doc__ = gen_math_ops._sub.__doc__.replace("`Sub`", "`tf.subtract`")
 @deprecated(
     "2016-12-30",
     "`tf.sub(x, y)` is deprecated, please use `tf.subtract(x, y)` or `x - y`")
-def sub(x, y, name=None):
+def _sub(x, y, name=None):
   return gen_math_ops._sub(x, y, name)
-sub.__doc__ = (gen_math_ops._sub.__doc__
-               + ("" if sub.__doc__ is None else sub.__doc__))
+_sub.__doc__ = (gen_math_ops._sub.__doc__
+                + ("" if _sub.__doc__ is None else _sub.__doc__))
 
 
 # pylint: disable=g-docstring-has-escape
@@ -411,7 +411,7 @@ def negative(x, name=None):
 @deprecated(
     "2016-12-30",
     "`tf.neg(x)` is deprecated, please use `tf.negative(x)` or `-x`")
-def neg(x, name=None):
+def _neg(x, name=None):
   """Computes numerical negative value element-wise.
 
   I.e., \\(y = -x\\).
diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py
index 2b7f3af54ab..c5adcf609d1 100644
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@@ -296,12 +296,11 @@ def mean(values, weights=None, metrics_collections=None,
       values = math_ops.multiply(values, weights)
       num_values = math_ops.reduce_sum(weights)
 
-    total_compute_op = state_ops.assign_add(total, math_ops.reduce_sum(values))
-    count_compute_op = state_ops.assign_add(count, num_values)
+    update_total_op = state_ops.assign_add(total, math_ops.reduce_sum(values))
+    update_count_op = state_ops.assign_add(count, num_values)
 
     mean_t = _safe_div(total, count, 'value')
-    with ops.control_dependencies([total_compute_op, count_compute_op]):
-      update_op = _safe_div(total, count, 'update_op')
+    update_op = _safe_div(update_total_op, update_count_op, 'update_op')
 
     if metrics_collections:
       ops.add_to_collections(metrics_collections, mean_t)
@@ -1007,8 +1006,8 @@ def mean_tensor(values, weights=None, metrics_collections=None,
       values = math_ops.multiply(values, weights)
       num_values = math_ops.multiply(num_values, weights)
 
-    total_compute_op = state_ops.assign_add(total, values)
-    count_compute_op = state_ops.assign_add(count, num_values)
+    update_total_op = state_ops.assign_add(total, values)
+    update_count_op = state_ops.assign_add(count, num_values)
 
     def compute_mean(total, count, name):
       non_zero_count = math_ops.maximum(count,
@@ -1017,8 +1016,7 @@ def mean_tensor(values, weights=None, metrics_collections=None,
       return math_ops.truediv(total, non_zero_count, name=name)
 
     mean_t = compute_mean(total, count, 'value')
-    with ops.control_dependencies([total_compute_op, count_compute_op]):
-      update_op = compute_mean(total, count, 'update_op')
+    update_op = compute_mean(update_total_op, update_count_op, 'update_op')
 
     if metrics_collections:
       ops.add_to_collections(metrics_collections, mean_t)
@@ -1271,17 +1269,16 @@ def precision(labels, predictions, weights=None,
         labels, predictions, weights, metrics_collections=None,
         updates_collections=None, name=None)
 
-    def compute_precision(name):
+    def compute_precision(tp, fp, name):
       return array_ops.where(
-          math_ops.greater(true_p + false_p, 0),
-          math_ops.div(true_p, true_p + false_p),
+          math_ops.greater(tp + fp, 0),
+          math_ops.div(tp, tp + fp),
           0,
           name)
 
-    p = compute_precision('value')
-    with ops.control_dependencies([true_positives_update_op,
-                                   false_positives_update_op]):
-      update_op = compute_precision('update_op')
+    p = compute_precision(true_p, false_p, 'value')
+    update_op = compute_precision(
+        true_positives_update_op, false_positives_update_op, 'update_op')
 
     if metrics_collections:
       ops.add_to_collections(metrics_collections, p)
@@ -1342,17 +1339,15 @@ def precision_at_thresholds(labels, predictions, thresholds,
                                      (predictions, labels, weights)):
     values, update_ops = _confusion_matrix_at_thresholds(
         labels, predictions, thresholds, weights, includes=('tp', 'fp'))
-    tp = values['tp']
-    fp = values['fp']
 
     # Avoid division by zero.
     epsilon = 1e-7
-    def compute_precision(name):
+    def compute_precision(tp, fp, name):
       return math_ops.div(tp, epsilon + tp + fp, name='precision_' + name)
 
-    prec = compute_precision('value')
-    with ops.control_dependencies(update_ops.values()):
-      update_op = compute_precision('update_op')
+    prec = compute_precision(values['tp'], values['fp'], 'value')
+    update_op = compute_precision(
+        update_ops['tp'], update_ops['fp'], 'update_op')
 
     if metrics_collections:
       ops.add_to_collections(metrics_collections, prec)
@@ -1469,9 +1464,8 @@ def recall(labels, predictions, weights=None,
           name)
 
     rec = compute_recall(true_p, false_n, 'value')
-    with ops.control_dependencies([true_positives_update_op,
-                                   false_negatives_update_op]):
-      update_op = compute_recall(true_p, false_n, 'update_op')
+    update_op = compute_recall(
+        true_positives_update_op, false_negatives_update_op, 'update_op')
 
     if metrics_collections:
       ops.add_to_collections(metrics_collections, rec)
@@ -1881,17 +1875,14 @@ def recall_at_thresholds(labels, predictions, thresholds,
                                      (predictions, labels, weights)):
     values, update_ops = _confusion_matrix_at_thresholds(
         labels, predictions, thresholds, weights, includes=('tp', 'fn'))
-    tp = values['tp']
-    fn = values['fn']
 
     # Avoid division by zero.
     epsilon = 1e-7
-    def compute_recall(name):
+    def compute_recall(tp, fn, name):
       return math_ops.div(tp, epsilon + tp + fn, name='recall_' + name)
 
-    rec = compute_recall('value')
-    with ops.control_dependencies(update_ops.values()):
-      update_op = compute_recall('update_op')
+    rec = compute_recall(values['tp'], values['fn'], 'value')
+    update_op = compute_recall(update_ops['tp'], update_ops['fn'], 'update_op')
 
     if metrics_collections:
       ops.add_to_collections(metrics_collections, rec)
@@ -1951,21 +1942,20 @@ def root_mean_squared_error(labels, predictions, weights=None,
   labels, predictions, weights = _remove_squeezable_dimensions(
       labels, predictions, weights)
   predictions.get_shape().assert_is_compatible_with(labels.get_shape())
-  value_tensor, update_op = mean_squared_error(
+  mse, update_mse_op = mean_squared_error(
       labels, predictions, weights, None, None,
       name or 'root_mean_squared_error')
 
-  rmse = math_ops.sqrt(value_tensor)
-  with ops.control_dependencies([update_op]):
-    update_op = math_ops.sqrt(update_op)
+  rmse = math_ops.sqrt(mse)
+  update_rmse_op = math_ops.sqrt(update_mse_op)
 
   if metrics_collections:
     ops.add_to_collections(metrics_collections, rmse)
 
   if updates_collections:
-    ops.add_to_collections(updates_collections, update_op)
+    ops.add_to_collections(updates_collections, update_rmse_op)
 
-  return rmse, update_op
+  return rmse, update_rmse_op
 
 
 def sensitivity_at_specificity(
@@ -2031,12 +2021,8 @@ def sensitivity_at_specificity(
 
     values, update_ops = _confusion_matrix_at_thresholds(
         labels, predictions, thresholds, weights)
-    tp = values['tp']
-    fn = values['fn']
-    tn = values['tn']
-    fp = values['fp']
 
-    def compute_sensitivity_at_specificity(name):
+    def compute_sensitivity_at_specificity(tp, tn, fp, fn, name):
       specificities = math_ops.div(tn, tn + fp + kepsilon)
       tf_index = math_ops.argmin(math_ops.abs(specificities - specificity), 0)
       tf_index = math_ops.cast(tf_index, dtypes.int32)
@@ -2046,9 +2032,11 @@ def sensitivity_at_specificity(
                           tp[tf_index] + fn[tf_index] + kepsilon,
                           name)
 
-    sensitivity = compute_sensitivity_at_specificity('value')
-    with ops.control_dependencies(update_ops.values()):
-      update_op = compute_sensitivity_at_specificity('update_op')
+    sensitivity = compute_sensitivity_at_specificity(
+        values['tp'], values['tn'], values['fp'], values['fn'], 'value')
+    update_op = compute_sensitivity_at_specificity(
+        update_ops['tp'], update_ops['tn'], update_ops['fp'], update_ops['fn'],
+        'update_op')
 
     if metrics_collections:
       ops.add_to_collections(metrics_collections, sensitivity)
@@ -2595,15 +2583,15 @@ def specificity_at_sensitivity(
 
     values, update_ops = _confusion_matrix_at_thresholds(
         labels, predictions, thresholds, weights)
-    tp = values['tp']
-    fn = values['fn']
-    tn = values['tn']
-    fp = values['fp']
 
-    def compute_specificity_at_sensitivity(name):
+    def compute_specificity_at_sensitivity(tp, tn, fp, fn, name):
       """Computes the specificity at the given sensitivity.
 
       Args:
+        tp: True positives.
+        tn: True negatives.
+        fp: False positives.
+        fn: False negatives.
         name: The name of the operation.
 
       Returns:
@@ -2626,9 +2614,11 @@ def specificity_at_sensitivity(
                           tn[tf_index] + fp[tf_index] + kepsilon,
                           name)
 
-    specificity = compute_specificity_at_sensitivity('value')
-    with ops.control_dependencies(update_ops.values()):
-      update_op = compute_specificity_at_sensitivity('update_op')
+    specificity = compute_specificity_at_sensitivity(
+        values['tp'], values['tn'], values['fp'], values['fn'], 'value')
+    update_op = compute_specificity_at_sensitivity(
+        update_ops['tp'], update_ops['tn'], update_ops['fp'], update_ops['fn'],
+        'update_op')
 
     if metrics_collections:
       ops.add_to_collections(metrics_collections, specificity)
diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index 361050e9f3b..929d090a184 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -30,7 +30,6 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.util.deprecation import deprecated
 
 
 def _maybe_set_device(handle_op, value_t):
@@ -67,13 +66,11 @@ class TensorArray(object):
 
   @@read
   @@gather
-  @@pack
   @@stack
   @@concat
 
   @@write
   @@scatter
-  @@unpack
   @@unstack
   @@split
 
@@ -323,14 +320,6 @@ class TensorArray(object):
       with ops.name_scope(name, "TensorArrayStack", [self._handle]):
         return self.gather(math_ops.range(0, self.size()), name=name)
 
-  @deprecated("2016-12-12",
-              "This op will be removed after the deprecation date. "
-              "Please switch to tf.stack.")
-  def pack(self, name=None):
-    return self.stack(name)
-
-  pack.__doc__ = stack.__doc__
-
   def gather(self, indices, name=None):
     """Return selected values in the TensorArray as a packed `Tensor`.
 
@@ -410,14 +399,6 @@ class TensorArray(object):
       return self.scatter(
           indices=math_ops.range(0, num_elements), value=value, name=name)
 
-  @deprecated("2016-12-12",
-              "This op will be removed after the deprecation date. "
-              "Please switch to tf.unstack.")
-  def unpack(self, value, name=None):
-    return self.unstack(value, name)
-
-  unpack.__doc__ = unstack.__doc__
-
   def scatter(self, indices, value, name=None):
     """Scatter the values of a `Tensor` in specific indices of a `TensorArray`.
 
diff --git a/tensorflow/python/platform/benchmark.py b/tensorflow/python/platform/benchmark.py
index 1fa0165d87d..d91c19eeb46 100644
--- a/tensorflow/python/platform/benchmark.py
+++ b/tensorflow/python/platform/benchmark.py
@@ -312,13 +312,15 @@ def _run_benchmarks(regex):
         instance_benchmark_fn()
 
 
-def benchmarks_main(true_main):
-  """Run benchmarks as declared in args.
+def benchmarks_main(true_main, argv=None):
+  """Run benchmarks as declared in argv.
 
   Args:
     true_main: True main function to run if benchmarks are not requested.
+    argv: the command line arguments (if None, uses sys.argv).
   """
-  argv = sys.argv
+  if argv is None:
+    argv = sys.argv
   found_arg = [arg for arg in argv
                if arg.startswith("--benchmarks=")
                or arg.startswith("-benchmarks=")]
@@ -327,6 +329,6 @@ def benchmarks_main(true_main):
     argv.remove(found_arg[0])
 
     regex = found_arg[0].split("=")[1]
-    app.run(lambda _: _run_benchmarks(regex))
+    app.run(lambda _: _run_benchmarks(regex), argv=argv)
   else:
     true_main()
diff --git a/tensorflow/python/platform/googletest.py b/tensorflow/python/platform/googletest.py
index d88a056e786..be5c833d6da 100644
--- a/tensorflow/python/platform/googletest.py
+++ b/tensorflow/python/platform/googletest.py
@@ -80,9 +80,12 @@ def g_main(argv):
 
 
 # Redefine main to allow running benchmarks
-def main():  # pylint: disable=function-redefined
+def main(argv=None):  # pylint: disable=function-redefined
   def main_wrapper():
-    return app.run(main=g_main, argv=sys.argv)
+    args = argv
+    if args is None:
+      args = sys.argv
+    return app.run(main=g_main, argv=args)
   benchmark.benchmarks_main(true_main=main_wrapper)
 
 
diff --git a/tensorflow/python/platform/test.py b/tensorflow/python/platform/test.py
index 0563b370ea0..3f2c1d97b48 100644
--- a/tensorflow/python/platform/test.py
+++ b/tensorflow/python/platform/test.py
@@ -88,9 +88,9 @@ else:
 Benchmark = _googletest.Benchmark  # pylint: disable=invalid-name
 
 
-def main():
+def main(argv=None):
   """Runs all unit tests."""
-  return _googletest.main()
+  return _googletest.main(argv)
 
 
 def get_temp_dir():
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index 6aef104a062..b061dd6b7d0 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -2,6 +2,8 @@
 # TensorFlow SavedModel.
 
 package(
+    # TODO(drpng): change that to //third_party/tensorflow:internal
+    # when we have migrated all users.
     default_visibility = ["//visibility:public"],
 )
 
@@ -11,6 +13,20 @@ exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
+py_library(
+    name = "saved_model",
+    srcs = ["saved_model.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":builder",
+        ":loader",
+        ":signature_constants",
+        ":signature_def_utils",
+        ":tag_constants",
+    ],
+)
+
 py_library(
     name = "constants",
     srcs = ["constants.py"],
@@ -31,7 +47,10 @@ py_library(
 
 py_library(
     name = "builder",
-    srcs = ["builder.py"],
+    srcs = [
+        "builder.py",
+        "builder_impl.py",
+    ],
     srcs_version = "PY2AND3",
     deps = [
         ":constants",
@@ -47,7 +66,10 @@ py_library(
 
 py_library(
     name = "loader",
-    srcs = ["loader.py"],
+    srcs = [
+        "loader.py",
+        "loader_impl.py",
+    ],
     srcs_version = "PY2AND3",
     deps = [
         ":constants",
@@ -61,6 +83,7 @@ py_library(
 
 py_library(
     name = "main_op",
+    testonly = 1,
     srcs = ["main_op.py"],
     srcs_version = "PY2AND3",
     deps = [
@@ -126,7 +149,10 @@ py_test(
 
 py_library(
     name = "signature_def_utils",
-    srcs = ["signature_def_utils.py"],
+    srcs = [
+        "signature_def_utils.py",
+        "signature_def_utils_impl.py",
+    ],
     srcs_version = "PY2AND3",
     deps = [
         ":signature_constants",
diff --git a/tensorflow/python/saved_model/builder.py b/tensorflow/python/saved_model/builder.py
index 53072b684f9..766b0a3579f 100644
--- a/tensorflow/python/saved_model/builder.py
+++ b/tensorflow/python/saved_model/builder.py
@@ -16,409 +16,20 @@
 
 Builds a SavedModel that can be saved to storage, is language neutral, and
 enables systems to produce, consume, or transform TensorFlow Models.
+
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
+# pylint: disable=unused-import
+from tensorflow.python.saved_model.builder_impl import SavedModelBuilder
+# pylint: enable=unused-import
+from tensorflow.python.util.all_util import remove_undocumented
 
-from google.protobuf.any_pb2 import Any
 
-from tensorflow.core.protobuf import meta_graph_pb2
-from tensorflow.core.protobuf import saved_model_pb2
-from tensorflow.core.protobuf import saver_pb2
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.lib.io import file_io
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import tf_logging
-from tensorflow.python.saved_model import constants
-from tensorflow.python.training import saver as tf_saver
-from tensorflow.python.util import compat
-
-
-class SavedModelBuilder(object):
-  """Builds the `SavedModel` protocol buffer and saves variables and assets.
-
-  The `SavedModelBuilder` class provides functionality to build a `SavedModel`
-  protocol buffer. Specifically, this allows multiple meta graphs to be saved as
-  part of a single language-neutral `SavedModel`, while sharing variables and
-  assets.
-
-  To build a SavedModel, the first meta graph must be saved with variables.
-  Subsequent meta graphs will simply be saved with their graph definitions. If
-  assets need to be saved and written or copied to disk, they can be provided
-  when the meta graph def is added. If multiple meta graph defs are associated
-  an asset of the same name, only the first version is retained.
-
-  Each meta graph added to the SavedModel must be annotated with tags. The tags
-  provide a means to identify the specific meta graph to load and restore, along
-  with the shared set of variables and assets.
-
-  Typical usage for the `SavedModelBuilder`:
-  ```python
-  ...
-  builder = saved_model_builder.SavedModelBuilder(export_dir)
-
-  with tf.Session(graph=tf.Graph()) as sess:
-    ...
-    builder.add_meta_graph_and_variables(sess,
-                                    ["foo-tag"],
-                                    signature_def_map=foo_signatures,
-                                    assets_collection=foo_assets)
-  ...
-
-  with tf.Session(graph=tf.Graph()) as sess:
-    ...
-    builder.add_meta_graph(["bar-tag", "baz-tag"])
-  ...
-
-  builder.save()
-  ```
-  """
-
-  def __init__(self, export_dir):
-    self._saved_model = saved_model_pb2.SavedModel()
-    self._saved_model.saved_model_schema_version = (
-        constants.SAVED_MODEL_SCHEMA_VERSION)
-
-    self._export_dir = export_dir
-    if file_io.file_exists(export_dir):
-      raise AssertionError(
-          "Export directory already exists. Please specify a different export "
-          "directory: %s" % export_dir)
-
-    file_io.recursive_create_dir(self._export_dir)
-
-    # Boolean to track whether variables and assets corresponding to the
-    # SavedModel have been saved. Specifically, the first meta graph to be added
-    # MUST use the add_meta_graph_and_variables() API. Subsequent add operations
-    # on the SavedModel MUST use the add_meta_graph() API which does not save
-    # weights.
-    self._has_saved_variables = False
-
-  def _asset_path_from_tensor(self, path_tensor):
-    """Returns the filepath value stored in constant `path_tensor`.
-
-    Args:
-      path_tensor: Tensor of a file-path.
-
-    Returns:
-      The string value i.e. path of the tensor, if valid.
-
-    Raises:
-      TypeError if tensor does not match expected op type, dtype or value.
-    """
-    if not isinstance(path_tensor, ops.Tensor):
-      raise TypeError("Asset path tensor must be a Tensor.")
-    if path_tensor.op.type != "Const":
-      raise TypeError("Asset path tensor must be of type constant.")
-    if path_tensor.dtype != dtypes.string:
-      raise TypeError("Asset path tensor must be of dtype string.")
-    str_values = path_tensor.op.get_attr("value").string_val
-    if len(str_values) != 1:
-      raise TypeError("Asset path tensor must be a scalar.")
-    return str_values[0]
-
-  def _add_asset_to_collection(self, asset_filename, asset_tensor):
-    """Builds an asset proto and adds it to the asset collection of the graph.
-
-    Args:
-      asset_filename: The filename of the asset to be added.
-      asset_tensor: The asset tensor used to populate the tensor info of the
-          asset proto.
-    """
-    asset_proto = meta_graph_pb2.AssetFileDef()
-    asset_proto.filename = asset_filename
-    asset_proto.tensor_info.name = asset_tensor.name
-
-    asset_any_proto = Any()
-    asset_any_proto.Pack(asset_proto)
-    ops.add_to_collection(constants.ASSETS_KEY, asset_any_proto)
-
-  def _save_and_write_assets(self, assets_collection_to_add=None):
-    """Saves asset to the meta graph and writes asset files to disk.
-
-    Args:
-      assets_collection_to_add: The collection where the asset paths are setup.
-    """
-    asset_source_filepath_list = self._maybe_save_assets(
-        assets_collection_to_add)
-
-    # Return if there are no assets to write.
-    if len(asset_source_filepath_list) is 0:
-      tf_logging.info("No assets to write.")
-      return
-
-    assets_destination_dir = os.path.join(
-        compat.as_bytes(self._export_dir),
-        compat.as_bytes(constants.ASSETS_DIRECTORY))
-
-    if not file_io.file_exists(assets_destination_dir):
-      file_io.recursive_create_dir(assets_destination_dir)
-
-    # Copy each asset from source path to destination path.
-    for asset_source_filepath in asset_source_filepath_list:
-      asset_source_filename = os.path.basename(asset_source_filepath)
-
-      asset_destination_filepath = os.path.join(
-          compat.as_bytes(assets_destination_dir),
-          compat.as_bytes(asset_source_filename))
-
-      # Only copy the asset file to the destination if it does not already
-      # exist. This is to ensure that an asset with the same name defined as
-      # part of multiple graphs is only copied the first time.
-      if not file_io.file_exists(asset_destination_filepath):
-        file_io.copy(asset_source_filepath, asset_destination_filepath)
-
-    tf_logging.info("Assets written to: %s", assets_destination_dir)
-
-  def _maybe_add_legacy_init_op(self, legacy_init_op=None):
-    """Add legacy init op to the SavedModel.
-
-    Args:
-      legacy_init_op: Optional legacy init op to support backward compatibility.
-
-    Raises:
-      TypeError if legacy init op is not of type `Operation`.
-    """
-    if legacy_init_op is not None:
-      if not isinstance(legacy_init_op, ops.Operation):
-        raise TypeError("legacy_init_op needs to be an Operation: %r" %
-                        legacy_init_op)
-      ops.add_to_collection(constants.LEGACY_INIT_OP_KEY, legacy_init_op)
-
-  def _add_main_op(self, main_op):
-    """Add main op to the SavedModel.
-
-    Args:
-      main_op: Main op to run as part of graph initialization.
-
-    Raises:
-      TypeError if main op is not of type `Operation`.
-    """
-    if main_op is not None:
-      if not isinstance(main_op, ops.Operation):
-        raise TypeError("main_op needs to be an Operation: %r" % main_op)
-      ops.add_to_collection(constants.MAIN_OP_KEY, main_op)
-
-  def _maybe_save_assets(self, assets_collection_to_add=None):
-    """Saves assets to the meta graph.
-
-    Args:
-      assets_collection_to_add: The collection where the asset paths are setup.
-
-    Returns:
-      The list of filepaths to the assets in the assets collection.
-
-    Raises:
-      ValueError: Indicating an invalid filepath tensor.
-    """
-    asset_source_filepath_list = []
-
-    if assets_collection_to_add is None:
-      tf_logging.info("No assets to save.")
-      return asset_source_filepath_list
-
-    # Iterate over the supplied asset collection, build the `AssetFile` proto
-    # and add them to the collection with key `constants.ASSETS_KEY`, in the
-    # graph.
-    for asset_tensor in assets_collection_to_add:
-      asset_source_filepath = self._asset_path_from_tensor(asset_tensor)
-      if not asset_source_filepath:
-        raise ValueError("Invalid asset filepath tensor %s" % asset_tensor)
-
-      asset_source_filename = os.path.basename(asset_source_filepath)
-
-      # Build `AssetFile` proto and add it to the asset collection in the graph.
-      self._add_asset_to_collection(asset_source_filename, asset_tensor)
-
-      asset_source_filepath_list.append(asset_source_filepath)
-
-    tf_logging.info("Assets added to graph.")
-    return asset_source_filepath_list
-
-  def _tag_and_add_meta_graph(self, meta_graph_def, tags, signature_def_map):
-    """Tags the meta graph def and adds it to the SavedModel.
-
-    Tags the meta graph def with the supplied tags, adds signature defs to it if
-    provided and appends the meta graph def to the SavedModel proto.
-
-    Args:
-      meta_graph_def: The meta graph def to add to the SavedModel.
-      tags: The set of tags to annotate the meta graph def with.
-      signature_def_map: The map of signature defs to be added to the meta graph
-          def.
-    """
-    for tag in tags:
-      meta_graph_def.meta_info_def.tags.append(tag)
-
-    if signature_def_map is not None:
-      for key in signature_def_map:
-        meta_graph_def.signature_def[key].CopyFrom(signature_def_map[key])
-
-    proto_meta_graph_def = self._saved_model.meta_graphs.add()
-    proto_meta_graph_def.CopyFrom(meta_graph_def)
-
-  def add_meta_graph(self,
-                     tags,
-                     signature_def_map=None,
-                     assets_collection=None,
-                     legacy_init_op=None,
-                     clear_devices=False,
-                     main_op=None):
-    """Adds the current meta graph to the SavedModel.
-
-    Creates a Saver in the current scope and uses the Saver to export the meta
-    graph def. Invoking this API requires the `add_meta_graph_and_variables()`
-    API to have been invoked before.
-
-    Args:
-      tags: The set of tags to annotate the meta graph def with.
-      signature_def_map: The map of signature defs to be added to the meta graph
-          def.
-      assets_collection: Assets collection to be saved with SavedModel. Note
-          that this collection should be a subset of the assets saved as part of
-          the first meta graph in the SavedModel.
-      legacy_init_op: Legacy support for op or group of ops to execute after the
-          restore op upon a load.
-      clear_devices: Set to true if the device info on the default graph should
-          be cleared.
-      main_op: Op or group of ops to execute when the graph is loaded.
-
-    Raises:
-      AssertionError: If the variables for the SavedModel have not been saved
-          yet.
-    """
-    if not self._has_saved_variables:
-      raise AssertionError(
-          "Variables and assets have not been saved yet. "
-          "Please invoke `add_meta_graph_and_variables()` first.")
-
-    # Save asset files and write them to disk, if any.
-    self._save_and_write_assets(assets_collection)
-
-    if main_op is None:
-      # Add legacy init op to the SavedModel.
-      self._maybe_add_legacy_init_op(legacy_init_op)
-    else:
-      self._add_main_op(main_op)
-
-    # Initialize a saver to generate a sharded output for all variables in the
-    # current scope.
-    saver = tf_saver.Saver(
-        variables.global_variables(),
-        sharded=True,
-        write_version=saver_pb2.SaverDef.V2)
-
-    meta_graph_def = saver.export_meta_graph(clear_devices=clear_devices)
-
-    # Tag the meta graph def and add it to the SavedModel.
-    self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
-
-  def add_meta_graph_and_variables(self,
-                                   sess,
-                                   tags,
-                                   signature_def_map=None,
-                                   assets_collection=None,
-                                   legacy_init_op=None,
-                                   clear_devices=False,
-                                   main_op=None):
-    """Adds the current meta graph to the SavedModel and saves variables.
-
-    Creates a Saver to save the variables from the provided session. Exports the
-    corresponding meta graph def. This function assumes that the variables to be
-    saved have been initialized. For a given `SavedModelBuilder`, this API must
-    be called exactly once and for the first meta graph to save. For subsequent
-    meta graph defs to be added, the `add_meta_graph()` API must be used.
-
-    Args:
-      sess: The TensorFlow session from which to save the meta graph and
-        variables.
-      tags: The set of tags with which to save the meta graph.
-      signature_def_map: The map of signature def map to add to the meta graph
-        def.
-      assets_collection: Assets collection to be saved with SavedModel.
-      legacy_init_op: Legacy support for op or group of ops to execute after the
-          restore op upon a load.
-      clear_devices: Set to true if the device info on the default graph should
-          be cleared.
-      main_op: Op or group of ops to execute when the graph is loaded.
-    """
-    if self._has_saved_variables:
-      raise AssertionError("Variables and assets have already been saved. "
-                           "Please invoke `add_meta_graph()` instead.")
-
-    # Save asset files and write them to disk, if any.
-    self._save_and_write_assets(assets_collection)
-
-    # Create the variables sub-directory, if it does not exist.
-    variables_dir = os.path.join(
-        compat.as_text(self._export_dir),
-        compat.as_text(constants.VARIABLES_DIRECTORY))
-    if not file_io.file_exists(variables_dir):
-      file_io.recursive_create_dir(variables_dir)
-
-    variables_path = os.path.join(
-        compat.as_text(variables_dir),
-        compat.as_text(constants.VARIABLES_FILENAME))
-
-    if main_op is None:
-      # Add legacy init op to the SavedModel.
-      self._maybe_add_legacy_init_op(legacy_init_op)
-    else:
-      self._add_main_op(main_op)
-
-    # Initialize a saver to generate a sharded output for all variables in the
-    # current scope.
-    saver = tf_saver.Saver(
-        variables.global_variables(),
-        sharded=True,
-        write_version=saver_pb2.SaverDef.V2)
-
-    # Save the variables. Also, disable writing the checkpoint state proto. The
-    # file is not used during SavedModel loading. In addition, since a
-    # SavedModel can be copied or moved, this avoids the checkpoint state to
-    # become outdated.
-    saver.save(sess, variables_path, write_meta_graph=False, write_state=False)
-
-    # Export the meta graph def.
-    meta_graph_def = saver.export_meta_graph(clear_devices=clear_devices)
-
-    # Tag the meta graph def and add it to the SavedModel.
-    self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
-
-    # Mark this instance of SavedModel as having saved variables, such that
-    # subsequent attempts to save variables will fail.
-    self._has_saved_variables = True
-
-  def save(self, as_text=False):
-    """Writes a `SavedModel` protocol buffer to disk.
-
-    The function writes the SavedModel protocol buffer to the export directory
-    in serialized format.
-
-    Args:
-      as_text: Writes the SavedModel protocol buffer in text format to disk.
-
-    Returns:
-      The path to which the SavedModel protocol buffer was written.
-    """
-    if not file_io.file_exists(self._export_dir):
-      file_io.recursive_create_dir(self._export_dir)
-
-    if as_text:
-      path = os.path.join(
-          compat.as_bytes(self._export_dir),
-          compat.as_bytes(constants.SAVED_MODEL_FILENAME_PBTXT))
-      file_io.write_string_to_file(path, str(self._saved_model))
-    else:
-      path = os.path.join(
-          compat.as_bytes(self._export_dir),
-          compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
-      file_io.write_string_to_file(path, self._saved_model.SerializeToString())
-    tf_logging.info("SavedModel written to: %s", path)
-
-    return path
+_allowed_symbols = [
+    "SavedModelBuilder",
+]
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/saved_model/builder_impl.py b/tensorflow/python/saved_model/builder_impl.py
new file mode 100644
index 00000000000..6b9e3c46932
--- /dev/null
+++ b/tensorflow/python/saved_model/builder_impl.py
@@ -0,0 +1,420 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""SavedModel builder implementation."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from google.protobuf.any_pb2 import Any
+
+from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.core.protobuf import saved_model_pb2
+from tensorflow.core.protobuf import saver_pb2
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import tf_logging
+from tensorflow.python.saved_model import constants
+from tensorflow.python.training import saver as tf_saver
+from tensorflow.python.util import compat
+
+
+class SavedModelBuilder(object):
+  """Builds the `SavedModel` protocol buffer and saves variables and assets.
+
+  The `SavedModelBuilder` class provides functionality to build a `SavedModel`
+  protocol buffer. Specifically, this allows multiple meta graphs to be saved as
+  part of a single language-neutral `SavedModel`, while sharing variables and
+  assets.
+
+  To build a SavedModel, the first meta graph must be saved with variables.
+  Subsequent meta graphs will simply be saved with their graph definitions. If
+  assets need to be saved and written or copied to disk, they can be provided
+  when the meta graph def is added. If multiple meta graph defs are associated
+  an asset of the same name, only the first version is retained.
+
+  Each meta graph added to the SavedModel must be annotated with tags. The tags
+  provide a means to identify the specific meta graph to load and restore, along
+  with the shared set of variables and assets.
+
+  Typical usage for the `SavedModelBuilder`:
+  ```python
+  ...
+  builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+  with tf.Session(graph=tf.Graph()) as sess:
+    ...
+    builder.add_meta_graph_and_variables(sess,
+                                    ["foo-tag"],
+                                    signature_def_map=foo_signatures,
+                                    assets_collection=foo_assets)
+  ...
+
+  with tf.Session(graph=tf.Graph()) as sess:
+    ...
+    builder.add_meta_graph(["bar-tag", "baz-tag"])
+  ...
+
+  builder.save()
+  ```
+  """
+
+  def __init__(self, export_dir):
+    self._saved_model = saved_model_pb2.SavedModel()
+    self._saved_model.saved_model_schema_version = (
+        constants.SAVED_MODEL_SCHEMA_VERSION)
+
+    self._export_dir = export_dir
+    if file_io.file_exists(export_dir):
+      raise AssertionError(
+          "Export directory already exists. Please specify a different export "
+          "directory: %s" % export_dir)
+
+    file_io.recursive_create_dir(self._export_dir)
+
+    # Boolean to track whether variables and assets corresponding to the
+    # SavedModel have been saved. Specifically, the first meta graph to be added
+    # MUST use the add_meta_graph_and_variables() API. Subsequent add operations
+    # on the SavedModel MUST use the add_meta_graph() API which does not save
+    # weights.
+    self._has_saved_variables = False
+
+  def _asset_path_from_tensor(self, path_tensor):
+    """Returns the filepath value stored in constant `path_tensor`.
+
+    Args:
+      path_tensor: Tensor of a file-path.
+
+    Returns:
+      The string value i.e. path of the tensor, if valid.
+
+    Raises:
+      TypeError if tensor does not match expected op type, dtype or value.
+    """
+    if not isinstance(path_tensor, ops.Tensor):
+      raise TypeError("Asset path tensor must be a Tensor.")
+    if path_tensor.op.type != "Const":
+      raise TypeError("Asset path tensor must be of type constant.")
+    if path_tensor.dtype != dtypes.string:
+      raise TypeError("Asset path tensor must be of dtype string.")
+    str_values = path_tensor.op.get_attr("value").string_val
+    if len(str_values) != 1:
+      raise TypeError("Asset path tensor must be a scalar.")
+    return str_values[0]
+
+  def _add_asset_to_collection(self, asset_filename, asset_tensor):
+    """Builds an asset proto and adds it to the asset collection of the graph.
+
+    Args:
+      asset_filename: The filename of the asset to be added.
+      asset_tensor: The asset tensor used to populate the tensor info of the
+          asset proto.
+    """
+    asset_proto = meta_graph_pb2.AssetFileDef()
+    asset_proto.filename = asset_filename
+    asset_proto.tensor_info.name = asset_tensor.name
+
+    asset_any_proto = Any()
+    asset_any_proto.Pack(asset_proto)
+    ops.add_to_collection(constants.ASSETS_KEY, asset_any_proto)
+
+  def _save_and_write_assets(self, assets_collection_to_add=None):
+    """Saves asset to the meta graph and writes asset files to disk.
+
+    Args:
+      assets_collection_to_add: The collection where the asset paths are setup.
+    """
+    asset_source_filepath_list = self._maybe_save_assets(
+        assets_collection_to_add)
+
+    # Return if there are no assets to write.
+    if len(asset_source_filepath_list) is 0:
+      tf_logging.info("No assets to write.")
+      return
+
+    assets_destination_dir = os.path.join(
+        compat.as_bytes(self._export_dir),
+        compat.as_bytes(constants.ASSETS_DIRECTORY))
+
+    if not file_io.file_exists(assets_destination_dir):
+      file_io.recursive_create_dir(assets_destination_dir)
+
+    # Copy each asset from source path to destination path.
+    for asset_source_filepath in asset_source_filepath_list:
+      asset_source_filename = os.path.basename(asset_source_filepath)
+
+      asset_destination_filepath = os.path.join(
+          compat.as_bytes(assets_destination_dir),
+          compat.as_bytes(asset_source_filename))
+
+      # Only copy the asset file to the destination if it does not already
+      # exist. This is to ensure that an asset with the same name defined as
+      # part of multiple graphs is only copied the first time.
+      if not file_io.file_exists(asset_destination_filepath):
+        file_io.copy(asset_source_filepath, asset_destination_filepath)
+
+    tf_logging.info("Assets written to: %s", assets_destination_dir)
+
+  def _maybe_add_legacy_init_op(self, legacy_init_op=None):
+    """Add legacy init op to the SavedModel.
+
+    Args:
+      legacy_init_op: Optional legacy init op to support backward compatibility.
+
+    Raises:
+      TypeError if legacy init op is not of type `Operation`.
+    """
+    if legacy_init_op is not None:
+      if not isinstance(legacy_init_op, ops.Operation):
+        raise TypeError("legacy_init_op needs to be an Operation: %r" %
+                        legacy_init_op)
+      ops.add_to_collection(constants.LEGACY_INIT_OP_KEY, legacy_init_op)
+
+  def _add_main_op(self, main_op):
+    """Add main op to the SavedModel.
+
+    Args:
+      main_op: Main op to run as part of graph initialization.
+
+    Raises:
+      TypeError if main op is not of type `Operation`.
+    """
+    if main_op is not None:
+      if not isinstance(main_op, ops.Operation):
+        raise TypeError("main_op needs to be an Operation: %r" % main_op)
+      ops.add_to_collection(constants.MAIN_OP_KEY, main_op)
+
+  def _maybe_save_assets(self, assets_collection_to_add=None):
+    """Saves assets to the meta graph.
+
+    Args:
+      assets_collection_to_add: The collection where the asset paths are setup.
+
+    Returns:
+      The list of filepaths to the assets in the assets collection.
+
+    Raises:
+      ValueError: Indicating an invalid filepath tensor.
+    """
+    asset_source_filepath_list = []
+
+    if assets_collection_to_add is None:
+      tf_logging.info("No assets to save.")
+      return asset_source_filepath_list
+
+    # Iterate over the supplied asset collection, build the `AssetFile` proto
+    # and add them to the collection with key `constants.ASSETS_KEY`, in the
+    # graph.
+    for asset_tensor in assets_collection_to_add:
+      asset_source_filepath = self._asset_path_from_tensor(asset_tensor)
+      if not asset_source_filepath:
+        raise ValueError("Invalid asset filepath tensor %s" % asset_tensor)
+
+      asset_source_filename = os.path.basename(asset_source_filepath)
+
+      # Build `AssetFile` proto and add it to the asset collection in the graph.
+      self._add_asset_to_collection(asset_source_filename, asset_tensor)
+
+      asset_source_filepath_list.append(asset_source_filepath)
+
+    tf_logging.info("Assets added to graph.")
+    return asset_source_filepath_list
+
+  def _tag_and_add_meta_graph(self, meta_graph_def, tags, signature_def_map):
+    """Tags the meta graph def and adds it to the SavedModel.
+
+    Tags the meta graph def with the supplied tags, adds signature defs to it if
+    provided and appends the meta graph def to the SavedModel proto.
+
+    Args:
+      meta_graph_def: The meta graph def to add to the SavedModel.
+      tags: The set of tags to annotate the meta graph def with.
+      signature_def_map: The map of signature defs to be added to the meta graph
+          def.
+    """
+    for tag in tags:
+      meta_graph_def.meta_info_def.tags.append(tag)
+
+    if signature_def_map is not None:
+      for key in signature_def_map:
+        meta_graph_def.signature_def[key].CopyFrom(signature_def_map[key])
+
+    proto_meta_graph_def = self._saved_model.meta_graphs.add()
+    proto_meta_graph_def.CopyFrom(meta_graph_def)
+
+  def add_meta_graph(self,
+                     tags,
+                     signature_def_map=None,
+                     assets_collection=None,
+                     legacy_init_op=None,
+                     clear_devices=False,
+                     main_op=None):
+    """Adds the current meta graph to the SavedModel.
+
+    Creates a Saver in the current scope and uses the Saver to export the meta
+    graph def. Invoking this API requires the `add_meta_graph_and_variables()`
+    API to have been invoked before.
+
+    Args:
+      tags: The set of tags to annotate the meta graph def with.
+      signature_def_map: The map of signature defs to be added to the meta graph
+          def.
+      assets_collection: Assets collection to be saved with SavedModel. Note
+          that this collection should be a subset of the assets saved as part of
+          the first meta graph in the SavedModel.
+      legacy_init_op: Legacy support for op or group of ops to execute after the
+          restore op upon a load.
+      clear_devices: Set to true if the device info on the default graph should
+          be cleared.
+      main_op: Op or group of ops to execute when the graph is loaded.
+
+    Raises:
+      AssertionError: If the variables for the SavedModel have not been saved
+          yet.
+    """
+    if not self._has_saved_variables:
+      raise AssertionError(
+          "Variables and assets have not been saved yet. "
+          "Please invoke `add_meta_graph_and_variables()` first.")
+
+    # Save asset files and write them to disk, if any.
+    self._save_and_write_assets(assets_collection)
+
+    if main_op is None:
+      # Add legacy init op to the SavedModel.
+      self._maybe_add_legacy_init_op(legacy_init_op)
+    else:
+      self._add_main_op(main_op)
+
+    # Initialize a saver to generate a sharded output for all variables in the
+    # current scope.
+    saver = tf_saver.Saver(
+        variables.global_variables(),
+        sharded=True,
+        write_version=saver_pb2.SaverDef.V2)
+
+    meta_graph_def = saver.export_meta_graph(clear_devices=clear_devices)
+
+    # Tag the meta graph def and add it to the SavedModel.
+    self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
+
+  def add_meta_graph_and_variables(self,
+                                   sess,
+                                   tags,
+                                   signature_def_map=None,
+                                   assets_collection=None,
+                                   legacy_init_op=None,
+                                   clear_devices=False,
+                                   main_op=None):
+    """Adds the current meta graph to the SavedModel and saves variables.
+
+    Creates a Saver to save the variables from the provided session. Exports the
+    corresponding meta graph def. This function assumes that the variables to be
+    saved have been initialized. For a given `SavedModelBuilder`, this API must
+    be called exactly once and for the first meta graph to save. For subsequent
+    meta graph defs to be added, the `add_meta_graph()` API must be used.
+
+    Args:
+      sess: The TensorFlow session from which to save the meta graph and
+        variables.
+      tags: The set of tags with which to save the meta graph.
+      signature_def_map: The map of signature def map to add to the meta graph
+        def.
+      assets_collection: Assets collection to be saved with SavedModel.
+      legacy_init_op: Legacy support for op or group of ops to execute after the
+          restore op upon a load.
+      clear_devices: Set to true if the device info on the default graph should
+          be cleared.
+      main_op: Op or group of ops to execute when the graph is loaded.
+    """
+    if self._has_saved_variables:
+      raise AssertionError("Variables and assets have already been saved. "
+                           "Please invoke `add_meta_graph()` instead.")
+
+    # Save asset files and write them to disk, if any.
+    self._save_and_write_assets(assets_collection)
+
+    # Create the variables sub-directory, if it does not exist.
+    variables_dir = os.path.join(
+        compat.as_text(self._export_dir),
+        compat.as_text(constants.VARIABLES_DIRECTORY))
+    if not file_io.file_exists(variables_dir):
+      file_io.recursive_create_dir(variables_dir)
+
+    variables_path = os.path.join(
+        compat.as_text(variables_dir),
+        compat.as_text(constants.VARIABLES_FILENAME))
+
+    if main_op is None:
+      # Add legacy init op to the SavedModel.
+      self._maybe_add_legacy_init_op(legacy_init_op)
+    else:
+      self._add_main_op(main_op)
+
+    # Initialize a saver to generate a sharded output for all variables in the
+    # current scope.
+    saver = tf_saver.Saver(
+        variables.global_variables(),
+        sharded=True,
+        write_version=saver_pb2.SaverDef.V2)
+
+    # Save the variables. Also, disable writing the checkpoint state proto. The
+    # file is not used during SavedModel loading. In addition, since a
+    # SavedModel can be copied or moved, this avoids the checkpoint state to
+    # become outdated.
+    saver.save(sess, variables_path, write_meta_graph=False, write_state=False)
+
+    # Export the meta graph def.
+    meta_graph_def = saver.export_meta_graph(clear_devices=clear_devices)
+
+    # Tag the meta graph def and add it to the SavedModel.
+    self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
+
+    # Mark this instance of SavedModel as having saved variables, such that
+    # subsequent attempts to save variables will fail.
+    self._has_saved_variables = True
+
+  def save(self, as_text=False):
+    """Writes a `SavedModel` protocol buffer to disk.
+
+    The function writes the SavedModel protocol buffer to the export directory
+    in serialized format.
+
+    Args:
+      as_text: Writes the SavedModel protocol buffer in text format to disk.
+
+    Returns:
+      The path to which the SavedModel protocol buffer was written.
+    """
+    if not file_io.file_exists(self._export_dir):
+      file_io.recursive_create_dir(self._export_dir)
+
+    if as_text:
+      path = os.path.join(
+          compat.as_bytes(self._export_dir),
+          compat.as_bytes(constants.SAVED_MODEL_FILENAME_PBTXT))
+      file_io.write_string_to_file(path, str(self._saved_model))
+    else:
+      path = os.path.join(
+          compat.as_bytes(self._export_dir),
+          compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
+      file_io.write_string_to_file(path, self._saved_model.SerializeToString())
+    tf_logging.info("SavedModel written to: %s", path)
+
+    return path
diff --git a/tensorflow/python/saved_model/loader.py b/tensorflow/python/saved_model/loader.py
index 659d7ef4ce3..0a7f516287a 100644
--- a/tensorflow/python/saved_model/loader.py
+++ b/tensorflow/python/saved_model/loader.py
@@ -30,9 +30,10 @@ added to the SavedModel using `add_meta_graph_and_variables(...)` in
 `builder.py`.
 
 Typical usage:
+
 ```python
 ...
-builder = saved_model_builder.SavedModelBuilder(export_dir)
+builder = tf.saved_model.builder.SavedModelBuilder(export_dir)
 
 with tf.Session(graph=tf.Graph()) as sess:
   ...
@@ -52,7 +53,7 @@ builder.save()
 
 ...
 with tf.Session(graph=tf.Graph()) as sess:
-  loader.load(sess, ["foo-tag"], export_dir)
+  tf.saved_model.loader.load(sess, ["foo-tag"], export_dir)
   ...
 
 ```
@@ -62,216 +63,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
-from google.protobuf import text_format
-
-from tensorflow.core.protobuf import meta_graph_pb2
-from tensorflow.core.protobuf import saved_model_pb2
-from tensorflow.python.framework import ops
-from tensorflow.python.lib.io import file_io
-from tensorflow.python.saved_model import constants
-from tensorflow.python.training import saver as tf_saver
-from tensorflow.python.util import compat
+# pylint: disable=unused-import
+from tensorflow.python.saved_model.loader_impl import load
+from tensorflow.python.saved_model.loader_impl import maybe_saved_model_directory
+# pylint: enable=unused-import
+from tensorflow.python.util.all_util import remove_undocumented
 
 
-def _parse_saved_model(export_dir):
-  """Reads the savedmodel.pb or savedmodel.pbtxt file containing `SavedModel`.
-
-  Args:
-    export_dir: Directory containing the SavedModel file.
-
-  Returns:
-    A `SavedModel` protocol buffer.
-
-  Raises:
-    IOError: If the file does not exist, or cannot be successfully parsed.
-  """
-  # Build the path to the SavedModel in pbtxt format.
-  path_to_pbtxt = os.path.join(
-      compat.as_bytes(export_dir),
-      compat.as_bytes(constants.SAVED_MODEL_FILENAME_PBTXT))
-  # Build the path to the SavedModel in pb format.
-  path_to_pb = os.path.join(
-      compat.as_bytes(export_dir),
-      compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
-
-  # Ensure that the SavedModel exists at either path.
-  if not file_io.file_exists(path_to_pbtxt) and not file_io.file_exists(
-      path_to_pb):
-    raise IOError("SavedModel file does not exist at: %s" % export_dir)
-
-  saved_model = saved_model_pb2.SavedModel()
-
-  # Parse the SavedModel protocol buffer.
-  try:
-    file_content = file_io.read_file_to_string(path_to_pb)
-    saved_model.ParseFromString(file_content)
-    return saved_model
-  except Exception:  # pylint: disable=broad-except
-    # Pass for exceptions in order to try reading the file in text format.
-    pass
-
-  try:
-    file_content = file_io.read_file_to_string(path_to_pbtxt)
-    text_format.Merge(file_content.decode("utf-8"), saved_model)
-  except text_format.ParseError as e:
-    raise IOError("Cannot parse file %s: %s." % (path_to_pbtxt, str(e)))
-  return saved_model
-
-
-def _get_asset_tensors(export_dir, meta_graph_def_to_load):
-  """Gets the asset tensors, if defined in the meta graph def to load.
-
-  Args:
-    export_dir: Directory where the SavedModel is located.
-    meta_graph_def_to_load: The meta graph def from the SavedModel to be loaded.
-
-  Returns:
-    A dictionary of asset tensors, keyed by the name of the asset tensor. The
-    value in the map corresponds to the absolute path of the asset file.
-  """
-  # Collection-def that may contain the assets key.
-  collection_def = meta_graph_def_to_load.collection_def
-
-  asset_tensor_dict = {}
-  if constants.ASSETS_KEY in collection_def:
-    # Location of the assets for SavedModel.
-    assets_directory = os.path.join(
-        compat.as_bytes(export_dir),
-        compat.as_bytes(constants.ASSETS_DIRECTORY))
-    assets_any_proto = collection_def[constants.ASSETS_KEY].any_list.value
-    # Process each asset and add it to the asset tensor dictionary.
-    for asset_any_proto in assets_any_proto:
-      asset_proto = meta_graph_pb2.AssetFileDef()
-      asset_any_proto.Unpack(asset_proto)
-      asset_tensor_dict[asset_proto.tensor_info.name] = os.path.join(
-          compat.as_bytes(assets_directory),
-          compat.as_bytes(asset_proto.filename))
-  return asset_tensor_dict
-
-
-def _get_main_op_tensor(meta_graph_def_to_load):
-  """Gets the main op tensor, if one exists.
-
-  Args:
-    meta_graph_def_to_load: The meta graph def from the SavedModel to be loaded.
-
-  Returns:
-    The main op tensor, if it exists and `None` otherwise.
-
-  Raises:
-    RuntimeError: If the collection def corresponding to the main op key has
-        other than exactly one tensor.
-  """
-  collection_def = meta_graph_def_to_load.collection_def
-  main_op_tensor = None
-  if constants.MAIN_OP_KEY in collection_def:
-    main_ops = collection_def[constants.MAIN_OP_KEY].node_list.value
-    if len(main_ops) != 1:
-      raise RuntimeError("Expected exactly one SavedModel main op.")
-    main_op_tensor = ops.get_collection(constants.MAIN_OP_KEY)[0]
-  return main_op_tensor
-
-
-def _get_legacy_init_op_tensor(meta_graph_def_to_load):
-  """Gets the legacy init op tensor, if one exists.
-
-  Args:
-    meta_graph_def_to_load: The meta graph def from the SavedModel to be loaded.
-
-  Returns:
-    The legacy init op tensor, if it exists and `None` otherwise.
-
-  Raises:
-    RuntimeError: If the collection def corresponding to the legacy init op key
-        has other than exactly one tensor.
-  """
-  collection_def = meta_graph_def_to_load.collection_def
-  legacy_init_op_tensor = None
-  if constants.LEGACY_INIT_OP_KEY in collection_def:
-    legacy_init_ops = collection_def[
-        constants.LEGACY_INIT_OP_KEY].node_list.value
-    if len(legacy_init_ops) != 1:
-      raise RuntimeError("Expected exactly one legacy serving init op.")
-    legacy_init_op_tensor = ops.get_collection(constants.LEGACY_INIT_OP_KEY)[0]
-  return legacy_init_op_tensor
-
-
-def maybe_saved_model_directory(export_dir):
-  """Checks whether the provided export directory could contain a SavedModel.
-
-  Note that the method does not load any data by itself. If the method returns
-  `false`, the export directory definitely does not contain a SavedModel. If the
-  method returns `true`, the export directory may contain a SavedModel but
-  provides no guarantee that it can be loaded.
-
-  Args:
-    export_dir: Absolute string path to possible export location. For example,
-                '/my/foo/model'.
-
-  Returns:
-    True if the export directory contains SavedModel files, False otherwise.
-  """
-  txt_path = os.path.join(export_dir, constants.SAVED_MODEL_FILENAME_PBTXT)
-  pb_path = os.path.join(export_dir, constants.SAVED_MODEL_FILENAME_PB)
-  return (file_io.file_exists(txt_path) or file_io.file_exists(pb_path))
-
-
-def load(sess, tags, export_dir):
-  """Loads the model from a SavedModel as specified by tags.
-
-  Args:
-    sess: The TensorFlow session to restore the variables.
-    tags: Set of string tags to identify the required MetaGraphDef. These should
-        correspond to the tags used when saving the variables using the
-        SavedModel `save()` API.
-    export_dir: Directory in which the SavedModel protocol buffer and variables
-        to be loaded are located.
-
-  Returns:
-    The `MetaGraphDef` protocol buffer loaded in the provided session. This
-    can be used to further extract signature-defs, collection-defs, etc.
-
-  Raises:
-    RuntimeError: MetaGraphDef associated with the tags cannot be found.
-  """
-  # Build the SavedModel protocol buffer and find the requested meta graph def.
-  saved_model = _parse_saved_model(export_dir)
-  found_match = False
-  for meta_graph_def in saved_model.meta_graphs:
-    if set(meta_graph_def.meta_info_def.tags) == set(tags):
-      meta_graph_def_to_load = meta_graph_def
-      found_match = True
-      break
-
-  if not found_match:
-    raise RuntimeError("MetaGraphDef associated with tags " + str(tags).strip(
-        "[]") + " could not be found in SavedModel")
-
-  # Build a saver by importing the meta graph def to load.
-  saver = tf_saver.import_meta_graph(meta_graph_def_to_load)
-
-  # Build the checkpoint path where the variables are located.
-  variables_path = os.path.join(
-      compat.as_bytes(export_dir),
-      compat.as_bytes(constants.VARIABLES_DIRECTORY),
-      compat.as_bytes(constants.VARIABLES_FILENAME))
-
-  # Restore the variables using the built saver in the provided session.
-  saver.restore(sess, variables_path)
-
-  # Get asset tensors, if any.
-  asset_tensors_dictionary = _get_asset_tensors(export_dir,
-                                                meta_graph_def_to_load)
-
-  main_op_tensor = _get_main_op_tensor(meta_graph_def_to_load)
-  if main_op_tensor is not None:
-    sess.run(fetches=[main_op_tensor], feed_dict=asset_tensors_dictionary)
-  else:
-    legacy_init_op_tensor = _get_legacy_init_op_tensor(meta_graph_def_to_load)
-    if legacy_init_op_tensor is not None:
-      sess.run(fetches=[legacy_init_op_tensor],
-               feed_dict=asset_tensors_dictionary)
-
-  return meta_graph_def_to_load
+_allowed_symbols = [
+    "load",
+    "maybe_saved_model_directory",
+]
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
new file mode 100644
index 00000000000..fc971f61e07
--- /dev/null
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -0,0 +1,234 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Loader implementation for SavedModel with hermetic, language-neutral exports.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from google.protobuf import text_format
+
+from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.core.protobuf import saved_model_pb2
+from tensorflow.python.framework import ops
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.saved_model import constants
+from tensorflow.python.training import saver as tf_saver
+from tensorflow.python.util import compat
+
+
+def _parse_saved_model(export_dir):
+  """Reads the savedmodel.pb or savedmodel.pbtxt file containing `SavedModel`.
+
+  Args:
+    export_dir: Directory containing the SavedModel file.
+
+  Returns:
+    A `SavedModel` protocol buffer.
+
+  Raises:
+    IOError: If the file does not exist, or cannot be successfully parsed.
+  """
+  # Build the path to the SavedModel in pbtxt format.
+  path_to_pbtxt = os.path.join(
+      compat.as_bytes(export_dir),
+      compat.as_bytes(constants.SAVED_MODEL_FILENAME_PBTXT))
+  # Build the path to the SavedModel in pb format.
+  path_to_pb = os.path.join(
+      compat.as_bytes(export_dir),
+      compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
+
+  # Ensure that the SavedModel exists at either path.
+  if not file_io.file_exists(path_to_pbtxt) and not file_io.file_exists(
+      path_to_pb):
+    raise IOError("SavedModel file does not exist at: %s" % export_dir)
+
+  saved_model = saved_model_pb2.SavedModel()
+
+  # Parse the SavedModel protocol buffer.
+  try:
+    file_content = file_io.read_file_to_string(path_to_pb)
+    saved_model.ParseFromString(file_content)
+    return saved_model
+  except Exception:  # pylint: disable=broad-except
+    # Pass for exceptions in order to try reading the file in text format.
+    pass
+
+  try:
+    file_content = file_io.read_file_to_string(path_to_pbtxt)
+    text_format.Merge(file_content.decode("utf-8"), saved_model)
+  except text_format.ParseError as e:
+    raise IOError("Cannot parse file %s: %s." % (path_to_pbtxt, str(e)))
+  return saved_model
+
+
+def _get_asset_tensors(export_dir, meta_graph_def_to_load):
+  """Gets the asset tensors, if defined in the meta graph def to load.
+
+  Args:
+    export_dir: Directory where the SavedModel is located.
+    meta_graph_def_to_load: The meta graph def from the SavedModel to be loaded.
+
+  Returns:
+    A dictionary of asset tensors, keyed by the name of the asset tensor. The
+    value in the map corresponds to the absolute path of the asset file.
+  """
+  # Collection-def that may contain the assets key.
+  collection_def = meta_graph_def_to_load.collection_def
+
+  asset_tensor_dict = {}
+  if constants.ASSETS_KEY in collection_def:
+    # Location of the assets for SavedModel.
+    assets_directory = os.path.join(
+        compat.as_bytes(export_dir),
+        compat.as_bytes(constants.ASSETS_DIRECTORY))
+    assets_any_proto = collection_def[constants.ASSETS_KEY].any_list.value
+    # Process each asset and add it to the asset tensor dictionary.
+    for asset_any_proto in assets_any_proto:
+      asset_proto = meta_graph_pb2.AssetFileDef()
+      asset_any_proto.Unpack(asset_proto)
+      asset_tensor_dict[asset_proto.tensor_info.name] = os.path.join(
+          compat.as_bytes(assets_directory),
+          compat.as_bytes(asset_proto.filename))
+  return asset_tensor_dict
+
+
+def _get_main_op_tensor(meta_graph_def_to_load):
+  """Gets the main op tensor, if one exists.
+
+  Args:
+    meta_graph_def_to_load: The meta graph def from the SavedModel to be loaded.
+
+  Returns:
+    The main op tensor, if it exists and `None` otherwise.
+
+  Raises:
+    RuntimeError: If the collection def corresponding to the main op key has
+        other than exactly one tensor.
+  """
+  collection_def = meta_graph_def_to_load.collection_def
+  main_op_tensor = None
+  if constants.MAIN_OP_KEY in collection_def:
+    main_ops = collection_def[constants.MAIN_OP_KEY].node_list.value
+    if len(main_ops) != 1:
+      raise RuntimeError("Expected exactly one SavedModel main op.")
+    main_op_tensor = ops.get_collection(constants.MAIN_OP_KEY)[0]
+  return main_op_tensor
+
+
+def _get_legacy_init_op_tensor(meta_graph_def_to_load):
+  """Gets the legacy init op tensor, if one exists.
+
+  Args:
+    meta_graph_def_to_load: The meta graph def from the SavedModel to be loaded.
+
+  Returns:
+    The legacy init op tensor, if it exists and `None` otherwise.
+
+  Raises:
+    RuntimeError: If the collection def corresponding to the legacy init op key
+        has other than exactly one tensor.
+  """
+  collection_def = meta_graph_def_to_load.collection_def
+  legacy_init_op_tensor = None
+  if constants.LEGACY_INIT_OP_KEY in collection_def:
+    legacy_init_ops = collection_def[
+        constants.LEGACY_INIT_OP_KEY].node_list.value
+    if len(legacy_init_ops) != 1:
+      raise RuntimeError("Expected exactly one legacy serving init op.")
+    legacy_init_op_tensor = ops.get_collection(constants.LEGACY_INIT_OP_KEY)[0]
+  return legacy_init_op_tensor
+
+
+def maybe_saved_model_directory(export_dir):
+  """Checks whether the provided export directory could contain a SavedModel.
+
+  Note that the method does not load any data by itself. If the method returns
+  `false`, the export directory definitely does not contain a SavedModel. If the
+  method returns `true`, the export directory may contain a SavedModel but
+  provides no guarantee that it can be loaded.
+
+  Args:
+    export_dir: Absolute string path to possible export location. For example,
+                '/my/foo/model'.
+
+  Returns:
+    True if the export directory contains SavedModel files, False otherwise.
+  """
+  txt_path = os.path.join(export_dir, constants.SAVED_MODEL_FILENAME_PBTXT)
+  pb_path = os.path.join(export_dir, constants.SAVED_MODEL_FILENAME_PB)
+  return file_io.file_exists(txt_path) or file_io.file_exists(pb_path)
+
+
+def load(sess, tags, export_dir):
+  """Loads the model from a SavedModel as specified by tags.
+
+  Args:
+    sess: The TensorFlow session to restore the variables.
+    tags: Set of string tags to identify the required MetaGraphDef. These should
+        correspond to the tags used when saving the variables using the
+        SavedModel `save()` API.
+    export_dir: Directory in which the SavedModel protocol buffer and variables
+        to be loaded are located.
+
+  Returns:
+    The `MetaGraphDef` protocol buffer loaded in the provided session. This
+    can be used to further extract signature-defs, collection-defs, etc.
+
+  Raises:
+    RuntimeError: MetaGraphDef associated with the tags cannot be found.
+  """
+  # Build the SavedModel protocol buffer and find the requested meta graph def.
+  saved_model = _parse_saved_model(export_dir)
+  found_match = False
+  for meta_graph_def in saved_model.meta_graphs:
+    if set(meta_graph_def.meta_info_def.tags) == set(tags):
+      meta_graph_def_to_load = meta_graph_def
+      found_match = True
+      break
+
+  if not found_match:
+    raise RuntimeError("MetaGraphDef associated with tags " + str(tags).strip(
+        "[]") + " could not be found in SavedModel")
+
+  # Build a saver by importing the meta graph def to load.
+  saver = tf_saver.import_meta_graph(meta_graph_def_to_load)
+
+  # Build the checkpoint path where the variables are located.
+  variables_path = os.path.join(
+      compat.as_bytes(export_dir),
+      compat.as_bytes(constants.VARIABLES_DIRECTORY),
+      compat.as_bytes(constants.VARIABLES_FILENAME))
+
+  # Restore the variables using the built saver in the provided session.
+  saver.restore(sess, variables_path)
+
+  # Get asset tensors, if any.
+  asset_tensors_dictionary = _get_asset_tensors(export_dir,
+                                                meta_graph_def_to_load)
+
+  main_op_tensor = _get_main_op_tensor(meta_graph_def_to_load)
+  if main_op_tensor is not None:
+    sess.run(fetches=[main_op_tensor], feed_dict=asset_tensors_dictionary)
+  else:
+    legacy_init_op_tensor = _get_legacy_init_op_tensor(meta_graph_def_to_load)
+    if legacy_init_op_tensor is not None:
+      sess.run(fetches=[legacy_init_op_tensor],
+               feed_dict=asset_tensors_dictionary)
+
+  return meta_graph_def_to_load
diff --git a/tensorflow/python/saved_model/saved_model.py b/tensorflow/python/saved_model/saved_model.py
new file mode 100644
index 00000000000..1de2617eef4
--- /dev/null
+++ b/tensorflow/python/saved_model/saved_model.py
@@ -0,0 +1,41 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Convenience functions to save a model.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+# pylint: disable=unused-import
+from tensorflow.python.saved_model import builder
+from tensorflow.python.saved_model import loader
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model import signature_def_utils
+from tensorflow.python.saved_model import tag_constants
+# pylint: enable=unused-import
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+
+_allowed_symbols = [
+    "builder",
+    "loader",
+    "signature_constants",
+    "signature_def_utils",
+    "tag_constants",
+]
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/saved_model/signature_constants.py b/tensorflow/python/saved_model/signature_constants.py
index 51a57cab056..935a124645b 100644
--- a/tensorflow/python/saved_model/signature_constants.py
+++ b/tensorflow/python/saved_model/signature_constants.py
@@ -19,6 +19,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.util.all_util import remove_undocumented
+
+
 # Key in the signature def map for `default` serving signatures. The default
 # signature is used in inference requests where a specific signature was not
 # specified.
@@ -64,3 +67,19 @@ REGRESS_METHOD_NAME = "tensorflow/serving/regress"
 REGRESS_OUTPUTS = "outputs"
 
 ################################################################################
+
+
+_allowed_symbols = [
+    "DEFAULT_SERVING_SIGNATURE_DEF_KEY",
+    "CLASSIFY_INPUTS",
+    "CLASSIFY_METHOD_NAME",
+    "CLASSIFY_OUTPUT_CLASSES",
+    "CLASSIFY_OUTPUT_SCORES",
+    "PREDICT_INPUTS",
+    "PREDICT_METHOD_NAME",
+    "PREDICT_OUTPUTS",
+    "REGRESS_INPUTS",
+    "REGRESS_METHOD_NAME",
+    "REGRESS_OUTPUTS",
+]
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/saved_model/signature_def_utils.py b/tensorflow/python/saved_model/signature_def_utils.py
index 23e844adb2b..be29a0f6b16 100644
--- a/tensorflow/python/saved_model/signature_def_utils.py
+++ b/tensorflow/python/saved_model/signature_def_utils.py
@@ -20,139 +20,19 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.core.protobuf import meta_graph_pb2
-from tensorflow.python.saved_model import signature_constants
-from tensorflow.python.saved_model import utils
+# pylint: disable=unused-import
+from tensorflow.python.saved_model.signature_def_utils_impl import build_signature_def
+from tensorflow.python.saved_model.signature_def_utils_impl import classification_signature_def
+from tensorflow.python.saved_model.signature_def_utils_impl import predict_signature_def
+from tensorflow.python.saved_model.signature_def_utils_impl import regression_signature_def
+# pylint: enable=unused-import
+from tensorflow.python.util.all_util import remove_undocumented
 
 
-def build_signature_def(inputs=None, outputs=None, method_name=None):
-  """Utility function to build a SignatureDef protocol buffer.
-
-  Args:
-    inputs: Inputs of the SignatureDef defined as a proto map of string to
-        tensor info.
-    outputs: Outputs of the SignatureDef defined as a proto map of string to
-        tensor info.
-    method_name: Method name of the SignatureDef as a string.
-
-  Returns:
-    A SignatureDef protocol buffer constructed based on the supplied arguments.
-  """
-  signature_def = meta_graph_pb2.SignatureDef()
-  if inputs is not None:
-    for item in inputs:
-      signature_def.inputs[item].CopyFrom(inputs[item])
-  if outputs is not None:
-    for item in outputs:
-      signature_def.outputs[item].CopyFrom(outputs[item])
-  if method_name is not None:
-    signature_def.method_name = method_name
-  return signature_def
-
-
-def regression_signature_def(examples, predictions):
-  """Creates regression signature from given examples and predictions.
-
-  Args:
-    examples: `Tensor`.
-    predictions: `Tensor`.
-
-  Returns:
-    A regression-flavored signature_def.
-
-  Raises:
-    ValueError: If examples is `None`.
-  """
-  if examples is None:
-    raise ValueError('examples cannot be None for regression.')
-  if predictions is None:
-    raise ValueError('predictions cannot be None for regression.')
-
-  input_tensor_info = utils.build_tensor_info(examples)
-  signature_inputs = {signature_constants.REGRESS_INPUTS: input_tensor_info}
-
-  output_tensor_info = utils.build_tensor_info(predictions)
-  signature_outputs = {signature_constants.REGRESS_OUTPUTS: output_tensor_info}
-  signature_def = build_signature_def(
-      signature_inputs, signature_outputs,
-      signature_constants.REGRESS_METHOD_NAME)
-
-  return signature_def
-
-
-def classification_signature_def(examples, classes, scores):
-  """Creates classification signature from given examples and predictions.
-
-  Args:
-    examples: `Tensor`.
-    classes: `Tensor`.
-    scores: `Tensor`.
-
-  Returns:
-    A classification-flavored signature_def.
-
-  Raises:
-    ValueError: If examples is `None`.
-  """
-  if examples is None:
-    raise ValueError('examples cannot be None for classification.')
-  if classes is None and scores is None:
-    raise ValueError('classes and scores cannot both be None for '
-                     'classification.')
-
-  input_tensor_info = utils.build_tensor_info(examples)
-  signature_inputs = {signature_constants.CLASSIFY_INPUTS: input_tensor_info}
-
-  signature_outputs = {}
-  if classes is not None:
-    classes_tensor_info = utils.build_tensor_info(classes)
-    signature_outputs[signature_constants.CLASSIFY_OUTPUT_CLASSES] = (
-        classes_tensor_info)
-  if scores is not None:
-    scores_tensor_info = utils.build_tensor_info(scores)
-    signature_outputs[signature_constants.CLASSIFY_OUTPUT_SCORES] = (
-        scores_tensor_info)
-
-  signature_def = build_signature_def(
-      signature_inputs, signature_outputs,
-      signature_constants.CLASSIFY_METHOD_NAME)
-
-  return signature_def
-
-
-def predict_signature_def(inputs, outputs):
-  """Creates prediction signature from given inputs and outputs.
-
-  Args:
-    inputs: dict of string to `Tensor`.
-    outputs: dict of string to `Tensor`.
-
-  Returns:
-    A prediction-flavored signature_def.
-
-  Raises:
-    ValueError: If inputs or outputs is `None`.
-  """
-  if inputs is None or not inputs:
-    raise ValueError('inputs cannot be None or empty for prediction.')
-  if outputs is None:
-    raise ValueError('outputs cannot be None or empty for prediction.')
-
-  # If there's only one input or output, we can standardize keys
-  if len(inputs) == 1:
-    (_, value), = inputs.items()
-    inputs = {signature_constants.PREDICT_INPUTS: value}
-  if len(outputs) == 1:
-    (_, value), = outputs.items()
-    outputs = {signature_constants.PREDICT_OUTPUTS: value}
-
-  signature_inputs = {key: utils.build_tensor_info(tensor)
-                      for key, tensor in inputs.items()}
-  signature_outputs = {key: utils.build_tensor_info(tensor)
-                       for key, tensor in outputs.items()}
-
-  signature_def = build_signature_def(
-      signature_inputs, signature_outputs,
-      signature_constants.PREDICT_METHOD_NAME)
-
-  return signature_def
+_allowed_symbols = [
+    "build_signature_def",
+    "classification_signature_def",
+    "predict_signature_def",
+    "regression_signature_def",
+]
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/saved_model/signature_def_utils_impl.py b/tensorflow/python/saved_model/signature_def_utils_impl.py
new file mode 100644
index 00000000000..6b6702a79a1
--- /dev/null
+++ b/tensorflow/python/saved_model/signature_def_utils_impl.py
@@ -0,0 +1,156 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""SignatureDef utility functions implementation."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model import utils
+
+
+def build_signature_def(inputs=None, outputs=None, method_name=None):
+  """Utility function to build a SignatureDef protocol buffer.
+
+  Args:
+    inputs: Inputs of the SignatureDef defined as a proto map of string to
+        tensor info.
+    outputs: Outputs of the SignatureDef defined as a proto map of string to
+        tensor info.
+    method_name: Method name of the SignatureDef as a string.
+
+  Returns:
+    A SignatureDef protocol buffer constructed based on the supplied arguments.
+  """
+  signature_def = meta_graph_pb2.SignatureDef()
+  if inputs is not None:
+    for item in inputs:
+      signature_def.inputs[item].CopyFrom(inputs[item])
+  if outputs is not None:
+    for item in outputs:
+      signature_def.outputs[item].CopyFrom(outputs[item])
+  if method_name is not None:
+    signature_def.method_name = method_name
+  return signature_def
+
+
+def regression_signature_def(examples, predictions):
+  """Creates regression signature from given examples and predictions.
+
+  Args:
+    examples: `Tensor`.
+    predictions: `Tensor`.
+
+  Returns:
+    A regression-flavored signature_def.
+
+  Raises:
+    ValueError: If examples is `None`.
+  """
+  if examples is None:
+    raise ValueError('examples cannot be None for regression.')
+  if predictions is None:
+    raise ValueError('predictions cannot be None for regression.')
+
+  input_tensor_info = utils.build_tensor_info(examples)
+  signature_inputs = {signature_constants.REGRESS_INPUTS: input_tensor_info}
+
+  output_tensor_info = utils.build_tensor_info(predictions)
+  signature_outputs = {signature_constants.REGRESS_OUTPUTS: output_tensor_info}
+  signature_def = build_signature_def(
+      signature_inputs, signature_outputs,
+      signature_constants.REGRESS_METHOD_NAME)
+
+  return signature_def
+
+
+def classification_signature_def(examples, classes, scores):
+  """Creates classification signature from given examples and predictions.
+
+  Args:
+    examples: `Tensor`.
+    classes: `Tensor`.
+    scores: `Tensor`.
+
+  Returns:
+    A classification-flavored signature_def.
+
+  Raises:
+    ValueError: If examples is `None`.
+  """
+  if examples is None:
+    raise ValueError('examples cannot be None for classification.')
+  if classes is None and scores is None:
+    raise ValueError('classes and scores cannot both be None for '
+                     'classification.')
+
+  input_tensor_info = utils.build_tensor_info(examples)
+  signature_inputs = {signature_constants.CLASSIFY_INPUTS: input_tensor_info}
+
+  signature_outputs = {}
+  if classes is not None:
+    classes_tensor_info = utils.build_tensor_info(classes)
+    signature_outputs[signature_constants.CLASSIFY_OUTPUT_CLASSES] = (
+        classes_tensor_info)
+  if scores is not None:
+    scores_tensor_info = utils.build_tensor_info(scores)
+    signature_outputs[signature_constants.CLASSIFY_OUTPUT_SCORES] = (
+        scores_tensor_info)
+
+  signature_def = build_signature_def(
+      signature_inputs, signature_outputs,
+      signature_constants.CLASSIFY_METHOD_NAME)
+
+  return signature_def
+
+
+def predict_signature_def(inputs, outputs):
+  """Creates prediction signature from given inputs and outputs.
+
+  Args:
+    inputs: dict of string to `Tensor`.
+    outputs: dict of string to `Tensor`.
+
+  Returns:
+    A prediction-flavored signature_def.
+
+  Raises:
+    ValueError: If inputs or outputs is `None`.
+  """
+  if inputs is None or not inputs:
+    raise ValueError('inputs cannot be None or empty for prediction.')
+  if outputs is None:
+    raise ValueError('outputs cannot be None or empty for prediction.')
+
+  # If there's only one input or output, we can standardize keys
+  if len(inputs) == 1:
+    (_, value), = inputs.items()
+    inputs = {signature_constants.PREDICT_INPUTS: value}
+  if len(outputs) == 1:
+    (_, value), = outputs.items()
+    outputs = {signature_constants.PREDICT_OUTPUTS: value}
+
+  signature_inputs = {key: utils.build_tensor_info(tensor)
+                      for key, tensor in inputs.items()}
+  signature_outputs = {key: utils.build_tensor_info(tensor)
+                       for key, tensor in outputs.items()}
+
+  signature_def = build_signature_def(
+      signature_inputs, signature_outputs,
+      signature_constants.PREDICT_METHOD_NAME)
+
+  return signature_def
diff --git a/tensorflow/python/saved_model/tag_constants.py b/tensorflow/python/saved_model/tag_constants.py
index b7e4ba9935b..4fb9645deac 100644
--- a/tensorflow/python/saved_model/tag_constants.py
+++ b/tensorflow/python/saved_model/tag_constants.py
@@ -19,8 +19,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.util.all_util import remove_undocumented
+
+
 # Tag for the `serving` graph.
 SERVING = "serve"
 
 # Tag for the `training` graph.
 TRAINING = "train"
+
+
+_allowed_symbols = [
+    "SERVING",
+    "TRAINING"
+]
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/training/gradient_descent.py b/tensorflow/python/training/gradient_descent.py
index 459b315d235..93b9eeee159 100644
--- a/tensorflow/python/training/gradient_descent.py
+++ b/tensorflow/python/training/gradient_descent.py
@@ -52,8 +52,10 @@ class GradientDescentOptimizer(optimizer.Optimizer):
         use_locking=self._use_locking).op
 
   def _resource_apply_dense(self, grad, handle):
-    return resource_variable_ops.assign_add_variable_op(
-        handle, -grad * self._learning_rate)
+    return training_ops.resource_apply_gradient_descent(
+        handle, math_ops.cast(self._learning_rate_tensor,
+                              grad.dtype.base_dtype),
+        grad, use_locking=self._use_locking)
 
   def _resource_apply_sparse(self, grad, handle, indices):
     return resource_variable_ops.resource_scatter_add(
diff --git a/tensorflow/python/training/input.py b/tensorflow/python/training/input.py
index 77ae20221f7..62b2b85a04f 100644
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@@ -69,7 +69,7 @@ def match_filenames_once(pattern, name=None):
 def limit_epochs(tensor, num_epochs=None, name=None):
   """Returns tensor `num_epochs` times and then raises an `OutOfRange` error.
 
-  Note: creates local counter `epochs`. Use `local_variable_initializer()` to
+  Note: creates local counter `epochs`. Use `local_variables_initializer()` to
   initialize local variables.
 
   Args:
@@ -111,7 +111,7 @@ def input_producer(input_tensor,
   """Output the rows of `input_tensor` to a queue for an input pipeline.
 
   Note: if `num_epochs` is not `None`, this function creates local counter
-  `epochs`. Use `local_variable_initializer()` to initialize local variables.
+  `epochs`. Use `local_variables_initializer()` to initialize local variables.
 
   Args:
     input_tensor: A tensor with the rows to produce. Must be at least
@@ -180,7 +180,7 @@ def string_input_producer(string_tensor,
   """Output strings (e.g. filenames) to a queue for an input pipeline.
 
   Note: if `num_epochs` is not `None`, this function creates local counter
-  `epochs`. Use `local_variable_initializer()` to initialize local variables.
+  `epochs`. Use `local_variables_initializer()` to initialize local variables.
 
   Args:
     string_tensor: A 1-D string tensor with the strings to produce.
@@ -235,7 +235,7 @@ def range_input_producer(limit, num_epochs=None, shuffle=True, seed=None,
   """Produces the integers from 0 to limit-1 in a queue.
 
   Note: if `num_epochs` is not `None`, this function creates local counter
-  `epochs`. Use `local_variable_initializer()` to initialize local variables.
+  `epochs`. Use `local_variables_initializer()` to initialize local variables.
 
   Args:
     limit: An int32 scalar tensor.
@@ -831,7 +831,7 @@ def batch(tensors, batch_size, num_threads=1, capacity=32,
   operations that depend on fixed batch_size would fail.
 
   Note: if `num_epochs` is not `None`, this function creates local counter
-  `epochs`. Use `local_variable_initializer()` to initialize local variables.
+  `epochs`. Use `local_variables_initializer()` to initialize local variables.
 
   Args:
     tensors: The list or dictionary of tensors to enqueue.
@@ -1124,7 +1124,7 @@ def shuffle_batch(tensors, batch_size, capacity, min_after_dequeue,
   operations that depend on fixed batch_size would fail.
 
   Note: if `num_epochs` is not `None`, this function creates local counter
-  `epochs`. Use `local_variable_initializer()` to initialize local variables.
+  `epochs`. Use `local_variables_initializer()` to initialize local variables.
 
   Args:
     tensors: The list or dictionary of tensors to enqueue.
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index c561ad39e43..ffdd533fd9a 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -497,7 +497,7 @@ class _MonitoredSession(object):
       queue_runner.start_queue_runners(sess=self.tf_sess, coord=self.coord)
       # Inform the hooks that a new session has been created.
       for hook in self._hooks:
-        hook.after_create_session(self.tf_sess)
+        hook.after_create_session(self.tf_sess, self.coord)
       return _CoordinatedSession(
           _HookedSession(self.tf_sess, self._hooks), self.coord)
 
diff --git a/tensorflow/python/training/monitored_session_test.py b/tensorflow/python/training/monitored_session_test.py
index 3d2b1f0067d..3b160731666 100644
--- a/tensorflow/python/training/monitored_session_test.py
+++ b/tensorflow/python/training/monitored_session_test.py
@@ -179,7 +179,7 @@ class FakeHook(session_run_hook.SessionRunHook):
   def begin(self):
     self.call_counter['begin'] += 1
 
-  def after_create_session(self, session):  # pylint: disable=unused-argument
+  def after_create_session(self, session, coord):  # pylint: disable=unused-argument
     self.call_counter['after_create_session'] += 1
 
   def before_run(self, run_context):
diff --git a/tensorflow/python/training/session_manager.py b/tensorflow/python/training/session_manager.py
index 9d01d63e7eb..6bcc6e25c36 100644
--- a/tensorflow/python/training/session_manager.py
+++ b/tensorflow/python/training/session_manager.py
@@ -421,44 +421,6 @@ class SessionManager(object):
       pass
     # pylint: enable=broad-except
 
-  def _ready(self, op, sess, msg):
-    """Checks if the model is ready or not, as determined by op.
-
-    Args:
-      op: An op, either _ready_op or _ready_for_local_init_op, which defines the
-        readiness of the model.
-      sess: A `Session`.
-      msg: A message to log to warning if not ready
-
-    Returns:
-      A tuple (is_ready, msg), where is_ready is True if ready and False
-      otherwise, and msg is `None` if the model is ready, a `String` with the
-      reason why it is not ready otherwise.
-    """
-    if op is None:
-      return True, None
-    else:
-      try:
-        ready_value = sess.run(op)
-        # The model is considered ready if ready_op returns an empty 1-D tensor.
-        # Also compare to `None` and dtype being int32 for backward
-        # compatibility.
-        if (ready_value is None or ready_value.dtype == np.int32 or
-            ready_value.size == 0):
-          return True, None
-        else:
-          # TODO(sherrym): If a custom ready_op returns other types of tensor,
-          # or strings other than variable names, this message could be
-          # confusing.
-          non_initialized_varnames = ", ".join(
-              [i.decode("utf-8") for i in ready_value])
-          return False, "Variables not initialized: " + non_initialized_varnames
-      except errors.FailedPreconditionError as e:
-        if "uninitialized" not in str(e):
-          logging.warning("%s : error [%s]", msg, str(e))
-          raise  e
-        return False, str(e)
-
   def _model_ready(self, sess):
     """Checks if the model is ready or not.
 
@@ -470,7 +432,7 @@ class SessionManager(object):
       otherwise, and msg is `None` if the model is ready, a `String` with the
       reason why it is not ready otherwise.
     """
-    return self._ready(self._ready_op, sess, "Model not ready")
+    return _ready(self._ready_op, sess, "Model not ready")
 
   def _model_ready_for_local_init(self, sess):
     """Checks if the model is ready to run local_init_op.
@@ -484,8 +446,8 @@ class SessionManager(object):
       ready to run local_init_op, a `String` with the reason why it is not ready
       otherwise.
     """
-    return self._ready(self._ready_for_local_init_op, sess,
-                       "Model not ready for local init")
+    return _ready(self._ready_for_local_init_op, sess,
+                  "Model not ready for local init")
 
   def _try_run_local_init_op(self, sess):
     """Tries to run _local_init_op, if not None, and is ready for local init.
@@ -509,6 +471,45 @@ class SessionManager(object):
     return True, None
 
 
+def _ready(op, sess, msg):
+  """Checks if the model is ready or not, as determined by op.
+
+  Args:
+    op: An op, either _ready_op or _ready_for_local_init_op, which defines the
+      readiness of the model.
+    sess: A `Session`.
+    msg: A message to log to warning if not ready
+
+  Returns:
+    A tuple (is_ready, msg), where is_ready is True if ready and False
+    otherwise, and msg is `None` if the model is ready, a `String` with the
+    reason why it is not ready otherwise.
+  """
+  if op is None:
+    return True, None
+  else:
+    try:
+      ready_value = sess.run(op)
+      # The model is considered ready if ready_op returns an empty 1-D tensor.
+      # Also compare to `None` and dtype being int32 for backward
+      # compatibility.
+      if (ready_value is None or ready_value.dtype == np.int32 or
+          ready_value.size == 0):
+        return True, None
+      else:
+        # TODO(sherrym): If a custom ready_op returns other types of tensor,
+        # or strings other than variable names, this message could be
+        # confusing.
+        non_initialized_varnames = ", ".join(
+            [i.decode("utf-8") for i in ready_value])
+        return False, "Variables not initialized: " + non_initialized_varnames
+    except errors.FailedPreconditionError as e:
+      if "uninitialized" not in str(e):
+        logging.warning("%s : error [%s]", msg, str(e))
+        raise e
+      return False, str(e)
+
+
 class _CountDownTimer(object):
 
   def __init__(self, duration_secs):
diff --git a/tensorflow/python/training/session_run_hook.py b/tensorflow/python/training/session_run_hook.py
index ad27a25c5ef..13f8ba1b6e2 100644
--- a/tensorflow/python/training/session_run_hook.py
+++ b/tensorflow/python/training/session_run_hook.py
@@ -98,7 +98,7 @@ class SessionRunHook(object):
     """
     pass
 
-  def after_create_session(self, session):  # pylint: disable=unused-argument
+  def after_create_session(self, session, coord):  # pylint: disable=unused-argument
     """Called when new TensorFlow session is created.
 
     This is called to signal the hooks that a new session has been created. This
@@ -111,6 +111,7 @@ class SessionRunHook(object):
 
     Args:
       session: A TensorFlow Session that has been created.
+      coord: A Coordinator object which keeps track of all threads.
     """
     pass
 
diff --git a/tensorflow/python/training/sync_replicas_optimizer.py b/tensorflow/python/training/sync_replicas_optimizer.py
index 16511b35427..0c69b351b68 100644
--- a/tensorflow/python/training/sync_replicas_optimizer.py
+++ b/tensorflow/python/training/sync_replicas_optimizer.py
@@ -28,6 +28,8 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import queue_runner
+from tensorflow.python.training import session_manager
+from tensorflow.python.training import session_run_hook
 
 
 # Please note that the gradients from replicas are averaged instead of summed
@@ -104,43 +106,22 @@ class SyncReplicasOptimizer(optimizer.Optimizer):
 
   # Now you can call `minimize()` or `compute_gradients()` and
   # `apply_gradients()` normally
-  grads = opt.minimize(total_loss, global_step=self.global_step)
+  training_op = opt.minimize(total_loss, global_step=self.global_step)
 
 
-  # You can now call get_init_tokens_op() and get_chief_queue_runner().
-  # Note that get_init_tokens_op() must be called before creating session
-  # because it modifies the graph by adding new nodes.
-  init_token_op = opt.get_init_tokens_op()
-  chief_queue_runner = opt.get_chief_queue_runner()
+  # You can create the hook which handles initialization and queues.
+  sync_replicas_hook = opt.make_session_run_hook(is_chief)
   ```
 
   In the training program, every worker will run the train_op as if not
-  synchronized. But one worker (usually the chief) will need to execute the
-  chief_queue_runner and get_init_tokens_op from this optimizer.
+  synchronized.
 
   ```python
-  # When you create the supervisor, you need to add the local_init_op and
-  # ready_for_local_init_op to make sure the local_step is initialized to the
-  # global_step. Here is an example:
-  if is_chief:
-    local_init_op = opt.chief_init_op
-  else:
-    local_init_op = opt.local_step_init_op
-  ready_for_local_init_op = opt.ready_for_local_init_op
-  sv = tf.Supervisor(graph=g,
-                     is_chief=is_chief,
-                     # This initialize local step.
-                     local_init_op=local_init_op,
-                     # This makes sure global step is initialized before using.
-                     ready_for_local_init_op=ready_for_local_init_op,
-                     saver=model.saver)
-
-  # After the session is created by the Supervisor and before the main while
-  # loop:
-  if is_chief and FLAGS.sync_replicas:
-    sv.start_queue_runners(sess, [chief_queue_runner])
-    # Insert initial tokens to the queue.
-    sess.run(init_token_op)
+  with training.MonitoredTrainingSession(
+      master=workers[worker_id].target, is_chief=is_chief,
+      hooks=[sync_replicas_hook]) as mon_sess:
+    while not mon_sess.should_stop():
+      mon_sess.run(training_op)
   ```
 
   @@__init__
@@ -440,3 +421,51 @@ class SyncReplicasOptimizer(optimizer.Optimizer):
       init_tokens = control_flow_ops.no_op(name="no_init_tokens")
 
     return init_tokens
+
+  def make_session_run_hook(self, is_chief, num_tokens=-1):
+    """Creates a hook to handle SyncReplicasHook ops such as initialization."""
+    if is_chief:
+      return _SyncReplicasOptimizerHook(self.chief_init_op,
+                                        self.ready_for_local_init_op,
+                                        self.get_chief_queue_runner(),
+                                        self.get_init_tokens_op(num_tokens))
+
+    return _SyncReplicasOptimizerHook(self.local_step_init_op,
+                                      self.ready_for_local_init_op, None, None)
+
+
+class _SyncReplicasOptimizerHook(session_run_hook.SessionRunHook):
+  """A SessionRunHook handles ops related to SyncReplicasOptimizer."""
+
+  def __init__(self, local_init_op, ready_for_local_init_op, q_runner,
+               init_tokens_op):
+    """Creates hook to handle SyncReplicaOptimizer initialization ops.
+
+    Args:
+      local_init_op: Either `SyncReplicasOptimizer.chief_init_op` or
+        `SyncReplicasOptimizer.local_step_init_op`.
+      ready_for_local_init_op: `SyncReplicasOptimizer.ready_for_local_init_op`
+      q_runner: Either `SyncReplicasOptimizer.get_chief_queue_runner` or `None`
+      init_tokens_op: `SyncReplicasOptimizer.get_init_tokens_op` or None
+    """
+    self._local_init_op = local_init_op
+    self._ready_for_local_init_op = ready_for_local_init_op
+    self._q_runner = q_runner
+    self._init_tokens_op = init_tokens_op
+
+  def after_create_session(self, session, coord):
+    """Runs SyncReplicasOptimizer initialization ops."""
+    local_init_success, msg = session_manager._ready(  # pylint: disable=protected-access
+        self._ready_for_local_init_op, session,
+        "Model is not ready for SyncReplicasOptimizer local init.")
+    if not local_init_success:
+      raise RuntimeError(
+          "Init operations did not make model ready for SyncReplicasOptimizer "
+          "local_init. Init op: %s, error: %s" %
+          (self._local_init_op.name, msg))
+    session.run(self._local_init_op)
+    if self._init_tokens_op is not None:
+      session.run(self._init_tokens_op)
+    if self._q_runner is not None:
+      self._q_runner.create_threads(
+          session, coord=coord, daemon=True, start=True)
diff --git a/tensorflow/python/training/sync_replicas_optimizer_test.py b/tensorflow/python/training/sync_replicas_optimizer_test.py
index 2e268dd4040..6da18391db9 100644
--- a/tensorflow/python/training/sync_replicas_optimizer_test.py
+++ b/tensorflow/python/training/sync_replicas_optimizer_test.py
@@ -28,7 +28,6 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import server_lib
-from tensorflow.python.training import supervisor as supervisor_lib
 from tensorflow.python.training import training
 
 
@@ -92,33 +91,14 @@ def get_workers(num_workers, replicas_to_aggregate, workers):
                     [var_0, var_1, var_sparse]),
                 global_step=global_step)
         ]
+        sync_replicas_hook = sync_rep_opt.make_session_run_hook(
+            is_chief, num_tokens=num_workers)
 
-        init_op = variables.global_variables_initializer()
-        # Needed ops from the sync_rep optimizer. This is mainly for the
-        # local_step initialization.
-        local_init_op = sync_rep_opt.local_step_init_op
-        if is_chief:
-          local_init_op = sync_rep_opt.chief_init_op
-        ready_for_local_init_op = sync_rep_opt.ready_for_local_init_op
-
-        # Chief_queue_runner
-        chief_queue_runner = sync_rep_opt.get_chief_queue_runner()
-        sync_init_op = sync_rep_opt.get_init_tokens_op(num_workers)
-
-    # Creates session for chief.
-    supervisor = supervisor_lib.Supervisor(
-        graph=graph,
-        is_chief=is_chief,
-        recovery_wait_secs=1,
-        init_op=init_op,
-        local_init_op=local_init_op,
-        ready_for_local_init_op=ready_for_local_init_op)
-    session = supervisor.prepare_or_wait_for_session(workers[worker_id].target)
-
-    # Chief should execute the sync_init_op and start the chief queue runner.
-    if is_chief:
-      session.run(sync_init_op)
-      supervisor.StartQueueRunners(session, [chief_queue_runner])
+      # Creates MonitoredSession
+      session = training.MonitoredTrainingSession(
+          master=workers[worker_id].target,
+          is_chief=is_chief,
+          hooks=[sync_replicas_hook])
 
     sessions.append(session)
     graphs.append(graph)
@@ -146,9 +126,9 @@ class SyncReplicasOptimizerTest(test.TestCase):
     var_0_g_0 = graphs[0].get_tensor_by_name("v0:0")
     var_1_g_0 = graphs[0].get_tensor_by_name("v1:0")
     local_step_0 = graphs[0].get_tensor_by_name("sync_rep_local_step:0")
-    self.assertAllEqual(0.0, var_0_g_0.eval(session=sessions[0]))
-    self.assertAllEqual(1.0, var_1_g_0.eval(session=sessions[0]))
-    self.assertAllEqual(0, local_step_0.eval(session=sessions[0]))
+    self.assertAllEqual(0.0, sessions[0].run(var_0_g_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1_g_0))
+    self.assertAllEqual(0, sessions[0].run(local_step_0))
 
     # Will just use session 1 to verify all the variables later.
     var_0_g_1 = graphs[1].get_tensor_by_name("v0:0")
@@ -158,10 +138,9 @@ class SyncReplicasOptimizerTest(test.TestCase):
     global_step = graphs[1].get_tensor_by_name("global_step:0")
 
     # The steps should also be initialized.
-    self.assertAllEqual(0, global_step.eval(session=sessions[1]))
-    self.assertAllEqual(0, local_step_1.eval(session=sessions[1]))
-    self.assertAllClose(
-        [[3.0], [4.0]], var_sparse_g_1.eval(session=sessions[1]))
+    self.assertAllEqual(0, sessions[1].run(global_step))
+    self.assertAllEqual(0, sessions[1].run(local_step_1))
+    self.assertAllClose([[3.0], [4.0]], sessions[1].run(var_sparse_g_1))
 
     # We have initial tokens in the queue so we can call this one by one. After
     # the first step, this will no longer work as there will be no more extra
@@ -171,16 +150,13 @@ class SyncReplicasOptimizerTest(test.TestCase):
 
     # The global step should have been updated and the variables should now have
     # the new values after the average of the gradients are applied.
-    while global_step.eval(session=sessions[1]) != 1:
+    while sessions[1].run(global_step) != 1:
       time.sleep(0.01)
 
-    self.assertAllClose(
-        0 - (0.1 + 0.3) / 2 * 2.0, var_0_g_1.eval(session=sessions[1]))
-    self.assertAllClose(
-        1 - (0.9 + 1.1) / 2 * 2.0, var_1_g_1.eval(session=sessions[1]))
-    self.assertAllClose(
-        [[3.0], [4.0 - (0.1 + 0.3) / 2 * 2.0]],
-        var_sparse_g_1.eval(session=sessions[1]))
+    self.assertAllClose(0 - (0.1 + 0.3) / 2 * 2.0, sessions[1].run(var_0_g_1))
+    self.assertAllClose(1 - (0.9 + 1.1) / 2 * 2.0, sessions[1].run(var_1_g_1))
+    self.assertAllClose([[3.0], [4.0 - (0.1 + 0.3) / 2 * 2.0]],
+                        sessions[1].run(var_sparse_g_1))
 
     # The local step for both workers should still be 0 because the initial
     # tokens in the token queue are 0s. This means that the following
@@ -188,20 +164,18 @@ class SyncReplicasOptimizerTest(test.TestCase):
     # the current global step. However, this only happens once when the system
     # just starts and this is necessary to make the system robust for the case
     # when chief gets restarted by errors/preemption/...
-    self.assertAllEqual(0, local_step_0.eval(session=sessions[0]))
-    self.assertAllEqual(0, local_step_1.eval(session=sessions[1]))
+    self.assertAllEqual(0, sessions[0].run(local_step_0))
+    self.assertAllEqual(0, sessions[1].run(local_step_1))
 
     sessions[0].run(train_ops[0])
     sessions[1].run(train_ops[1])
     # Although the global step should still be 1 as explained above, the local
     # step should now be updated to 1. The variables are still the same.
-    self.assertAllEqual(1, global_step.eval(session=sessions[1]))
-    self.assertAllEqual(1, local_step_0.eval(session=sessions[0]))
-    self.assertAllEqual(1, local_step_1.eval(session=sessions[1]))
-    self.assertAllClose(
-        0 - (0.1 + 0.3) / 2 * 2.0, var_0_g_1.eval(session=sessions[1]))
-    self.assertAllClose(
-        1 - (0.9 + 1.1) / 2 * 2.0, var_1_g_1.eval(session=sessions[1]))
+    self.assertAllEqual(1, sessions[1].run(global_step))
+    self.assertAllEqual(1, sessions[0].run(local_step_0))
+    self.assertAllEqual(1, sessions[1].run(local_step_1))
+    self.assertAllClose(0 - (0.1 + 0.3) / 2 * 2.0, sessions[1].run(var_0_g_1))
+    self.assertAllClose(1 - (0.9 + 1.1) / 2 * 2.0, sessions[1].run(var_1_g_1))
 
     # At this step, the token queue is empty. So the 2 workers need to work
     # together to proceed.
@@ -221,11 +195,11 @@ class SyncReplicasOptimizerTest(test.TestCase):
 
     # The global step should now be 2 and the gradients should have been
     # applied twice.
-    self.assertAllEqual(2, global_step.eval(session=sessions[1]))
-    self.assertAllClose(
-        0 - 2 * (0.1 + 0.3) / 2 * 2.0, var_0_g_1.eval(session=sessions[1]))
-    self.assertAllClose(
-        1 - 2 * (0.9 + 1.1) / 2 * 2.0, var_1_g_1.eval(session=sessions[1]))
+    self.assertAllEqual(2, sessions[1].run(global_step))
+    self.assertAllClose(0 - 2 * (0.1 + 0.3) / 2 * 2.0,
+                        sessions[1].run(var_0_g_1))
+    self.assertAllClose(1 - 2 * (0.9 + 1.1) / 2 * 2.0,
+                        sessions[1].run(var_1_g_1))
 
   # 3 workers and one of them is backup.
   def test3Workers1Backup(self):
@@ -245,8 +219,8 @@ class SyncReplicasOptimizerTest(test.TestCase):
     global_step = graphs[1].get_tensor_by_name("global_step:0")
 
     # The steps should also be initilized.
-    self.assertAllEqual(0, global_step.eval(session=sessions[1]))
-    self.assertAllEqual(0, local_step_1.eval(session=sessions[1]))
+    self.assertAllEqual(0, sessions[1].run(global_step))
+    self.assertAllEqual(0, sessions[1].run(local_step_1))
 
     # We have initial tokens in the queue so we can call this one by one. After
     # the token queue becomes empty, they should be called concurrently.
@@ -257,14 +231,12 @@ class SyncReplicasOptimizerTest(test.TestCase):
     # The global step should have been updated since we only need to collect 2
     # gradients. The variables should now have the new values after the average
     # of the gradients from worker 0/2 are applied.
-    while global_step.eval(session=sessions[1]) != 1:
+    while sessions[1].run(global_step) != 1:
       time.sleep(0.01)
 
-    self.assertAllEqual(1, global_step.eval(session=sessions[1]))
-    self.assertAllClose(
-        0 - (0.1 + 0.5) / 2 * 2.0, var_0_g_1.eval(session=sessions[1]))
-    self.assertAllClose(
-        1 - (0.9 + 1.3) / 2 * 2.0, var_1_g_1.eval(session=sessions[1]))
+    self.assertAllEqual(1, sessions[1].run(global_step))
+    self.assertAllClose(0 - (0.1 + 0.5) / 2 * 2.0, sessions[1].run(var_0_g_1))
+    self.assertAllClose(1 - (0.9 + 1.3) / 2 * 2.0, sessions[1].run(var_1_g_1))
 
     # Worker 1 finished later and its gradients will now be dropped as it is
     # stale.
@@ -278,8 +250,8 @@ class SyncReplicasOptimizerTest(test.TestCase):
 
     # Although the global step should still be 1 as explained above, the local
     # step should now be updated to 1. Just check worker 1 as an example.
-    self.assertAllEqual(1, global_step.eval(session=sessions[1]))
-    self.assertAllEqual(1, local_step_1.eval(session=sessions[1]))
+    self.assertAllEqual(1, sessions[1].run(global_step))
+    self.assertAllEqual(1, sessions[1].run(local_step_1))
 
     thread_0 = self.checkedThread(
         target=self._run, args=(train_ops[0], sessions[0]))
@@ -290,7 +262,7 @@ class SyncReplicasOptimizerTest(test.TestCase):
     # It will wait as we need 2 workers to finish this step and the global step
     # should be still 1.
     thread_0.start()
-    self.assertAllEqual(1, global_step.eval(session=sessions[1]))
+    self.assertAllEqual(1, sessions[1].run(global_step))
 
     # Starts worker 1.
     thread_1.start()
@@ -298,11 +270,11 @@ class SyncReplicasOptimizerTest(test.TestCase):
 
     # The global step should now be 2 and the gradients should have been
     # applied again.
-    self.assertAllEqual(2, global_step.eval(session=sessions[1]))
-    self.assertAllClose(
-        -0.6 - (0.1 + 0.3) / 2 * 2.0, var_0_g_1.eval(session=sessions[1]))
-    self.assertAllClose(
-        -1.2 - (0.9 + 1.1) / 2 * 2.0, var_1_g_1.eval(session=sessions[1]))
+    self.assertAllEqual(2, sessions[1].run(global_step))
+    self.assertAllClose(-0.6 - (0.1 + 0.3) / 2 * 2.0,
+                        sessions[1].run(var_0_g_1))
+    self.assertAllClose(-1.2 - (0.9 + 1.1) / 2 * 2.0,
+                        sessions[1].run(var_1_g_1))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index 4790b60d086..3e323a0307e 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -26,6 +26,7 @@ should be recursive.
 @@flatten
 @@flatten_dict_items
 @@pack_sequence_as
+@@map_structure
 """
 
 from __future__ import absolute_import
@@ -260,3 +261,42 @@ def pack_sequence_as(structure, flat_sequence):
 
   _, packed = _packed_nest_with_indices(structure, flat_sequence, 0)
   return _sequence_like(structure, packed)
+
+
+def map_structure(func, *structure):
+  """Applies `func` to each entry in `structure` and returns a new structure.
+
+  Applies `func(x[0], x[1], ...)` where x[i] is an entry in
+  `structure[i]`.  All structures in `structure` must have the same arity,
+  and the return value will contain the results in the same structure.
+
+  Args:
+    func: A callable that acceps as many arguments are there are structures.
+    *structure: scalar, or tuple or list of constructed scalars and/or other
+      tuples/lists, or scalars.  Note: numpy arrays are considered scalars.
+
+  Returns:
+    A new structure with the same arity as `structure`, whose values correspond
+    to `func(x[0], x[1], ...)` where `x[i]` is a value in the corresponding
+    location in `structure[i]`.
+
+  Raises:
+    TypeError: If `func` is not callable or if the structures do not match
+      each other by depth tree.
+    ValueError: If no structure is provided or if the structures do not match
+      each other by type.
+  """
+  if not callable(func):
+    raise TypeError("func must be callable, got: %s" % func)
+
+  if not structure:
+    raise ValueError("Must provide at least one structure")
+
+  for other in structure[1:]:
+    assert_same_structure(structure[0], other)
+
+  flat_structure = [flatten(s) for s in structure]
+  entries = zip(*flat_structure)
+
+  return pack_sequence_as(
+      structure[0], [func(*x) for x in entries])
diff --git a/tensorflow/python/util/nest_test.py b/tensorflow/python/util/nest_test.py
index 75a0d8ad43e..ce08ebbc97d 100644
--- a/tensorflow/python/util/nest_test.py
+++ b/tensorflow/python/util/nest_test.py
@@ -139,6 +139,36 @@ class NestTest(test.TestCase):
                                  "don't have the same nested structure"):
       nest.assert_same_structure([[3], 4], [3, [4]])
 
+  def testMapStructure(self):
+    structure1 = (((1, 2), 3), 4, (5, 6))
+    structure2 = (((7, 8), 9), 10, (11, 12))
+    structure1_plus1 = nest.map_structure(lambda x: x + 1, structure1)
+    nest.assert_same_structure(structure1, structure1_plus1)
+    self.assertAllEqual(
+        [2, 3, 4, 5, 6, 7],
+        nest.flatten(structure1_plus1))
+    structure1_plus_structure2 = nest.map_structure(
+        lambda x, y: x + y, structure1, structure2)
+    self.assertEqual(
+        (((1 + 7, 2 + 8), 3 + 9), 4 + 10, (5 + 11, 6 + 12)),
+        structure1_plus_structure2)
+
+    self.assertEqual(3, nest.map_structure(lambda x: x - 1, 4))
+
+    self.assertEqual(7, nest.map_structure(lambda x, y: x + y, 3, 4))
+
+    with self.assertRaisesRegexp(TypeError, "callable"):
+      nest.map_structure("bad", structure1_plus1)
+
+    with self.assertRaisesRegexp(ValueError, "same nested structure"):
+      nest.map_structure(lambda x, y: None, 3, (3,))
+
+    with self.assertRaisesRegexp(TypeError, "same sequence type"):
+      nest.map_structure(lambda x, y: None, ((3, 4), 5), [(3, 4), 5])
+
+    with self.assertRaisesRegexp(ValueError, "same nested structure"):
+      nest.map_structure(lambda x, y: None, ((3, 4), 5), (3, (4, 5)))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/tensorboard/TAG b/tensorflow/tensorboard/TAG
index a2720097dcc..87523dd7a06 100644
--- a/tensorflow/tensorboard/TAG
+++ b/tensorflow/tensorboard/TAG
@@ -1 +1 @@
-39
+41
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/tf-image-loader.html b/tensorflow/tensorboard/components/tf_image_dashboard/tf-image-loader.html
index fdf2c4494f7..357655e2582 100644
--- a/tensorflow/tensorboard/components/tf_image_dashboard/tf-image-loader.html
+++ b/tensorflow/tensorboard/components/tf_image_dashboard/tf-image-loader.html
@@ -16,6 +16,7 @@ limitations under the License.
 -->
 
 <link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../paper-slider/paper-slider.html">
 <link rel="import" href="../tf-imports/lodash.html">
 <link rel="import" href="../tf-imports/d3.html">
 
@@ -28,15 +29,29 @@ future for loading older images.
 <dom-module id="tf-image-loader">
   <template>
     <div id="image-annotation">
-      step [[step]]
-      <template is="dom-if" if="[[wallTime]]">
-        ([[wallTime]])
+      <template is="dom-if" if="[[_hasAtLeastOneStep]]">
+        step
+        <span class="step-value">
+          [[_stepValue]]
+        </span>
+        <template is="dom-if" if="[[_currentWallTime]]">
+          ([[_currentWallTime]])
+        </template>
+      </template>
+      <template is="dom-if" if="[[_hasMultipleSteps]]">
+        <paper-slider
+          id="steps"
+          immediate-value="{{_stepIndex}}"
+          max="[[_maxStepIndex]]"
+          max-markers="[[_maxStepIndex]]"
+          snaps
+          step="1"
+          value="{{_stepIndex}}"></paper-slider>
       </template>
     </div>
-    <img
-      id="img"
-      src="[[imageUrl]]"
-      on-error="reload">
+
+    <img id="img" src="">
+
     <style>
       :host {
         display: block;
@@ -52,7 +67,21 @@ future for loading older images.
         margin: -10px 0 10px 0;
       }
 
+      #image-annotation .step-value {
+        font-weight: bold;
+      }
+
+      #steps {
+        height: 15px;
+        margin: 0 0 0 -15px;
+        /* 31 comes from adding a padding of 15px from both sides of the paper-slider, subtracting
+         * 1px so that the slider width aligns with the image (the last slider marker takes up 1px),
+         * and adding 2px to account for a border of 1px on both sides of the image. 30 - 1 + 2. */
+        width: calc(100% + 31px);
+      }
+
       img {
+        border: 1px solid #f5f5f5;
         image-rendering: -moz-crisp-edges;
         image-rendering: pixelated;
         display: block;
@@ -66,33 +95,106 @@ future for loading older images.
       is: "tf-image-loader",
       properties: {
         colorScale: Object,
-        imageUrl: String,
         run: String,
-        step: Number,
-        wallTime: String,
+        // This is an array of Tensorboard Image&Datum objects (See backend.ts for details). The
+        // properties of objects in this array are
+        // {
+        //   width: number,
+        //   height: number,
+        //   wall_time: Date,
+        //   step: number,
+        //   url: string,
+        // }
+        _steps: {
+          type: Array,
+          value: [],
+          notify: true,
+        },
+        _stepIndex: {
+          type: Number,
+          notify: true,
+        },
+        _hasAtLeastOneStep: {
+          type: Boolean,
+          computed: "_computeHasAtLeastOneStep(_steps)",
+        },
+        _hasMultipleSteps: {
+          type: Boolean,
+          computed: "_computeHasMultipleSteps(_steps)",
+        },
+        _stepValue: {
+          type: Number,
+          computed: "_computeStepValue(_stepIndex)",
+        },
+        _currentWallTime: {
+          type: Number,
+          computed: "_computeCurrentWallTime(_stepIndex)",
+        },
+        _maxStepIndex: {
+          type: Number,
+          computed: "_computeMaxStepIndex(_steps)",
+        },
+      },
+      observers: [
+        "_onStepIndexChanged(_stepIndex)",
+      ],
+      redraw: function() {
+        // Other dashboards logic requires a redraw method to be defined. redraw is called at
+        // various places such as when the image is expanded.
+        this.setSeriesData(this.run, this._steps);
       },
       setVisibleSeries: function(runs) {
         // Do nothing.
       },
-      setSeriesData: function(run, data) {
-        var last = _.last(data);
-        this.redraw(last);
+      setSeriesData: function(run, steps) {
+        this.set("run", run);
+        this.set("_steps", steps);
+        this.set("_stepIndex", steps.length - 1);
 
         // Update the border color based on the run.
-        this.$$('#image-annotation').style.borderColor = this.colorScale.scale(run);
-      },
-      redraw: function(imageData) {
-        var url = imageData.url || this.imageUrl;
-        this.imageUrl = ""; // Force redraw
-        this.imageUrl = url;
+        var color = this.colorScale.scale(run);
+        this.$$("#image-annotation").style.borderColor = color;
 
-        // Update the step if the value fetched is a valid number >= 0
-        // (not null, NaN, etc).
-        this.step = imageData.step >= 0 ? imageData.step : this.step;
-        if (imageData.wall_time) {
-          this.wallTime = imageData.wall_time.toString();
+        // Set the color for the slider that lets the user select a step.
+        // These values should all be changed from their defaults set by paper-slider.
+        var mixins = [
+          "--paper-slider-active-color",
+          "--paper-slider-secondary-color",
+          "--paper-slider-knob-color",
+          "--paper-slider-knob-start-color",
+          "--paper-slider-knob-start-border-color",
+        ];
+
+        for (var i = 0; i < mixins.length; i++) {
+          this.customStyle[mixins[i]] = color;
         }
       },
+      _onStepIndexChanged: function(stepIndex) {
+        // We manually change the image URL (instead of binding to the image's src attribute)
+        // because we would like to clear the image URL before setting the src to the new URL. If
+        // we avoid doing that, the user might be misled into believing that the new image has
+        // finished loading (and that it looks identical to the previous image).
+        if (!this._steps.length) {
+          return;
+        }
+        this.$.img.src = "";
+        this.$.img.src = this._steps[stepIndex].url;
+      },
+      _computeHasAtLeastOneStep: function(steps) {
+        return steps.length > 0;
+      },
+      _computeHasMultipleSteps: function(steps) {
+        return steps.length > 1;
+      },
+      _computeStepValue: function(stepIndex) {
+        return this._steps[stepIndex].step;
+      },
+      _computeCurrentWallTime: function(stepIndex) {
+        return this._steps[stepIndex].wall_time.toString();
+      },
+      _computeMaxStepIndex: function(steps) {
+        return steps.length - 1;
+      },
     });
   </script>
 </dom-module>
diff --git a/tensorflow/tools/ci_build/builds/test_installation.sh b/tensorflow/tools/ci_build/builds/test_installation.sh
index a1e1c22c4d0..b3664bbc56f 100755
--- a/tensorflow/tools/ci_build/builds/test_installation.sh
+++ b/tensorflow/tools/ci_build/builds/test_installation.sh
@@ -259,7 +259,7 @@ cp -r tensorflow/contrib/ffmpeg/testdata ${PY_TEST_DIR}
 
 # Run tests
 DIR0=$(pwd)
-ALL_PY_TESTS_0=$(find tensorflow/{contrib,examples,models,python,tensorboard} \
+ALL_PY_TESTS_0=$(find tensorflow/{contrib,examples,python,tensorboard} \
     -type f \( -name "*_test.py" -o -name "test_*.py" \) | sort)
 
 
diff --git a/tensorflow/tools/compatibility/testdata/test_file_v0_11.py b/tensorflow/tools/compatibility/testdata/test_file_v0_11.py
index 37d914c6486..01f37d87680 100644
--- a/tensorflow/tools/compatibility/testdata/test_file_v0_11.py
+++ b/tensorflow/tools/compatibility/testdata/test_file_v0_11.py
@@ -163,6 +163,33 @@ class TestUpgrade(test_util.TensorFlowTestCase):
       #     # TODO(aselle): (tf.batch_*)
       # ]
 
+  def testBatchAndSvd(self):
+    with self.test_session():
+      mat = [[1., 2.], [2., 3.]]
+      batched_mat = tf.expand_dims(mat, [0])
+      result = tf.matmul(mat, mat).eval()
+      result_batched = tf.batch_matmul(batched_mat, batched_mat).eval()
+      self.assertAllEqual(result_batched, np.expand_dims(result, 0))
+      self.assertAllEqual(
+          tf.svd(mat, False, True).eval(),
+          tf.svd(mat, compute_uv=False, full_matrices=True).eval())
+
+  def testCrossEntropy(self):
+    # TODO(aselle): Test sparse_softmax_...
+    with self.test_session():
+      labels = [.8, .5, .2, .1]
+      logits = [.9, .1, .3, .1]
+      self.assertAllEqual(
+          tf.nn.softmax_cross_entropy_with_logits(
+              logits, labels).eval(),
+          tf.nn.softmax_cross_entropy_with_logits(
+              labels=labels, logits=logits).eval())
+      self.assertAllEqual(
+          tf.nn.sigmoid_cross_entropy_with_logits(
+              logits, labels).eval(),
+          tf.nn.sigmoid_cross_entropy_with_logits(
+              labels=labels, logits=logits).eval())
+
   def testVariables(self):
     with self.test_session() as s:
 
diff --git a/tensorflow/tools/compatibility/tf_upgrade.py b/tensorflow/tools/compatibility/tf_upgrade.py
index 223f8cd5f5d..374be0475a5 100644
--- a/tensorflow/tools/compatibility/tf_upgrade.py
+++ b/tensorflow/tools/compatibility/tf_upgrade.py
@@ -21,14 +21,11 @@ import argparse
 import ast
 import collections
 import os
+import shutil
 import sys
+import tempfile
 import traceback
 
-# TODO(aselle): Add SVD, Concat
-# TODO(aselle): summary merge all (can we detect this?)
-# TODO(aselle): batch_matmul
-# TODO(wicke): tf.nn.{softmax,sparse_softmax,sigmoid}_cross_entropy_with_logits?
-
 
 class APIChangeSpec(object):
   """List of maps that describe what changed in the API."""
@@ -143,7 +140,8 @@ class APIChangeSpec(object):
         "tf.batch_fft3d": "tf.fft3d",
         "tf.batch_ifft3d": "tf.ifft3d",
         "tf.select": "tf.where",
-        "tf.complex_abs": "tf.abs"
+        "tf.complex_abs": "tf.abs",
+        "tf.batch_matmul": "tf.matmul",
     }
 
     # Functions that were reordered should be changed to the new keyword args
@@ -151,7 +149,14 @@ class APIChangeSpec(object):
     # positional arguments yourself, this could do the wrong thing.
     self.function_reorders = {
         "tf.split": ["axis", "num_or_size_splits", "value", "name"],
-        "tf.concat": ["concat_dim", "values", "name"]
+        "tf.concat": ["concat_dim", "values", "name"],
+        "tf.svd": ["tensor", "compute_uv", "full_matrices", "name"],
+        "tf.nn.softmax_cross_entropy_with_logits": [
+            "logits", "labels", "dim", "name"],
+        "tf.nn.sparse_softmax_cross_entropy_with_logits": [
+            "logits", "labels", "name"],
+        "tf.nn.sigmoid_cross_entropy_with_logits": [
+            "logits", "labels", "name"]
     }
 
     # Specially handled functions.
@@ -223,7 +228,7 @@ class FileEditRecorder(object):
       char_array = list(text[line - 1])
 
       # Record a description of the change
-      change_report += "%s Line %d\n" % (self._filename, line)
+      change_report += "%r Line %d\n" % (self._filename, line)
       change_report += "-" * 80 + "\n\n"
       for e in edits:
         change_report += "%s\n" % e.comment
@@ -243,7 +248,7 @@ class FileEditRecorder(object):
         # Make sure the edit is changing what it should be changing
         old_actual = "".join(char_array[start_eff:end_eff])
         if old_actual != e.old:
-          raise ValueError("Expected text '%s' but got '%s'" %
+          raise ValueError("Expected text %r but got %r" %
                            ("".join(e.old), "".join(old_actual)))
         # Make the edit
         char_array[start_eff:end_eff] = list(e.new)
@@ -278,7 +283,7 @@ class FileEditRecorder(object):
 
     self._line_to_edit[line].append(
         FileEditTuple(comment, line, start, old, new))
-    if error is not None:
+    if error:
       self._errors.append("%s:%d: %s" % (self._filename, line, error))
 
 
@@ -302,11 +307,33 @@ class TensorFlowCallVisitor(ast.NodeVisitor):
 
   def _rename_functions(self, node, full_name):
     function_renames = self._api_change_spec.function_renames
-    if full_name in function_renames:
+    try:
       new_name = function_renames[full_name]
-      self._file_edit.add("Renamed function `%s` to `%s`" % (full_name,
-                                                             new_name),
+      self._file_edit.add("Renamed function %r to %r" % (full_name,
+                                                         new_name),
                           node.lineno, node.col_offset, full_name, new_name)
+    except KeyError:
+      pass
+
+  def _get_attribute_full_path(self, node):
+    """Traverse an attribute to generate a full name e.g. tf.foo.bar.
+
+    Args:
+      node: A Node of type Attribute.
+
+    Returns:
+      a '.'-delimited full-name or None if the tree was not a simple form.
+      i.e. `foo()+b).bar` returns None, while `a.b.c` would return "a.b.c".
+    """
+    curr = node
+    items = []
+    while not isinstance(curr, ast.Name):
+      if not isinstance(curr, ast.Attribute):
+        return None
+      items.append(curr.attr)
+      curr = curr.value
+    items.append(curr.id)
+    return ".".join(reversed(items))
 
   def visit_Call(self, node):  # pylint: disable=invalid-name
     """Handle visiting a call node in the AST.
@@ -315,59 +342,51 @@ class TensorFlowCallVisitor(ast.NodeVisitor):
       node: Current Node
     """
 
-    # Find call string (this is not perfectly accurate,
-    # but should cover tf.x*)
-    curr = node.func
-    items = []
-    valid = True
-    while not isinstance(curr, ast.Name):
-      if isinstance(curr, ast.Attribute):
-        items.append(curr.attr)
-      else:
-        # We cannot just return, because we need to keep walking.
-        # TODO(aselle): Would it be cleaner to use an exception here with else?
-        valid = False
-        break
-      curr = curr.value
-    if valid:
-      items.append(curr.id)
+    ast.NodeVisitor.generic_visit(self, node)
 
-    if valid:
-      # Conversion logic
-      full_name = ".".join(items[::-1])
-      if full_name.startswith("tf."):
-        # Call special handlers
-        function_handles = self._api_change_spec.function_handle
-        if full_name in function_handles:
-          function_handles[full_name](self._file_edit, node)
+    # Find a simple attribute name path e.g. "tf.foo.bar"
+    full_name = self._get_attribute_full_path(node.func)
 
-        # Check for renames
-        self._rename_functions(node, full_name)
+    if full_name and full_name.startswith("tf."):
+      # Call special handlers
+      function_handles = self._api_change_spec.function_handle
+      if full_name in function_handles:
+        function_handles[full_name](self._file_edit, node)
 
-        # Examine any non-keyword argument and make it into a keyword argument
-        # if reordering required.
-        function_reorders = self._api_change_spec.function_reorders
-        if full_name in function_reorders:
-          reordered = function_reorders[full_name]
-          for idx, arg in enumerate(node.args):
-            self._file_edit.add("Added keyword `%s` to reordered function `%s`"
-                                % (reordered[idx], full_name), arg.lineno,
-                                arg.col_offset, "", reordered[idx] + "=")
+      # Examine any non-keyword argument and make it into a keyword argument
+      # if reordering required.
+      function_reorders = self._api_change_spec.function_reorders
+      if full_name in function_reorders:
+        reordered = function_reorders[full_name]
+        for idx, arg in enumerate(node.args):
+          self._file_edit.add("Added keyword %r to reordered function %r"
+                              % (reordered[idx], full_name), arg.lineno,
+                              arg.col_offset, "", reordered[idx] + "=")
 
-        # Examine each keyword argument and convert it to the final renamed form
-        function_keyword_renames = (
-            self._api_change_spec.function_keyword_renames)
-        renamed_keywords = ({} if full_name not in function_keyword_renames else
-                            function_keyword_renames[full_name])
-        for keyword in node.keywords:
-          argkey = keyword.arg
-          argval = keyword.value
-          if argkey in renamed_keywords:
-            self._file_edit.add("Renamed keyword argument from `%s` to `%s`" %
-                                (argkey, renamed_keywords[argkey]),
-                                argval.lineno,
-                                argval.col_offset - len(argkey) - 1,
-                                argkey + "=", renamed_keywords[argkey] + "=")
+      # Examine each keyword argument and convert it to the final renamed form
+      function_keyword_renames = (
+          self._api_change_spec.function_keyword_renames)
+      renamed_keywords = ({} if full_name not in function_keyword_renames else
+                          function_keyword_renames[full_name])
+      for keyword in node.keywords:
+        argkey = keyword.arg
+        argval = keyword.value
+        if argkey in renamed_keywords:
+          self._file_edit.add("Renamed keyword argument from %r to %r" %
+                              (argkey, renamed_keywords[argkey]),
+                              argval.lineno,
+                              argval.col_offset - len(argkey) - 1,
+                              argkey + "=", renamed_keywords[argkey] + "=")
+
+  def visit_Attribute(self, node):  # pylint: disable=invalid-name
+    """Handle bare Attributes i.e. [tf.foo, tf.bar].
+
+    Args:
+      node: Node that is of type ast.Attribute
+    """
+    full_name = self._get_attribute_full_path(node)
+    if full_name and full_name.startswith("tf."):
+      self._rename_functions(node, full_name)
 
     ast.NodeVisitor.generic_visit(self, node)
 
@@ -387,11 +406,15 @@ class TensorFlowCodeUpgrader(object):
     Returns:
       A tuple representing number of files processed, log of actions, errors
     """
-    in_file = open(in_filename, "r")
-    out_file = open(out_filename, "w") if out_filename else None
 
-    return self.process_opened_file(
-        in_filename, in_file, out_filename, out_file)
+    # Write to a temporary file, just in case we are doing an implace modify.
+    with open(in_filename, "r") as in_file, \
+        tempfile.NamedTemporaryFile("w", delete=False) as temp_file:
+      ret = self.process_opened_file(
+          in_filename, in_file, out_filename, temp_file)
+
+    shutil.move(temp_file.name, out_filename)
+    return ret
 
   # Broad exceptions are required here because ast throws whatever it wants.
   # pylint: disable=broad-except
@@ -411,7 +434,7 @@ class TensorFlowCodeUpgrader(object):
     """
     process_errors = []
     text = "-" * 80 + "\n"
-    text += "Processing file %s\n outputting to %s\n" % (in_filename,
+    text += "Processing file %r\n outputting to %r\n" % (in_filename,
                                                          out_filename)
     text += "-" * 80 + "\n\n"
 
@@ -420,7 +443,7 @@ class TensorFlowCodeUpgrader(object):
     try:
       parsed_ast = ast.parse("".join(lines))
     except Exception:
-      text += "Failed to parse %s\n\n" % in_filename
+      text += "Failed to parse %r\n\n" % in_filename
       text += traceback.format_exc()
     if parsed_ast:
       visitor = TensorFlowCallVisitor(in_filename, lines)
@@ -448,7 +471,7 @@ class TensorFlowCodeUpgrader(object):
 
     # make sure output directory doesn't exist
     if output_root_directory and os.path.exists(output_root_directory):
-      print("Output directory '%s' must not already exist." % (
+      print("Output directory %r must not already exist." % (
           output_root_directory))
       sys.exit(1)
 
@@ -456,7 +479,7 @@ class TensorFlowCodeUpgrader(object):
     norm_root = os.path.split(os.path.normpath(root_directory))
     norm_output = os.path.split(os.path.normpath(output_root_directory))
     if norm_root == norm_output:
-      print("Output directory '%s' same as input directory '%s"'' % (
+      print("Output directory %r same as input directory %r" % (
           root_directory, output_root_directory))
       sys.exit(1)
 
@@ -475,7 +498,7 @@ class TensorFlowCodeUpgrader(object):
     tree_errors = []
     report = ""
     report += ("=" * 80) + "\n"
-    report += "Input tree: %s\n" % root_directory
+    report += "Input tree: %r\n" % root_directory
     report += ("=" * 80) + "\n"
 
     for input_path, output_path in files_to_process:
@@ -547,4 +570,4 @@ Simple usage:
     print("Detected %d errors that require attention" % len(errors))
     print("-" * 80)
     print("\n".join(errors))
-    print("\nMake sure to read the detailed log %s\n" % report_filename)
+    print("\nMake sure to read the detailed log %r\n" % report_filename)
diff --git a/tensorflow/tools/compatibility/tf_upgrade_test.py b/tensorflow/tools/compatibility/tf_upgrade_test.py
index 69a85d8bda9..286c70f6127 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_test.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_test.py
@@ -17,7 +17,9 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-import StringIO
+import os
+import tempfile
+import six
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test as test_lib
 from tensorflow.tools.compatibility import tf_upgrade
@@ -32,8 +34,8 @@ class TestUpgrade(test_util.TensorFlowTestCase):
   """
 
   def _upgrade(self, old_file_text):
-    in_file = StringIO.StringIO(old_file_text)
-    out_file = StringIO.StringIO()
+    in_file = six.StringIO(old_file_text)
+    out_file = six.StringIO()
     upgrader = tf_upgrade.TensorFlowCodeUpgrader()
     count, report, errors = (
         upgrader.process_opened_file("test.py", in_file,
@@ -81,5 +83,21 @@ class TestUpgrade(test_util.TensorFlowTestCase):
   # TODO(aselle): Explicitly not testing command line interface and process_tree
   # for now, since this is a one off utility.
 
+
+class TestUpgradeFiles(test_util.TensorFlowTestCase):
+
+  def testInplace(self):
+    """Check to make sure we don't have a file system race."""
+    temp_file = tempfile.NamedTemporaryFile("w", delete=False)
+    original = "tf.mul(a, b)\n"
+    upgraded = "tf.multiply(a, b)\n"
+    temp_file.write(original)
+    temp_file.close()
+    upgrader = tf_upgrade.TensorFlowCodeUpgrader()
+    upgrader.process_file(temp_file.name, temp_file.name)
+    self.assertAllEqual(open(temp_file.name).read(), upgraded)
+    os.unlink(temp_file.name)
+
+
 if __name__ == "__main__":
   test_lib.main()
diff --git a/tensorflow/tools/dist_test/python/census_widendeep.py b/tensorflow/tools/dist_test/python/census_widendeep.py
index 62ff7f8f2f0..db56a687f6b 100644
--- a/tensorflow/tools/dist_test/python/census_widendeep.py
+++ b/tensorflow/tools/dist_test/python/census_widendeep.py
@@ -20,8 +20,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import argparse
 import json
 import os
+import sys
 
 from six.moves import urllib
 import tensorflow as tf
@@ -30,28 +32,6 @@ from tensorflow.contrib.learn.python.learn import learn_runner
 from tensorflow.contrib.learn.python.learn.estimators import run_config
 
 
-# Define command-line flags
-flags = tf.app.flags
-flags.DEFINE_string("data_dir", "/tmp/census-data",
-                    "Directory for storing the cesnsus data")
-flags.DEFINE_string("model_dir", "/tmp/census_wide_and_deep_model",
-                    "Directory for storing the model")
-flags.DEFINE_string("output_dir", "", "Base output directory.")
-flags.DEFINE_string("schedule", "local_run",
-                    "Schedule to run for this experiment.")
-flags.DEFINE_string("master_grpc_url", "",
-                    "URL to master GRPC tensorflow server, e.g.,"
-                    "grpc://127.0.0.1:2222")
-flags.DEFINE_integer("num_parameter_servers", 0,
-                     "Number of parameter servers")
-flags.DEFINE_integer("worker_index", 0,
-                     "Worker index (>=0)")
-flags.DEFINE_integer("train_steps", 1000, "Number of training steps")
-flags.DEFINE_integer("eval_steps", 1, "Number of evaluation steps")
-
-FLAGS = flags.FLAGS
-
-
 # Constants: Data download URLs
 TRAIN_DATA_URL = "http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.data"
 TEST_DATA_URL = "http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.test"
@@ -277,4 +257,62 @@ def main(unused_argv):
 
 
 if __name__ == "__main__":
-  tf.app.run()
+  parser = argparse.ArgumentParser()
+  parser.register("type", "bool", lambda v: v.lower() == "true")
+  parser.add_argument(
+      "--data_dir",
+      type=str,
+      default="/tmp/census-data",
+      help="Directory for storing the cesnsus data"
+  )
+  parser.add_argument(
+      "--model_dir",
+      type=str,
+      default="/tmp/census_wide_and_deep_model",
+      help="Directory for storing the model"
+  )
+  parser.add_argument(
+      "--output_dir",
+      type=str,
+      default="",
+      help="Base output directory."
+  )
+  parser.add_argument(
+      "--schedule",
+      type=str,
+      default="local_run",
+      help="Schedule to run for this experiment."
+  )
+  parser.add_argument(
+      "--master_grpc_url",
+      type=str,
+      default="",
+      help="URL to master GRPC tensorflow server, e.g.,grpc://127.0.0.1:2222"
+  )
+  parser.add_argument(
+      "--num_parameter_servers",
+      type=int,
+      default=0,
+      help="Number of parameter servers"
+  )
+  parser.add_argument(
+      "--worker_index",
+      type=int,
+      default=0,
+      help="Worker index (>=0)"
+  )
+  parser.add_argument(
+      "--train_steps",
+      type=int,
+      default=1000,
+      help="Number of training steps"
+  )
+  parser.add_argument(
+      "--eval_steps",
+      type=int,
+      default=1,
+      help="Number of evaluation steps"
+  )
+  global FLAGS  # pylint:disable=global-at-module-level
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/tools/dist_test/server/grpc_tensorflow_server.py b/tensorflow/tools/dist_test/server/grpc_tensorflow_server.py
index 5e36eaf7487..2d774577b6d 100755
--- a/tensorflow/tools/dist_test/server/grpc_tensorflow_server.py
+++ b/tensorflow/tools/dist_test/server/grpc_tensorflow_server.py
@@ -33,32 +33,22 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import argparse
+import sys
+
 from tensorflow.core.protobuf import tensorflow_server_pb2
 from tensorflow.python.platform import app
-from tensorflow.python.platform import flags
 from tensorflow.python.training import server_lib
 
-FLAGS = flags.FLAGS
 
-flags.DEFINE_string("cluster_spec", "", """Cluster spec: SPEC.
-    SPEC is <JOB>(,<JOB>)*,"
-    JOB  is <NAME>|<HOST:PORT>(;<HOST:PORT>)*,"
-    NAME is a valid job name ([a-z][0-9a-z]*),"
-    HOST is a hostname or IP address,"
-    PORT is a port number."
-E.g., local|localhost:2222;localhost:2223, ps|ps0:2222;ps1:2222""")
-flags.DEFINE_string("job_name", "", "Job name: e.g., local")
-flags.DEFINE_integer("task_id", 0, "Task index, e.g., 0")
-flags.DEFINE_boolean("verbose", False, "Verbose mode")
-
-
-def parse_cluster_spec(cluster_spec, cluster):
+def parse_cluster_spec(cluster_spec, cluster, verbose=False):
   """Parse content of cluster_spec string and inject info into cluster protobuf.
 
   Args:
     cluster_spec: cluster specification string, e.g.,
           "local|localhost:2222;localhost:2223"
     cluster: cluster protobuf.
+    verbose: If verbose logging is requested.
 
   Raises:
     ValueError: if the cluster_spec string is invalid.
@@ -82,7 +72,7 @@ def parse_cluster_spec(cluster_spec, cluster):
 
     job_def.name = job_name
 
-    if FLAGS.verbose:
+    if verbose:
       print("Added job named \"%s\"" % job_name)
 
     job_tasks = job_string.split("|")[1].split(";")
@@ -92,7 +82,7 @@ def parse_cluster_spec(cluster_spec, cluster):
 
       job_def.tasks[i] = job_tasks[i]
 
-      if FLAGS.verbose:
+      if verbose:
         print("  Added task \"%s\" to job \"%s\"" % (job_tasks[i], job_name))
 
 
@@ -101,7 +91,7 @@ def main(unused_args):
   server_def = tensorflow_server_pb2.ServerDef(protocol="grpc")
 
   # Cluster info
-  parse_cluster_spec(FLAGS.cluster_spec, server_def.cluster)
+  parse_cluster_spec(FLAGS.cluster_spec, server_def.cluster, FLAGS.verbose)
 
   # Job name
   if not FLAGS.job_name:
@@ -121,4 +111,39 @@ def main(unused_args):
 
 
 if __name__ == "__main__":
-  app.run()
+  parser = argparse.ArgumentParser()
+  parser.register("type", "bool", lambda v: v.lower() == "true")
+  parser.add_argument(
+      "--cluster_spec",
+      type=str,
+      default="",
+      help="""\
+      Cluster spec: SPEC.     SPEC is <JOB>(,<JOB>)*,"     JOB  is
+      <NAME>|<HOST:PORT>(;<HOST:PORT>)*,"     NAME is a valid job name
+      ([a-z][0-9a-z]*),"     HOST is a hostname or IP address,"     PORT is a
+      port number." E.g., local|localhost:2222;localhost:2223,
+      ps|ps0:2222;ps1:2222\
+      """
+  )
+  parser.add_argument(
+      "--job_name",
+      type=str,
+      default="",
+      help="Job name: e.g., local"
+  )
+  parser.add_argument(
+      "--task_id",
+      type=int,
+      default=0,
+      help="Task index, e.g., 0"
+  )
+  parser.add_argument(
+      "--verbose",
+      type="bool",
+      nargs="?",
+      const=True,
+      default=False,
+      help="Verbose mode"
+  )
+  FLAGS, unparsed = parser.parse_known_args()
+  app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 5570cea32fc..62fb9b9176e 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -87,6 +87,7 @@ filegroup(
         "@gif_archive//:COPYING",
         "@grpc//:LICENSE",
         "@highwayhash//:LICENSE",
+        "@jemalloc//:COPYING",
         "@jpeg//:LICENSE.md",
         "@libxsmm_archive//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 5cea08e2f3a..21557822f41 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -34,6 +34,24 @@ def check_version(bazel_version):
           native.bazel_version, bazel_version))
   pass
 
+# Temporary workaround to support including TensorFlow as a submodule until this
+# use-case is supported in the next Bazel release.
+def _temp_workaround_http_archive_impl(repo_ctx):
+   repo_ctx.template("BUILD", repo_ctx.attr.build_file,
+                     {"%ws%": repo_ctx.attr.repository}, False)
+   repo_ctx.download_and_extract(repo_ctx.attr.urls, "", repo_ctx.attr.sha256,
+                                 "", repo_ctx.attr.strip_prefix)
+
+temp_workaround_http_archive = repository_rule(
+   implementation=_temp_workaround_http_archive_impl,
+   attrs = {
+      "build_file": attr.label(),
+      "repository": attr.string(),
+      "urls": attr.string_list(default = []),
+      "sha256": attr.string(default = ""),
+      "strip_prefix": attr.string(default = ""),
+   })
+
 # If TensorFlow is linked as a submodule.
 # path_prefix and tf_repo_name are no longer used.
 def tf_workspace(path_prefix = "", tf_repo_name = ""):
@@ -128,7 +146,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       build_file = str(Label("//third_party:nasm.BUILD")),
   )
 
-  native.new_http_archive(
+  temp_workaround_http_archive(
       name = "jpeg",
       urls = [
           "http://bazel-mirror.storage.googleapis.com/github.com/libjpeg-turbo/libjpeg-turbo/archive/1.5.1.tar.gz",
@@ -137,6 +155,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       sha256 = "c15a9607892113946379ccea3ca8b85018301b200754f209453ab21674268e77",
       strip_prefix = "libjpeg-turbo-1.5.1",
       build_file = str(Label("//third_party/jpeg:jpeg.BUILD")),
+      repository = tf_repo_name,
   )
 
   native.new_http_archive(
@@ -376,3 +395,14 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       name = "junit",
       actual = "@junit_jar//jar",
   )
+
+  native.new_http_archive(
+      name = "jemalloc",
+      urls = [
+          "http://bazel-mirror.storage.googleapis.com/github.com/jemalloc/jemalloc/archive/4.4.0.tar.gz",
+          "https://github.com/jemalloc/jemalloc/archive/4.4.0.tar.gz",
+      ],
+      sha256 = "3c8f25c02e806c3ce0ab5fb7da1817f89fc9732709024e2a81b6b82f7cc792a8",
+      strip_prefix = "jemalloc-4.4.0",
+      build_file = str(Label("//third_party:jemalloc.BUILD")),
+  )
diff --git a/third_party/jemalloc.BUILD b/third_party/jemalloc.BUILD
new file mode 100644
index 00000000000..2496d126277
--- /dev/null
+++ b/third_party/jemalloc.BUILD
@@ -0,0 +1,321 @@
+# Description:
+# jemalloc - a general-purpose scalable concurrent malloc implementation
+
+licenses(["notice"])  # BSD
+
+exports_files(["COPYING"])
+
+load("@//third_party:common.bzl", "template_rule")
+
+cc_library(
+    name = "jemalloc",
+    srcs = [
+        "src/arena.c",
+        "src/atomic.c",
+        "src/base.c",
+        "src/bitmap.c",
+        "src/chunk.c",
+        "src/chunk_dss.c",
+        "src/chunk_mmap.c",
+        "src/ckh.c",
+        "src/ctl.c",
+        "src/extent.c",
+        "src/hash.c",
+        "src/huge.c",
+        "src/jemalloc.c",
+        "src/mb.c",
+        "src/mutex.c",
+        "src/nstime.c",
+        "src/pages.c",
+        "src/prng.c",
+        "src/prof.c",
+        "src/quarantine.c",
+        "src/rtree.c",
+        "src/spin.c",
+        "src/stats.c",
+        "src/tcache.c",
+        "src/tsd.c",
+        "src/util.c",
+        "src/witness.c",
+    ],
+    hdrs = [
+        "include/jemalloc/internal/arena.h",
+        "include/jemalloc/internal/assert.h",
+        "include/jemalloc/internal/atomic.h",
+        "include/jemalloc/internal/base.h",
+        "include/jemalloc/internal/bitmap.h",
+        "include/jemalloc/internal/chunk.h",
+        "include/jemalloc/internal/chunk_dss.h",
+        "include/jemalloc/internal/chunk_mmap.h",
+        "include/jemalloc/internal/ckh.h",
+        "include/jemalloc/internal/ctl.h",
+        "include/jemalloc/internal/extent.h",
+        "include/jemalloc/internal/hash.h",
+        "include/jemalloc/internal/huge.h",
+        "include/jemalloc/internal/jemalloc_internal.h",
+        "include/jemalloc/internal/jemalloc_internal_decls.h",
+        "include/jemalloc/internal/jemalloc_internal_defs.h",
+        "include/jemalloc/internal/jemalloc_internal_macros.h",
+        "include/jemalloc/internal/mb.h",
+        "include/jemalloc/internal/mutex.h",
+        "include/jemalloc/internal/nstime.h",
+        "include/jemalloc/internal/pages.h",
+        "include/jemalloc/internal/ph.h",
+        "include/jemalloc/internal/private_namespace.h",
+        "include/jemalloc/internal/prng.h",
+        "include/jemalloc/internal/prof.h",
+        "include/jemalloc/internal/ql.h",
+        "include/jemalloc/internal/qr.h",
+        "include/jemalloc/internal/quarantine.h",
+        "include/jemalloc/internal/rb.h",
+        "include/jemalloc/internal/rtree.h",
+        "include/jemalloc/internal/size_classes.h",
+        "include/jemalloc/internal/smoothstep.h",
+        "include/jemalloc/internal/spin.h",
+        "include/jemalloc/internal/stats.h",
+        "include/jemalloc/internal/tcache.h",
+        "include/jemalloc/internal/ticker.h",
+        "include/jemalloc/internal/tsd.h",
+        "include/jemalloc/internal/util.h",
+        "include/jemalloc/internal/valgrind.h",
+        "include/jemalloc/internal/witness.h",
+        "include/jemalloc/jemalloc.h",
+    ],
+    # Same flags that jemalloc uses to build.
+    copts = [
+        "-O3",
+        "-funroll-loops",
+        "-D_GNU_SOURCE",
+        "-D_REENTRANT",
+    ],
+    includes = ["include"],
+    visibility = ["//visibility:public"],
+)
+
+sh_binary(
+    name = "jemalloc_sh",
+    srcs = ["include/jemalloc/jemalloc.sh"],
+)
+
+genrule(
+    name = "jemalloc_h",
+    srcs = [
+        ":jemalloc_defs_h",
+        ":jemalloc_macros_h",
+        ":jemalloc_mangle_h",
+        ":jemalloc_protos_h",
+        ":jemalloc_rename_h",
+        ":jemalloc_typedefs_h",
+    ],
+    outs = ["include/jemalloc/jemalloc.h"],
+    cmd = "$(location :jemalloc_sh) $$(dirname $(location :jemalloc_defs_h))/../../ >$@",
+    tools = [":jemalloc_sh"],
+)
+
+# Add to this list if you want to export more symbols from jemalloc.
+genrule(
+    name = "public_symbols_txt",
+    outs = ["include/jemalloc/internal/public_symbols.txt"],
+    cmd = "\n".join([
+        "cat <<'EOF' > $@",
+        "free:jemalloc_free",
+        "malloc:jemalloc_malloc",
+        "posix_memalign:jemalloc_posix_memalign",
+        "realloc:jemalloc_realloc",
+        "EOF",
+    ]),
+)
+
+sh_binary(
+    name = "jemalloc_mangle_sh",
+    srcs = ["include/jemalloc/jemalloc_mangle.sh"],
+)
+
+genrule(
+    name = "jemalloc_mangle_h",
+    srcs = [":public_symbols_txt"],
+    outs = ["include/jemalloc/jemalloc_mangle.h"],
+    cmd = "$(location :jemalloc_mangle_sh) $(location :public_symbols_txt) je_ >$@",
+    tools = [":jemalloc_mangle_sh"],
+)
+
+sh_binary(
+    name = "jemalloc_rename_sh",
+    srcs = ["include/jemalloc/jemalloc_rename.sh"],
+)
+
+genrule(
+    name = "jemalloc_rename_h",
+    srcs = [":public_symbols_txt"],
+    outs = ["include/jemalloc/jemalloc_rename.h"],
+    cmd = "$(location :jemalloc_rename_sh) $(location :public_symbols_txt) >$@",
+    tools = [":jemalloc_rename_sh"],
+)
+
+sh_binary(
+    name = "private_namespace_sh",
+    srcs = ["include/jemalloc/internal/private_namespace.sh"],
+)
+
+genrule(
+    name = "private_namespace_h",
+    srcs = ["include/jemalloc/internal/private_symbols.txt"],
+    outs = ["include/jemalloc/internal/private_namespace.h"],
+    cmd = "$(location :private_namespace_sh) $(location include/jemalloc/internal/private_symbols.txt) >$@",
+    tools = [":private_namespace_sh"],
+)
+
+sh_binary(
+    name = "public_namespace_sh",
+    srcs = ["include/jemalloc/internal/public_namespace.sh"],
+)
+
+genrule(
+    name = "public_namespace_h",
+    srcs = [":public_symbols_txt"],
+    outs = ["include/jemalloc/internal/public_namespace.h"],
+    cmd = "$(location :public_namespace_sh) $(location :public_symbols_txt) >$@",
+    tools = [":public_namespace_sh"],
+)
+
+sh_binary(
+    name = "size_classes_sh",
+    srcs = ["include/jemalloc/internal/size_classes.sh"],
+)
+
+# Size classes for Linux x86_64. Update if adding builds for other
+# architectures. See size_classes.sh for details on the arguments.
+genrule(
+    name = "size_classes_h",
+    outs = ["include/jemalloc/internal/size_classes.h"],
+    cmd = "$(location :size_classes_sh) \"3 4\" 3 12 2 >$@",
+    tools = [":size_classes_sh"],
+)
+
+template_rule(
+    name = "jemalloc_internal_h",
+    src = "include/jemalloc/internal/jemalloc_internal.h.in",
+    out = "include/jemalloc/internal/jemalloc_internal.h",
+    substitutions = {
+        "@private_namespace@": "je_",
+        "@install_suffix@": "",
+    },
+)
+
+template_rule(
+    name = "jemalloc_internal_defs_h",
+    src = "include/jemalloc/internal/jemalloc_internal_defs.h.in",
+    out = "include/jemalloc/internal/jemalloc_internal_defs.h",
+    substitutions = {
+        "#undef JEMALLOC_PREFIX": "#define JEMALLOC_PREFIX \"jemalloc_\"",
+        "#undef JEMALLOC_CPREFIX": "#define JEMALLOC_CPREFIX \"JEMALLOC_\"",
+        "#undef JEMALLOC_PRIVATE_NAMESPACE": "#define JEMALLOC_PRIVATE_NAMESPACE je_",
+        "#undef CPU_SPINWAIT": "#define CPU_SPINWAIT __asm__ volatile(\"pause\")",
+        "#undef JEMALLOC_HAVE_BUILTIN_CLZ": "#define JEMALLOC_HAVE_BUILTIN_CLZ",
+        "#undef JEMALLOC_USE_SYSCALL": "#define JEMALLOC_USE_SYSCALL",
+        "#undef JEMALLOC_HAVE_SECURE_GETENV": "#define JEMALLOC_HAVE_SECURE_GETENV",
+        "#undef JEMALLOC_HAVE_PTHREAD_ATFORK": "#define JEMALLOC_HAVE_PTHREAD_ATFORK",
+        "#undef JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE": "#define JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE 1",
+        # Newline required because of substitution conflicts.
+        "#undef JEMALLOC_HAVE_CLOCK_MONOTONIC\n": "#define JEMALLOC_HAVE_CLOCK_MONOTONIC 1\n",
+        "#undef JEMALLOC_THREADED_INIT": "#define JEMALLOC_THREADED_INIT",
+        "#undef JEMALLOC_TLS_MODEL": "#define JEMALLOC_TLS_MODEL __attribute__((tls_model(\"initial-exec\")))",
+        "#undef JEMALLOC_CC_SILENCE": "#define JEMALLOC_CC_SILENCE",
+        "#undef JEMALLOC_STATS": "#define JEMALLOC_STATS",
+        "#undef JEMALLOC_TCACHE": "#define JEMALLOC_TCACHE",
+        "#undef JEMALLOC_DSS": "#define JEMALLOC_DSS",
+        "#undef JEMALLOC_FILL": "#define JEMALLOC_FILL",
+        "#undef LG_TINY_MIN": "#define LG_TINY_MIN 3",
+        "#undef LG_PAGE": "#define LG_PAGE 12",
+        "#undef JEMALLOC_MAPS_COALESCE": "#define JEMALLOC_MAPS_COALESCE",
+        "#undef JEMALLOC_TLS": "#define JEMALLOC_TLS",
+        "#undef JEMALLOC_INTERNAL_UNREACHABLE": "#define JEMALLOC_INTERNAL_UNREACHABLE __builtin_unreachable",
+        "#undef JEMALLOC_INTERNAL_FFSLL": "#define JEMALLOC_INTERNAL_FFSLL __builtin_ffsll",
+        # Newline required because of substitution conflicts.
+        "#undef JEMALLOC_INTERNAL_FFSL\n": "#define JEMALLOC_INTERNAL_FFSL __builtin_ffsl\n",
+        "#undef JEMALLOC_INTERNAL_FFS\n": "#define JEMALLOC_INTERNAL_FFS __builtin_ffs\n",
+        "#undef JEMALLOC_CACHE_OBLIVIOUS": "#define JEMALLOC_CACHE_OBLIVIOUS",
+        "#undef JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY": "#define JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY",
+        "#undef JEMALLOC_HAVE_MADVISE": "#define JEMALLOC_HAVE_MADVISE",
+        "#undef JEMALLOC_PURGE_MADVISE_DONTNEED": "#define JEMALLOC_PURGE_MADVISE_DONTNEED",
+        "#undef JEMALLOC_THP": "#define JEMALLOC_THP",
+        "#undef JEMALLOC_HAS_ALLOCA_H": "#define JEMALLOC_HAS_ALLOCA_H 1",
+        # Newline required because of substitution conflicts.
+        "#undef LG_SIZEOF_INT\n": "#define LG_SIZEOF_INT 2\n",
+        "#undef LG_SIZEOF_LONG\n": "#define LG_SIZEOF_LONG 3\n",
+        "#undef LG_SIZEOF_LONG_LONG": "#define LG_SIZEOF_LONG_LONG 3",
+        "#undef LG_SIZEOF_INTMAX_T": "#define LG_SIZEOF_INTMAX_T 3",
+        "#undef JEMALLOC_GLIBC_MALLOC_HOOK": "#define JEMALLOC_GLIBC_MALLOC_HOOK",
+        "#undef JEMALLOC_GLIBC_MEMALIGN_HOOK": "#define JEMALLOC_GLIBC_MEMALIGN_HOOK",
+        "#undef JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP": "#define JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP",
+        "#undef JEMALLOC_CONFIG_MALLOC_CONF": "#define JEMALLOC_CONFIG_MALLOC_CONF \"\"",
+    },
+)
+
+template_rule(
+    name = "jemalloc_defs_h",
+    src = "include/jemalloc/jemalloc_defs.h.in",
+    out = "include/jemalloc/jemalloc_defs.h",
+    substitutions = {
+        "#undef JEMALLOC_HAVE_ATTR": "#define JEMALLOC_HAVE_ATTR",
+        "#undef JEMALLOC_HAVE_ATTR_ALLOC_SIZE": "#define JEMALLOC_HAVE_ATTR_ALLOC_SIZE",
+        "#undef JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF": "#define JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF",
+        "#undef JEMALLOC_HAVE_ATTR_FORMAT_PRINTF": "#define JEMALLOC_HAVE_ATTR_FORMAT_PRINTF",
+        "#undef JEMALLOC_OVERRIDE_MEMALIGN": "#define JEMALLOC_OVERRIDE_MEMALIGN",
+        "#undef JEMALLOC_OVERRIDE_VALLOC": "#define JEMALLOC_OVERRIDE_VALLOC",
+        "#undef JEMALLOC_USABLE_SIZE_CONST": "#define JEMALLOC_USABLE_SIZE_CONST",
+        "#undef JEMALLOC_USE_CXX_THROW": "#define JEMALLOC_USE_CXX_THROW",
+        "#undef LG_SIZEOF_PTR": "#define LG_SIZEOF_PTR 3",
+    },
+)
+
+template_rule(
+    name = "jemalloc_macros_h",
+    src = "include/jemalloc/jemalloc_macros.h.in",
+    out = "include/jemalloc/jemalloc_macros.h",
+    substitutions = {
+        "@jemalloc_version@": "0.0.0",
+        "@jemalloc_version_major@": "0",
+        "@jemalloc_version_minor@": "0",
+        "@jemalloc_version_bugfix@": "0",
+        "@jemalloc_version_nrev@": "0",
+        "@jemalloc_version_gid@": "0000000000000000000000000000000000000000",
+    },
+)
+
+template_rule(
+    name = "jemalloc_protos_h",
+    src = "include/jemalloc/jemalloc_protos.h.in",
+    out = "include/jemalloc/jemalloc_protos.h",
+    substitutions = {
+        "@aligned_alloc": "aligned_alloc",
+        "@calloc": "calloc",
+        "@cbopaque": "cbopaque",
+        "@dallocx": "dallocx",
+        "@free": "free",
+        "@je": "je",
+        "@mallctl": "mallctl",
+        "@mallctlnametomib": "mallctlnametomib",
+        "@mallctlbymib": "mallctlbymib",
+        "@malloc_stats_print": "malloc_stats_print",
+        "@malloc_usable_size": "malloc_usable_size",
+        "@malloc": "malloc",
+        "@mallocx": "mallocx",
+        "@memalign": "memalign",
+        "@nallocx": "nallocx",
+        "@posix_memalign": "posix_memalign",
+        "@rallocx": "rallocx",
+        "@realloc": "realloc",
+        "@sallocx": "sallocx",
+        "@sdallocx": "sdallocx",
+        "@valloc": "valloc",
+        "@xallocx": "xallocx",
+    },
+)
+
+template_rule(
+    name = "jemalloc_typedefs_h",
+    src = "include/jemalloc/jemalloc_typedefs.h.in",
+    out = "include/jemalloc/jemalloc_typedefs.h",
+    substitutions = {},
+)
diff --git a/third_party/jpeg/jpeg.BUILD b/third_party/jpeg/jpeg.BUILD
index 37401b41d0d..78e03eadcfa 100644
--- a/third_party/jpeg/jpeg.BUILD
+++ b/third_party/jpeg/jpeg.BUILD
@@ -5,7 +5,7 @@ licenses(["notice"])  # custom notice-style license, see LICENSE.md
 
 exports_files(["LICENSE.md"])
 
-load("@//third_party:common.bzl", "template_rule")
+load("@%ws%//third_party:common.bzl", "template_rule")
 
 libjpegturbo_nocopts = "-[W]error"