diff --git a/WORKSPACE b/WORKSPACE
index d3e01b76371..4ec77c790b9 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -46,7 +46,7 @@ new_git_repository(
 new_git_repository(
   name = "font_roboto",
   build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/font-roboto.git",
+  remote = "https://github.com/polymerelements/font-roboto.git",
   tag = "v1.0.1",
 )
 
@@ -60,49 +60,49 @@ new_git_repository(
 new_git_repository(
   name = "iron_a11y_announcer",
   build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/iron-a11y-announcer.git",
+  remote = "https://github.com/polymerelements/iron-a11y-announcer.git",
   tag = "v1.0.4",
 )
 
 new_git_repository(
   name = "iron_a11y_keys_behavior",
   build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/iron-a11y-keys-behavior.git",
+  remote = "https://github.com/polymerelements/iron-a11y-keys-behavior.git",
   tag = "v1.1.2",
 )
 
 new_git_repository(
   name = "iron_ajax",
   build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/iron-ajax.git",
+  remote = "https://github.com/polymerelements/iron-ajax.git",
   tag = "v1.1.1",
 )
 
 new_git_repository(
   name = "iron_autogrow_textarea",
   build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/iron-autogrow-textarea.git",
+  remote = "https://github.com/polymerelements/iron-autogrow-textarea.git",
   tag = "v1.0.12",
 )
 
 new_git_repository(
   name = "iron_behaviors",
   build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/iron-behaviors.git",
+  remote = "https://github.com/polymerelements/iron-behaviors.git",
   tag = "v1.0.13",
 )
 
 new_git_repository(
   name = "iron_checked_element_behavior",
   build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/iron-checked-element-behavior.git",
+  remote = "https://github.com/polymerelements/iron-checked-element-behavior.git",
   tag = "v1.0.4",
 )
 
 new_git_repository(
   name = "iron_collapse",
   build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/iron-collapse.git",
+  remote = "https://github.com/polymerelements/iron-collapse.git",
   tag = "v1.0.6",
 )
 
@@ -116,7 +116,7 @@ new_git_repository(
 new_git_repository(
   name = "iron_fit_behavior",
   build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/iron-fit-behavior.git",
+  remote = "https://github.com/polymerelements/iron-fit-behavior.git",
   tag = "v1.0.6",
 )
 
@@ -130,7 +130,7 @@ new_git_repository(
 new_git_repository(
   name = "iron_form_element_behavior",
   build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/iron-form-element-behavior.git",
+  remote = "https://github.com/polymerelements/iron-form-element-behavior.git",
   tag = "v1.0.6",
 )
 
@@ -151,28 +151,28 @@ new_git_repository(
 new_git_repository(
   name = "iron_iconset_svg",
   build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/iron-iconset-svg.git",
+  remote = "https://github.com/polymerelements/iron-iconset-svg.git",
   tag = "v1.0.9",
 )
 
 new_git_repository(
   name = "iron_input",
   build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/iron-input.git",
+  remote = "https://github.com/polymerelements/iron-input.git",
   tag = "v1.0.9",
 )
 
 new_git_repository(
   name = "iron_list",
   build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/iron-list.git",
+  remote = "https://github.com/polymerelements/iron-list.git",
   tag = "v1.1.7",
 )
 
 new_git_repository(
   name = "iron_menu_behavior",
   build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/iron-menu-behavior.git",
+  remote = "https://github.com/polymerelements/iron-menu-behavior.git",
   tag = "v1.1.5",
 )
 
@@ -187,13 +187,13 @@ new_git_repository(
   name = "iron_overlay_behavior",
   build_file = "bower.BUILD",
   remote = "https://github.com/polymerelements/iron-overlay-behavior.git",
-  tag = "v1.6.1",
+  tag = "v1.6.2",
 )
 
 new_git_repository(
   name = "iron_range_behavior",
   build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/iron-range-behavior.git",
+  remote = "https://github.com/polymerelements/iron-range-behavior.git",
   tag = "v1.0.4",
 )
 
@@ -207,14 +207,14 @@ new_git_repository(
 new_git_repository(
   name = "iron_selector",
   build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/iron-selector.git",
+  remote = "https://github.com/polymerelements/iron-selector.git",
   tag = "v1.2.4",
 )
 
 new_git_repository(
   name = "iron_validatable_behavior",
   build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/iron-validatable-behavior.git",
+  remote = "https://github.com/polymerelements/iron-validatable-behavior.git",
   tag = "v1.0.5",
 )
 
@@ -235,56 +235,56 @@ new_git_repository(
 new_git_repository(
   name = "paper_behaviors",
   build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/paper-behaviors.git",
+  remote = "https://github.com/polymerelements/paper-behaviors.git",
   tag = "v1.0.11",
 )
 
 new_git_repository(
   name = "paper_button",
   build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/paper-button.git",
+  remote = "https://github.com/polymerelements/paper-button.git",
   tag = "v1.0.11",
 )
 
 new_git_repository(
   name = "paper_checkbox",
   build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/paper-checkbox.git",
+  remote = "https://github.com/polymerelements/paper-checkbox.git",
   tag = "v1.1.3",
 )
 
 new_git_repository(
   name = "paper_dropdown_menu",
   build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/paper-dropdown-menu.git",
+  remote = "https://github.com/polymerelements/paper-dropdown-menu.git",
   tag = "v1.1.3",
 )
 
 new_git_repository(
   name = "paper_header_panel",
   build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/paper-header-panel.git",
+  remote = "https://github.com/polymerelements/paper-header-panel.git",
   tag = "v1.1.4",
 )
 
 new_git_repository(
   name = "paper_icon_button",
   build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/paper-icon-button.git",
+  remote = "https://github.com/polymerelements/paper-icon-button.git",
   tag = "v1.0.6",
 )
 
 new_git_repository(
   name = "paper_input",
   build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/paper-input.git",
+  remote = "https://github.com/polymerelements/paper-input.git",
   tag = "v1.1.5",
 )
 
 new_git_repository(
   name = "paper_item",
   build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/paper-item.git",
+  remote = "https://github.com/polymerelements/paper-item.git",
   tag = "v1.1.4",
 )
 
@@ -298,7 +298,7 @@ new_git_repository(
 new_git_repository(
   name = "paper_menu",
   build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/paper-menu.git",
+  remote = "https://github.com/polymerelements/paper-menu.git",
   tag = "v1.2.2",
 )
 
@@ -306,27 +306,27 @@ new_git_repository(
   name = "paper_menu_button",
   build_file = "bower.BUILD",
   remote = "https://github.com/polymerelements/paper-menu-button.git",
-  tag = "v1.0.4",
+  tag = "v1.1.0",
 )
 
 new_git_repository(
   name = "paper_progress",
   build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/paper-progress.git",
-  tag = "v1.0.8",
+  remote = "https://github.com/polymerelements/paper-progress.git",
+  tag = "v1.0.9",
 )
 
 new_git_repository(
   name = "paper_radio_button",
   build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/paper-radio-button.git",
+  remote = "https://github.com/polymerelements/paper-radio-button.git",
   tag = "v1.1.1",
 )
 
 new_git_repository(
   name = "paper_radio_group",
   build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/paper-radio-group.git",
+  remote = "https://github.com/polymerelements/paper-radio-group.git",
   tag = "v1.0.9",
 )
 
@@ -340,35 +340,35 @@ new_git_repository(
 new_git_repository(
   name = "paper_slider",
   build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/paper-slider.git",
+  remote = "https://github.com/polymerelements/paper-slider.git",
   tag = "v1.0.8",
 )
 
 new_git_repository(
   name = "paper_styles",
   build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/paper-styles.git",
+  remote = "https://github.com/polymerelements/paper-styles.git",
   tag = "v1.1.1",
 )
 
 new_git_repository(
   name = "paper_tabs",
   build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/paper-tabs.git",
+  remote = "https://github.com/polymerelements/paper-tabs.git",
   tag = "v1.2.4",
 )
 
 new_git_repository(
   name = "paper_toggle_button",
   build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/paper-toggle-button.git",
+  remote = "https://github.com/polymerelements/paper-toggle-button.git",
   tag = "v1.0.12",
 )
 
 new_git_repository(
   name = "paper_toolbar",
   build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/paper-toolbar.git",
+  remote = "https://github.com/polymerelements/paper-toolbar.git",
   tag = "v1.1.2",
 )
 
@@ -382,7 +382,7 @@ new_git_repository(
 new_git_repository(
   name = "polymer",
   build_file = "bower.BUILD",
-  remote = "https://github.com/Polymer/polymer.git",
+  remote = "https://github.com/polymer/polymer.git",
   tag = "v1.4.0",
 )
 
@@ -403,6 +403,6 @@ new_git_repository(
 new_git_repository(
   name = "webcomponentsjs",
   build_file = "bower.BUILD",
-  remote = "https://github.com/Polymer/webcomponentsjs.git",
+  remote = "https://github.com/polymer/webcomponentsjs.git",
   tag = "v0.7.21",
 )
diff --git a/eigen.BUILD b/eigen.BUILD
index b58c541e3e7..44f5e5bea22 100644
--- a/eigen.BUILD
+++ b/eigen.BUILD
@@ -1,6 +1,6 @@
 package(default_visibility = ["//visibility:public"])
 
-archive_dir = "eigen-eigen-6e521c802bf5"
+archive_dir = "eigen-eigen-3f653ace7d28"
 
 cc_library(
     name = "eigen",
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 9d51c03b7cf..45d4ab1a42d 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -20,6 +20,15 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "android_arm",
+    values = {
+        "crosstool_top": "//external:android/crosstool",
+        "android_cpu": "armeabi-v7a",
+    },
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
     name = "darwin",
     values = {"cpu": "darwin"},
diff --git a/tensorflow/contrib/cmake/external/eigen.cmake b/tensorflow/contrib/cmake/external/eigen.cmake
index 377ea8b5e6a..d8ced6b2dd8 100644
--- a/tensorflow/contrib/cmake/external/eigen.cmake
+++ b/tensorflow/contrib/cmake/external/eigen.cmake
@@ -7,7 +7,7 @@
 
 include (ExternalProject)
 
-set(eigen_archive_hash "6e521c802bf5")
+set(eigen_archive_hash "3f653ace7d28")
 
 set(eigen_INCLUDE_DIRS
     ${CMAKE_CURRENT_BINARY_DIR}
@@ -16,7 +16,7 @@ set(eigen_INCLUDE_DIRS
     ${tensorflow_source_dir}/third_party/eigen3
 )
 set(eigen_URL https://bitbucket.org/eigen/eigen/get/${eigen_archive_hash}.tar.gz)
-set(eigen_HASH SHA256=f1b4b4401d08d0d44128ab80ebe76633363dab20c29b1bf2370aed8b4893cc5e)
+set(eigen_HASH SHA256=b49502f423deda55cea33bc503f84409cca92157f3b536d17113b81138f86715)
 set(eigen_BUILD ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen)
 set(eigen_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/eigen/install)
 
diff --git a/tensorflow/contrib/framework/python/framework/tensor_util_test.py b/tensorflow/contrib/framework/python/framework/tensor_util_test.py
index fa475c1fbb4..8c3cf3ac92a 100644
--- a/tensorflow/contrib/framework/python/framework/tensor_util_test.py
+++ b/tensorflow/contrib/framework/python/framework/tensor_util_test.py
@@ -752,7 +752,7 @@ class WithShapeTest(tf.test.TestCase):
 
       for incompatible_shape in [[0], [1]]:
         self.assertRaisesRegexp(
-            ValueError, "must have the same rank",
+            ValueError, r"Shapes \(\?, 2\) and \([01],\) are not compatible",
             tf.contrib.framework.with_shape,
             incompatible_shape, tensor_partial_shape)
       for incompatible_shape in [[1, 2, 1]]:
@@ -761,7 +761,7 @@ class WithShapeTest(tf.test.TestCase):
             incompatible_shape, tensor_partial_shape)
       for incompatible_shape in [[2, 1]]:
         self.assertRaisesRegexp(
-            ValueError, "Dimensions.*are not compatible",
+            ValueError, r"Shapes \(\?, 2\) and \(2, 1\) are not compatible",
             tf.contrib.framework.with_shape,
             incompatible_shape, tensor_partial_shape)
 
diff --git a/tensorflow/contrib/linear_optimizer/kernels/resources_test.cc b/tensorflow/contrib/linear_optimizer/kernels/resources_test.cc
index 4b4c2f5fd7e..060d29daabb 100644
--- a/tensorflow/contrib/linear_optimizer/kernels/resources_test.cc
+++ b/tensorflow/contrib/linear_optimizer/kernels/resources_test.cc
@@ -164,7 +164,6 @@ TEST_F(DataByExampleTest, VisitUnavailable) {
     signal(&updated_data);
   });
   wait(&completed_visit);
-  EXPECT_FALSE(thread_pool.HasPendingClosures());
   EXPECT_TRUE(errors::IsUnavailable(status));
 }
 
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index e7d9ab13a0b..0af7f54a164 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -245,6 +245,7 @@ tf_cuda_library(
         "framework/register_types.h",
         "framework/resource_mgr.h",
         "framework/selective_registration.h",
+        "framework/session_state.h",
         "framework/tensor.h",
         "framework/tensor_shape.h",
         "framework/tensor_slice.h",
@@ -267,6 +268,7 @@ tf_cuda_library(
         "util/saved_tensor_slice_util.h",
         "util/sparse/group_iterator.h",
         "util/sparse/sparse_tensor.h",
+        "util/stat_summarizer.h",
         "util/tensor_format.h",
         "util/tensor_slice_reader.h",
         "util/tensor_slice_reader_cache.h",
@@ -856,6 +858,7 @@ filegroup(
         "framework/partial_tensor_shape.h",
         "framework/rendezvous.h",
         "framework/selective_registration.h",
+        "framework/session_state.h",
         "framework/tensor.h",
         "framework/tensor_reference.h",
         "framework/tensor_shape.h",
@@ -1268,6 +1271,7 @@ tf_cc_test(
         "//tensorflow/core/kernels:matmul_op",
         "//tensorflow/core/kernels:ops_util",
         "//tensorflow/core/kernels:queue_ops",
+        "//tensorflow/core/kernels:session_ops",
         "//tensorflow/core/kernels:variable_ops",
         "//third_party/eigen3",
     ],
diff --git a/tensorflow/core/common_runtime/constant_folding.cc b/tensorflow/core/common_runtime/constant_folding.cc
index 07f08c55771..71dbd6d6809 100644
--- a/tensorflow/core/common_runtime/constant_folding.cc
+++ b/tensorflow/core/common_runtime/constant_folding.cc
@@ -50,6 +50,11 @@ bool IsConstantFoldable(const Node* n,
   if (n->IsControlFlow() || n->IsSend() || n->IsRecv()) {
     return false;
   }
+  // TODO(yuanbyu): For now disable these session handle operations.
+  if (n->IsGetSessionHandle() || n->IsGetSessionTensor() ||
+      n->IsDeleteSessionTensor()) {
+    return false;
+  }
   if (n->IsSource()) {
     return false;
   }
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 00f1edd0bad..67605e23e53 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -313,6 +313,8 @@ Status DirectSession::Run(const RunOptions& run_options,
   args.rendezvous = run_state.rendez;
   args.cancellation_manager = cancellation_manager_;
   args.runner = [this](Executor::Args::Closure c) { SchedClosure(c); };
+  args.session_state = &session_state_;
+  args.tensor_store = &run_state.tensor_store;
   if (LogMemory::IsEnabled()) {
     LogMemory::RecordStep(args.step_id, run_state_args.handle);
   }
@@ -340,6 +342,11 @@ Status DirectSession::Run(const RunOptions& run_options,
   // Receive outputs.
   TF_RETURN_IF_ERROR(
       RecvOutputs(output_names, executors_and_keys, &run_state, outputs));
+
+  // Save the output tensors of this run we choose to keep.
+  TF_RETURN_IF_ERROR(
+      run_state.tensor_store.SaveTensors(output_names, &session_state_));
+
   return Status::OK();
 }
 
@@ -369,9 +376,8 @@ Status DirectSession::PRunSetup(const std::vector<string>& input_names,
   {
     mutex_lock l(executor_lock_);
     if (!partial_runs_.insert({run_state_args.handle, run_state}).second) {
-      return errors::Internal("The handle ", run_state_args.handle,
-                              " created for this partial"
-                              " run is not unique.");
+      return errors::Internal("The handle '", run_state_args.handle,
+                              "' created for this partial run is not unique.");
     }
   }
 
@@ -390,13 +396,12 @@ Status DirectSession::PRunSetup(const std::vector<string>& input_names,
       });
 
   Executor::Args args;
-  {
-    mutex_lock l(mu_);
-    args.step_id = name_counter_++;
-  }
+  args.step_id = step_id_counter_.fetch_add(1);
   args.rendezvous = run_state->rendez;
   args.cancellation_manager = cancellation_manager_;
   args.runner = [this](Executor::Args::Closure c) { SchedClosure(c); };
+  args.session_state = &session_state_;
+  args.tensor_store = &run_state->tensor_store;
   if (LogMemory::IsEnabled()) {
     LogMemory::RecordStep(args.step_id, run_state_args.handle);
   }
@@ -470,9 +475,14 @@ Status DirectSession::PRun(const string& handle, const NamedTensorList& inputs,
     s = RecvOutputs(output_names, executors_and_keys, run_state, outputs);
   }
 
-  // Delete the run state if there is an error or all fetches are done.
+  // Save the output tensors of this run we choose to keep.
+  if (s.ok()) {
+    s = run_state->tensor_store.SaveTensors(output_names, &session_state_);
+  }
+
   {
     mutex_lock l(executor_lock_);
+    // Delete the run state if there is an error or all fetches are done.
     bool done = true;
     if (s.ok()) {
       {
@@ -911,7 +921,7 @@ Status DirectSession::CreateGraphs(gtl::ArraySlice<string> feeds,
     // allow.
     device_opts.allow_internal_ops = true;
     device_opts.expect_device_spec = true;
-    Status s = ConvertGraphDefToGraph(device_opts, *graph_def, device_graph);
+    s = ConvertGraphDefToGraph(device_opts, *graph_def, device_graph);
     if (!s.ok()) {
       delete device_graph;
       break;
diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
index 15c3b2625ac..a35036ecd81 100644
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/session_state.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -78,6 +79,7 @@ class DirectSession : public Session {
   ::tensorflow::Status PRun(const string& handle, const NamedTensorList& inputs,
                             const std::vector<string>& output_names,
                             std::vector<Tensor>* outputs) override;
+
   ::tensorflow::Status Close() override;
 
   // NOTE: This is a temporary api that is only meant to enable testing.
@@ -135,6 +137,7 @@ class DirectSession : public Session {
     Notification executors_done;
     std::unordered_set<string> pending_inputs;
     std::unordered_set<string> pending_outputs;
+    TensorStore tensor_store;
 
     RunState(const std::vector<string>& input_names,
              const std::vector<string>& output_names) {
@@ -146,6 +149,7 @@ class DirectSession : public Session {
         pending_outputs.emplace(name);
       }
     }
+
     ~RunState();
   };
 
@@ -228,6 +232,9 @@ class DirectSession : public Session {
   std::unordered_map<string, RunState*> partial_runs_
       GUARDED_BY(executor_lock_);
 
+  // This holds all the tensors that are currently alive in the session.
+  SessionState session_state_;
+
   CancellationManager* cancellation_manager_;
 
   // Saves and restores device placements for stateful nodes.
diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index b0255fe1180..75a1235f0b0 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -564,6 +564,77 @@ TEST(DirectSessionTest, PartialRunMultiOutputFeed) {
   ASSERT_EQ(true, outputs[0].flat<bool>()(0));
 }
 
+TEST(DirectSessionTest, RunHandleTest) {
+  GraphDef def;
+  Graph g(OpRegistry::Global());
+
+  Tensor value0(DT_FLOAT, TensorShape({}));
+  value0.scalar<float>()() = 1.0;
+  Node* const0 = test::graph::Constant(&g, value0);
+  Node* identity0 = test::graph::Identity(&g, const0);
+
+  Tensor value1(DT_FLOAT, TensorShape({}));
+  value1.scalar<float>()() = 2.0;
+  Node* const1 = test::graph::Constant(&g, value1);
+  Node* node3 = test::graph::Add(&g, identity0, const1);
+  Node* node4 = test::graph::Unary(&g, "GetSessionHandle", node3);
+
+  Tensor value2(DT_STRING, TensorShape({}));
+  Node* const2 = test::graph::Constant(&g, value2);
+  Node* node5 = test::graph::GetSessionTensor(&g, const2);
+  Node* node6 = test::graph::Add(&g, node5, const1);
+
+  Node* node7 = test::graph::Unary(&g, "DeleteSessionTensor", const2);
+
+  test::graph::ToGraphDef(&g, &def);
+
+  std::unique_ptr<Session> session(CreateSession());
+  ASSERT_TRUE(session != nullptr);
+  TF_ASSERT_OK(session->Create(def));
+
+  // First run call: Create a handle.
+  std::vector<Tensor> outputs;
+  Status s = session->Run({}, {node4->name() + ":0"}, {}, &outputs);
+  ASSERT_TRUE(s.ok());
+  ASSERT_EQ(1, outputs.size());
+
+  // Second run call: Use a handle.
+  std::vector<Tensor> outputs1;
+  s = session->Run({{const2->name(), outputs[0]}}, {node6->name() + ":0"}, {},
+                   &outputs1);
+  ASSERT_TRUE(s.ok());
+  ASSERT_EQ(1, outputs1.size());
+  ASSERT_EQ(5.0, outputs1[0].flat<float>()(0));
+
+  // Third run call: Delete a handle.
+  std::vector<Tensor> outputs2;
+  s = session->Run({{const2->name(), outputs[0]}}, {}, {node7->name()},
+                   &outputs2);
+  ASSERT_TRUE(s.ok());
+}
+
+TEST(DirectSessionTest, CreateGraphFailsWhenAssigningAFedVar) {
+  Graph graph(OpRegistry::Global());
+
+  Node* a = test::graph::Var(&graph, DT_FLOAT, {});
+  Node* b = test::graph::Constant(&graph, {});
+
+  Tensor zero(DT_FLOAT, {});
+  test::FillValues<float>(&zero, {0});
+
+  // a = b
+  Node* assign = test::graph::Assign(&graph, a, b);
+
+  std::unique_ptr<Session> session(CreateSession());
+  ASSERT_TRUE(session != nullptr);
+
+  // The graph is invalid since a constant cannot be assigned to a constant.
+  // The return Status of session->Run should flag this as an invalid argument.
+  std::vector<Tensor> outputs;
+  Status s = session->Run({{a->name(), zero}}, {assign->name()}, {}, &outputs);
+  ASSERT_TRUE(errors::IsInvalidArgument(s));
+}
+
 TEST(DirectSessionTest, TimeoutSession) {
   GraphDef graph;
   // Creates a graph with one FIFOQueue and one dequeue op.
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 1051fe71932..87868462bba 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -645,6 +645,8 @@ class ExecutorState {
   int64 step_id_;
   // Not owned.
   Rendezvous* rendezvous_;
+  SessionState* session_state_;
+  TensorStore* tensor_store_;
   StepStatsCollector* stats_collector_;
   // QUESTION: Make it a checkpoint::TensorSliceReaderCacheWrapper
   // instead of a pointer?  (avoids having to delete).
@@ -793,6 +795,8 @@ class ExecutorState {
 ExecutorState::ExecutorState(const Executor::Args& args, ExecutorImpl* impl)
     : step_id_(args.step_id),
       rendezvous_(args.rendezvous),
+      session_state_(args.session_state),
+      tensor_store_(args.tensor_store),
       stats_collector_(args.stats_collector),
       slice_reader_cache_(new checkpoint::TensorSliceReaderCacheWrapper),
       call_frame_(args.call_frame),
@@ -938,6 +942,8 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) {
   // track allocations if and only if we are collecting statistics
   params.track_allocations = (stats_collector_ != nullptr);
   params.rendezvous = rendezvous_;
+  params.session_state = session_state_;
+  params.tensor_store = tensor_store_;
   params.cancellation_manager = cancellation_manager_;
   params.call_frame = call_frame_;
   params.function_library = impl_->params_.function_library;
diff --git a/tensorflow/core/common_runtime/executor.h b/tensorflow/core/common_runtime/executor.h
index 1d4972d04df..b013927980c 100644
--- a/tensorflow/core/common_runtime/executor.h
+++ b/tensorflow/core/common_runtime/executor.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/framework/session_state.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/notification.h"
@@ -85,6 +86,8 @@ class Executor {
     StepStatsCollector* stats_collector = nullptr;
     FunctionCallFrame* call_frame = nullptr;
     CancellationManager* cancellation_manager = nullptr;
+    SessionState* session_state = nullptr;
+    TensorStore* tensor_store = nullptr;
 
     typedef std::function<void()> Closure;
     typedef std::function<void(Closure)> Runner;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
index eac00fc757a..2fa707eacc2 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/common_runtime/session_state.cc b/tensorflow/core/common_runtime/session_state.cc
new file mode 100644
index 00000000000..10e614cce58
--- /dev/null
+++ b/tensorflow/core/common_runtime/session_state.cc
@@ -0,0 +1,83 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/session_state.h"
+#include "tensorflow/core/graph/tensor_id.h"
+
+namespace tensorflow {
+
+Status SessionState::GetTensor(const string& handle, Tensor* tensor) {
+  mutex_lock l(state_lock_);
+  auto it = tensors_.find(handle);
+  if (it == tensors_.end()) {
+    return errors::InvalidArgument("The tensor with handle '", handle,
+                                   "' is not in the session store.");
+  }
+  *tensor = it->second;
+  return Status::OK();
+}
+
+Status SessionState::AddTensor(const string& handle, const Tensor& tensor) {
+  mutex_lock l(state_lock_);
+  if (!tensors_.insert({handle, tensor}).second) {
+    return errors::InvalidArgument("Failed to add a tensor with handle '",
+                                   handle, "' to the session store.");
+  }
+  return Status::OK();
+}
+
+Status SessionState::DeleteTensor(const string& handle) {
+  mutex_lock l(state_lock_);
+  if (tensors_.erase(handle) == 0) {
+    return errors::InvalidArgument("Failed to delete a tensor with handle '",
+                                   handle, "' in the session store.");
+  }
+  return Status::OK();
+}
+
+int64 SessionState::GetNewId() {
+  mutex_lock l(state_lock_);
+  return tensor_id_++;
+}
+
+Status TensorStore::AddTensor(const string& name, const TensorAndKey& tk) {
+  mutex_lock l(lock_);
+  if (!tensors_.insert({name, tk}).second) {
+    return errors::InvalidArgument("Failed to add a tensor with name '", name,
+                                   "' to the tensor store.");
+  }
+  return Status::OK();
+}
+
+Status TensorStore::SaveTensors(const std::vector<string>& output_names,
+                                SessionState* session_state) {
+  mutex_lock l(lock_);
+  if (tensors_.size() != 0) {
+    // Save only the tensors in output_names in the session.
+    for (const string& name : output_names) {
+      TensorId id(ParseTensorName(name));
+      const string& op_name = id.first.ToString();
+      auto it = tensors_.find(op_name);
+      if (it != tensors_.end()) {
+        // Save the tensor to the session state.
+        string key = it->second.GetHandle(op_name);
+        TF_RETURN_IF_ERROR(session_state->AddTensor(key, it->second.tensor));
+      }
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index 3f2fb3f3a0d..2ac3bd77375 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -103,10 +103,14 @@ Status GrpcServer::Init() {
         return errors::InvalidArgument("Task ", server_def_.task_index(),
                                        " was not defined in job \"",
                                        server_def_.job_name(), "\"");
-      } else if (!strings::safe_strto32(str_util::Split(iter->second, ':')[1],
-                                        &requested_port_)) {
-        return errors::Internal("Could not parse port for local server from \"",
-                                iter->second, "\"");
+      }
+      const std::vector<string> hostname_port =
+          str_util::Split(iter->second, ':');
+      if (hostname_port.size() != 2 ||
+          !strings::safe_strto32(hostname_port[1], &requested_port_)) {
+        return errors::InvalidArgument(
+            "Could not parse port for local server from \"", iter->second,
+            "\"");
       } else {
         break;
       }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index 876677ce542..b56421970fa 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -89,12 +89,12 @@ class GrpcServer : public ServerInterface {
 
   // Implementation of a TensorFlow master, and RPC polling thread.
   MasterEnv master_env_;
-  AsyncServiceInterface* master_service_;
+  AsyncServiceInterface* master_service_ = nullptr;
   std::unique_ptr<Thread> master_thread_ GUARDED_BY(mu_);
 
   // Implementation of a TensorFlow worker, and RPC polling thread.
   WorkerEnv worker_env_;
-  AsyncServiceInterface* worker_service_;
+  AsyncServiceInterface* worker_service_ = nullptr;
   std::unique_ptr<Thread> worker_thread_ GUARDED_BY(mu_);
 
   std::unique_ptr<::grpc::Server> server_ GUARDED_BY(mu_);
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index 14ea365f54d..4c289c6469d 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -678,8 +678,8 @@ Status FunctionCallFrame::GetRetvals(std::vector<Tensor>* rets) const {
 
 Status FunctionCallFrame::GetArg(int index, Tensor* val) const {
   if (index < 0 || static_cast<size_t>(index) >= args_.size()) {
-    return errors::OutOfRange("GetArg ", index, " is not within [0, ",
-                              args_.size(), ")");
+    return errors::InvalidArgument("GetArg ", index, " is not within [0, ",
+                                   args_.size(), ")");
   }
   *val = args_[index];
   return Status::OK();
@@ -687,8 +687,8 @@ Status FunctionCallFrame::GetArg(int index, Tensor* val) const {
 
 Status FunctionCallFrame::SetRetval(int index, const Tensor& val) {
   if (index < 0 || static_cast<size_t>(index) >= rets_.size()) {
-    return errors::OutOfRange("SetRetval ", index, " is not within [0, ",
-                              rets_.size(), ")");
+    return errors::InvalidArgument("SetRetval ", index, " is not within [0, ",
+                                   rets_.size(), ")");
   }
   if (val.dtype() != ret_types_[index]) {
     return errors::InvalidArgument(
diff --git a/tensorflow/core/framework/function_test.cc b/tensorflow/core/framework/function_test.cc
index 8489c94a3c2..9d084837444 100644
--- a/tensorflow/core/framework/function_test.cc
+++ b/tensorflow/core/framework/function_test.cc
@@ -563,8 +563,8 @@ TEST(FunctionCallFrame, Void_Void) {
   auto a = test::AsTensor<float>({100});
   HasError(frame.SetArgs({a}), "Invalid argument");
   Tensor v;
-  HasError(frame.GetArg(0, &v), "Out of range");
-  HasError(frame.SetRetval(0, v), "Out of range");
+  HasError(frame.GetArg(0, &v), "Invalid argument");
+  HasError(frame.SetRetval(0, v), "Invalid argument");
   std::vector<Tensor> rets;
   TF_EXPECT_OK(frame.GetRetvals(&rets));
   EXPECT_EQ(rets.size(), 0);
@@ -581,16 +581,16 @@ TEST(FunctionCallFrame, Float_Float_Float) {
   TF_EXPECT_OK(frame.SetArgs({a, b}));
 
   Tensor v;
-  HasError(frame.GetArg(-1, &v), "Out of range");
-  HasError(frame.GetArg(2, &v), "Out of range");
+  HasError(frame.GetArg(-1, &v), "Invalid argument");
+  HasError(frame.GetArg(2, &v), "Invalid argument");
   TF_EXPECT_OK(frame.GetArg(0, &v));
   test::ExpectTensorEqual<float>(a, v);
   TF_EXPECT_OK(frame.GetArg(1, &v));
   test::ExpectTensorEqual<float>(b, v);
 
   v = test::AsTensor<float>({-100});
-  HasError(frame.SetRetval(-1, v), "Out of range");
-  HasError(frame.SetRetval(1, v), "Out of range");
+  HasError(frame.SetRetval(-1, v), "Invalid argument");
+  HasError(frame.SetRetval(1, v), "Invalid argument");
   HasError(frame.SetRetval(0, test::AsTensor<int64>({-100})),
            "Invalid argument: Expects ret[0] to be float");
 
diff --git a/tensorflow/core/framework/numeric_op.h b/tensorflow/core/framework/numeric_op.h
index 6690ed3f622..ccfe2f0d921 100644
--- a/tensorflow/core/framework/numeric_op.h
+++ b/tensorflow/core/framework/numeric_op.h
@@ -99,7 +99,7 @@ class BinaryElementWiseOp : public BinaryOp<T> {
 #undef NDIM_CASE
 
       default:
-        context->SetStatus(errors::OutOfRange(
+        context->SetStatus(errors::InvalidArgument(
             "We only handle up to Tensor::dims() up to 8, not ", a.dims()));
         break;
     }
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 61d15edf7b1..ca017aa1899 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/framework/selective_registration.h"
+#include "tensorflow/core/framework/session_state.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -502,6 +503,12 @@ class OpKernelContext {
     // computations running on other devices.
     Rendezvous* rendezvous = nullptr;
 
+    // The session state for this op.
+    SessionState* session_state = nullptr;
+
+    // The tensor store for this op.
+    TensorStore* tensor_store = nullptr;
+
     // Mechanism used by this op kernel invocation to register a callback
     // for its cancellation.
     CancellationManager* cancellation_manager = nullptr;
@@ -841,6 +848,12 @@ class OpKernelContext {
   // Rendezvous Send() and Recv().
   Rendezvous* rendezvous() const { return params_->rendezvous; }
 
+  // An op kernel can access the session state it belongs to.
+  SessionState* session_state() const { return params_->session_state; }
+
+  // An op kernel can access the tensor store of the run it belongs to.
+  TensorStore* tensor_store() const { return params_->tensor_store; }
+
   // Function call support.
   //
   // If this kernel invocation is within a function execution,
@@ -1031,15 +1044,16 @@ typedef ::tensorflow::KernelDefBuilder Name;
 #define REGISTER_KERNEL_BUILDER_UNIQ_HELPER(ctr, kernel_builder, ...) \
   REGISTER_KERNEL_BUILDER_UNIQ(ctr, kernel_builder, __VA_ARGS__)
 
-#define REGISTER_KERNEL_BUILDER_UNIQ(ctr, kernel_builder, ...)        \
-  static ::tensorflow::kernel_factory::OpKernelRegistrar              \
-      registrar__body__##ctr##__object(                               \
-          SHOULD_REGISTER_OP_KERNEL(#__VA_ARGS__)                     \
-              ? ::tensorflow::register_kernel::kernel_builder.Build() \
-              : nullptr,                                              \
-          #__VA_ARGS__,                                               \
-          [](::tensorflow::OpKernelConstruction* context)             \
-              -> ::tensorflow::OpKernel* { return new __VA_ARGS__(context); })
+#define REGISTER_KERNEL_BUILDER_UNIQ(ctr, kernel_builder, ...)          \
+  static ::tensorflow::kernel_factory::OpKernelRegistrar                \
+      registrar__body__##ctr##__object(                                 \
+          SHOULD_REGISTER_OP_KERNEL(#__VA_ARGS__)                       \
+              ? ::tensorflow::register_kernel::kernel_builder.Build()   \
+              : nullptr,                                                \
+          #__VA_ARGS__, [](::tensorflow::OpKernelConstruction* context) \
+                            -> ::tensorflow::OpKernel* {                \
+                              return new __VA_ARGS__(context);          \
+                            });
 
 void* GlobalKernelRegistry();
 
diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc
index 6d3cfb0c920..06de5d53040 100644
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -120,8 +120,8 @@ class OpKernelTest : public ::testing::Test {
   void ExpectEqual(const string& what, const DataTypeVector& expected,
                    const DataTypeVector& observed) {
     EXPECT_EQ(expected.size(), observed.size()) << what;
-    const int size = std::min(expected.size(), observed.size());
-    for (int i = 0; i < size; ++i) {
+    const size_t size = std::min(expected.size(), observed.size());
+    for (size_t i = 0; i < size; ++i) {
       bool match = TypesCompatible(expected[i], observed[i]);
       EXPECT_TRUE(match) << what << " i:" << i << ", expected: " << expected[i]
                          << ", observed: " << observed[i];
diff --git a/tensorflow/core/framework/register_types.h b/tensorflow/core/framework/register_types.h
index 634852b2ba9..99e84eda31c 100644
--- a/tensorflow/core/framework/register_types.h
+++ b/tensorflow/core/framework/register_types.h
@@ -47,65 +47,42 @@ limitations under the License.
 // Call "m" for all number types that support the comparison operations "<" and
 // ">".
 #define TF_CALL_REAL_NUMBER_TYPES(m) \
-  m(float);                          \
-  m(double);                         \
-  m(int64);                          \
-  m(int32);                          \
-  m(uint8);                          \
-  m(int16);                          \
-  m(int8)
+  m(float) m(double) m(int64) m(int32) m(uint8) m(int16) m(int8)
 
 #define TF_CALL_REAL_NUMBER_TYPES_NO_INT32(m) \
-  m(float);                                   \
-  m(double);                                  \
-  m(int64);                                   \
-  m(uint8);                                   \
-  m(int16);                                   \
-  m(int8)
+  m(float) m(double) m(int64) m(uint8) m(int16) m(int8)
 
 // Call "m" for all number types, including complex64 and complex128.
 #define TF_CALL_NUMBER_TYPES(m) \
-  TF_CALL_REAL_NUMBER_TYPES(m); \
-  m(complex64);                 \
-  m(complex128)
+  TF_CALL_REAL_NUMBER_TYPES(m)  \
+  m(complex64) m(complex128)
 
 #define TF_CALL_NUMBER_TYPES_NO_INT32(m) \
-  TF_CALL_REAL_NUMBER_TYPES_NO_INT32(m); \
-  m(complex64);                          \
-  m(complex128)
+  TF_CALL_REAL_NUMBER_TYPES_NO_INT32(m)  \
+  m(complex64) m(complex128)
 
 #define TF_CALL_POD_TYPES(m) \
-  TF_CALL_NUMBER_TYPES(m);   \
+  TF_CALL_NUMBER_TYPES(m)    \
   m(bool)
 
 // Call "m" on all types.
 #define TF_CALL_ALL_TYPES(m) \
-  TF_CALL_POD_TYPES(m);      \
+  TF_CALL_POD_TYPES(m)       \
   m(string)
 
 // Call "m" on all types supported on GPU.
-#define TF_CALL_GPU_NUMBER_TYPES(m) \
-  m(float);                         \
-  m(double)
+#define TF_CALL_GPU_NUMBER_TYPES(m) m(float) m(double)
 
 // Call "m" on all quantized types.
-#define TF_CALL_QUANTIZED_TYPES(m) \
-  m(qint8);                        \
-  m(quint8);                       \
-  m(qint32)
+#define TF_CALL_QUANTIZED_TYPES(m) m(qint8) m(quint8) m(qint32)
 
 #elif defined(__ANDROID_TYPES_FULL__)
 
-#define TF_CALL_REAL_NUMBER_TYPES(m) \
-  m(float);                          \
-  m(int32);                          \
-  m(int64)
+#define TF_CALL_REAL_NUMBER_TYPES(m) m(float) m(int32) m(int64)
 
 #define TF_CALL_NUMBER_TYPES(m) TF_CALL_REAL_NUMBER_TYPES(m)
 
-#define TF_CALL_REAL_NUMBER_TYPES_NO_INT32(m) \
-  m(float);                                   \
-  m(int64)
+#define TF_CALL_REAL_NUMBER_TYPES_NO_INT32(m) m(float) m(int64)
 
 #define TF_CALL_NUMBER_TYPES_NO_INT32(m) TF_CALL_REAL_NUMBER_TYPES_NO_INT32(m)
 
@@ -117,16 +94,11 @@ limitations under the License.
 #define TF_CALL_GPU_NUMBER_TYPES(m) m(float)
 
 // Call "m" on all quantized types.
-#define TF_CALL_QUANTIZED_TYPES(m) \
-  m(qint8);                        \
-  m(quint8);                       \
-  m(qint32)
+#define TF_CALL_QUANTIZED_TYPES(m) m(qint8) m(quint8) m(qint32)
 
 #else  // defined(__ANDROID__) && !defined(__ANDROID_TYPES_FULL__)
 
-#define TF_CALL_REAL_NUMBER_TYPES(m) \
-  m(float);                          \
-  m(int32)
+#define TF_CALL_REAL_NUMBER_TYPES(m) m(float) m(int32)
 
 #define TF_CALL_NUMBER_TYPES(m) TF_CALL_REAL_NUMBER_TYPES(m)
 
diff --git a/tensorflow/core/framework/session_state.h b/tensorflow/core/framework/session_state.h
new file mode 100644
index 00000000000..0093e91f9b8
--- /dev/null
+++ b/tensorflow/core/framework/session_state.h
@@ -0,0 +1,85 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_FRAMEWORK_SESSION_STATE_H_
+#define TENSORFLOW_FRAMEWORK_SESSION_STATE_H_
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+// The session state remembers the tensors we choose to keep across
+// multiple run calls.
+class SessionState {
+ public:
+  // Get a tensor from the session state.
+  Status GetTensor(const string& handle, Tensor* tensor);
+
+  // Store a tensor in the session state.
+  Status AddTensor(const string& handle, const Tensor& tensor);
+
+  // Delete a tensdor from the session state.
+  Status DeleteTensor(const string& handle);
+
+  int64 GetNewId();
+
+ private:
+  mutex state_lock_;
+
+  // For generating unique ids for tensors stored in the session.
+  int64 tensor_id_ = 0;
+
+  // The live tensors in the session. A map from tensor handle to tensor.
+  std::unordered_map<string, Tensor> tensors_;
+};
+
+// The tensor store remembers the tensors we choose to keep for the
+// current run call. It is available to every op kernel.
+class TensorStore {
+ public:
+  struct TensorAndKey {
+    Tensor tensor;
+    int64 id;
+    string device_name;
+
+    string GetHandle(const string& tensor_name) {
+      return strings::StrCat(tensor_name, ";", id, ";", device_name);
+    }
+  };
+
+  // Add the named tensor to the tensor store for this run.
+  Status AddTensor(const string& name, const TensorAndKey& tk);
+
+  // Save the tensors in the tensor store of this run to the session.
+  Status SaveTensors(const std::vector<string>& output_names,
+                     SessionState* session_state);
+
+ private:
+  mutex lock_;
+
+  // The tensors that will be saved to session state when this run completes.
+  // A map from tensor string name to tensor.
+  std::unordered_map<string, TensorAndKey> tensors_ GUARDED_BY(lock_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_FRAMEWORK_SESSION_STATE_H_
diff --git a/tensorflow/core/framework/tensor_shape.cc b/tensorflow/core/framework/tensor_shape.cc
index ee59a79d38b..ae7c34bd93d 100644
--- a/tensorflow/core/framework/tensor_shape.cc
+++ b/tensorflow/core/framework/tensor_shape.cc
@@ -44,6 +44,7 @@ void TensorShape::CheckDimsAtLeast(int NDIMS) const {
 
 bool TensorShape::IsValid(const TensorShapeProto& proto) {
   int64 num_elements = 1;
+  if (proto.dim().size() > MaxDimensions()) return false;
   for (const auto& d : proto.dim()) {
     if (d.size() < 0) return false;
     num_elements *= d.size();
@@ -54,6 +55,10 @@ bool TensorShape::IsValid(const TensorShapeProto& proto) {
 
 Status TensorShape::IsValidShape(const TensorShapeProto& proto) {
   int64 num_elements = 1;
+  if (proto.dim().size() > MaxDimensions()) {
+    return errors::InvalidArgument("Shape ", DebugString(proto),
+                                   " has too many dimensions");
+  }
   for (const auto& d : proto.dim()) {
     if (d.size() < 0) {
       return errors::InvalidArgument("Shape ", DebugString(proto),
@@ -165,7 +170,7 @@ void TensorShape::RecomputeNumElements() {
 void TensorShape::AddDim(int64 size) {
   CHECK_GE(size, 0);
   const int nd = ndims_byte();
-  CHECK_LT(nd, 255) << "Too many dimensions in tensor";
+  CHECK_LT(nd, MaxDimensions()) << "Too many dimensions in tensor";
   if (tag() == REP16 && nd < 6 && size < kMaxRep16) {
     as16()->dims_[nd] = static_cast<int16>(size);
   } else if (tag() == REP32 && nd < 3 && size < kMaxRep32) {
@@ -214,6 +219,7 @@ void TensorShape::InsertDim(int d, int64 size) {
   CHECK_GE(d, 0);
   CHECK_LE(d, dims());
   CHECK_GE(size, 0);
+  CHECK_LT(dims(), MaxDimensions());
   gtl::InlinedVector<int64, 8> vals;
   AppendTo(*this, &vals);
   vals.insert(vals.begin() + d, size);
@@ -341,6 +347,9 @@ bool TensorShapeUtils::StartsWith(const TensorShape& shape,
 template <typename T>
 static inline Status MakeShapeHelper(const T* dims, int n, TensorShape* out) {
   *out = TensorShape();
+  if (n > TensorShape::MaxDimensions()) {
+    return errors::InvalidArgument("Too many dimensions");
+  }
   for (int i = 0; i < n; ++i) {
     const T dim = internal::SubtleMustCopy(dims[i]);
     if (dim >= 0) {
diff --git a/tensorflow/core/framework/tensor_shape.h b/tensorflow/core/framework/tensor_shape.h
index e341ceddfbe..84947e308a7 100644
--- a/tensorflow/core/framework/tensor_shape.h
+++ b/tensorflow/core/framework/tensor_shape.h
@@ -71,6 +71,9 @@ class TensorShape {
   /// Appends all the dimensions from `shape`.
   void AppendShape(const TensorShape& shape);
 
+  // Maximum number of dimensions in a tensor.
+  static constexpr int MaxDimensions() { return 255; }
+
   /// \brief Insert a dimension somewhere in the `TensorShape`.
   /// REQUIRES: `0 <= d <= dims()`
   /// REQUIRES: `size >= 0`
@@ -277,6 +280,7 @@ template <int NDIMS>
 Eigen::DSizes<Eigen::DenseIndex, NDIMS> TensorShape::AsEigenDSizesWithPadding()
     const {
   CheckDimsAtLeast(NDIMS);
+  static_assert(NDIMS <= TensorShape::MaxDimensions(), "Too many dimensions");
   Eigen::DSizes<Eigen::DenseIndex, NDIMS> dsizes;
   for (int d = 0; d < dims(); d++) {
     dsizes[d] = dim_size(d);
diff --git a/tensorflow/core/framework/tensor_shape_test.cc b/tensorflow/core/framework/tensor_shape_test.cc
index f47d2f9ac37..5eeaeb61dae 100644
--- a/tensorflow/core/framework/tensor_shape_test.cc
+++ b/tensorflow/core/framework/tensor_shape_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/tensor_shape.h"
 
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -87,6 +88,21 @@ TEST(TensorShapeTest, InvalidShapeProto) {
   EXPECT_FALSE(TensorShape::IsValid(proto));
 }
 
+TEST(TensorShapeTest, TooManyDimsProto) {
+  TensorShapeProto proto;
+  // Deliberate redundancy to ensure that both paths work.
+  EXPECT_TRUE(TensorShape::IsValid(proto));
+  TF_EXPECT_OK(TensorShape::IsValidShape(proto));
+  for (int i = 0; i < TensorShape::MaxDimensions(); i++) {
+    proto.add_dim()->set_size(1);
+  }
+  EXPECT_TRUE(TensorShape::IsValid(proto));
+  TF_EXPECT_OK(TensorShape::IsValidShape(proto));
+  proto.add_dim()->set_size(1);
+  EXPECT_FALSE(TensorShape::IsValid(proto));
+  EXPECT_FALSE(TensorShape::IsValidShape(proto).ok());
+}
+
 TEST(TensorShapeTest, SetDimForEmptyTensor) {
   TensorShape s({10, 5, 20});
   EXPECT_EQ(1000, s.num_elements());
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index 57c5b2b2009..80eaed56a9d 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -95,6 +95,9 @@ void Node::Initialize(int id, int cost_id, Properties* props) {
   SET_CLASS(NC_CONSTANT, ts, "Const", "HostConst");
   SET_CLASS(NC_VARIABLE, ts, "Variable", "");
   SET_CLASS(NC_IDENTITY, ts, "Identity", "RefIdentity");
+  SET_CLASS(NC_GET_SESSION_HANDLE, ts, "GetSessionHandle", "");
+  SET_CLASS(NC_GET_SESSION_TENSOR, ts, "GetSessionTensor", "");
+  SET_CLASS(NC_DELETE_SESSION_TENSOR, ts, "DeleteSessionTensor", "");
   if (class_ == NC_UNINITIALIZED) {
     class_ = NC_OTHER;  // Catch all
   }
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index 4ad2a306b23..23aa211c846 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -118,6 +118,11 @@ class Node {
   bool IsConstant() const { return (class_ == NC_CONSTANT); }
   bool IsVariable() const { return (class_ == NC_VARIABLE); }
   bool IsIdentity() const { return (class_ == NC_IDENTITY); }
+  bool IsGetSessionHandle() const { return (class_ == NC_GET_SESSION_HANDLE); }
+  bool IsGetSessionTensor() const { return (class_ == NC_GET_SESSION_TENSOR); }
+  bool IsDeleteSessionTensor() const {
+    return (class_ == NC_DELETE_SESSION_TENSOR);
+  }
   bool IsControlFlow() const {
     return (class_ != NC_OTHER) &&  // Fast path
            (IsSwitch() || IsMerge() || IsEnter() || IsExit() ||
@@ -172,6 +177,9 @@ class Node {
     NC_CONSTANT,
     NC_VARIABLE,
     NC_IDENTITY,
+    NC_GET_SESSION_HANDLE,
+    NC_GET_SESSION_TENSOR,
+    NC_DELETE_SESSION_TENSOR,
     NC_OTHER  // Not a special kind of node
   };
 
diff --git a/tensorflow/core/graph/testlib.cc b/tensorflow/core/graph/testlib.cc
index f3164009fcb..e5267c45755 100644
--- a/tensorflow/core/graph/testlib.cc
+++ b/tensorflow/core/graph/testlib.cc
@@ -360,6 +360,15 @@ Node* Gather(Graph* g, Node* in0, Node* in1) {
   return ret;
 }
 
+Node* GetSessionTensor(Graph* g, Node* in) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "GetSessionTensor")
+                  .Input(in, 0)
+                  .Attr("dtype", DT_FLOAT)
+                  .Finalize(g, &ret));
+  return ret;
+}
+
 void ToGraphDef(Graph* g, GraphDef* gdef) { g->ToGraphDef(gdef); }
 
 }  // end namespace graph
diff --git a/tensorflow/core/graph/testlib.h b/tensorflow/core/graph/testlib.h
index cb6f2468f2a..f61265d6f41 100644
--- a/tensorflow/core/graph/testlib.h
+++ b/tensorflow/core/graph/testlib.h
@@ -161,6 +161,9 @@ Node* Gather(Graph* g, Node* in0, Node* in1);
 // Computes the args needed broadcast gradient function.
 Node* BroadcastGradientArgs(Graph* g, Node* s0, Node* s1);
 
+// Gets a tensor stored in the session state.
+Node* GetSessionTensor(Graph* g, Node* in);
+
 }  // end namespace graph
 }  // end namespace test
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 34cd927e22b..d75db3f381d 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -173,6 +173,7 @@ cc_library(
     srcs = ["save_restore_tensor.cc"],
     hdrs = ["save_restore_tensor.h"],
     deps = [
+        ":bounds_check",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
     ],
@@ -261,6 +262,7 @@ tf_kernel_libraries(
         "concat_op",
         "constant_op",
         "diag_op",
+        "batch_matrix_diag_op",
         "edit_distance_op",
         "gather_nd_op",
         "gather_op",
@@ -337,6 +339,23 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "example_parsing_ops_test",
+    size = "small",
+    deps = [
+        ":example_parsing_ops",
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cuda_cc_test(
     name = "gather_op_test",
     size = "small",
@@ -523,6 +542,7 @@ tf_kernel_libraries(
         "padding_fifo_queue_op",
         "queue_ops",
         "random_shuffle_queue_op",
+        "session_ops",
         "stack_ops",
         "tensor_array_ops",
     ],
@@ -593,14 +613,16 @@ cc_library(
     ],
 )
 
-cc_library(
+tf_kernel_library(
     name = "tensor_array",
     srcs = ["tensor_array.cc"],
     hdrs = ["tensor_array.h"],
     visibility = ["//visibility:private"],
     deps = [
+        ":aggregate_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//third_party/eigen3",
     ],
 )
 
@@ -1282,6 +1304,7 @@ tf_kernel_libraries(
     name = "string",
     prefixes = [
         "string_to_hash_bucket_op",
+        "reduce_join_op",
     ],
     deps = [
         "//tensorflow/core:framework",
@@ -1497,6 +1520,7 @@ filegroup(
         "restore_op.cc",
         "save_op.cc",
         "save_restore_tensor.cc",
+        "session_ops.cc",
         "softplus_op.cc",
         "softsign_op.cc",
         "sparse_to_dense_op.cc",
diff --git a/tensorflow/core/kernels/batch_matrix_diag_op.cc b/tensorflow/core/kernels/batch_matrix_diag_op.cc
new file mode 100644
index 00000000000..3c696c173dd
--- /dev/null
+++ b/tensorflow/core/kernels/batch_matrix_diag_op.cc
@@ -0,0 +1,232 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/array_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/batch_matrix_diag_op.h"
+
+#include <memory>
+#include <vector>
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename T>
+inline typename TTypes<T, 3>::ConstTensor flat_inner_dims_matrix(
+    const Tensor& t) {
+  int64 last_size = t.dims() > 1 ? t.dim_size(t.dims() - 1) : 1;
+  int64 but_last_size = t.dims() > 1 ? t.dim_size(t.dims() - 2) : 1;
+  if (last_size * but_last_size == 0) {
+    DCHECK_EQ(t.NumElements(), 0);
+    // Return something empty, avoiding divide by 0
+    return t.shaped<T, 3>({0, 0, 0});
+  } else {
+    return t.shaped<T, 3>({t.NumElements() / (but_last_size * last_size),
+                           but_last_size, last_size});
+  }
+}
+
+template <typename T>
+inline typename TTypes<T, 3>::Tensor flat_inner_dims_matrix(Tensor* t) {
+  int64 last_size = t->dims() > 1 ? t->dim_size(t->dims() - 1) : 1;
+  int64 but_last_size = t->dims() > 1 ? t->dim_size(t->dims() - 2) : 1;
+  if (last_size * but_last_size == 0) {
+    DCHECK_EQ(t->NumElements(), 0);
+    // Return something empty, avoiding divide by 0
+    return t->shaped<T, 3>({0, 0, 0});
+  } else {
+    return t->shaped<T, 3>({t->NumElements() / (but_last_size * last_size),
+                            but_last_size, last_size});
+  }
+}
+
+template <typename Device, typename T>
+class BatchMatrixDiagPartOp : public OpKernel {
+ public:
+  explicit BatchMatrixDiagPartOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+
+    const TensorShape& input_shape = input.shape();
+    const int rank = input_shape.dims();
+
+    // Preliminary validation of sizes.
+    OP_REQUIRES(context, TensorShapeUtils::IsMatrixOrHigher(input_shape),
+                errors::InvalidArgument(
+                    "input must be at least 2-dim, received shape: ",
+                    input.shape().DebugString()));
+
+    // Check to make sure the last two dimensions have the same value
+    const int64 k = input_shape.dim_size(rank - 1);
+    OP_REQUIRES(
+        context, k == input_shape.dim_size(rank - 2),
+        errors::InvalidArgument(
+            "input's last two dimensions must be equal, received shape: ",
+            input.shape().DebugString()));
+
+    auto input_reshaped = flat_inner_dims_matrix<T>(input);
+
+    TensorShape output_shape = input_shape;
+    output_shape.RemoveDim(rank - 1);
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+
+    auto output_reshaped = output->flat_inner_dims<T>();
+
+    functor::BatchMatrixDiagPart<Device, T>::Compute(
+        context->eigen_device<Device>(), input_reshaped, output_reshaped);
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(BatchMatrixDiagPartOp);
+};
+
+template <typename Device, typename T>
+class BatchMatrixDiagOp : public OpKernel {
+ public:
+  explicit BatchMatrixDiagOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+
+    const TensorShape& input_shape = input.shape();
+    const int rank = input_shape.dims();
+
+    // Preliminary validation of sizes.
+    OP_REQUIRES(context, TensorShapeUtils::IsVectorOrHigher(input_shape),
+                errors::InvalidArgument(
+                    "input must be at least 1-dim, received shape: ",
+                    input.shape().DebugString()));
+
+    // Check to make sure the last two dimensions have the same value
+    const int64 k = input_shape.dim_size(rank - 1);
+    auto input_reshaped = input.flat_inner_dims<T>();
+
+    TensorShape output_shape = input_shape;
+    output_shape.AddDim(k);
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+
+    auto output_reshaped = flat_inner_dims_matrix<T>(output);
+
+    functor::BatchMatrixDiag<Device, T>::Compute(
+        context->eigen_device<Device>(), input_reshaped, output_reshaped);
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(BatchMatrixDiagOp);
+};
+
+#define REGISTER_BATCH_MATRIX_DIAG(type)                                    \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("BatchMatrixDiag").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      BatchMatrixDiagOp<CPUDevice, type>);                                  \
+  REGISTER_KERNEL_BUILDER(Name("BatchMatrixDiagPart")                       \
+                              .Device(DEVICE_CPU)                           \
+                              .TypeConstraint<type>("T"),                   \
+                          BatchMatrixDiagPartOp<CPUDevice, type>);
+
+TF_CALL_NUMBER_TYPES(REGISTER_BATCH_MATRIX_DIAG);
+
+// Implementation of the functor specialization for CPU.
+namespace functor {
+template <typename T>
+struct BatchMatrixDiag<CPUDevice, T> {
+  static void Compute(const CPUDevice& d,
+                      typename TTypes<T, 2>::ConstTensor input,
+                      typename TTypes<T, 3>::Tensor output) {
+    output.device(d) = output.constant(T());
+    for (int64 r = 0; r < output.dimension(0); ++r) {
+      for (int64 d = 0; d < output.dimension(1); ++d) {
+        output(r, d, d) = input(r, d);
+      }
+    }
+  }
+};
+
+template <typename T>
+struct BatchMatrixDiagPart<CPUDevice, T> {
+  static void Compute(const CPUDevice& d,
+                      typename TTypes<T, 3>::ConstTensor input,
+                      typename TTypes<T, 2>::Tensor output) {
+    for (int64 r = 0; r < output.dimension(0); ++r) {
+      for (int64 d = 0; d < output.dimension(1); ++d) {
+        output(r, d) = input(r, d, d);
+      }
+    }
+  }
+};
+
+}  // namespace functor
+
+#if GOOGLE_CUDA
+
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                         \
+  template <>                                                       \
+  void BatchMatrixDiag<GPUDevice, T>::Compute(                      \
+      const GPUDevice& d, typename TTypes<T, 2>::ConstTensor input, \
+      typename TTypes<T, 3>::Tensor output);                        \
+  extern template struct BatchMatrixDiag<GPUDevice, T>;             \
+  template <>                                                       \
+  void BatchMatrixDiagPart<GPUDevice, T>::Compute(                  \
+      const GPUDevice& d, typename TTypes<T, 3>::ConstTensor input, \
+      typename TTypes<T, 2>::Tensor output);                        \
+  extern template struct BatchMatrixDiagPart<GPUDevice, T>;
+
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
+
+}  // namespace functor
+
+// Registration of the GPU implementations.
+#define REGISTER_BATCH_MATRIX_DIAG_GPU(type)                                \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("BatchMatrixDiag").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+      BatchMatrixDiagOp<GPUDevice, type>);                                  \
+  REGISTER_KERNEL_BUILDER(Name("BatchMatrixDiagPart")                       \
+                              .Device(DEVICE_GPU)                           \
+                              .TypeConstraint<type>("T"),                   \
+                          BatchMatrixDiagPartOp<GPUDevice, type>);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_BATCH_MATRIX_DIAG_GPU);
+
+#undef REGISTER_BATCH_MATRIX_DIAG_GPU
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batch_matrix_diag_op.h b/tensorflow/core/kernels/batch_matrix_diag_op.h
new file mode 100644
index 00000000000..b745b932460
--- /dev/null
+++ b/tensorflow/core/kernels/batch_matrix_diag_op.h
@@ -0,0 +1,94 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_BATCH_MATRIX_DIAG_OP_H_
+#define TENSORFLOW_KERNELS_BATCH_MATRIX_DIAG_OP_H_
+
+// Generator definition for BatchMatrixDiagOp, must be compilable by nvcc.
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+namespace generator {
+
+template <typename T>
+class BatchMatrixDiagPartGenerator {
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+      BatchMatrixDiagPartGenerator(typename TTypes<T, 3>::ConstTensor input)
+      : input_(input) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T
+  operator()(const Eigen::array<Eigen::DenseIndex, 2>& coords) const {
+    Eigen::array<Eigen::DenseIndex, 3> diag_from_coords(
+        {coords[0], coords[1], coords[1]});
+    return input_(diag_from_coords);
+  }
+
+ private:
+  typename TTypes<T, 3>::ConstTensor input_;
+};
+
+template <typename T>
+class BatchMatrixDiagGenerator {
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+      BatchMatrixDiagGenerator(typename TTypes<T, 2>::ConstTensor input)
+      : input_(input) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T
+  operator()(const Eigen::array<Eigen::DenseIndex, 3>& coords) const {
+    if (coords[2] != coords[1]) return T();
+
+    Eigen::array<Eigen::DenseIndex, 2> diag_coords({coords[0], coords[1]});
+    return input_(diag_coords);
+  }
+
+ private:
+  typename TTypes<T, 2>::ConstTensor input_;
+};
+
+}  // namespace generator
+
+namespace functor {
+
+template <typename Device, typename T>
+struct BatchMatrixDiagPart {
+  EIGEN_ALWAYS_INLINE static void Compute(
+      const Device& d, typename TTypes<T, 3>::ConstTensor input,
+      typename TTypes<T, 2>::Tensor output) {
+    generator::BatchMatrixDiagPartGenerator<T> generator(input);
+    output.device(d) = output.generate(generator);
+  }
+};
+
+template <typename Device, typename T>
+struct BatchMatrixDiag {
+  EIGEN_ALWAYS_INLINE static void Compute(
+      const Device& d, typename TTypes<T, 2>::ConstTensor input,
+      typename TTypes<T, 3>::Tensor output) {
+    generator::BatchMatrixDiagGenerator<T> generator(input);
+    output.device(d) = output.generate(generator);
+  }
+};
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_BATCH_MATRIX_DIAG_OP_H_
diff --git a/tensorflow/core/kernels/batch_matrix_diag_op_gpu.cu.cc b/tensorflow/core/kernels/batch_matrix_diag_op_gpu.cu.cc
new file mode 100644
index 00000000000..643d6406f3b
--- /dev/null
+++ b/tensorflow/core/kernels/batch_matrix_diag_op_gpu.cu.cc
@@ -0,0 +1,37 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/batch_matrix_diag_op.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+#define DEFINE_GPU_SPEC(T)                                   \
+  template class generator::BatchMatrixDiagGenerator<T>;     \
+  template struct functor::BatchMatrixDiag<GPUDevice, T>;    \
+  template class generator::BatchMatrixDiagPartGenerator<T>; \
+  template struct functor::BatchMatrixDiagPart<GPUDevice, T>;
+
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPEC);
+
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/check_numerics_op.cc b/tensorflow/core/kernels/check_numerics_op.cc
index 20e3c76b55c..cc93bf36f43 100644
--- a/tensorflow/core/kernels/check_numerics_op.cc
+++ b/tensorflow/core/kernels/check_numerics_op.cc
@@ -60,7 +60,7 @@ class CheckNumericsOp<CPUDevice, T> : public OpKernel {
 
     auto in = context->input(0).flat<T>();
     const T* data = in.data();
-    const int size = in.size();
+    const int64 size = in.size();
     // Check to see if any element of the tensor is NaN or Inf.
     int fp_props =
         std::accumulate(data, data + size, 0, [](const int& x, const T& y) {
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index 22e472deae3..e56ddb31d56 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -23,8 +23,10 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/platform/macros.h"
 
@@ -145,6 +147,10 @@ class FillOp : public OpKernel {
                 errors::InvalidArgument("value must be a scalar, got shape ",
                                         Tvalue.shape().DebugString()));
     auto dims = Tdims.flat<int32>();
+    OP_REQUIRES(context,
+                FastBoundsCheck(dims.size(), TensorShape::MaxDimensions()),
+                errors::InvalidArgument("dims must have size < ",
+                                        TensorShape::MaxDimensions()));
     for (int i = 0; i < dims.size(); i++) {
       OP_REQUIRES(context, dims(i) >= 0,
                   errors::InvalidArgument("dims[", i, "] = ", dims(i),
@@ -153,7 +159,7 @@ class FillOp : public OpKernel {
     TensorShape shape;
     OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
                                 reinterpret_cast<const int32*>(dims.data()),
-                                dims.size(), &shape));
+                                static_cast<int>(dims.size()), &shape));
     Tensor* out = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(0, shape, &out));
     functor::FillFunctor<Device, T> functor;
diff --git a/tensorflow/core/kernels/constant_op_gpu.cu.cc b/tensorflow/core/kernels/constant_op_gpu.cu.cc
index 49159d069a2..93d5792c16d 100644
--- a/tensorflow/core/kernels/constant_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/constant_op_gpu.cu.cc
@@ -78,7 +78,7 @@ struct FillFunctor<GPUDevice, T> {
   }
 };
 
-#define DEFINE_FILL_GPU(T) template struct FillFunctor<GPUDevice, T>
+#define DEFINE_FILL_GPU(T) template struct FillFunctor<GPUDevice, T>;
 TF_CALL_REAL_NUMBER_TYPES(DEFINE_FILL_GPU);
 DEFINE_FILL_GPU(bool);
 DEFINE_FILL_GPU(Eigen::half);
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index 681774b0a78..6f6887651dc 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -341,62 +341,6 @@ struct cos : base<T, Eigen::internal::scalar_cos_op<T> > {};
 struct logical_not : base<bool, Eigen::internal::scalar_boolean_not_op<bool> > {
 };
 
-namespace impl {
-
-#ifndef __CUDACC__
-// Uses STL std cmath functions.
-template <typename T>
-bool isinf(T v) {
-  return std::isinf(v);
-}
-
-template <typename T>
-bool isnan(T v) {
-  return std::isnan(v);
-}
-
-template <typename T>
-bool isfinite(T v) {
-  return std::isfinite(v);
-}
-
-template <typename T>
-T floor(T v) {
-  return std::floor(v);
-}
-
-template <typename T>
-T ceil(T v) {
-  return std::ceil(v);
-}
-#else
-// Uses CUDA's functions for float and double.
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isinf(T v) {
-  return ::isinf(v);
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isnan(T v) {
-  return ::isnan(v);
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isfinite(T v) {
-  return ::isfinite(v);
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T floor(T v) {
-  return ::floor(v);
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T ceil(T v) {
-  return ::ceil(v);
-}
-#endif
-}  // end namespace impl
 
 // NOTE: std::isinf, std::isnan, std::isfinite are plain function.
 // Therefore we need to wrap them in functors to be used with Eigen's
@@ -406,7 +350,7 @@ template <typename T>
 struct isinf_func {
   typedef bool result_type;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(T x) const {
-    return impl::isinf(x);
+    return Eigen::numext::isinf(x);
   }
 };
 
@@ -417,7 +361,7 @@ template <typename T>
 struct isnan_func {
   typedef bool result_type;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(T x) const {
-    return impl::isnan(x);
+    return Eigen::numext::isnan(x);
   }
 };
 
@@ -428,7 +372,7 @@ template <typename T>
 struct isfinite_func {
   typedef bool result_type;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(T x) const {
-    return impl::isfinite(x);
+    return Eigen::numext::isfinite(x);
   }
 };
 
@@ -439,7 +383,7 @@ template <typename T>
 struct floor_func {
   typedef T result_type;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(T x) const {
-    return impl::floor(x);
+    return Eigen::numext::floor(x);
   }
 };
 
@@ -450,7 +394,7 @@ template <typename T>
 struct ceil_func {
   typedef T result_type;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(T x) const {
-    return impl::ceil(x);
+    return Eigen::numext::ceil(x);
   }
 };
 
diff --git a/tensorflow/core/kernels/dense_update_ops.cc b/tensorflow/core/kernels/dense_update_ops.cc
index 12633a37db2..50611dd8c8d 100644
--- a/tensorflow/core/kernels/dense_update_ops.cc
+++ b/tensorflow/core/kernels/dense_update_ops.cc
@@ -130,7 +130,7 @@ namespace functor {
   void DenseUpdate<GPUDevice, T, OP>::operator()(          \
       const GPUDevice& d, typename TTypes<T>::Flat params, \
       typename TTypes<T>::ConstFlat update);               \
-  extern template struct DenseUpdate<GPUDevice, T, OP>
+  extern template struct DenseUpdate<GPUDevice, T, OP>;
 #define DECLARE_GPU_SPEC(T)                         \
   DECLARE_GPU_SPEC_FOR_OP(T, DenseUpdateType::ADD); \
   DECLARE_GPU_SPEC_FOR_OP(T, DenseUpdateType::SUB)
diff --git a/tensorflow/core/kernels/example_parsing_ops_test.cc b/tensorflow/core/kernels/example_parsing_ops_test.cc
new file mode 100644
index 00000000000..892ce4d897c
--- /dev/null
+++ b/tensorflow/core/kernels/example_parsing_ops_test.cc
@@ -0,0 +1,121 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <unordered_map>
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/example/example.pb.h"
+#include "tensorflow/core/example/feature.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+typedef std::map<std::pair<int, int>, Tensor> ExampleTensorMap;
+
+struct DenseStringExampleStore {
+  static ExampleTensorMap GetSerializedExamples() {
+    ExampleTensorMap examples;
+    int keys[] = {10, 100, 1000, 10000};
+    int batch_sizes[] = {128};
+    Example example;
+    for (int num_keys : keys) {
+      for (int batch_size : batch_sizes) {
+        Tensor record_string(DT_STRING, TensorShape({batch_size}));
+        auto string_t = record_string.vec<string>();
+        example.Clear();
+        for (int b = 0; b < batch_size; ++b) {
+          for (int k = 0; k < num_keys; ++k) {
+            string k_str = strings::Printf("%d", k);
+            Feature f;
+            f.mutable_bytes_list()->add_value("abc");
+            Features* features = example.mutable_features();
+            (*features->mutable_feature())[k_str] = f;
+          }
+          CHECK(example.SerializeToString(&string_t(b)));
+        }
+        examples[std::make_pair(batch_size, num_keys)] = record_string;
+      }
+    }
+    return examples;
+  }
+  static ExampleTensorMap serialized_example;
+};
+
+ExampleTensorMap DenseStringExampleStore::serialized_example =
+    DenseStringExampleStore::GetSerializedExamples();
+
+static Graph* ParseDenseStringExample(int batch_size, int num_keys) {
+  Graph* g = new Graph(OpRegistry::Global());
+  Tensor& serialized =
+      DenseStringExampleStore::serialized_example[std::make_pair(batch_size,
+                                                                 num_keys)];
+
+  Tensor names(DT_STRING, TensorShape({batch_size}));
+
+  std::vector<NodeBuilder::NodeOut> sparse_keys;
+  std::vector<NodeBuilder::NodeOut> dense_keys;
+  std::vector<NodeBuilder::NodeOut> dense_defaults;
+  for (int i = 0; i < num_keys; ++i) {
+    Tensor dense_key(DT_STRING, TensorShape());
+    dense_key.scalar<string>()() = strings::Printf("%d", i);
+    dense_keys.emplace_back(test::graph::Constant(g, dense_key));
+
+    Tensor dense_default(DT_STRING, TensorShape());
+    dense_defaults.emplace_back(test::graph::Constant(g, dense_default));
+  }
+
+  std::vector<DataType> sparse_types;
+  std::vector<TensorShape> dense_shapes(num_keys, TensorShape());
+
+  Node* ret;
+  TF_EXPECT_OK(NodeBuilder(g->NewName("n"), "ParseExample")
+                   .Input(test::graph::Constant(g, serialized))
+                   .Input(test::graph::Constant(g, names))
+                   .Input(sparse_keys)
+                   .Input(dense_keys)
+                   .Input(dense_defaults)
+                   .Attr("sparse_types", sparse_types)
+                   .Attr("dense_shapes", dense_shapes)
+                   .Finalize(g, &ret));
+
+  return g;
+}
+
+// B == batch_size, K == num_keys.  K must be one of 10, 100, 1000, 10000
+#define BM_ParseDenseStringExample(B, K)                                 \
+  static void BM_ParseDenseStringExample##_##B##_##K(int iters) {        \
+    int64 items_per_iter = static_cast<int64>(B) * K;                    \
+    testing::ItemsProcessed(static_cast<int64>(iters) * items_per_iter); \
+    test::Benchmark("cpu", ParseDenseStringExample(B, K)).Run(iters);    \
+  }                                                                      \
+  BENCHMARK(BM_ParseDenseStringExample##_##B##_##K);
+
+BM_ParseDenseStringExample(128, 10);
+BM_ParseDenseStringExample(128, 100);
+BM_ParseDenseStringExample(128, 1000);
+BM_ParseDenseStringExample(128, 10000);
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/gather_nd_op.cc b/tensorflow/core/kernels/gather_nd_op.cc
index 21e67afbfd3..7957998157e 100644
--- a/tensorflow/core/kernels/gather_nd_op.cc
+++ b/tensorflow/core/kernels/gather_nd_op.cc
@@ -200,7 +200,7 @@ namespace functor {
       const GPUDevice& d, typename TTypes<T, NDIM>::ConstTensor Tparams, \
       typename TTypes<Index>::ConstMatrix Tindices,                      \
       typename TTypes<T>::Flat Tout);                                    \
-  extern template struct GatherNd<GPUDevice, T, Index, NDIM>
+  extern template struct GatherNd<GPUDevice, T, Index, NDIM>;
 
 #define DECLARE_GPU_SPECS_INDEX(T, Index)    \
   DECLARE_GPU_SPECS_INDEX_NDIM(T, Index, 1); \
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index 5bb73cf9719..82dea187090 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -187,7 +187,7 @@ namespace functor {
       const GPUDevice& d, typename TTypes<T>::ConstMatrix Tparams, \
       typename TTypes<Index>::ConstFlat Tindices,                  \
       typename TTypes<T>::Matrix Tout);                            \
-  extern template struct Gather<GPUDevice, T, Index>
+  extern template struct Gather<GPUDevice, T, Index>;
 
 #define DECLARE_GPU_SPECS(T)         \
   DECLARE_GPU_SPECS_INDEX(T, int32); \
diff --git a/tensorflow/core/kernels/listdiff_op.cc b/tensorflow/core/kernels/listdiff_op.cc
index 39c87a4a742..891f7888aba 100644
--- a/tensorflow/core/kernels/listdiff_op.cc
+++ b/tensorflow/core/kernels/listdiff_op.cc
@@ -73,7 +73,7 @@ class ListDiffOp : public OpKernel {
     for (int i = 0, p = 0; i < x_size; ++i) {
       if (y_set.count(Tx(i)) == 0) {
         OP_REQUIRES(context, p < out_size,
-                    errors::OutOfRange(
+                    errors::InvalidArgument(
                         "Tried to set output index ", p,
                         " when output Tensor only had ", out_size,
                         " elements. Check that your "
diff --git a/tensorflow/core/kernels/reduce_join_op.cc b/tensorflow/core/kernels/reduce_join_op.cc
new file mode 100644
index 00000000000..ec53a9a614b
--- /dev/null
+++ b/tensorflow/core/kernels/reduce_join_op.cc
@@ -0,0 +1,202 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/string_ops.cc.
+
+#include <string>
+
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+
+namespace tensorflow {
+
+namespace {
+
+const gtl::InlinedVector<int64, 8> GetStrides(const TensorShape& shape) {
+  gtl::InlinedVector<int64, 8> result(shape.dims());
+  int64 product = 1;
+  for (int32 i = shape.dims() - 1; i >= 0; --i) {
+    result[i] = product;
+    product *= shape.dim_size(i);
+  }
+  return result;
+}
+
+// Given a linear index to a subset of dimensions, full shape,
+// precomputed list of running products of the full shape, and list of
+// dimensions in the subset, outputs the linear index to the full shape with
+// nonspecified dimensions set to 0.  Dimensions must be ordered from outer-most
+// to inner-most with respect to the subset linear index.
+inline int64 LinearSubIndexToFullIndex(
+    int64 output_index, const gtl::InlinedVector<int32, 8>& dim_list,
+    const TensorShape& input_shape,
+    const gtl::InlinedVector<int64, 8>& strides) {
+  int64 result = 0;
+  int64 quotient = output_index;
+  for (int32 i = dim_list.size() - 1; i >= 0; --i) {
+    int32 dim = dim_list[i];
+    int64 dim_value = quotient % input_shape.dim_size(dim);
+    quotient = quotient / input_shape.dim_size(dim);
+    result += strides[dim] * dim_value;
+  }
+  return result;
+}
+
+// Computes the number of input elements reduced per output element.
+int64 GetReductionIterSize(const gtl::InlinedVector<int32, 8>& reduced_indices,
+                           const TensorShape& input_shape) {
+  int64 result = 1;
+  for (int32 reduce_dim : reduced_indices) {
+    result *= input_shape.dim_size(reduce_dim);
+  }
+  return result;
+}
+
+// Computes a list of all true reduced indices, accounting for negative
+// indices and empty inputs.
+gtl::InlinedVector<int32, 8> GetReducedIndices(const Tensor& reduction_indices,
+                                               int32 input_dims) {
+  const auto reduction_indices_flat = reduction_indices.flat<int32>();
+  const int32 reduction_dims = reduction_indices_flat.size();
+
+  gtl::InlinedVector<int32, 8> reduced_indices(reduction_dims);
+  if (reduction_dims > 0) {
+    for (int32 i = 0; i < reduction_dims; ++i) {
+      reduced_indices[i] = reduction_indices_flat(reduction_dims - i - 1);
+      reduced_indices[i] += reduced_indices[i] < 0 ? input_dims : 0;
+    }
+  } else {
+    for (int32 i = 0; i < input_dims; ++i) {
+      reduced_indices.push_back(i);
+    }
+  }
+
+  return reduced_indices;
+}
+
+// Appends all unreduced dimensions to the given vector.
+void MakeUnreducedIndices(gtl::InlinedVector<bool, 8> index_is_reduced,
+                          int32 input_dims,
+                          gtl::InlinedVector<int32, 8>* unreduced_indices) {
+  for (int32 index = 0; index < input_dims; ++index) {
+    if (!index_is_reduced[index]) unreduced_indices->push_back(index);
+  }
+}
+
+TensorShape GetOutputShape(gtl::InlinedVector<bool, 8> index_is_reduced,
+                           const TensorShape& input_shape, bool keep_dims) {
+  TensorShape output_shape;
+  for (int32 index = 0; index < index_is_reduced.size(); ++index) {
+    if (index_is_reduced[index]) {
+      if (keep_dims) output_shape.AddDim(1);
+    } else {
+      output_shape.AddDim(input_shape.dim_size(index));
+    }
+  }
+  return output_shape;
+}
+
+}  // namespace
+
+class ReduceJoinOp : public OpKernel {
+ public:
+  using OpKernel::OpKernel;
+
+  explicit ReduceJoinOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("keep_dims", &keep_dims_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("separator", &separator_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    const auto input_flat = input.flat<string>();
+    const TensorShape& input_shape = input.shape();
+    const int32 input_dims = input_shape.dims();
+    OP_REQUIRES(context, TensorShapeUtils::IsVectorOrHigher(input_shape),
+                errors::InvalidArgument("Input cannot be a scalar."));
+
+    const Tensor& reduction_indices = context->input(1);
+    const auto reduction_indices_flat = reduction_indices.flat<int32>();
+    const int32 reduction_dims = reduction_indices_flat.size();
+
+    // Empty reduction_indices indicates that all indices are reduced.
+    gtl::InlinedVector<bool, 8> index_is_reduced(input_dims,
+                                                 reduction_dims == 0);
+    for (int32 i = 0; i < reduction_dims; i++) {
+      int32 reduce_index = reduction_indices_flat(i);
+      const int32 true_reduce_index =
+          reduce_index < 0 ? reduce_index + input_dims : reduce_index;
+      OP_REQUIRES(
+          context, reduce_index >= -input_dims && reduce_index < input_dims,
+          errors::OutOfRange("Invalid reduction dimension ", reduce_index,
+                             " for input with ", input_dims, " dimension(s)"));
+      OP_REQUIRES(context, input_shape.dim_size(true_reduce_index) > 0,
+                  errors::InvalidArgument("Reduction dimension ", reduce_index,
+                                          " has size 0"));
+      OP_REQUIRES(context, !index_is_reduced[true_reduce_index],
+                  errors::InvalidArgument("Duplicate reduction dimension ",
+                                          reduce_index));
+      index_is_reduced[true_reduce_index] = true;
+    }
+
+    gtl::InlinedVector<int32, 8> reduced_indices =
+        GetReducedIndices(reduction_indices, input_dims);
+    gtl::InlinedVector<int32, 8> unreduced_indices;
+    if (reduction_indices.shape().num_elements() > 0) {
+      MakeUnreducedIndices(index_is_reduced, input_dims, &unreduced_indices);
+    }
+    const auto strides = GetStrides(input_shape);
+
+    Tensor* output_tensor = nullptr;
+    TensorShape output_shape =
+        GetOutputShape(index_is_reduced, input_shape, keep_dims_);
+    OP_REQUIRES_OK(context, context->allocate_output("output", output_shape,
+                                                     &output_tensor));
+    auto output_flat = output_tensor->flat<string>();
+
+    const int64 reduction_iter_size =
+        GetReductionIterSize(reduced_indices, input_shape);
+    gtl::InlinedVector<StringPiece, 8> curr_strings(reduction_iter_size);
+    for (int64 output_index = 0; output_index < output_shape.num_elements();
+         ++output_index) {
+      int64 output_full_index = LinearSubIndexToFullIndex(
+          output_index, unreduced_indices, input_shape, strides);
+      for (int64 reduction_index = 0; reduction_index < reduction_iter_size;
+           ++reduction_index) {
+        int64 reduction_full_index = LinearSubIndexToFullIndex(
+            reduction_index, reduced_indices, input_shape, strides);
+        curr_strings[reduction_index] =
+            input_flat(output_full_index + reduction_full_index);
+      }
+      output_flat(output_index) =
+          str_util::Join(curr_strings, separator_.c_str());
+    }
+  }
+
+ private:
+  bool keep_dims_;
+  string separator_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("ReduceJoin").Device(DEVICE_CPU), ReduceJoinOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_common.cc b/tensorflow/core/kernels/reduction_ops_common.cc
index baba8a58d45..8818f7befb6 100644
--- a/tensorflow/core/kernels/reduction_ops_common.cc
+++ b/tensorflow/core/kernels/reduction_ops_common.cc
@@ -63,9 +63,9 @@ Status ReductionHelper::Simplify(const Tensor& data, const Tensor& axis,
   for (int64 i = 0; i < axis.NumElements(); ++i) {
     const int32 index = axis_vec(i);
     if (index < 0 || index >= data.dims()) {
-      return errors::OutOfRange("Invalid reduction dimension (", index,
-                                " for input with ", data.dims(),
-                                " dimension(s)");
+      return errors::InvalidArgument("Invalid reduction dimension (", index,
+                                     " for input with ", data.dims(),
+                                     " dimension(s)");
     }
     bitmap[index] = true;
   }
diff --git a/tensorflow/core/kernels/relu_op_gpu.cu.cc b/tensorflow/core/kernels/relu_op_gpu.cu.cc
index 0a12c854b88..be98cf00ac8 100644
--- a/tensorflow/core/kernels/relu_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/relu_op_gpu.cu.cc
@@ -35,7 +35,7 @@ typedef Eigen::GpuDevice GPUDevice;
   template struct functor::Relu6<GPUDevice, T>;     \
   template struct functor::Relu6Grad<GPUDevice, T>; \
   template struct functor::Elu<GPUDevice, T>;       \
-  template struct functor::EluGrad<GPUDevice, T>
+  template struct functor::EluGrad<GPUDevice, T>;
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
 
diff --git a/tensorflow/core/kernels/save_restore_tensor.cc b/tensorflow/core/kernels/save_restore_tensor.cc
index ea65ab14cf4..56c499db8cd 100644
--- a/tensorflow/core/kernels/save_restore_tensor.cc
+++ b/tensorflow/core/kernels/save_restore_tensor.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -49,17 +50,7 @@ bool ParseShapeAndSlice(const string& shape_and_slice, TensorShape* shape,
         shape_and_slice);
     return false;
   }
-  int num_dims = splits.size() - 1;
-  shape->Clear();
-  for (int i = 0; i < num_dims; ++i) {
-    int dim;
-    if (!strings::safe_strto32(splits[i], &dim)) {
-      *error = strings::StrCat("Non numerical dimension in shape_and_slice: ",
-                               shape_and_slice);
-      return false;
-    }
-    shape->AddDim(dim);
-  }
+
   // The last split is the slice specification.
   slice->Clear();
   auto status = slice->Parse(splits.back(), slice);
@@ -67,6 +58,20 @@ bool ParseShapeAndSlice(const string& shape_and_slice, TensorShape* shape,
     *error = status.error_message();
     return false;
   }
+
+  // The first n-1 are the shape specification.
+  splits.pop_back();
+  shape->Clear();
+  for (const auto& s : splits) {
+    int dim;
+    if (!strings::safe_strto32(s, &dim)) {
+      *error = strings::StrCat("Non numerical dimension in shape_and_slice: ",
+                               shape_and_slice);
+      return false;
+    }
+    shape->AddDim(dim);
+  }
+
   // The specified slice must be compatible with the specified shape.
   status = slice->SliceTensorShape(*shape, shape_slice);
   if (!status.ok()) {
@@ -91,13 +96,20 @@ void SaveTensors(
             size, "elements"));
   }
 
+  // Path, names, and slices if save_slices is true.
+  const int kFixedInputs = save_slices ? 3 : 2;
   const Tensor& tensor_names_t = context->input(1);
-  const int64 N = tensor_names_t.NumElements();
+  OP_REQUIRES(context,
+              FastBoundsCheck(tensor_names_t.NumElements() + kFixedInputs,
+                              std::numeric_limits<int>::max()),
+              errors::InvalidArgument("Too many inputs to SaveTensors"));
+  const int N = static_cast<int>(tensor_names_t.NumElements());
   const string* tensor_shapes_and_slices_ptr = nullptr;
   if (save_slices) {
     const Tensor& tensor_shapes_and_slices_t = context->input(2);
     OP_REQUIRES(
-        context, tensor_shapes_and_slices_t.NumElements() == N,
+        context,
+        tensor_shapes_and_slices_t.NumElements() == static_cast<int64>(N),
         errors::InvalidArgument("Expected ", N,
                                 " elements for the tensor "
                                 "shapes and slices but got ",
@@ -105,8 +117,6 @@ void SaveTensors(
     tensor_shapes_and_slices_ptr =
         tensor_shapes_and_slices_t.flat<string>().data();
   }
-  // Path, names, and slices if save_slices is true.
-  const int kFixedInputs = save_slices ? 3 : 2;
   OP_REQUIRES(context, context->num_inputs() == N + kFixedInputs,
               errors::InvalidArgument("Expected totally ", N + kFixedInputs,
                                       " inputs as input #1 (which is a string "
@@ -123,7 +133,7 @@ void SaveTensors(
   auto tensor_names_flat = tensor_names_t.flat<string>();
 
   string error;
-  for (int64 i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i) {
     const string& name = tensor_names_flat(i);
     const Tensor& input = context->input(i + kFixedInputs);
     TensorShape shape(input.shape());
diff --git a/tensorflow/core/kernels/scatter_op.cc b/tensorflow/core/kernels/scatter_op.cc
index fd62fd25872..6d6454735ff 100644
--- a/tensorflow/core/kernels/scatter_op.cc
+++ b/tensorflow/core/kernels/scatter_op.cc
@@ -130,7 +130,7 @@ class ScatterUpdateOp : public OpKernel {
                     "indices has too many elements for ",
                     DataTypeString(DataTypeToEnum<Index>::v()), " indexing: ",
                     N_big, " > ", std::numeric_limits<Index>::max()));
-    const Index N = indices.NumElements();
+    const Index N = static_cast<Index>(indices.NumElements());
     OP_REQUIRES(
         c, params.dim_size(0) <= std::numeric_limits<Index>::max(),
         errors::InvalidArgument("params.shape[0] too large for ",
@@ -166,8 +166,9 @@ struct ScatterFunctor<CPUDevice, T, Index, op> {
                    typename TTypes<T>::Matrix params,
                    typename TTypes<T>::ConstMatrix updates,
                    typename TTypes<Index>::ConstFlat indices) {
-    const Index N = indices.size();
-    const Index limit = params.dimension(0);
+    // indices and params sizes were validated in DoCompute().
+    const Index N = static_cast<Index>(indices.size());
+    const Index limit = static_cast<Index>(params.dimension(0));
     for (Index i = 0; i < N; i++) {
       // Grab the index and check its validity.  An earlier version of the
       // code checked it and then grabbed it from memory a second time, which
diff --git a/tensorflow/core/kernels/session_ops.cc b/tensorflow/core/kernels/session_ops.cc
new file mode 100644
index 00000000000..6c814e2d40d
--- /dev/null
+++ b/tensorflow/core/kernels/session_ops.cc
@@ -0,0 +1,120 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/data_flow_ops.cc.
+
+#include <limits.h>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class GetSessionHandleOp : public OpKernel {
+ public:
+  explicit GetSessionHandleOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    Tensor val = ctx->input(0);
+    int64 id = ctx->session_state()->GetNewId();
+    TensorStore::TensorAndKey tk{val, id, def().device()};
+    OP_REQUIRES_OK(ctx, ctx->tensor_store()->AddTensor(def().name(), tk));
+    Tensor* handle = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle));
+    handle->flat<string>().setConstant(tk.GetHandle(def().name()));
+  }
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GetSessionHandleOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("GetSessionHandle").Device(DEVICE_CPU),
+                        GetSessionHandleOp);
+
+#define REGISTER_GPU_KERNEL(type)                         \
+  REGISTER_KERNEL_BUILDER(Name("GetSessionHandle")        \
+                              .Device(DEVICE_GPU)         \
+                              .HostMemory("handle")       \
+                              .TypeConstraint<type>("T"), \
+                          GetSessionHandleOp)
+
+TF_CALL_NUMBER_TYPES(REGISTER_GPU_KERNEL);
+REGISTER_GPU_KERNEL(bool);
+#undef REGISTER_GPU_KERNEL
+
+class GetSessionTensorOp : public OpKernel {
+ public:
+  explicit GetSessionTensorOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& handle = ctx->input(0);
+    const string& name = handle.scalar<string>()();
+    Tensor val;
+    OP_REQUIRES_OK(ctx, ctx->session_state()->GetTensor(name, &val));
+    ctx->set_output(0, val);
+  }
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GetSessionTensorOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("GetSessionTensor").Device(DEVICE_CPU),
+                        GetSessionTensorOp);
+
+#define REGISTER_GPU_KERNEL(type)                             \
+  REGISTER_KERNEL_BUILDER(Name("GetSessionTensor")            \
+                              .Device(DEVICE_GPU)             \
+                              .HostMemory("handle")           \
+                              .TypeConstraint<type>("dtype"), \
+                          GetSessionTensorOp)
+
+TF_CALL_NUMBER_TYPES(REGISTER_GPU_KERNEL);
+REGISTER_GPU_KERNEL(bool);
+#undef REGISTER_GPU_KERNEL
+
+class DeleteSessionTensorOp : public OpKernel {
+ public:
+  explicit DeleteSessionTensorOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& handle = ctx->input(0);
+    const string& name = handle.scalar<string>()();
+    OP_REQUIRES_OK(ctx, ctx->session_state()->DeleteTensor(name));
+  }
+
+  TF_DISALLOW_COPY_AND_ASSIGN(DeleteSessionTensorOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("DeleteSessionTensor").Device(DEVICE_CPU),
+                        DeleteSessionTensorOp);
+REGISTER_KERNEL_BUILDER(
+    Name("DeleteSessionTensor").Device(DEVICE_GPU).HostMemory("handle"),
+    DeleteSessionTensorOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/slice_op.cc b/tensorflow/core/kernels/slice_op.cc
index bce7879a544..5d61cd55456 100644
--- a/tensorflow/core/kernels/slice_op.cc
+++ b/tensorflow/core/kernels/slice_op.cc
@@ -155,7 +155,7 @@ class SliceOp : public OpKernel {
         // TODO(agarwal): Consider multi-threading this loop for cases where
         // size[0] is very large.
         for (int i = 0; i < size[0]; ++i) {
-          const int row = begin[0] + i;
+          const int64 row = begin[0] + i;
           if (i + 1 < size[0]) {
             port::prefetch<port::PREFETCH_HINT_T0>(&output(i + 1, 0));
             port::prefetch<port::PREFETCH_HINT_T0>(&input(row + 1, begin[1]));
diff --git a/tensorflow/core/kernels/summary_op.cc b/tensorflow/core/kernels/summary_op.cc
index 16e5b0a0fff..500199bc30f 100644
--- a/tensorflow/core/kernels/summary_op.cc
+++ b/tensorflow/core/kernels/summary_op.cc
@@ -89,7 +89,7 @@ class SummaryHistoOp : public OpKernel {
       T v = flat(i);
       if (!std::isfinite(v)) {
         c->SetStatus(
-            errors::OutOfRange("Nan in summary histogram for: ", name()));
+            errors::InvalidArgument("Nan in summary histogram for: ", name()));
         break;
       }
       histo.Add(static_cast<double>(v));
diff --git a/tensorflow/core/kernels/tensor_array.cc b/tensorflow/core/kernels/tensor_array.cc
index f9267a902a0..3dab15f780b 100644
--- a/tensorflow/core/kernels/tensor_array.cc
+++ b/tensorflow/core/kernels/tensor_array.cc
@@ -13,49 +13,45 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#define EIGEN_USE_THREADS
 #include "tensorflow/core/kernels/tensor_array.h"
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/aggregate_ops_cpu.h"
+
 namespace tensorflow {
 
-Status TensorArray::LockedWrite(OpKernelContext* ctx, const int32 index,
-                                PersistentTensor* value) {
-  TF_RETURN_IF_ERROR(LockedReturnIfClosed());
-  size_t index_size = static_cast<size_t>(index);
-  if (index < 0 ||
-      (!dynamic_size_ && index_size >= tensors_.size())) {
-    return errors::InvalidArgument(
-        "TensorArray ", handle_.vec<string>()(1), ": Tried to write to index ",
-        index, " but array is not resizeable and size is: ", tensors_.size());
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace tensor_array {
+
+#define TENSOR_ARRAY_WRITE_OR_ADD(Device, T)                                \
+  template <>                                                               \
+  Status AddToTensor<Device, T>(OpKernelContext * ctx, Tensor * sum,        \
+                                const Tensor* current, const Tensor* add) { \
+    functor::Add2Functor<Device, T> add_functor;                            \
+    add_functor(ctx->template eigen_device<Device>(), sum->flat<T>(),       \
+                current->flat<T>(), add->flat<T>());                        \
+    return Status::OK();                                                    \
   }
-  if (dynamic_size_) {
-    // We must grow the internal TensorArray
-    if (index_size >= tensors_.capacity()) {
-      tensors_.reserve(2 * (index_size + 1));
-    }
-    if (index_size >= tensors_.size()) {
-      tensors_.resize(index_size + 1);
-    }
-  }
-  TensorAndState& t = tensors_[index];
-  if (t.written) {
-    return errors::InvalidArgument("TensorArray ", handle_.vec<string>()(1),
-                                   ": Could not write to TensorArray index ",
-                                   index,
-                                   " because it has already been written to.");
-  }
-  Tensor* value_t = value->AccessTensor(ctx);
-  if (value_t->dtype() != dtype_) {
-    return errors::InvalidArgument(
-        "TensorArray ", handle_.vec<string>()(1),
-        ": Could not write to TensorArray index ", index,
-        " because the value dtype is ", DataTypeString(value_t->dtype()),
-        " but TensorArray dtype is ", DataTypeString(dtype_), ".");
-  }
-  t.tensor = *value;
-  t.shape = value_t->shape();
-  t.written = true;
-  return Status::OK();
-}
+
+#define TENSOR_ARRAY_WRITE_OR_ADD_CPU(T) TENSOR_ARRAY_WRITE_OR_ADD(CPUDevice, T)
+TF_CALL_NUMBER_TYPES(TENSOR_ARRAY_WRITE_OR_ADD_CPU)
+#undef TENSOR_ARRAY_WRITE_OR_ADD_CPU
+
+#if GOOGLE_CUDA
+
+#define TENSOR_ARRAY_WRITE_OR_ADD_GPU(T) TENSOR_ARRAY_WRITE_OR_ADD(GPUDevice, T)
+TF_CALL_GPU_NUMBER_TYPES(TENSOR_ARRAY_WRITE_OR_ADD_GPU);
+#undef TENSOR_ARRAY_WRITE_OR_ADD_GPU
+
+#endif  // GOOGLE_CUDA
+
+#undef TENSOR_ARRAY_WRITE_OR_ADD
+
+}  // namespace tensor_array
 
 Status TensorArray::LockedRead(const int32 index, PersistentTensor* value) {
   TF_RETURN_IF_ERROR(LockedReturnIfClosed());
@@ -64,20 +60,25 @@ Status TensorArray::LockedRead(const int32 index, PersistentTensor* value) {
                                    " but array size is: ", tensors_.size());
   }
   TensorAndState& t = tensors_[index];
-  if (t.read) {
-    return errors::InvalidArgument(
-        "TensorArray ", handle_.vec<string>()(1), ": Could not read index ",
-        index, " twice because TensorArray a read-once object.");
-  }
   if (!t.written) {
     return errors::InvalidArgument("TensorArray ", handle_.vec<string>()(1),
                                    ": Could not read from TensorArray index ",
                                    index,
                                    " because it has not yet been written to.");
   }
+  if (t.cleared) {
+    return errors::InvalidArgument("TensorArray ", handle_.vec<string>()(1),
+                                   ": Could not read index ", index,
+                                   " twice because it was cleared after a "
+                                   "previous read (perhaps try setting "
+                                   "clear_after_read = false?).");
+  }
   *value = t.tensor;
+  if (clear_after_read_) {
+    t.tensor = PersistentTensor();
+    t.cleared = true;
+  }
   t.read = true;
-  t.tensor = PersistentTensor();
   return Status::OK();
 }
 
diff --git a/tensorflow/core/kernels/tensor_array.h b/tensorflow/core/kernels/tensor_array.h
index 7ef04bee947..6a206dd904e 100644
--- a/tensorflow/core/kernels/tensor_array.h
+++ b/tensorflow/core/kernels/tensor_array.h
@@ -24,22 +24,60 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/aggregate_ops.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace tensor_array {
+
+// Full implementations are in tensor_array.cc
+template <typename Device, typename T>
+Status AddToTensor(OpKernelContext* ctx, Tensor* sum, const Tensor* current,
+                   const Tensor* add) {
+  return errors::InvalidArgument(
+      "tensor_array::AddToTensor type not supported: ",
+      DataTypeString(DataTypeToEnum<T>::value));
+};
+
+#define TENSOR_ARRAY_WRITE_OR_ADD(Device, T)                         \
+  template <>                                                        \
+  Status AddToTensor<Device, T>(OpKernelContext * ctx, Tensor * sum, \
+                                const Tensor* current, const Tensor* add);
+
+#define TENSOR_ARRAY_WRITE_OR_ADD_CPU(T) TENSOR_ARRAY_WRITE_OR_ADD(CPUDevice, T)
+TF_CALL_NUMBER_TYPES(TENSOR_ARRAY_WRITE_OR_ADD_CPU)
+#undef TENSOR_ARRAY_WRITE_OR_ADD_CPU
+
+#if GOOGLE_CUDA
+
+#define TENSOR_ARRAY_WRITE_OR_ADD_GPU(T) TENSOR_ARRAY_WRITE_OR_ADD(GPUDevice, T)
+TF_CALL_GPU_NUMBER_TYPES(TENSOR_ARRAY_WRITE_OR_ADD_GPU);
+#undef TENSOR_ARRAY_WRITE_OR_ADD_GPU
+
+#endif  // GOOGLE_CUDA
+
+#undef TENSOR_ARRAY_WRITE_OR_ADD
+
+}  // namespace tensor_array
+
 // The TensorArray object keeps an array of PersistentTensors.  It
 // allows reading from the array and writing to the array.
 //
 // Important properties:
-//   * Reading and writing to a particular index in the TensorArray
-//     is allowed at most once per index.
-//   * Upon reading an entry, that entry is cleared from the array and
-//     marked as read.  This allows removal of Tensor from memory
-//     as soon as it is not needed.  Its shape is saved.
-//   * No deep copies of any PersistentTensor are ever made.
+//   * Usually, writing to a particular index in the TensorArray is allowed at
+//     most once per index.  In a special case, writes with the flag
+//     multiple_writes_aggregate allow multiple writes to the same
+//     index.  In this case, the writes are summed.
+//   * Multiple reads are supported.
+//   * Deep copies of PersistentTensors are rarely made.  The only
+//     time they are made is when WriteOrAggregate is called at least twice
+//     on the same index with the flag multiple_writes_aggregate = True.
 //   * Reading and Writing to the array is protected by a mutex.
 //     All operations on a TensorArray are thread-safe.
 //   * A TensorArray may be preemptively closed, which releases all
@@ -51,8 +89,12 @@ namespace tensorflow {
 //   * Write-Once semantics mean the gradient of a TensorArray Read never has to
 //     worry which of multiple writes to that index the gradient value
 //     is meant for.
-//   * Read-Once semantics mean the TensorArray never sees
-//     multiple writes to the same index as part of gradient aggregation.
+//   * Read-Many semantics (when using clear_after_read=false) allow the
+//     TensorArray to be read, packed, or concatenated multiple times;
+//     and the gradient operations use the multiple_writes_aggregate
+//     flag to aggregate the backprop writes.  Multiple backprop writes to
+//     the same index are partial gradients corresponding to the
+//     multiple reads of that index in the forward phase.
 //
 class TensorArray : public ResourceBase {
  public:
@@ -61,11 +103,15 @@ class TensorArray : public ResourceBase {
   // can hold more than MAX_INT entries, in practice we do not expect
   // users to construct this many Tensors for storage in a TensorArray.
   TensorArray(const DataType& dtype, const Tensor& handle, int32 N,
-              bool dynamic_size)
+              bool dynamic_size, bool multiple_writes_aggregate,
+              bool clear_after_read)
       : dtype_(dtype),
         handle_(handle),
         closed_(false),
         dynamic_size_(dynamic_size),
+        multiple_writes_aggregate_(multiple_writes_aggregate),
+        gradients_disallowed_(false),
+        clear_after_read_(clear_after_read),
         tensors_(N) {}
 
   // Write PersistentTensor 'value' to index 'index'.
@@ -77,25 +123,40 @@ class TensorArray : public ResourceBase {
   //    Otherwise:
   //      The index is in [0, N) where N == Size()
   //  * The dtype of the Tensor in 'value' matches the TensorArray's dtype.
-  //  * The Tensor at 'index' has not yet been written to.
+  //  * If multiple_writes_aggregate is false:
+  //    The Tensor at 'index' has not yet been written to.
+  //  * If multiple_writes_aggregate is true:
+  //    The Tensor at 'index' has the same shape as value.
   //
   // Side effects:
-  //  * The underlying Tensor in 'value' has a new reference to it.
-  //  * Index 'index' is marked as written.
+  //  * On the first write to 'index':
+  //    - The underlying Tensor in 'value' has a new reference to it.
+  //    - The index 'index' is marked as written.
+  //  * If multiple_writes_aggregate is false, subsequent writes to 'index'
+  //    raise an InvalidArgument error.
+  //  * If multiple_writes_aggregate is true, subsequent writes to 'index':
+  //    - The underlying Tensors in 'value' and from the first write
+  //      are released and a local PersistentTensor is created.
+  //    - Index 'index' is also marked as local_copy.
+  //    - The gradient_disallowed flag is set true (GradientAllowed()
+  //      will now return false).
   //
   // Note, value is passed as a pointer because we its underlying
   // Tensor's shape is accessed.  Otherwise it is not modified.
-  Status Write(OpKernelContext* ctx, const int32 index,
-               PersistentTensor* value) {
+  template <typename Device, typename T>
+  Status WriteOrAggregate(OpKernelContext* ctx, const int32 index,
+                          PersistentTensor* value) {
     mutex_lock l(mu_);
-    return LockedWrite(ctx, index, value);
+    return LockedWriteOrAggregate<Device, T>(ctx, index, value);
   }
 
-  Status WriteMany(OpKernelContext* ctx,
-                   std::vector<PersistentTensor>* values) {
+  template <typename Device, typename T>
+  Status WriteOrAggregateMany(OpKernelContext* ctx,
+                              std::vector<PersistentTensor>* values) {
     mutex_lock l(mu_);
     for (int32 i = values->size() - 1; i >= 0; --i) {
-      TF_RETURN_IF_ERROR(LockedWrite(ctx, i, &(*values)[i]));
+      Status s = LockedWriteOrAggregate<Device, T>(ctx, i, &(*values)[i]);
+      TF_RETURN_IF_ERROR(s);
     }
     return Status::OK();
   }
@@ -106,13 +167,15 @@ class TensorArray : public ResourceBase {
   //  * The TensorArray is not closed
   //  * The index is in [0, N)
   //  * The Tensor at 'index' has been written to.
-  //  * The Tensor at 'index' has not already been read.
+  //  * The Tensor at 'index' has not been read from with flag
+  //    clear_after_read = true.
   //
   // Side effects:
-  //  * The PersistentTensor at 'index' is cleared from the given index.
-  //  * The reference to the underlying Tensor at 'index' is shifted to
+  //  * If clear_after_read is true, the reference to the underlying
+  //    Tensor is deleted.
+  //  * The reference to the underlying Tensor at 'index' is copied to
   //    the returned '*value'.
-  //  * Index 'index' is marked as read.
+  //  * The index is marked as read (it cannot be rewritten to).
   Status Read(const int32 index, PersistentTensor* value) {
     mutex_lock l(mu_);
     return LockedRead(index, value);
@@ -161,6 +224,11 @@ class TensorArray : public ResourceBase {
     return dynamic_size_;
   }
 
+  bool GradientsAllowed() {
+    mutex_lock l(mu_);
+    return !gradients_disallowed_;
+  }
+
   // Clear the TensorArray, including any Tensor references, and mark as closed.
   void ClearAndMarkClosed() {
     mutex_lock l(mu_);
@@ -175,6 +243,11 @@ class TensorArray : public ResourceBase {
   Status LockedWrite(OpKernelContext* ctx, const int32 index,
                      PersistentTensor* value) EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
+  template <typename Device, typename T>
+  Status LockedWriteOrAggregate(OpKernelContext* ctx, const int32 index,
+                                PersistentTensor* value)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
   Status LockedRead(const int32 index, PersistentTensor* value)
       EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
@@ -191,25 +264,134 @@ class TensorArray : public ResourceBase {
 
   mutex mu_;
 
-  bool closed_
-      GUARDED_BY(mu_);  // Marks that the tensor_array_ has been cleared.
+  // Marks that the tensor_array_ has been cleared.
+  bool closed_ GUARDED_BY(mu_);
 
-  bool dynamic_size_;  // Determines if Writes are allowed to grow the array.
+  // Writes are allowed to grow the array.
+  bool dynamic_size_;
+
+  // Multiple writes to the same index will result in summation of the
+  // values (used by backprop)
+  bool multiple_writes_aggregate_;
+
+  // If multiple Writes were attempted (e.g. via attribute
+  // multiple_writes_aggregate), then gradients are disallowed.
+  bool gradients_disallowed_ GUARDED_BY(mu_);
+
+  // After a read at an index, clear away its PersistentTensor to
+  // release memory.
+  bool clear_after_read_;
 
   // TensorAndState is used to keep track of the PersistentTensors
   // stored in the TensorArray, along with their shapes, and a boolean
   // that determines whether they have already been read or not.
   struct TensorAndState {
-    TensorAndState() : written(false), read(false) {}
+    TensorAndState()
+        : written(false), read(false), cleared(false), local_copy(false) {}
     PersistentTensor tensor;
     TensorShape shape;
     bool written;  // True if a Tensor has been written to the index.
     bool read;  // True if a Tensor has been written to and read from the index.
+    bool cleared;  // True if a tensor has been read with
+                   // clear_after_read = true;
+
+    // Used by writes when multiple_writes_aggregate is true.  In this
+    // case, the first time a value is written, it is a shallow copy.
+    // The second time a value is written, it is aggregated.  However,
+    // in this case a new Tensor must be constructed to hold the
+    // aggregated value.  This flag marks that such a Tensor is being
+    // used.  All future writes will aggregate to the existing local Tensor.
+    bool local_copy;
   };
   // The list of underlying PersistentTensors and states.
   std::vector<TensorAndState> tensors_ GUARDED_BY(mu_);
 };
 
+template <typename Device, typename T>
+Status TensorArray::LockedWriteOrAggregate(OpKernelContext* ctx,
+                                           const int32 index,
+                                           PersistentTensor* value) {
+  TF_RETURN_IF_ERROR(LockedReturnIfClosed());
+  size_t index_size = static_cast<size_t>(index);
+  if (index < 0 || (!dynamic_size_ && index_size >= tensors_.size())) {
+    return errors::InvalidArgument(
+        "TensorArray ", handle_.vec<string>()(1), ": Tried to write to index ",
+        index, " but array is not resizeable and size is: ", tensors_.size());
+  }
+  if (dynamic_size_) {
+    // We must grow the internal TensorArray
+    if (index_size >= tensors_.capacity()) {
+      tensors_.reserve(2 * (index_size + 1));
+    }
+    if (index_size >= tensors_.size()) {
+      tensors_.resize(index_size + 1);
+    }
+  }
+  TensorAndState& t = tensors_[index];
+
+  Tensor* value_t = value->AccessTensor(ctx);
+  if (value_t->dtype() != dtype_) {
+    return errors::InvalidArgument(
+        "TensorArray ", handle_.vec<string>()(1),
+        ": Could not write to TensorArray index ", index,
+        " because the value dtype is ", DataTypeString(value_t->dtype()),
+        " but TensorArray dtype is ", DataTypeString(dtype_), ".");
+  }
+
+  if (t.read) {
+    return errors::InvalidArgument("TensorArray ", handle_.vec<string>()(1),
+                                   ": Could not write to TensorArray index ",
+                                   index, " because it has already been read.");
+  }
+
+  if (!multiple_writes_aggregate_ && t.written) {
+    return errors::InvalidArgument("TensorArray ", handle_.vec<string>()(1),
+                                   ": Could not write to TensorArray index ",
+                                   index,
+                                   " because it has already been written to.");
+  }
+
+  if (t.written) {
+    DCHECK(multiple_writes_aggregate_);
+
+    // Check that value_t shape matches t.shape
+    if (value_t->shape() != t.shape) {
+      return errors::InvalidArgument(
+          "TensorArray ", handle_.vec<string>()(1),
+          ": Could not aggregate to TensorArray index ", index,
+          " because the existing shape is ", t.shape.DebugString(),
+          " but the new input shape is ", value_t->shape().DebugString(), ".");
+    }
+
+    Tensor* existing_t = t.tensor.AccessTensor(ctx);
+
+    if (t.local_copy) {
+      Status s = tensor_array::AddToTensor<Device, T>(ctx, existing_t,
+                                                      existing_t, value_t);
+      TF_RETURN_IF_ERROR(s);
+    } else {
+      PersistentTensor local_tensor;
+      Tensor* local_tensor_t;
+      TF_RETURN_IF_ERROR(ctx->allocate_persistent(
+          dtype_, existing_t->shape(), &local_tensor, &local_tensor_t));
+      Status s = tensor_array::AddToTensor<Device, T>(ctx, local_tensor_t,
+                                                      existing_t, value_t);
+      TF_RETURN_IF_ERROR(s);
+      t.tensor = local_tensor;
+      t.local_copy = true;
+    }
+
+    // We've aggregated the values, so disallow backprop on this
+    // TensorArray.
+    gradients_disallowed_ = true;
+  } else {
+    t.tensor = *value;
+    t.shape = value_t->shape();
+    t.written = true;
+  }
+  return Status::OK();
+}
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_TENSOR_ARRAY_H_
diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc
index c6d07344f32..098a68ac491 100644
--- a/tensorflow/core/kernels/tensor_array_ops.cc
+++ b/tensorflow/core/kernels/tensor_array_ops.cc
@@ -125,6 +125,8 @@ class TensorArrayOp : public TensorArrayCreationOp {
       : TensorArrayCreationOp(context) {
     OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_));
     OP_REQUIRES_OK(context, context->GetAttr("dynamic_size", &dynamic_size_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("clear_after_read", &clear_after_read_));
     OP_REQUIRES_OK(context,
                    context->GetAttr("tensor_array_name", &tensor_array_name_));
     if (tensor_array_name_ == "") tensor_array_name_ = name();
@@ -148,7 +150,8 @@ class TensorArrayOp : public TensorArrayCreationOp {
     handle(1) = tensor_array_name_;
 
     TensorArray* tensor_array = new TensorArray(
-        dtype_, *tensor_array_output_handle, size, dynamic_size_);
+        dtype_, *tensor_array_output_handle, size, dynamic_size_,
+        false /* multiple_writes_aggregate */, clear_after_read_);
 
     TF_RETURN_IF_ERROR(rm->Create(handle(0), tensor_array_name_, tensor_array));
 
@@ -160,6 +163,7 @@ class TensorArrayOp : public TensorArrayCreationOp {
  private:
   DataType dtype_;
   bool dynamic_size_;
+  bool clear_after_read_;
   string tensor_array_name_;  // The name used to create the TensorArray.
 
   TF_DISALLOW_COPY_AND_ASSIGN(TensorArrayOp);
@@ -220,11 +224,20 @@ class TensorArrayGradOp : public TensorArrayCreationOp {
     tensor_array->DisableDynamicSize();
     TF_RETURN_IF_ERROR(tensor_array->Size(&array_size));
 
+    if (!tensor_array->GradientsAllowed()) {
+      return errors::InvalidArgument(
+          "Unable to create a gradients TensorArray for ", tensor_array_name,
+          ".  Perhaps you used the multiple_writes_aggregate flag on a "
+          "previous write?  Gradient calculation is impossible when multiple "
+          "writes are performed to the same index.");
+    }
+
     auto creator = [this, tensor_array, array_size,
                     tensor_array_output_handle](TensorArray** ret) {
-      *ret =
-          new TensorArray(tensor_array->ElemType(), *tensor_array_output_handle,
-                          array_size, false /* dynamic_size */);
+      *ret = new TensorArray(
+          tensor_array->ElemType(), *tensor_array_output_handle, array_size,
+          false /* dynamic_size */, true /* multiple_writes_aggregate */,
+          true /* close_after_read */);
       return Status::OK();
     };
 
@@ -285,10 +298,10 @@ class TensorArrayWriteOp : public OpKernel {
                                 " but Op is trying to write dtype ",
                                 DataTypeString(tensor_value->dtype()), "."));
     PersistentTensor persistent_tensor(*tensor_value);
-    OP_REQUIRES_OK(ctx, tensor_array->Write(ctx, index, &persistent_tensor));
+    Status s = tensor_array->WriteOrAggregate<Device, T>(ctx, index,
+                                                         &persistent_tensor);
+    OP_REQUIRES_OK(ctx, s);
   }
-
-  bool IsExpensive() override { return false; }
 };
 
 #define REGISTER_WRITE(type)                                                 \
@@ -737,7 +750,9 @@ class TensorArrayUnpackOp : public OpKernel {
       write_values.push_back(persistent_tensor);
     }
 
-    OP_REQUIRES_OK(ctx, tensor_array->WriteMany(ctx, &write_values));
+    Status s =
+        tensor_array->WriteOrAggregateMany<Device, T>(ctx, &write_values);
+    OP_REQUIRES_OK(ctx, s);
   }
 };
 
@@ -871,7 +886,9 @@ class TensorArraySplitOp : public OpKernel {
       write_values.push_back(persistent_tensor);
     }
 
-    OP_REQUIRES_OK(ctx, tensor_array->WriteMany(ctx, &write_values));
+    Status s =
+        tensor_array->WriteOrAggregateMany<Device, T>(ctx, &write_values);
+    OP_REQUIRES_OK(ctx, s);
   }
 };
 
diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc
index 52e792a399a..fb35d407345 100644
--- a/tensorflow/core/kernels/transpose_op.cc
+++ b/tensorflow/core/kernels/transpose_op.cc
@@ -49,7 +49,12 @@ class InvertPermutationOp : public OpKernel {
         context, TensorShapeUtils::IsVector(input.shape()),
         errors::InvalidArgument("invert_permutation expects a 1D vector."));
     auto Tin = input.vec<int32>();
-    const int N = Tin.size();
+    OP_REQUIRES(context,
+                FastBoundsCheck(Tin.size(), std::numeric_limits<int32>::max()),
+                errors::InvalidArgument("permutation of nonnegative int32s "
+                                        "must have <= int32 max elements"));
+    const int32 N =
+        static_cast<int32>(Tin.size());  // Safe: bounds-checked above.
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, input.shape(), &output));
diff --git a/tensorflow/core/lib/core/error_codes.proto b/tensorflow/core/lib/core/error_codes.proto
index 4c083a7c7a5..1b334677f1e 100644
--- a/tensorflow/core/lib/core/error_codes.proto
+++ b/tensorflow/core/lib/core/error_codes.proto
@@ -99,7 +99,7 @@ enum Code {
   // ABORTED, and UNAVAILABLE.
   ABORTED = 10;
 
-  // Operation was attempted past the valid range.  E.g., seeking or
+  // Operation tried to iterate past the valid input range.  E.g., seeking or
   // reading past end of file.
   //
   // Unlike INVALID_ARGUMENT, this error indicates a problem that may
diff --git a/tensorflow/core/lib/core/threadpool.cc b/tensorflow/core/lib/core/threadpool.cc
index 07ace0560ab..f4e952826a6 100644
--- a/tensorflow/core/lib/core/threadpool.cc
+++ b/tensorflow/core/lib/core/threadpool.cc
@@ -15,6 +15,16 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/threadpool.h"
 
+#ifdef TENSORFLOW_USE_EIGEN_THREADPOOL
+#define EIGEN_USE_THREADS
+#define EIGEN_USE_CUSTOM_THREAD_POOL
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#else
+#include <deque>
+#include <thread>
+#include <vector>
+#endif
+
 #include "tensorflow/core/platform/denormal.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -24,26 +34,97 @@ limitations under the License.
 namespace tensorflow {
 namespace thread {
 
-struct ThreadPool::Waiter {
-  condition_variable cv;
-  bool ready;
+#ifdef TENSORFLOW_USE_EIGEN_THREADPOOL
+
+struct EigenEnvironment {
+  typedef Thread EnvThread;
+  struct Task {
+    std::function<void()> f;
+    uint64 trace_id;
+  };
+
+  Env* const env_;
+  const ThreadOptions thread_options_;
+  const string name_;
+
+  EigenEnvironment(Env* env, const ThreadOptions& thread_options,
+                   const string& name)
+      : env_(env), thread_options_(thread_options), name_(name) {}
+
+  EnvThread* CreateThread(std::function<void()> f) {
+    return env_->StartThread(thread_options_, name_, [=]() {
+      // Set the processor flag to flush denormals to zero
+      port::ScopedFlushDenormal flush;
+      f();
+    });
+  }
+
+  Task CreateTask(std::function<void()> f) {
+    uint64 id = 0;
+    if (port::Tracing::IsActive()) {
+      id = port::Tracing::UniqueId();
+      port::Tracing::RecordEvent(port::Tracing::EventCategory::kScheduleClosure,
+                                 id);
+    }
+    return Task{std::move(f), id};
+  }
+
+  void ExecuteTask(const Task& t) {
+    if (t.trace_id != 0) {
+      port::Tracing::ScopedActivity region(
+          port::Tracing::EventCategory::kRunClosure, t.trace_id);
+      t.f();
+    } else {
+      t.f();
+    }
+  }
 };
 
-ThreadPool::ThreadPool(Env* env, const string& name, int num_threads)
-    : ThreadPool(env, ThreadOptions(), name, num_threads) {}
+struct ThreadPool::Impl : Eigen::ThreadPoolTempl<EigenEnvironment> {
+  Impl(Env* env, const ThreadOptions& thread_options, const string& name,
+       int num_threads)
+      : Eigen::ThreadPoolTempl<EigenEnvironment>(
+            num_threads, EigenEnvironment(env, thread_options, name)) {}
+};
 
-ThreadPool::ThreadPool(Env* env, const ThreadOptions& thread_options,
+#else
+
+struct ThreadPool::Impl {
+  Impl(Env* env, const ThreadOptions& thread_options, const string& name,
+       int num_threads);
+  ~Impl();
+  void Schedule(std::function<void()> fn);
+
+ private:
+  struct Waiter {
+    condition_variable cv;
+    bool ready;
+  };
+
+  struct Task {
+    std::function<void()> fn;
+    uint64 id;
+  };
+
+  void WorkerLoop();
+
+  const string name_;
+  mutex mu_;
+  std::vector<Thread*> threads_;  // All threads
+  std::vector<Waiter*> waiters_;  // Stack of waiting threads.
+  std::deque<Task> pending_;      // Queue of pending work
+};
+
+ThreadPool::Impl::Impl(Env* env, const ThreadOptions& thread_options,
                        const string& name, int num_threads)
     : name_(name) {
-  CHECK_GE(num_threads, 1);
-  string name_prefix = "tf_" + name_;
   for (int i = 0; i < num_threads; i++) {
-    threads_.push_back(env->StartThread(thread_options, name_prefix,
-                                        [this]() { WorkerLoop(); }));
+    threads_.push_back(
+        env->StartThread(thread_options, name, [this]() { WorkerLoop(); }));
   }
 }
 
-ThreadPool::~ThreadPool() {
+ThreadPool::Impl::~Impl() {
   {
     // Wait for all work to get done.
     mutex_lock l(mu_);
@@ -66,13 +147,7 @@ ThreadPool::~ThreadPool() {
   }
 }
 
-bool ThreadPool::HasPendingClosures() const {
-  mutex_lock l(mu_);
-  return pending_.size() != 0;
-}
-
-void ThreadPool::Schedule(std::function<void()> fn) {
-  CHECK(fn != nullptr);
+void ThreadPool::Impl::Schedule(std::function<void()> fn) {
   uint64 id = 0;
   if (port::Tracing::IsActive()) {
     id = port::Tracing::UniqueId();
@@ -90,7 +165,7 @@ void ThreadPool::Schedule(std::function<void()> fn) {
   }
 }
 
-void ThreadPool::WorkerLoop() {
+void ThreadPool::Impl::WorkerLoop() {
   // Set the processor flag to flush denormals to zero
   port::ScopedFlushDenormal flush;
 
@@ -107,22 +182,40 @@ void ThreadPool::WorkerLoop() {
       }
     }
     // Pick up pending work
-    Item item = pending_.front();
+    Task t = pending_.front();
     pending_.pop_front();
-    if (item.fn == nullptr) {
+    if (t.fn == nullptr) {
       break;
     }
     mu_.unlock();
-    if (item.id != 0) {
+    if (t.id != 0) {
       port::Tracing::ScopedActivity region(
-          port::Tracing::EventCategory::kRunClosure, item.id);
-      item.fn();
+          port::Tracing::EventCategory::kRunClosure, t.id);
+      t.fn();
     } else {
-      item.fn();
+      t.fn();
     }
     mu_.lock();
   }
 }
+#endif
+
+ThreadPool::ThreadPool(Env* env, const string& name, int num_threads)
+    : ThreadPool(env, ThreadOptions(), name, num_threads) {}
+
+ThreadPool::ThreadPool(Env* env, const ThreadOptions& thread_options,
+                       const string& name, int num_threads) {
+  CHECK_GE(num_threads, 1);
+  impl_.reset(
+      new ThreadPool::Impl(env, thread_options, "tf_" + name, num_threads));
+}
+
+ThreadPool::~ThreadPool() {}
+
+void ThreadPool::Schedule(std::function<void()> fn) {
+  CHECK(fn != nullptr);
+  impl_->Schedule(std::move(fn));
+}
 
 }  // namespace thread
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/core/threadpool.h b/tensorflow/core/lib/core/threadpool.h
index ef37dcf2d93..ae709e08249 100644
--- a/tensorflow/core/lib/core/threadpool.h
+++ b/tensorflow/core/lib/core/threadpool.h
@@ -16,13 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LIB_CORE_THREADPOOL_H_
 #define TENSORFLOW_LIB_CORE_THREADPOOL_H_
 
-#include <deque>
 #include <functional>
-#include <thread>
-#include <vector>
+#include <memory>
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -45,28 +42,15 @@ class ThreadPool {
 
   // Wait until all scheduled work has finished and then destroy the
   // set of threads.
-  virtual ~ThreadPool();
+  ~ThreadPool();
 
   // Schedule fn() for execution in the pool of threads.
-  virtual void Schedule(std::function<void()> fn);
+  void Schedule(std::function<void()> fn);
 
-  virtual bool HasPendingClosures() const;
+  struct Impl;
 
  private:
-  struct Waiter;
-  struct Item {
-    std::function<void()> fn;
-    uint64 id;
-  };
-
-  void WorkerLoop();
-
-  const string name_;
-  mutable mutex mu_;
-  std::vector<Thread*> threads_;  // All threads
-  std::vector<Waiter*> waiters_;  // Stack of waiting threads.
-  std::deque<Item> pending_;      // Queue of pending work
-
+  std::unique_ptr<Impl> impl_;
   TF_DISALLOW_COPY_AND_ASSIGN(ThreadPool);
 };
 
diff --git a/tensorflow/core/lib/core/threadpool_test.cc b/tensorflow/core/lib/core/threadpool_test.cc
index 59ca99c299c..f0edebdd62b 100644
--- a/tensorflow/core/lib/core/threadpool_test.cc
+++ b/tensorflow/core/lib/core/threadpool_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <atomic>
 
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
diff --git a/tensorflow/core/lib/io/table_test.cc b/tensorflow/core/lib/io/table_test.cc
index 7b362935653..08f92571045 100644
--- a/tensorflow/core/lib/io/table_test.cc
+++ b/tensorflow/core/lib/io/table_test.cc
@@ -581,9 +581,9 @@ TEST(TableTest, ApproximateOffsetOfCompressed) {
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0));
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0));
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 10, 100));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3000));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3000));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 6000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 4000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 4000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 7000));
 }
 
 TEST(TableTest, SeekToFirstKeyDoesNotReadTooMuch) {
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 36e6e29afb6..2b5fcd02416 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -224,6 +224,87 @@ diagonal: The extracted diagonal.
 
 )doc");
 
+// --------------------------------------------------------------------------
+REGISTER_OP("BatchMatrixDiag")
+    .Input("diagonal: T")
+    .Output("output: T")
+    .Attr("T: type")
+    .Doc(R"doc(
+Returns a batched diagonal tensor with a given batched diagonal values.
+
+Given a `diagonal`, this operation returns a tensor with the `diagonal` and
+everything else padded with zeros. The diagonal is computed as follows:
+
+Assume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a
+tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
+
+`output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.
+
+For example:
+
+```prettyprint
+# 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
+
+and diagonal.shape = (2, 4)
+
+tf.batch_matrix_diag(diagonal) ==> [[[1, 0, 0, 0]
+                                     [0, 2, 0, 0]
+                                     [0, 0, 3, 0]
+                                     [0, 0, 0, 4]],
+                                    [[5, 0, 0, 0]
+                                     [0, 6, 0, 0]
+                                     [0, 0, 7, 0]
+                                     [0, 0, 0, 8]]]
+
+which has shape (2, 4, 4)
+```
+
+diagonal: Rank `k`, where `k >= 1`.
+output: Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`.
+)doc");
+
+// --------------------------------------------------------------------------
+REGISTER_OP("BatchMatrixDiagPart")
+    .Input("input: T")
+    .Output("diagonal: T")
+    .Attr("T: type")
+    .Doc(R"doc(
+Returns the batched diagonal part of a batched tensor.
+
+This operation returns a tensor with the `diagonal` part
+of the batched `input`. The `diagonal` part is computed as follows:
+
+Assume `input` has `k` dimensions `[I, J, K, ..., N, N]`, then the output is a
+tensor of rank `k - 1` with dimensions `[I, J, K, ..., N]` where:
+
+`diagonal[i, j, k, ..., n] = input[i, j, k, ..., n, n]`.
+
+The input must be at least a matrix.
+
+For example:
+
+```prettyprint
+# 'input' is [[[1, 0, 0, 0]
+               [0, 2, 0, 0]
+               [0, 0, 3, 0]
+               [0, 0, 0, 4]],
+              [[5, 0, 0, 0]
+               [0, 6, 0, 0]
+               [0, 0, 7, 0]
+               [0, 0, 0, 8]]]
+
+and input.shape = (2, 4, 4)
+
+tf.batch_matrix_diag_part(input) ==> [[1, 2, 3, 4], [5, 6, 7, 8]]
+
+which has shape (2, 4)
+```
+
+input: Rank `k` tensor where `k >= 2` and the last two dimensions are equal.
+diagonal: The extracted diagonal(s) having shape
+  `diagonal.shape = input.shape[:-1]`.
+)doc");
+
 // --------------------------------------------------------------------------
 REGISTER_OP("Reverse")
     .Input("tensor: T")
diff --git a/tensorflow/core/ops/compat/ops_history.v0.pbtxt b/tensorflow/core/ops/compat/ops_history.v0.pbtxt
index 78074d09167..8fbbff37dfb 100644
--- a/tensorflow/core/ops/compat/ops_history.v0.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v0.pbtxt
@@ -3004,6 +3004,36 @@ op {
     }
   }
 }
+op {
+  name: "BatchMatrixDiag"
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "BatchMatrixDiagPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
 op {
   name: "BatchMatrixInverse"
   input_arg {
@@ -3050,6 +3080,38 @@ op {
     }
   }
 }
+op {
+  name: "BatchMatrixSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "BatchMatrixSolveLs"
   input_arg {
@@ -3118,6 +3180,45 @@ op {
     }
   }
 }
+op {
+  name: "BatchMatrixTriangularSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "lower"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "BatchNormWithGlobalNormalization"
   input_arg {
@@ -5258,6 +5359,13 @@ op {
     }
   }
 }
+op {
+  name: "DeleteSessionTensor"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+}
 op {
   name: "DepthToSpace"
   input_arg {
@@ -6509,6 +6617,36 @@ op {
     }
   }
 }
+op {
+  name: "GetSessionHandle"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "GetSessionTensor"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+}
 op {
   name: "Greater"
   input_arg {
@@ -8323,6 +8461,38 @@ op {
     }
   }
 }
+op {
+  name: "MatrixSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "MatrixSolveLs"
   input_arg {
@@ -8391,6 +8561,45 @@ op {
     }
   }
 }
+op {
+  name: "MatrixTriangularSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "lower"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "Max"
   input_arg {
@@ -11056,6 +11265,35 @@ op {
     type: DT_FLOAT
   }
 }
+op {
+  name: "ReduceJoin"
+  input_arg {
+    name: "inputs"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "reduction_indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "separator"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
 op {
   name: "RefEnter"
   input_arg {
@@ -16924,6 +17162,44 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TensorArray"
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "dynamic_size"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "clear_after_read"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "tensor_array_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "TensorArrayClose"
   input_arg {
diff --git a/tensorflow/core/ops/data_flow_ops.cc b/tensorflow/core/ops/data_flow_ops.cc
index 078753f053a..cef74ca8ac7 100644
--- a/tensorflow/core/ops/data_flow_ops.cc
+++ b/tensorflow/core/ops/data_flow_ops.cc
@@ -389,6 +389,7 @@ REGISTER_OP("TensorArray")
     .Input("size: int32")
     .Attr("dtype: type")
     .Attr("dynamic_size: bool = false")
+    .Attr("clear_after_read: bool = true")
     .Attr("tensor_array_name: string = ''")
     .Output("handle: Ref(string)")
     .SetIsStateful()
@@ -401,6 +402,9 @@ size: The size of the array.
 dtype: The type of the elements on the tensor_array.
 dynamic_size: A boolean that determines whether writes to the TensorArray
   are allowed to grow the size.  By default, this is not allowed.
+clear_after_read: If true (default), Tensors in the TensorArray are cleared
+  after being read.  This disables multiple read semantics but allows early
+  release of memory.
 tensor_array_name: Overrides the name used for the temporary tensor_array
   resource. Default value is the name of the 'TensorArray' op (which
   is guaranteed unique).
@@ -483,7 +487,7 @@ REGISTER_OP("TensorArrayRead")
     .Output("value: dtype")
     .Attr("dtype: type")
     .Doc(R"doc(
-Read an element from the TensorArray.
+Read an element from the TensorArray into output `value`.
 
 handle: The handle to a TensorArray.
 dtype: The type of the elem that is returned.
@@ -497,7 +501,7 @@ REGISTER_OP("TensorArrayPack")
     .Output("value: dtype")
     .Attr("dtype: type")
     .Doc(R"doc(
-Pack the elements from the TensorArray.
+Pack the elements from the TensorArray into output `value`.
 
 All elements must have the same shape.
 
@@ -530,12 +534,17 @@ REGISTER_OP("TensorArrayConcat")
     .Output("lengths: int64")
     .Attr("dtype: type")
     .Doc(R"doc(
-Concat the elements from the TensorArray.
+Concat the elements from the TensorArray into value `value`.
+
+Takes `T` elements of shapes
+
+  ```
+  (n0 x d0 x d1 x ...), (n1 x d0 x d1 x ...), ..., (n(T-1) x d0 x d1 x ...)
+  ```
 
-Takes T elements of shapes (n0 x d0 x d1 x ...), (n1 x d0 x d1 x ...),
-  ..., (n(T-1) x d0 x d1 x ...)
 and concatenates them into a Tensor of shape:
-  (n0 + n1 + ... + n(T-1) x d0 x d1 x ...).
+
+  ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```
 
 All elements must have the same shape (excepting the first dimension).
 
@@ -546,7 +555,7 @@ value: All of the elements in the TensorArray, concatenated along the first
   axis.
 lengths: A vector of the row sizes of the original T elements in the
   value output.  In the example above, this would be the values:
-  (n1, n2, ..., n(T-1))
+  `(n1, n2, ..., n(T-1))`.
 )doc");
 
 REGISTER_OP("TensorArraySplit")
@@ -560,15 +569,22 @@ REGISTER_OP("TensorArraySplit")
 Split the data from the input value into TensorArray elements.
 
 Assuming that `lengths` takes on values
-  (n0, n1, ..., n(T-1))
+
+  ```(n0, n1, ..., n(T-1))```
+
 and that `value` has shape
-  (n0 + n1 + ... + n(T-1) x d0 x d1 x ...),
+
+  ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```,
+
 this splits values into a TensorArray with T tensors.
 
 TensorArray index t will be the subtensor of values with starting position
-  (n0 + n1 + ... + n(t-1), 0, 0, ...)
+
+  ```(n0 + n1 + ... + n(t-1), 0, 0, ...)```
+
 and having size
-  nt x d0 x d1 x ...
+
+  ```nt x d0 x d1 x ...```
 
 handle: The handle to a TensorArray.
 value: The concatenated tensor to write to the TensorArray.
@@ -670,4 +686,35 @@ keys: Keys of type Tkey.
 values: Values of type Tval. Same shape as `keys`.
 )doc");
 
+REGISTER_OP("GetSessionHandle")
+    .Input("value: T")
+    .Output("handle: string")
+    .Attr("T: type")
+    .Doc(R"doc(
+Store the input tensor in the state of the current session.
+
+value: The tensor to be stored.
+handle: The handle for the tensor stored in the session state.
+)doc");
+
+REGISTER_OP("GetSessionTensor")
+    .Input("handle: string")
+    .Output("value: dtype")
+    .Attr("dtype: type")
+    .Doc(R"doc(
+Get the value of the tensor specified by its handle.
+
+handle: The handle for a tensor stored in the session state.
+value: The tensor for the given handle.
+dtype: The type of the output value.
+)doc");
+
+REGISTER_OP("DeleteSessionTensor")
+    .Input("handle: string")
+    .Doc(R"doc(
+Delete the tensor specified by its handle in the session.
+
+handle: The handle for a tensor stored in the session state.
+)doc");
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/logging_ops.cc b/tensorflow/core/ops/logging_ops.cc
index cc744e24783..9f7751583a9 100644
--- a/tensorflow/core/ops/logging_ops.cc
+++ b/tensorflow/core/ops/logging_ops.cc
@@ -89,7 +89,7 @@ The generated
 [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
 has one summary value containing a histogram for `values`.
 
-This op reports an `OutOfRange` error if any value is not finite.
+This op reports an `InvalidArgument` error if any value is not finite.
 
 tag: Scalar.  Tag to use for the `Summary.Value`.
 values: Any shape. Values to use to build the histogram.
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 962ceac9509..e71db5d8929 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -1390,6 +1390,44 @@ op {
   summary: "Calculates the determinants for a batch of square matrices."
   description: "The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions\nform square matrices. The output is a 1-D tensor containing the determinants\nfor all input submatrices `[..., :, :]`."
 }
+op {
+  name: "BatchMatrixDiag"
+  input_arg {
+    name: "diagonal"
+    description: "Rank `k`, where `k >= 1`."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    description: "Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  summary: "Returns a batched diagonal tensor with a given batched diagonal values."
+  description: "Given a `diagonal`, this operation returns a tensor with the `diagonal` and\neverything else padded with zeros. The diagonal is computed as follows:\n\nAssume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a\ntensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:\n\n`output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.\n\nFor example:\n\n```prettyprint\n# \'diagonal\' is [[1, 2, 3, 4], [5, 6, 7, 8]]\n\nand diagonal.shape = (2, 4)\n\ntf.batch_matrix_diag(diagonal) ==> [[[1, 0, 0, 0]\n                                     [0, 2, 0, 0]\n                                     [0, 0, 3, 0]\n                                     [0, 0, 0, 4]],\n                                    [[5, 0, 0, 0]\n                                     [0, 6, 0, 0]\n                                     [0, 0, 7, 0]\n                                     [0, 0, 0, 8]]]\n\nwhich has shape (2, 4, 4)\n```"
+}
+op {
+  name: "BatchMatrixDiagPart"
+  input_arg {
+    name: "input"
+    description: "Rank `k` tensor where `k >= 2` and the last two dimensions are equal."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "diagonal"
+    description: "The extracted diagonal(s) having shape\n`diagonal.shape = input.shape[:-1]`."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  summary: "Returns the batched diagonal part of a batched tensor."
+  description: "This operation returns a tensor with the `diagonal` part\nof the batched `input`. The `diagonal` part is computed as follows:\n\nAssume `input` has `k` dimensions `[I, J, K, ..., N, N]`, then the output is a\ntensor of rank `k - 1` with dimensions `[I, J, K, ..., N]` where:\n\n`diagonal[i, j, k, ..., n] = input[i, j, k, ..., n, n]`.\n\nThe input must be at least a matrix.\n\nFor example:\n\n```prettyprint\n# \'input\' is [[[1, 0, 0, 0]\n               [0, 2, 0, 0]\n               [0, 0, 3, 0]\n               [0, 0, 0, 4]],\n              [[5, 0, 0, 0]\n               [0, 6, 0, 0]\n               [0, 0, 7, 0]\n               [0, 0, 0, 8]]]\n\nand input.shape = (2, 4, 4)\n\ntf.batch_matrix_diag_part(input) ==> [[1, 2, 3, 4], [5, 6, 7, 8]]\n\nwhich has shape (2, 4)\n```"
+}
 op {
   name: "BatchMatrixInverse"
   input_arg {
@@ -1432,6 +1470,14 @@ op {
     description: "Shape is `[..., M, K]`."
     type_attr: "T"
   }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "Boolean indicating whether to solve with `matrix` or its (block-wise)\nadjoint."
+  }
   attr {
     name: "T"
     type: "type"
@@ -1443,7 +1489,7 @@ op {
     }
   }
   summary: "Solves systems of linear equations. Checks for invertibility."
-  description: "Matrix is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions\nform square matrices. Rhs is a tensor of shape\n`[..., M, K]`. The output is a tensor shape `[..., M, K]` where each output\nmatrix satisfies matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]."
+  description: "Matrix is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions\nform square matrices. Rhs is a tensor of shape\n`[..., M, K]`. The output is a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output\nmatrix satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.\nIf `adjoint` is `True` then each output\nmatrix satisfies `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`."
 }
 op {
   name: "BatchMatrixSolveLs"
@@ -1509,7 +1555,15 @@ op {
     default_value {
       b: true
     }
-    description: "Boolean indicating whether matrix is lower or upper triangular."
+    description: "Boolean indicating whether the innermost matrices in `matrix` are\nlower or upper triangular."
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "Boolean indicating whether to solve with `matrix` or its (block-wise)\nadjoint."
   }
   attr {
     name: "T"
@@ -1522,7 +1576,7 @@ op {
     }
   }
   summary: "Solves systems of linear equations with upper or lower triangular matrices by"
-  description: "backsubstitution.\n\n`matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form\nsquare matrices. If `lower` is `True` then the strictly upper triangular part\nof each inner-most matrix is ignored. If `lower` is False then the strictly\nlower triangular part of each inner-most matrix is ignored. `rhs` is a tensor\nof shape [..., M, K]`.\n\nThe output is a tensor of shape `[..., M, K]`. If `lower` is `True` then the\noutput satisfies\n\\\\(\\sum_{k=0}^{i}\\\\) matrix[..., i, k] * output[..., k, j] = rhs[..., i, j].\nIf `lower` is false then the strictly then the output satisfies\n\\\\(sum_{k=i}^{K-1}\\\\) matrix[..., i, k] * output[..., k, j] = rhs[..., i, j]."
+  description: "backsubstitution.\n\n`matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form\nsquare matrices. If `lower` is `True` then the strictly upper triangular part\nof each inner-most matrix is assumed to be zero and not accessed.\nIf `lower` is False then the strictly lower triangular part of each inner-most\nmatrix is assumed to be zero and not accessed.\n`rhs` is a tensor of shape [..., M, K]`.\n\nThe output is a tensor of shape `[..., M, K]`. If `adjoint` is `True` then the\ninnermost matrices in output` satisfy matrix equations\n`matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.\nIf `adjoint` is `False` then the strictly then the  innermost matrices in\n`output` satisfy matrix equations\n`adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`."
 }
 op {
   name: "BatchNormWithGlobalNormalization"
@@ -2835,6 +2889,15 @@ op {
   }
   summary: "Reinterpret the bytes of a string as a vector of numbers."
 }
+op {
+  name: "DeleteSessionTensor"
+  input_arg {
+    name: "handle"
+    description: "The handle for a tensor stored in the session state."
+    type: DT_STRING
+  }
+  summary: "Delete the tensor specified by its handle in the session."
+}
 op {
   name: "DepthToSpace"
   input_arg {
@@ -4100,6 +4163,43 @@ op {
   summary: "Gather values from `params` according to `indices`."
   description: "`indices` must be integer tensor, containing indices into `params`.\nIt must be shape `[d_0, ..., d_N, R]` where `R` is the rank of `params`.\nThe innermost dimension of `indices` (with length `R`) corresponds to the\nindices of `params`.\n\nProduces an output tensor with shape `[d_0, ..., d_{n-1}]` where:\n\n    output[i, j, k, ...] = params[indices[i, j, k, ..., :]]\n\ne.g. for `indices` a matrix:\n\n    output[i] = params[indices[i, :]]"
 }
+op {
+  name: "GetSessionHandle"
+  input_arg {
+    name: "value"
+    description: "The tensor to be stored."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "handle"
+    description: "The handle for the tensor stored in the session state."
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  summary: "Store the input tensor in the state of the current session."
+}
+op {
+  name: "GetSessionTensor"
+  input_arg {
+    name: "handle"
+    description: "The handle for a tensor stored in the session state."
+    type: DT_STRING
+  }
+  output_arg {
+    name: "value"
+    description: "The tensor for the given handle."
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    description: "The type of the output value."
+  }
+  summary: "Get the value of the tensor specified by its handle."
+}
 op {
   name: "Greater"
   input_arg {
@@ -4257,7 +4357,7 @@ op {
     }
   }
   summary: "Outputs a `Summary` protocol buffer with a histogram."
-  description: "The generated\n[`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)\nhas one summary value containing a histogram for `values`.\n\nThis op reports an `OutOfRange` error if any value is not finite."
+  description: "The generated\n[`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)\nhas one summary value containing a histogram for `values`.\n\nThis op reports an `InvalidArgument` error if any value is not finite."
 }
 op {
   name: "IFFT"
@@ -5403,9 +5503,17 @@ op {
   }
   output_arg {
     name: "output"
-    description: "Shape is `[M, K]` containing the tensor that solves\nmatrix * output = rhs."
+    description: "Shape is `[M, K]`. If `adjoint` is `False` then `output` that solves\n`matrix` * `output` = `rhs`. If `adjoint` is `True` then `output` that solves\n`adjoint(matrix)` * `output` = `rhs`."
     type_attr: "T"
   }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "Boolean indicating whether to solve with `matrix` or its adjoint."
+  }
   attr {
     name: "T"
     type: "type"
@@ -5482,7 +5590,15 @@ op {
     default_value {
       b: true
     }
-    description: "Boolean indicating whether matrix is lower or upper triangular."
+    description: "Boolean indicating whether `matrix` is lower or upper triangular"
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "Boolean indicating whether to solve with `matrix` or its adjoint."
   }
   attr {
     name: "T"
@@ -5495,7 +5611,7 @@ op {
     }
   }
   summary: "Solves a system of linear equations with an upper or lower triangular matrix by"
-  description: "backsubstitution.\n\n`matrix` is a matrix of shape `[M, M]`. If `lower` is `True` then the strictly\nupper triangular part of `matrix` is ignored. If `lower` is False then the\nstrictly lower triangular part of `matrix` is ignored. `rhs` is a matrix of\nshape [M, K]`.\n\nThe output is a matrix of shape `[M, K]`. If `lower` is `True` then the output\nsatisfies \\\\(\\sum_{k=0}^{i}\\\\) matrix[i, k] * output[k, j] = rhs[i, j].\nIf `lower` is false then output satisfies\n\\\\(\\sum_{k=i}^{K-1}\\\\) matrix[i, k] * output[k, j] = rhs[i, j]."
+  description: "backsubstitution.\n\n`matrix` is a matrix of shape `[M, M]`. If `lower` is `True` then the strictly\nupper triangular part of `matrix` is assumed to be zero and not accessed.\nIf `lower` is False then the strictly lower triangular part of `matrix` is\nassumed to be zero and not accessed.\n`rhs` is a matrix of shape [M, K]`.\n\nThe output is a matrix of shape `[M, K]`. If `adjoint` is `False` the output\nsatisfies the matrix equation `matrix` * `output` = `rhs`.\nIf `adjoint` is `False` then `output` satisfies the matrix equation\n`matrix` * `output` = `rhs`.\nIf `adjoint` is `True` then `output` satisfies the matrix equation\n`adjoint(matrix)` * `output` = `rhs`."
 }
 op {
   name: "Max"
@@ -7568,6 +7684,42 @@ op {
   summary: "Returns the real part of a complex number."
   description: "Given a tensor `in` of complex numbers, this operation returns a tensor of type\n`float` that is the real part of each element in `in`. All elements in `in`\nmust be complex numbers of the form \\\\(a + bj\\\\), where *a* is the real part\nreturned by this operation and *b* is the imaginary part.\n\nFor example:\n\n```\n# tensor \'in\' is [-2.25 + 4.75j, 3.25 + 5.75j]\ntf.real(in) ==> [-2.25, 3.25]\n```"
 }
+op {
+  name: "ReduceJoin"
+  input_arg {
+    name: "inputs"
+    description: "The input to be joined.  All reduced indices must have non-zero size."
+    type: DT_STRING
+  }
+  input_arg {
+    name: "reduction_indices"
+    description: "The dimensions to reduce over.  Dimensions are reduced in the\norder specified.  If `reduction_indices` has higher rank than `1`, it is\nflattened.  Omitting `reduction_indices` is equivalent to passing\n`[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported."
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    description: "Has shape equal to that of the input with reduced dimensions removed or\nset to `1` depending on `keep_dims`."
+    type: DT_STRING
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If `True`, retain reduced dimensions with length `1`."
+  }
+  attr {
+    name: "separator"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "The separator to use when joining."
+  }
+  summary: "Joins a string Tensor across the given dimensions."
+  description: "Computes the string join across dimensions in the given string Tensor of shape\n`[d_0, d_1, ..., d_n-1]`.  Returns a new Tensor created by joining the input\nstrings with the given separator (default: empty string).  Negative indices are\ncounted backwards from the end, with `-1` being equivalent to `n - 1`.  Passing\nan empty `reduction_indices` joins all strings in linear index order and outputs\na scalar string.\n\n\nFor example:\n```\n# tensor `a` is [[\"a\", \"b\"], [\"c\", \"d\"]]\ntf.reduce_join(a, 0) ==> [\"ac\", \"bd\"]\ntf.reduce_join(a, 1) ==> [\"ab\", \"cd\"]\ntf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> [\"ac\", \"bd\"]\ntf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> [\"ab\", \"cd\"]\ntf.reduce_join(a, 0, keep_dims=True) ==> [[\"ac\", \"bd\"]]\ntf.reduce_join(a, 1, keep_dims=True) ==> [[\"ab\"], [\"cd\"]]\ntf.reduce_join(a, 0, separator=\".\") ==> [\"a.c\", \"b.d\"]\ntf.reduce_join(a, [0, 1]) ==> [\"acbd\"]\ntf.reduce_join(a, [1, 0]) ==> [\"abcd\"]\ntf.reduce_join(a, []) ==> [\"abcd\"]\n```"
+}
 op {
   name: "RefEnter"
   input_arg {
@@ -11107,6 +11259,14 @@ op {
     }
     description: "A boolean that determines whether writes to the TensorArray\nare allowed to grow the size.  By default, this is not allowed."
   }
+  attr {
+    name: "clear_after_read"
+    type: "bool"
+    default_value {
+      b: true
+    }
+    description: "If true (default), Tensors in the TensorArray are cleared\nafter being read.  This disables multiple read semantics but allows early\nrelease of memory."
+  }
   attr {
     name: "tensor_array_name"
     type: "string"
@@ -11150,7 +11310,7 @@ op {
   }
   output_arg {
     name: "lengths"
-    description: "A vector of the row sizes of the original T elements in the\nvalue output.  In the example above, this would be the values:\n(n1, n2, ..., n(T-1))"
+    description: "A vector of the row sizes of the original T elements in the\nvalue output.  In the example above, this would be the values:\n`(n1, n2, ..., n(T-1))`."
     type: DT_INT64
   }
   attr {
@@ -11158,8 +11318,8 @@ op {
     type: "type"
     description: "The type of the elem that is returned."
   }
-  summary: "Concat the elements from the TensorArray."
-  description: "Takes T elements of shapes (n0 x d0 x d1 x ...), (n1 x d0 x d1 x ...),\n  ..., (n(T-1) x d0 x d1 x ...)\nand concatenates them into a Tensor of shape:\n  (n0 + n1 + ... + n(T-1) x d0 x d1 x ...).\n\nAll elements must have the same shape (excepting the first dimension)."
+  summary: "Concat the elements from the TensorArray into value `value`."
+  description: "Takes `T` elements of shapes\n\n  ```\n  (n0 x d0 x d1 x ...), (n1 x d0 x d1 x ...), ..., (n(T-1) x d0 x d1 x ...)\n  ```\n\nand concatenates them into a Tensor of shape:\n\n  ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```\n\nAll elements must have the same shape (excepting the first dimension)."
 }
 op {
   name: "TensorArrayGrad"
@@ -11208,7 +11368,7 @@ op {
     type: "type"
     description: "The type of the elem that is returned."
   }
-  summary: "Pack the elements from the TensorArray."
+  summary: "Pack the elements from the TensorArray into output `value`."
   description: "All elements must have the same shape."
 }
 op {
@@ -11238,7 +11398,7 @@ op {
     type: "type"
     description: "The type of the elem that is returned."
   }
-  summary: "Read an element from the TensorArray."
+  summary: "Read an element from the TensorArray into output `value`."
 }
 op {
   name: "TensorArraySize"
@@ -11293,7 +11453,7 @@ op {
     type: "type"
   }
   summary: "Split the data from the input value into TensorArray elements."
-  description: "Assuming that `lengths` takes on values\n  (n0, n1, ..., n(T-1))\nand that `value` has shape\n  (n0 + n1 + ... + n(T-1) x d0 x d1 x ...),\nthis splits values into a TensorArray with T tensors.\n\nTensorArray index t will be the subtensor of values with starting position\n  (n0 + n1 + ... + n(t-1), 0, 0, ...)\nand having size\n  nt x d0 x d1 x ..."
+  description: "Assuming that `lengths` takes on values\n\n  ```(n0, n1, ..., n(T-1))```\n\nand that `value` has shape\n\n  ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```,\n\nthis splits values into a TensorArray with T tensors.\n\nTensorArray index t will be the subtensor of values with starting position\n\n  ```(n0 + n1 + ... + n(t-1), 0, 0, ...)```\n\nand having size\n\n  ```nt x d0 x d1 x ...```"
 }
 op {
   name: "TensorArrayUnpack"
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index d621c5368f5..93a239c5c47 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -33,4 +33,48 @@ num_buckets: The number of buckets.
 output: A Tensor of the same shape as the input `string_tensor`.
 )doc");
 
+REGISTER_OP("ReduceJoin")
+    .Input("inputs: string")
+    .Input("reduction_indices: int32")
+    .Attr("keep_dims: bool = false")
+    .Attr("separator: string = ''")
+    .Output("output: string")
+    .Doc(R"doc(
+Joins a string Tensor across the given dimensions.
+
+Computes the string join across dimensions in the given string Tensor of shape
+`[d_0, d_1, ..., d_n-1]`.  Returns a new Tensor created by joining the input
+strings with the given separator (default: empty string).  Negative indices are
+counted backwards from the end, with `-1` being equivalent to `n - 1`.  Passing
+an empty `reduction_indices` joins all strings in linear index order and outputs
+a scalar string.
+
+
+For example:
+```
+# tensor `a` is [["a", "b"], ["c", "d"]]
+tf.reduce_join(a, 0) ==> ["ac", "bd"]
+tf.reduce_join(a, 1) ==> ["ab", "cd"]
+tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
+tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
+tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
+tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
+tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
+tf.reduce_join(a, [0, 1]) ==> ["acbd"]
+tf.reduce_join(a, [1, 0]) ==> ["abcd"]
+tf.reduce_join(a, []) ==> ["abcd"]
+```
+
+inputs: The input to be joined.  All reduced indices must have non-zero size.
+reduction_indices: The dimensions to reduce over.  Dimensions are reduced in the
+  order specified.  If `reduction_indices` has higher rank than `1`, it is
+  flattened.  Omitting `reduction_indices` is equivalent to passing
+  `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
+keep_dims: If `True`, retain reduced dimensions with length `1`.
+separator: The separator to use when joining.
+
+output: Has shape equal to that of the input with reduced dimensions removed or
+  set to `1` depending on `keep_dims`.
+)doc");
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/default/thread_annotations.h b/tensorflow/core/platform/default/thread_annotations.h
index 46143b2ea30..f3936d366de 100644
--- a/tensorflow/core/platform/default/thread_annotations.h
+++ b/tensorflow/core/platform/default/thread_annotations.h
@@ -157,11 +157,6 @@ limitations under the License.
 // annotations will be ignored by the analysis.
 #define TS_UNCHECKED(x) ""
 
-// Disables warnings for a single read operation.  This can be used to do racy
-// reads of guarded data members, in cases where the race is benign.
-#define TS_UNCHECKED_READ(x) \
-  ::tensorflow::thread_safety_analysis::ts_unchecked_read(x)
-
 namespace tensorflow {
 namespace thread_safety_analysis {
 
diff --git a/tensorflow/core/platform/env.cc b/tensorflow/core/platform/env.cc
index 3e0c51d599d..714b4511f89 100644
--- a/tensorflow/core/platform/env.cc
+++ b/tensorflow/core/platform/env.cc
@@ -29,30 +29,32 @@ class FileSystemRegistryImpl : public FileSystemRegistry {
 
  private:
   mutable mutex mu_;
-  mutable std::unordered_map<string, FileSystem*> registry_ GUARDED_BY(mu_);
+  mutable std::unordered_map<string, std::unique_ptr<FileSystem>> registry_
+      GUARDED_BY(mu_);
 };
 
 void FileSystemRegistryImpl::Register(const string& scheme,
                                       FileSystemRegistry::Factory factory) {
   mutex_lock lock(mu_);
-  QCHECK(!gtl::FindOrNull(registry_, scheme)) << "File factory for " << scheme
-                                              << " already registered";
-  registry_[scheme] = factory();
+  QCHECK(
+      registry_.emplace(string(scheme), std::unique_ptr<FileSystem>(factory()))
+          .second)
+      << "File factory for " << scheme << " already registered";
 }
 
 FileSystem* FileSystemRegistryImpl::Lookup(const string& scheme) {
   mutex_lock lock(mu_);
-  auto fs_ptr = gtl::FindOrNull(registry_, scheme);
-  if (!fs_ptr) {
+  const auto found = registry_.find(scheme);
+  if (found == registry_.end()) {
     return nullptr;
   }
-  return *fs_ptr;
+  return found->second.get();
 }
 
 Status FileSystemRegistryImpl::GetRegisteredFileSystemSchemes(
     std::vector<string>* schemes) {
   mutex_lock lock(mu_);
-  for (auto const e : registry_) {
+  for (const auto& e : registry_) {
     schemes->push_back(e.first);
   }
   return Status::OK();
@@ -60,8 +62,6 @@ Status FileSystemRegistryImpl::GetRegisteredFileSystemSchemes(
 
 Env::Env() : file_system_registry_(new FileSystemRegistryImpl) {}
 
-Env::~Env() { delete file_system_registry_; }
-
 Status Env::GetFileSystemForFile(const string& fname, FileSystem** result) {
   string scheme = GetSchemeFromURI(fname);
   FileSystem* file_system = file_system_registry_->Lookup(scheme);
diff --git a/tensorflow/core/platform/env.h b/tensorflow/core/platform/env.h
index 6527da97a95..1abe5cd2c0b 100644
--- a/tensorflow/core/platform/env.h
+++ b/tensorflow/core/platform/env.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_PLATFORM_ENV_H_
 
 #include <stdint.h>
+#include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -45,7 +46,7 @@ struct ThreadOptions;
 class Env {
  public:
   Env();
-  virtual ~Env();
+  virtual ~Env() = default;
 
   /// \brief Returns a default environment suitable for the current operating
   /// system.
@@ -59,6 +60,8 @@ class Env {
   /// \brief Returns the FileSystem object to handle operations on the file
   /// specified by 'fname'. The FileSystem object is used as the implementation
   /// for the file system related (non-virtual) functions that follow.
+  /// Returned FileSystem object is still owned by the Env object and will
+  // (might) be destroyed when the environment is destroyed.
   virtual Status GetFileSystemForFile(const string& fname, FileSystem** result);
 
   /// \brief Returns the file system schemes registered for this Env.
@@ -77,6 +80,10 @@ class Env {
   /// status.
   ///
   /// The returned file may be concurrently accessed by multiple threads.
+  ///
+  /// The ownership of the returned RandomAccessFile is passed to the caller
+  /// and the object should be deleted when is not used. The file object
+  /// shouldn't live longer than the Env object.
   Status NewRandomAccessFile(const string& fname, RandomAccessFile** result);
 
   /// \brief Creates an object that writes to a new file with the specified
@@ -88,6 +95,10 @@ class Env {
   /// returns non-OK.
   ///
   /// The returned file will only be accessed by one thread at a time.
+  ///
+  /// The ownership of the returned WritableFile is passed to the caller
+  /// and the object should be deleted when is not used. The file object
+  /// shouldn't live longer than the Env object.
   Status NewWritableFile(const string& fname, WritableFile** result);
 
   /// \brief Creates an object that either appends to an existing file, or
@@ -98,6 +109,10 @@ class Env {
   /// non-OK.
   ///
   /// The returned file will only be accessed by one thread at a time.
+  ///
+  /// The ownership of the returned WritableFile is passed to the caller
+  /// and the object should be deleted when is not used. The file object
+  /// shouldn't live longer than the Env object.
   Status NewAppendableFile(const string& fname, WritableFile** result);
 
   /// \brief Creates a readonly region of memory with the file context.
@@ -107,6 +122,10 @@ class Env {
   /// the caller. On failure stores nullptr in *result and returns non-OK.
   ///
   /// The returned memory region can be accessed from many threads in parallel.
+  ///
+  /// The ownership of the returned ReadOnlyMemoryRegion is passed to the caller
+  /// and the object should be deleted when is not used. The memory region
+  /// object shouldn't live longer than the Env object.
   Status NewReadOnlyMemoryRegionFromFile(const string& fname,
                                          ReadOnlyMemoryRegion** result);
 
@@ -192,7 +211,7 @@ class Env {
   Env(const Env&);
   void operator=(const Env&);
 
-  FileSystemRegistry* file_system_registry_;
+  std::unique_ptr<FileSystemRegistry> file_system_registry_;
 };
 
 /// \brief An implementation of Env that forwards all calls to another Env.
diff --git a/tensorflow/core/platform/file_system.h b/tensorflow/core/platform/file_system.h
index ab26a226dcf..fc64ae17b82 100644
--- a/tensorflow/core/platform/file_system.h
+++ b/tensorflow/core/platform/file_system.h
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 
diff --git a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
index 2ea188da878..31817ed4649 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
@@ -1,29 +1,25 @@
 # Copyright 2015 Google Inc. All Rights Reserved.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the 'License');
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an 'AS IS' BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 
-"""A very simple MNIST classifier, modified to display data in TensorBoard.
+"""A simple MNIST classifier which displays summaries in TensorBoard.
 
-See extensive documentation for the original model at
-http://tensorflow.org/tutorials/mnist/beginners/index.md
-
-See documentation on the TensorBoard specific pieces at
-http://tensorflow.org/how_tos/summaries_and_tensorboard/index.md
-
-If you modify this file, please update the excerpt in
-how_tos/summaries_and_tensorboard/index.md.
+ This is an unimpressive MNIST model, but it is a good example of using
+tf.name_scope to make a graph legible in the TensorBoard graph explorer, and of
+naming summary tags so that they are grouped meaningfully in TensorBoard.
 
+It demonstrates the functionality of every TensorBoard dashboard.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -39,72 +35,132 @@ FLAGS = flags.FLAGS
 flags.DEFINE_boolean('fake_data', False, 'If true, uses fake data '
                      'for unit testing.')
 flags.DEFINE_integer('max_steps', 1000, 'Number of steps to run trainer.')
-flags.DEFINE_float('learning_rate', 0.5, 'Initial learning rate.')
+flags.DEFINE_float('learning_rate', 0.001, 'Initial learning rate.')
+flags.DEFINE_float('dropout', 0.9, 'Keep probability for training dropout.')
 flags.DEFINE_string('data_dir', '/tmp/data', 'Directory for storing data')
 flags.DEFINE_string('summaries_dir', '/tmp/mnist_logs', 'Summaries directory')
 
 
-def main(_):
+def train():
   # Import data
   mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True,
                                     fake_data=FLAGS.fake_data)
 
   sess = tf.InteractiveSession()
 
-  # Create the model
-  x = tf.placeholder(tf.float32, [None, 784], name='x-input')
-  W = tf.Variable(tf.zeros([784, 10]), name='weights')
-  b = tf.Variable(tf.zeros([10]), name='bias')
+  # Create a multilayer model.
 
-  # Use a name scope to organize nodes in the graph visualizer
-  with tf.name_scope('Wx_b'):
-    y = tf.nn.softmax(tf.matmul(x, W) + b)
+  # Input placehoolders
+  with tf.name_scope('input'):
+    x = tf.placeholder(tf.float32, [None, 784], name='x-input')
+    image_shaped_input = tf.reshape(x, [-1, 28, 28, 1])
+    tf.image_summary('input', image_shaped_input, 10)
+    y_ = tf.placeholder(tf.float32, [None, 10], name='y-input')
+    keep_prob = tf.placeholder(tf.float32)
+    tf.scalar_summary('dropout_keep_probability', keep_prob)
 
-  # Add summary ops to collect data
-  tf.histogram_summary('weights', W)
-  tf.histogram_summary('biases', b)
-  tf.histogram_summary('y', y)
+  # We can't initialize these variables to 0 - the network will get stuck.
+  def weight_variable(shape):
+    """Create a weight variable with appropriate initialization."""
+    initial = tf.truncated_normal(shape, stddev=0.1)
+    return tf.Variable(initial)
 
-  # Define loss and optimizer
-  y_ = tf.placeholder(tf.float32, [None, 10], name='y-input')
-  # More name scopes will clean up the graph representation
-  with tf.name_scope('xent'):
-    cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
+  def bias_variable(shape):
+    """Create a bias variable with appropriate initialization."""
+    initial = tf.constant(0.1, shape=shape)
+    return tf.Variable(initial)
+
+  def variable_summaries(var, name):
+    """Attach a lot of summaries to a Tensor."""
+    with tf.name_scope('summaries'):
+      mean = tf.reduce_mean(var)
+      tf.scalar_summary('mean/' + name, mean)
+      with tf.name_scope('stddev'):
+        stddev = tf.sqrt(tf.reduce_sum(tf.square(var - mean)))
+      tf.scalar_summary('sttdev/' + name, stddev)
+      tf.scalar_summary('max/' + name, tf.reduce_max(var))
+      tf.scalar_summary('min/' + name, tf.reduce_min(var))
+      tf.histogram_summary(name, var)
+
+  def nn_layer(input_tensor, input_dim, output_dim, layer_name, act=tf.nn.relu):
+    """Reusable code for making a simple neural net layer.
+
+    It does a matrix multiply, bias add, and then uses relu to nonlinearize.
+    It also sets up name scoping so that the resultant graph is easy to read, and
+    adds a number of summary ops.
+    """
+    # Adding a name scope ensures logical grouping of the layers in the graph.
+    with tf.name_scope(layer_name):
+      # This Variable will hold the state of the weights for the layer
+      with tf.name_scope('weights'):
+        weights = weight_variable([input_dim, output_dim])
+        variable_summaries(weights, layer_name + '/weights')
+      with tf.name_scope('biases'):
+        biases = bias_variable([output_dim])
+        variable_summaries(biases, layer_name + '/biases')
+      with tf.name_scope('Wx_plus_b'):
+        preactivate = tf.matmul(input_tensor, weights) + biases
+        tf.histogram_summary(layer_name + '/pre_activations', preactivate)
+      activations = act(preactivate, 'activation')
+      tf.histogram_summary(layer_name + '/activations', activations)
+      return activations
+
+  hidden1 = nn_layer(x, 784, 500, 'layer1')
+  dropped = tf.nn.dropout(hidden1, keep_prob)
+  y = nn_layer(dropped, 500, 10, 'layer2', act=tf.nn.softmax)
+
+
+  with tf.name_scope('cross_entropy'):
+    diff = y_ * tf.log(y)
+    with tf.name_scope('total'):
+      cross_entropy = -tf.reduce_mean(diff)
     tf.scalar_summary('cross entropy', cross_entropy)
+
   with tf.name_scope('train'):
-    train_step = tf.train.GradientDescentOptimizer(
+    train_step = tf.train.AdamOptimizer(
         FLAGS.learning_rate).minimize(cross_entropy)
 
-  with tf.name_scope('test'):
-    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
-    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
+  with tf.name_scope('accuracy'):
+    with tf.name_scope('correct_prediction'):
+      correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
+    with tf.name_scope('accuracy'):
+      accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
     tf.scalar_summary('accuracy', accuracy)
 
   # Merge all the summaries and write them out to /tmp/mnist_logs (by default)
   merged = tf.merge_all_summaries()
-  writer = tf.train.SummaryWriter(FLAGS.summaries_dir, sess.graph)
+  train_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/train', sess.graph)
+  test_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/test')
   tf.initialize_all_variables().run()
 
-  # Train the model, and feed in test data and record summaries every 10 steps
+  # Train the model, and also write summaries.
+  # Every 10th step, measure test-set accuracy, and write test summaries
+  # All other steps, run train_step on training data, & add training summaries
+
+  def feed_dict(train):
+    """Make a TensorFlow feed_dict: maps data onto Tensor placeholders."""
+    if train or FLAGS.fake_data:
+      xs, ys = mnist.train.next_batch(100, fake_data=FLAGS.fake_data)
+      k = FLAGS.dropout
+    else:
+      xs, ys = mnist.test.images, mnist.test.labels
+      k = 1.0
+    return {x: xs, y_: ys, keep_prob: k}
 
   for i in range(FLAGS.max_steps):
-    if i % 10 == 0:  # Record summary data and the accuracy
-      if FLAGS.fake_data:
-        batch_xs, batch_ys = mnist.train.next_batch(
-            100, fake_data=FLAGS.fake_data)
-        feed = {x: batch_xs, y_: batch_ys}
-      else:
-        feed = {x: mnist.test.images, y_: mnist.test.labels}
-      result = sess.run([merged, accuracy], feed_dict=feed)
-      summary_str = result[0]
-      acc = result[1]
-      writer.add_summary(summary_str, i)
+    if i % 10 == 0:  # Record summaries and test-set accuracy
+      summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False))
+      test_writer.add_summary(summary, i)
       print('Accuracy at step %s: %s' % (i, acc))
-    else:
-      batch_xs, batch_ys = mnist.train.next_batch(
-          100, fake_data=FLAGS.fake_data)
-      feed = {x: batch_xs, y_: batch_ys}
-      sess.run(train_step, feed_dict=feed)
+    else: # Record train set summarieis, and train
+      summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True))
+      train_writer.add_summary(summary, i)
+
+def main(_):
+  if tf.gfile.Exists(FLAGS.summaries_dir):
+    tf.gfile.DeleteRecursively(FLAGS.summaries_dir)
+  tf.gfile.MakeDirs(FLAGS.summaries_dir)
+  train()
 
 if __name__ == '__main__':
   tf.app.run()
diff --git a/tensorflow/g3doc/api_docs/python/client.md b/tensorflow/g3doc/api_docs/python/client.md
index cdb9df53a55..4e0908ed5fb 100644
--- a/tensorflow/g3doc/api_docs/python/client.md
+++ b/tensorflow/g3doc/api_docs/python/client.md
@@ -117,6 +117,9 @@ method. A graph element can be one of the following types:
   the *i*th return value will be a
   [`SparseTensorValue`](../../api_docs/python/sparse_ops.md#SparseTensorValue)
   containing the value of that sparse tensor.
+* If the *i*th element of `fetches` is produced by a `get_tensor_handle` op,
+  the *i*th return value will be a numpy ndarray containing the handle of
+  that tensor.
 
 The optional `feed_dict` argument allows the caller to override
 the value of tensors in the graph. Each key in `feed_dict` can be
@@ -620,7 +623,7 @@ Creates an `AbortedError`.
 
 ### `class tf.errors.OutOfRangeError` {#OutOfRangeError}
 
-Raised when an operation executed past the valid range.
+Raised when an operation iterates past the valid input range.
 
 This exception is raised in "end-of-file" conditions, such as when a
 [`queue.dequeue()`](../../api_docs/python/io_ops.md#QueueBase.dequeue)
diff --git a/tensorflow/g3doc/api_docs/python/control_flow_ops.md b/tensorflow/g3doc/api_docs/python/control_flow_ops.md
index 9b51acf56eb..376d498adcc 100644
--- a/tensorflow/g3doc/api_docs/python/control_flow_ops.md
+++ b/tensorflow/g3doc/api_docs/python/control_flow_ops.md
@@ -175,7 +175,7 @@ the same non-zero number and type of outputs.
   y = tf.constant(5)
   def f1(): return tf.mul(x, 17)
   def f2(): return tf.add(y, 23)
-  r = cond(math_ops.less(x, y), f1, f2)
+  r = cond(tf.less(x, y), f1, f2)
   # r is set to f1().
   # Operations in f2 (e.g., tf.add) are not executed.
 ```
@@ -259,6 +259,55 @@ Example 2:
              callable.
 
 
+- - -
+
+### `tf.while_loop(cond, body, loop_vars, parallel_iterations=10, back_prop=True, swap_memory=False, name=None)` {#while_loop}
+
+Repeat `body` while the condition `cond` is true.
+
+`cond` is a callable taking a list of tensors and returning a boolean scalar
+tensor. `body` is a callable taking a list of tensors and returning a list of
+tensors of the same length and with the same types as the input. `loop_vars`
+is a list of tensors that is passed to both `cond` and `body`.
+
+In addition to regular Tensors or IndexedSlices, the body may accept and
+return TensorArray objects.  The flows of the TensorArray objects will
+be appropriately forwarded between loops and during gradient calculations.
+
+While `cond` evaluates to true, `body` is executed.
+
+##### Args:
+
+
+*  <b>`cond`</b>: The termination condition of the loop.
+*  <b>`body`</b>: A callable that represents the loop body.
+*  <b>`loop_vars`</b>: The list of variable input tensors.
+*  <b>`parallel_iterations`</b>: The number of iterations allowed to run in parallel.
+*  <b>`back_prop`</b>: Whether backprop is enabled for this while loop.
+*  <b>`swap_memory`</b>: Whether GPU-CPU memory swap is enabled for this loop.
+*  <b>`name`</b>: Optional name prefix for the returned tensors.
+
+##### Returns:
+
+  The output tensors for the loop variables after the loop.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if `cond` or `body` is not callable.
+*  <b>`ValueError`</b>: if `loop_var` is empty.
+
+
+*  <b>`Example`</b>: 
+
+  ```python
+  i = tf.constant(0)
+  c = lambda i: tf.less(i, 10)
+  b = lambda i: tf.add(i, 1)
+  r = tf.while_loop(c, b, [i])
+  ```
+
+
 
 ## Logical Operators
 
diff --git a/tensorflow/g3doc/api_docs/python/histogram_ops.md b/tensorflow/g3doc/api_docs/python/histogram_ops.md
index a307e5bfcf7..bf373a40a6d 100644
--- a/tensorflow/g3doc/api_docs/python/histogram_ops.md
+++ b/tensorflow/g3doc/api_docs/python/histogram_ops.md
@@ -32,6 +32,7 @@ equal width and determined by the arguments `value_range` and `nbins`.
 
 
 *  <b>`Examples`</b>: 
+
 ```python
 # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
 nbins = 5
diff --git a/tensorflow/g3doc/api_docs/python/index.md b/tensorflow/g3doc/api_docs/python/index.md
index 295f956da9f..bf7d9e38d8f 100644
--- a/tensorflow/g3doc/api_docs/python/index.md
+++ b/tensorflow/g3doc/api_docs/python/index.md
@@ -141,6 +141,8 @@
   * [`batch_ifft3d`](../../api_docs/python/math_ops.md#batch_ifft3d)
   * [`batch_matmul`](../../api_docs/python/math_ops.md#batch_matmul)
   * [`batch_matrix_determinant`](../../api_docs/python/math_ops.md#batch_matrix_determinant)
+  * [`batch_matrix_diag`](../../api_docs/python/math_ops.md#batch_matrix_diag)
+  * [`batch_matrix_diag_part`](../../api_docs/python/math_ops.md#batch_matrix_diag_part)
   * [`batch_matrix_inverse`](../../api_docs/python/math_ops.md#batch_matrix_inverse)
   * [`batch_matrix_solve`](../../api_docs/python/math_ops.md#batch_matrix_solve)
   * [`batch_matrix_solve_ls`](../../api_docs/python/math_ops.md#batch_matrix_solve_ls)
@@ -224,6 +226,10 @@
   * [`unsorted_segment_sum`](../../api_docs/python/math_ops.md#unsorted_segment_sum)
   * [`where`](../../api_docs/python/math_ops.md#where)
 
+* **[Strings](../../api_docs/python/string_ops.md)**:
+  * [`reduce_join`](../../api_docs/python/string_ops.md#reduce_join)
+  * [`string_to_hash_bucket`](../../api_docs/python/string_ops.md#string_to_hash_bucket)
+
 * **[Histograms](../../api_docs/python/histogram_ops.md)**:
   * [`histogram_fixed_width`](../../api_docs/python/histogram_ops.md#histogram_fixed_width)
 
@@ -255,6 +261,7 @@
   * [`tuple`](../../api_docs/python/control_flow_ops.md#tuple)
   * [`verify_tensor_all_finite`](../../api_docs/python/control_flow_ops.md#verify_tensor_all_finite)
   * [`where`](../../api_docs/python/control_flow_ops.md#where)
+  * [`while_loop`](../../api_docs/python/control_flow_ops.md#while_loop)
 
 * **[Higher Order Functions](../../api_docs/python/functional_ops.md)**:
   * [`foldl`](../../api_docs/python/functional_ops.md#foldl)
@@ -262,6 +269,11 @@
   * [`map_fn`](../../api_docs/python/functional_ops.md#map_fn)
   * [`scan`](../../api_docs/python/functional_ops.md#scan)
 
+* **[Tensor Handle Operations](../../api_docs/python/session_ops.md)**:
+  * [`delete_session_tensor`](../../api_docs/python/session_ops.md#delete_session_tensor)
+  * [`get_session_handle`](../../api_docs/python/session_ops.md#get_session_handle)
+  * [`get_session_tensor`](../../api_docs/python/session_ops.md#get_session_tensor)
+
 * **[Images](../../api_docs/python/image.md)**:
   * [`adjust_brightness`](../../api_docs/python/image.md#adjust_brightness)
   * [`adjust_contrast`](../../api_docs/python/image.md#adjust_contrast)
diff --git a/tensorflow/g3doc/api_docs/python/math_ops.md b/tensorflow/g3doc/api_docs/python/math_ops.md
index 403621e3105..627b6fa5d03 100644
--- a/tensorflow/g3doc/api_docs/python/math_ops.md
+++ b/tensorflow/g3doc/api_docs/python/math_ops.md
@@ -741,6 +741,101 @@ Gamma function.
 TensorFlow provides several operations that you can use to add basic
 mathematical functions for matrices to your graph.
 
+- - -
+
+### `tf.batch_matrix_diag(diagonal, name=None)` {#batch_matrix_diag}
+
+Returns a batched diagonal tensor with a given batched diagonal values.
+
+Given a `diagonal`, this operation returns a tensor with the `diagonal` and
+everything else padded with zeros. The diagonal is computed as follows:
+
+Assume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a
+tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
+
+`output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.
+
+For example:
+
+```prettyprint
+# 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
+
+and diagonal.shape = (2, 4)
+
+tf.batch_matrix_diag(diagonal) ==> [[[1, 0, 0, 0]
+                                     [0, 2, 0, 0]
+                                     [0, 0, 3, 0]
+                                     [0, 0, 0, 4]],
+                                    [[5, 0, 0, 0]
+                                     [0, 6, 0, 0]
+                                     [0, 0, 7, 0]
+                                     [0, 0, 0, 8]]]
+
+which has shape (2, 4, 4)
+```
+
+##### Args:
+
+
+*  <b>`diagonal`</b>: A `Tensor`. Rank `k`, where `k >= 1`.
+*  <b>`name`</b>: A name for the operation (optional).
+
+##### Returns:
+
+  A `Tensor`. Has the same type as `diagonal`.
+  Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`.
+
+
+- - -
+
+### `tf.batch_matrix_diag_part(input, name=None)` {#batch_matrix_diag_part}
+
+Returns the batched diagonal part of a batched tensor.
+
+This operation returns a tensor with the `diagonal` part
+of the batched `input`. The `diagonal` part is computed as follows:
+
+Assume `input` has `k` dimensions `[I, J, K, ..., N, N]`, then the output is a
+tensor of rank `k - 1` with dimensions `[I, J, K, ..., N]` where:
+
+`diagonal[i, j, k, ..., n] = input[i, j, k, ..., n, n]`.
+
+The input must be at least a matrix.
+
+For example:
+
+```prettyprint
+# 'input' is [[[1, 0, 0, 0]
+               [0, 2, 0, 0]
+               [0, 0, 3, 0]
+               [0, 0, 0, 4]],
+              [[5, 0, 0, 0]
+               [0, 6, 0, 0]
+               [0, 0, 7, 0]
+               [0, 0, 0, 8]]]
+
+and input.shape = (2, 4, 4)
+
+tf.batch_matrix_diag_part(input) ==> [[1, 2, 3, 4], [5, 6, 7, 8]]
+
+which has shape (2, 4)
+```
+
+##### Args:
+
+
+*  <b>`input`</b>: A `Tensor`.
+    Rank `k` tensor where `k >= 2` and the last two dimensions are equal.
+*  <b>`name`</b>: A name for the operation (optional).
+
+##### Returns:
+
+  A `Tensor`. Has the same type as `input`.
+  The extracted diagonal(s) having shape
+  `diagonal.shape = input.shape[:-1]`.
+
+
+
 - - -
 
 ### `tf.diag(diagonal, name=None)` {#diag}
@@ -1192,7 +1287,7 @@ eigenvalues, and subsequent [...,1:, :] containing the eigenvectors.
 
 - - -
 
-### `tf.matrix_solve(matrix, rhs, name=None)` {#matrix_solve}
+### `tf.matrix_solve(matrix, rhs, adjoint=None, name=None)` {#matrix_solve}
 
 Solves a system of linear equations. Checks for invertibility.
 
@@ -1202,25 +1297,30 @@ Solves a system of linear equations. Checks for invertibility.
 *  <b>`matrix`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`.
     Shape is `[M, M]`.
 *  <b>`rhs`</b>: A `Tensor`. Must have the same type as `matrix`. Shape is `[M, K]`.
+*  <b>`adjoint`</b>: An optional `bool`. Defaults to `False`.
+    Boolean indicating whether to solve with `matrix` or its adjoint.
 *  <b>`name`</b>: A name for the operation (optional).
 
 ##### Returns:
 
   A `Tensor`. Has the same type as `matrix`.
-  Shape is `[M, K]` containing the tensor that solves
-  matrix * output = rhs.
+  Shape is `[M, K]`. If `adjoint` is `False` then `output` that solves
+  `matrix` * `output` = `rhs`. If `adjoint` is `True` then `output` that solves
+  `adjoint(matrix)` * `output` = `rhs`.
 
 
 - - -
 
-### `tf.batch_matrix_solve(matrix, rhs, name=None)` {#batch_matrix_solve}
+### `tf.batch_matrix_solve(matrix, rhs, adjoint=None, name=None)` {#batch_matrix_solve}
 
 Solves systems of linear equations. Checks for invertibility.
 
 Matrix is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
 form square matrices. Rhs is a tensor of shape
-`[..., M, K]`. The output is a tensor shape `[..., M, K]` where each output
-matrix satisfies matrix[..., :, :] * output[..., :, :] = rhs[..., :, :].
+`[..., M, K]`. The output is a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output
+matrix satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+If `adjoint` is `True` then each output
+matrix satisfies `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
 
 ##### Args:
 
@@ -1229,6 +1329,9 @@ matrix satisfies matrix[..., :, :] * output[..., :, :] = rhs[..., :, :].
     Shape is `[..., M, M]`.
 *  <b>`rhs`</b>: A `Tensor`. Must have the same type as `matrix`.
     Shape is `[..., M, K]`.
+*  <b>`adjoint`</b>: An optional `bool`. Defaults to `False`.
+    Boolean indicating whether to solve with `matrix` or its (block-wise)
+    adjoint.
 *  <b>`name`</b>: A name for the operation (optional).
 
 ##### Returns:
@@ -1239,21 +1342,24 @@ matrix satisfies matrix[..., :, :] * output[..., :, :] = rhs[..., :, :].
 
 - - -
 
-### `tf.matrix_triangular_solve(matrix, rhs, lower=None, name=None)` {#matrix_triangular_solve}
+### `tf.matrix_triangular_solve(matrix, rhs, lower=None, adjoint=None, name=None)` {#matrix_triangular_solve}
 
 Solves a system of linear equations with an upper or lower triangular matrix by
 
 backsubstitution.
 
 `matrix` is a matrix of shape `[M, M]`. If `lower` is `True` then the strictly
-upper triangular part of `matrix` is ignored. If `lower` is False then the
-strictly lower triangular part of `matrix` is ignored. `rhs` is a matrix of
-shape [M, K]`.
+upper triangular part of `matrix` is assumed to be zero and not accessed.
+If `lower` is False then the strictly lower triangular part of `matrix` is
+assumed to be zero and not accessed.
+`rhs` is a matrix of shape [M, K]`.
 
-The output is a matrix of shape `[M, K]`. If `lower` is `True` then the output
-satisfies \\(\sum_{k=0}^{i}\\) matrix[i, k] * output[k, j] = rhs[i, j].
-If `lower` is false then output satisfies
-\\(\sum_{k=i}^{K-1}\\) matrix[i, k] * output[k, j] = rhs[i, j].
+The output is a matrix of shape `[M, K]`. If `adjoint` is `False` the output
+satisfies the matrix equation `matrix` * `output` = `rhs`.
+If `adjoint` is `False` then `output` satisfies the matrix equation
+`matrix` * `output` = `rhs`.
+If `adjoint` is `True` then `output` satisfies the matrix equation
+`adjoint(matrix)` * `output` = `rhs`.
 
 ##### Args:
 
@@ -1262,7 +1368,9 @@ If `lower` is false then output satisfies
     Shape is `[M, M]`.
 *  <b>`rhs`</b>: A `Tensor`. Must have the same type as `matrix`. Shape is `[M, K]`.
 *  <b>`lower`</b>: An optional `bool`. Defaults to `True`.
-    Boolean indicating whether matrix is lower or upper triangular.
+    Boolean indicating whether `matrix` is lower or upper triangular
+*  <b>`adjoint`</b>: An optional `bool`. Defaults to `False`.
+    Boolean indicating whether to solve with `matrix` or its adjoint.
 *  <b>`name`</b>: A name for the operation (optional).
 
 ##### Returns:
@@ -1272,7 +1380,7 @@ If `lower` is false then output satisfies
 
 - - -
 
-### `tf.batch_matrix_triangular_solve(matrix, rhs, lower=None, name=None)` {#batch_matrix_triangular_solve}
+### `tf.batch_matrix_triangular_solve(matrix, rhs, lower=None, adjoint=None, name=None)` {#batch_matrix_triangular_solve}
 
 Solves systems of linear equations with upper or lower triangular matrices by
 
@@ -1280,15 +1388,17 @@ backsubstitution.
 
 `matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
 square matrices. If `lower` is `True` then the strictly upper triangular part
-of each inner-most matrix is ignored. If `lower` is False then the strictly
-lower triangular part of each inner-most matrix is ignored. `rhs` is a tensor
-of shape [..., M, K]`.
+of each inner-most matrix is assumed to be zero and not accessed.
+If `lower` is False then the strictly lower triangular part of each inner-most
+matrix is assumed to be zero and not accessed.
+`rhs` is a tensor of shape [..., M, K]`.
 
-The output is a tensor of shape `[..., M, K]`. If `lower` is `True` then the
-output satisfies
-\\(\sum_{k=0}^{i}\\) matrix[..., i, k] * output[..., k, j] = rhs[..., i, j].
-If `lower` is false then the strictly then the output satisfies
-\\(sum_{k=i}^{K-1}\\) matrix[..., i, k] * output[..., k, j] = rhs[..., i, j].
+The output is a tensor of shape `[..., M, K]`. If `adjoint` is `True` then the
+innermost matrices in output` satisfy matrix equations
+`matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+If `adjoint` is `False` then the strictly then the  innermost matrices in
+`output` satisfy matrix equations
+`adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
 
 ##### Args:
 
@@ -1298,7 +1408,11 @@ If `lower` is false then the strictly then the output satisfies
 *  <b>`rhs`</b>: A `Tensor`. Must have the same type as `matrix`.
     Shape is `[..., M, K]`.
 *  <b>`lower`</b>: An optional `bool`. Defaults to `True`.
-    Boolean indicating whether matrix is lower or upper triangular.
+    Boolean indicating whether the innermost matrices in `matrix` are
+    lower or upper triangular.
+*  <b>`adjoint`</b>: An optional `bool`. Defaults to `False`.
+    Boolean indicating whether to solve with `matrix` or its (block-wise)
+    adjoint.
 *  <b>`name`</b>: A name for the operation (optional).
 
 ##### Returns:
diff --git a/tensorflow/g3doc/api_docs/python/session_ops.md b/tensorflow/g3doc/api_docs/python/session_ops.md
new file mode 100644
index 00000000000..388c2cb81b3
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/session_ops.md
@@ -0,0 +1,102 @@
+<!-- This file is machine generated: DO NOT EDIT! -->
+
+# Tensor Handle Operations
+
+Note: Functions taking `Tensor` arguments can also take anything accepted by
+[`tf.convert_to_tensor`](framework.md#convert_to_tensor).
+
+[TOC]
+
+## Tensor Handle Operations.
+
+TensorFlow provides several operators that allows the user to keep tensors
+"in-place" across run calls.
+
+- - -
+
+### `tf.get_session_handle(data, name=None)` {#get_session_handle}
+
+Return the handle of `data`.
+
+This is EXPERIMENTAL and subject to change.
+
+Keep `data` "in-place" in the runtime and create a handle that can be
+used to retrieve `data` in a subsequent run().
+
+Combined with `get_session_tensor`, we can keep a tensor produced in
+one run call in place, and use it as the input in a future run call.
+Below is a simple example:
+
+```python
+c = tf.mul(a, b)
+h = tf.get_session_handle(c)
+h = sess.run(h)
+
+p, a = tf.get_session_tensor(tf.float32)
+b = tf.mul(a, 10)
+c = sess.run(b, feed_dict={p: h.handle})
+```
+
+##### Args:
+
+
+*  <b>`data`</b>: A tensor to be stored in the session.
+*  <b>`name`</b>: Optional name prefix for the return tensor.
+
+##### Returns:
+
+  A scalar string tensor representing a unique handle for `data`.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if `data` is not a Tensor.
+
+
+- - -
+
+### `tf.get_session_tensor(dtype, name=None)` {#get_session_tensor}
+
+Get the tensor of type `dtype` by feeding a tensor handle.
+
+This is EXPERIMENTAL and subject to change.
+
+Get the value of the tensor from a tensor handle. The tensor
+is produced in a previous run() and stored in the state of the
+session.
+
+##### Args:
+
+
+*  <b>`dtype`</b>: The type of the output tensor.
+*  <b>`name`</b>: Optional name prefix for the return tensor.
+
+##### Returns:
+
+  A pair of tensors. The first is a placeholder for feeding a
+  tensor handle and the second is the tensor in the session state
+  keyed by the tensor handle.
+
+
+- - -
+
+### `tf.delete_session_tensor(name=None)` {#delete_session_tensor}
+
+Delete the tensor by feeding a tensor handle.
+
+This is EXPERIMENTAL and subject to change.
+
+Delete the tensor of a given tensor handle. The tensor is produced
+in a previous run() and stored in the state of the session.
+
+##### Args:
+
+
+*  <b>`name`</b>: Optional name prefix for the return tensor.
+
+##### Returns:
+
+  A pair of graph elements. The first is a placeholder for feeding a
+  tensor handle and the second is a deletion operation.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/state_ops.md b/tensorflow/g3doc/api_docs/python/state_ops.md
index 6e43a50045f..172c4785007 100644
--- a/tensorflow/g3doc/api_docs/python/state_ops.md
+++ b/tensorflow/g3doc/api_docs/python/state_ops.md
@@ -781,7 +781,7 @@ checkpoints per device.
 
 - - -
 
-#### `tf.train.Saver.save(sess, save_path, global_step=None, latest_filename=None, meta_graph_suffix='meta')` {#Saver.save}
+#### `tf.train.Saver.save(sess, save_path, global_step=None, latest_filename=None, meta_graph_suffix='meta', write_meta_graph=True)` {#Saver.save}
 
 Saves variables.
 
@@ -807,6 +807,8 @@ path can be passed directly to a call to `restore()`.
     managed by the saver to keep track of recent checkpoints.  Defaults to
     'checkpoint'.
 *  <b>`meta_graph_suffix`</b>: Suffix for `MetaGraphDef` file. Defaults to 'meta'.
+*  <b>`write_meta_graph`</b>: `Boolean` indicating whether or not to write the meta
+    graph file.
 
 ##### Returns:
 
diff --git a/tensorflow/g3doc/api_docs/python/string_ops.md b/tensorflow/g3doc/api_docs/python/string_ops.md
new file mode 100644
index 00000000000..c3d275ac6d4
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/string_ops.md
@@ -0,0 +1,96 @@
+<!-- This file is machine generated: DO NOT EDIT! -->
+
+# Strings
+
+Note: Functions taking `Tensor` arguments can also take anything accepted by
+[`tf.convert_to_tensor`](framework.md#convert_to_tensor).
+
+[TOC]
+
+## Hashing
+
+String hashing ops take a string input tensor and map each element to an
+integer.
+
+- - -
+
+### `tf.string_to_hash_bucket(string_tensor, num_buckets, name=None)` {#string_to_hash_bucket}
+
+Converts each string in the input Tensor to its hash mod by a number of buckets.
+
+The hash function is deterministic on the content of the string within the
+process.
+
+Note that the hash function may change from time to time.
+
+##### Args:
+
+
+*  <b>`string_tensor`</b>: A `Tensor` of type `string`.
+*  <b>`num_buckets`</b>: An `int` that is `>= 1`. The number of buckets.
+*  <b>`name`</b>: A name for the operation (optional).
+
+##### Returns:
+
+  A `Tensor` of type `int64`.
+  A Tensor of the same shape as the input `string_tensor`.
+
+
+
+## Joining
+
+String joining ops concatenate elements of input string tensors to produce a new
+string tensor.
+
+- - -
+
+### `tf.reduce_join(inputs, reduction_indices, keep_dims=None, separator=None, name=None)` {#reduce_join}
+
+Joins a string Tensor across the given dimensions.
+
+Computes the string join across dimensions in the given string Tensor of shape
+`[d_0, d_1, ..., d_n-1]`.  Returns a new Tensor created by joining the input
+strings with the given separator (default: empty string).  Negative indices are
+counted backwards from the end, with `-1` being equivalent to `n - 1`.  Passing
+an empty `reduction_indices` joins all strings in linear index order and outputs
+a scalar string.
+
+
+For example:
+```
+# tensor `a` is [["a", "b"], ["c", "d"]]
+tf.reduce_join(a, 0) ==> ["ac", "bd"]
+tf.reduce_join(a, 1) ==> ["ab", "cd"]
+tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
+tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
+tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
+tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
+tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
+tf.reduce_join(a, [0, 1]) ==> ["acbd"]
+tf.reduce_join(a, [1, 0]) ==> ["abcd"]
+tf.reduce_join(a, []) ==> ["abcd"]
+```
+
+##### Args:
+
+
+*  <b>`inputs`</b>: A `Tensor` of type `string`.
+    The input to be joined.  All reduced indices must have non-zero size.
+*  <b>`reduction_indices`</b>: A `Tensor` of type `int32`.
+    The dimensions to reduce over.  Dimensions are reduced in the
+    order specified.  If `reduction_indices` has higher rank than `1`, it is
+    flattened.  Omitting `reduction_indices` is equivalent to passing
+    `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
+*  <b>`keep_dims`</b>: An optional `bool`. Defaults to `False`.
+    If `True`, retain reduced dimensions with length `1`.
+*  <b>`separator`</b>: An optional `string`. Defaults to `""`.
+    The separator to use when joining.
+*  <b>`name`</b>: A name for the operation (optional).
+
+##### Returns:
+
+  A `Tensor` of type `string`.
+  Has shape equal to that of the input with reduced dimensions removed or
+  set to `1` depending on `keep_dims`.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/train.md b/tensorflow/g3doc/api_docs/python/train.md
index 7d01698ba0c..24b9f9f8142 100644
--- a/tensorflow/g3doc/api_docs/python/train.md
+++ b/tensorflow/g3doc/api_docs/python/train.md
@@ -1558,7 +1558,7 @@ communicate with any other server in the same cluster.
 Creates a new server with the given definition.
 
 The `job_name`, `task_index`, and `protocol` arguments are optional, and
-override any information also provided in `server_or_cluster_def`.
+override any information provided in `server_or_cluster_def`.
 
 ##### Args:
 
@@ -1567,13 +1567,15 @@ override any information also provided in `server_or_cluster_def`.
     `tf.train.ClusterDef` protocol buffer, or a
     `tf.train.ClusterSpec` object, describing the server to be
     created and/or the cluster of which it is a member.
-*  <b>`job_name`</b>: (Optional.) If not specified in `server_or_cluster_def`,
-    specifies the name of the job of which this server is a member.
-*  <b>`task_index`</b>: (Optional.) If not specified in `server_or_cluster_def`,
-    specifies the task index of this server in its job.
-*  <b>`protocol`</b>: (Optional.) If not specified in `server_or_cluster_def`,
-    specifies the protocol to be used by this server. Acceptable
-    values include `"grpc"`.
+*  <b>`job_name`</b>: (Optional.) Specifies the name of the job of which the server
+    is a member. Defaults to the value in `server_or_cluster_def`, if
+    specified.
+*  <b>`task_index`</b>: (Optional.) Specifies the task index of the server in its
+    job. Defaults to the value in `server_or_cluster_def`, if specified.
+    Otherwise defaults to 0 if the server's job has only one task.
+*  <b>`protocol`</b>: (Optional.) Specifies the protocol to be used by the server.
+    Acceptable values include `"grpc"`. Defaults to the value in
+    `server_or_cluster_def`, if specified. Otherwise defaults to `"grpc"`.
 *  <b>`start`</b>: (Optional.) Boolean, indicating whether to start the server
     after creating it. Defaults to `True`.
 
@@ -2677,7 +2679,7 @@ Returns a list of tasks in the given job.
 ##### Returns:
 
   A list of strings, corresponding to the network addresses of tasks in
-  the given job.
+  the given job, ordered by task index.
 
 ##### Raises:
 
@@ -2852,7 +2854,7 @@ The generated
 [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
 has one summary value containing a histogram for `values`.
 
-This op reports an `OutOfRange` error if any value is not finite.
+This op reports an `InvalidArgument` error if any value is not finite.
 
 ##### Args:
 
diff --git a/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md b/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md
index 39a5661f24d..45ee082e680 100644
--- a/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md
+++ b/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md
@@ -8,7 +8,8 @@ your TensorFlow graph, plot quantitative metrics about the execution of your
 graph, and show additional data like images that pass through it. When
 TensorBoard is fully configured, it looks like this:
 
-![MNIST TensorBoard](../../images/mnist_tensorboard.png "MNIST TensorBoard")
+[![MNIST TensorBoard](../../images/mnist_tensorboard.png "MNIST TensorBoard")](http://tensorflow.org/tensorboard)
+[*Click try a TensorBoard with data from this tutorial!*](http://tensorflow.org/tensorboard)
 
 
 ## Serializing the data
@@ -75,56 +76,70 @@ statistics, such as how the weights or accuracy varied during training.
 The code below is an excerpt; full source is [here](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py).
 
 ```python
-# Create the model
-x = tf.placeholder(tf.float32, [None, 784], name="x-input")
-W = tf.Variable(tf.zeros([784,10]), name="weights")
-b = tf.Variable(tf.zeros([10], name="bias"))
+def variable_summaries(var, name):
+  with tf.name_scope("summaries"):
+    mean = tf.reduce_mean(var)
+    tf.scalar_summary('mean/' + name, mean)
+    with tf.name_scope('stddev'):
+      stddev = tf.sqrt(tf.reduce_sum(tf.square(var - mean)))
+    tf.scalar_summary('sttdev/' + name, stddev)
+    tf.scalar_summary('max/' + name, tf.reduce_max(var))
+    tf.scalar_summary('min/' + name, tf.reduce_min(var))
+    tf.histogram_summary(name, var)
 
-# use a name scope to organize nodes in the graph visualizer
-with tf.name_scope("Wx_b") as scope:
-  y = tf.nn.softmax(tf.matmul(x,W) + b)
+def nn_layer(input_tensor, input_dim, output_dim, layer_name):
+  """Reusable code for making a simple neural net layer.
 
-# Add summary ops to collect data
-tf.histogram_summary("weights", W)
-tf.histogram_summary("biases", b)
-tf.histogram_summary("y", y)
+  It does a matrix multiply, bias add, and then uses relu to nonlinearize.
+  It also sets up name scoping so that the resultant graph is easy to read, and
+  adds a number of summary ops.
+  """
+  # Adding a name scope ensures logical grouping of the layers in the graph.
+  with tf.name_scope(layer_name):
+    # This Variable will hold the state of the weights for the layer
+    with tf.name_scope("weights"):
+      weights = weight_variable([input_dim, output_dim])
+      variable_summaries(weights, layer_name + '/weights')
+    with tf.name_scope("biases"):
+      biases = bias_variable([output_dim])
+      variable_summaries(biases, layer_name + '/biases')
+    with tf.name_scope('Wx_plus_b'):
+      activations = tf.matmul(input_tensor, weights) + biases
+      tf.histogram_summary(layer_name + '/activations', activations)
+    relu = tf.nn.relu(activations, 'relu')
+    tf.histogram_summary(layer_name + '/activations_relu', relu)
+    return tf.nn.dropout(relu, keep_prob)
 
-# Define loss and optimizer
-y_ = tf.placeholder(tf.float32, [None,10], name="y-input")
-# More name scopes will clean up the graph representation
-with tf.name_scope("xent") as scope:
-  cross_entropy = -tf.reduce_sum(y_*tf.log(y))
-  tf.scalar_summary("cross entropy", cross_entropy)
-with tf.name_scope("train") as scope:
-  train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)
+layer1 = nn_layer(x, 784, 50, 'layer1')
+layer2 = nn_layer(layer1, 50, 10, 'layer2')
+y = tf.nn.softmax(layer2, 'predictions')
 
-with tf.name_scope("test") as scope:
-  correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
-  accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
-  tf.scalar_summary("accuracy", accuracy)
 
-# Merge all the summaries and write them out to /tmp/mnist_logs
+with tf.name_scope('cross_entropy'):
+  diff = y_ * tf.log(y)
+  with tf.name_scope('total'):
+    cross_entropy = -tf.reduce_sum(diff)
+  with tf.name_scope('normalized'):
+    normalized_cross_entropy = -tf.reduce_mean(diff)
+  tf.scalar_summary('cross entropy', normalized_cross_entropy)
+
+with tf.name_scope('train'):
+  train_step = tf.train.AdamOptimizer(
+      FLAGS.learning_rate).minimize(cross_entropy)
+
+with tf.name_scope('accuracy'):
+  with tf.name_scope('correct_prediction'):
+    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
+  with tf.name_scope('accuracy'):
+    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
+  tf.scalar_summary('accuracy', accuracy)
+
+# Merge all the summaries and write them out to /tmp/mnist_logs (by default)
 merged = tf.merge_all_summaries()
-writer = tf.train.SummaryWriter("/tmp/mnist_logs", sess.graph)
+train_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/train', sess.graph)
+test_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/test')
 tf.initialize_all_variables().run()
 
-# Train the model, and feed in test data and record summaries every 10 steps
-
-for i in range(1000):
-  if i % 10 == 0:  # Record summary data, and the accuracy
-    feed = {x: mnist.test.images, y_: mnist.test.labels}
-    result = sess.run([merged, accuracy], feed_dict=feed)
-    summary_str = result[0]
-    acc = result[1]
-    writer.add_summary(summary_str, i)
-    print("Accuracy at step %s: %s" % (i, acc))
-  else:
-    batch_xs, batch_ys = mnist.train.next_batch(100)
-    feed = {x: batch_xs, y_: batch_ys}
-    sess.run(train_step, feed_dict=feed)
-
-print(accuracy.eval({x: mnist.test.images, y_: mnist.test.labels}))
-
 ```
 
 You're now all set to visualize this data using TensorBoard.
@@ -135,7 +150,7 @@ You're now all set to visualize this data using TensorBoard.
 To run TensorBoard, use the command
 
 ```bash
-python tensorflow/tensorboard/tensorboard.py --logdir=path/to/log-directory
+tensorboard --logdir=path/to/log-directory
 ```
 
 where `logdir` points to the directory where the `SummaryWriter` serialized its
@@ -144,18 +159,8 @@ serialized data from separate runs, then TensorBoard will visualize the data
 from all of those runs. Once TensorBoard is running, navigate your web browser
 to `localhost:6006` to view the TensorBoard.
 
-If you have pip installed TensorFlow, `tensorboard` is installed into
-the system path, so you can use the simpler command
-
-```bash
-tensorboard --logdir=/path/to/log-directory
-```
-
 When looking at TensorBoard, you will see the navigation tabs in the top right
 corner. Each tab represents a set of serialized data that can be visualized.
-For any tab you are looking at, if the logs being looked at by TensorBoard do
-not contain any data relevant to that tab, a message will be displayed
-indicating how to serialize data that is applicable to that tab.
 
 For in depth information on how to use the *graph* tab to visualize your graph,
 see [TensorBoard: Graph Visualization](../../how_tos/graph_viz/index.md).
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 958c499159d..a8ac2c82091 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -575,6 +575,9 @@ tf_gen_op_wrapper_py(
         "TensorArraySplit",
         "TensorArrayUnpack",
         "TensorArrayWrite",
+        "GetSessionHandle",
+        "GetSessionTensor",
+        "DeleteSessionTensor",
     ],
     require_shape_functions = True,
 )
@@ -810,6 +813,7 @@ py_library(
         "ops/rnn_cell.py",
         "ops/script_ops.py",
         "ops/seq2seq.py",
+        "ops/session_ops.py",
         "ops/sparse_grad.py",
         "ops/sparse_ops.py",
         "ops/standard_ops.py",
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index 4318f51bff5..4306d38e69f 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -106,8 +106,10 @@ from tensorflow.python.ops import histogram_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
+from tensorflow.python.ops import session_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import string_ops
 
 
 # Don't export modules except for the few we really want
@@ -120,7 +122,8 @@ _whitelist = set([app, compat, contrib, errors, flags, gfile, image,
 __all__ = make_all(__name__,
                    [framework_lib, array_ops, client_lib, constant_op,
                     control_flow_ops, functional_ops, histogram_ops, io_ops,
-                    math_ops, nn, script_ops, sparse_ops, state_ops, train])
+                    math_ops, nn, script_ops, session_ops, sparse_ops,
+                    state_ops, string_ops, train])
 
 # Symbols whitelisted for export without documentation.
 # TODO(cwhipkey): review these and move to contrib, expose through
@@ -167,7 +170,6 @@ __all__.extend([
     'sparse_matmul',
     'sparse_segment_mean_grad',
     'sparse_segment_sqrt_n_grad',
-    'string_to_hash_bucket',
     'unique_with_counts',
     'user_ops',
 ])
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index a9bfdb63c0b..817965f9924 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -27,6 +27,7 @@ import numpy as np
 from tensorflow.python import pywrap_tensorflow as tf_session
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import session_ops
 from tensorflow.python.platform import logging
 from tensorflow.python.util import compat
 
@@ -99,6 +100,9 @@ class BaseSession(SessionInterface):
     self._extend_lock = threading.Lock()
     self._target = target
 
+    self._delete_lock = threading.Lock()
+    self._dead_handles = []
+
     self._session = None
 
     opts = tf_session.TF_NewSessionOptions(target=target, config=config)
@@ -277,6 +281,9 @@ class BaseSession(SessionInterface):
       the *i*th return value will be a
       [`SparseTensorValue`](../../api_docs/python/sparse_ops.md#SparseTensorValue)
       containing the value of that sparse tensor.
+    * If the *i*th element of `fetches` is produced by a `get_tensor_handle` op,
+      the *i*th return value will be a numpy ndarray containing the handle of
+      that tensor.
 
     The optional `feed_dict` argument allows the caller to override
     the value of tensors in the graph. Each key in `feed_dict` can be
@@ -350,17 +357,22 @@ class BaseSession(SessionInterface):
     list of feeds and fetches that will be used in the subsequent
     `partial_run` calls.
 
+    The optional `feed_dict` argument allows the caller to override
+    the value of tensors in the graph. See run() for more information.
+
     Below is a simple example:
 
-      a = array_ops.placeholder(dtypes.float32, shape=[])
-      b = array_ops.placeholder(dtypes.float32, shape=[])
-      c = array_ops.placeholder(dtypes.float32, shape=[])
-      r1 = math_ops.add(a, b)
-      r2 = math_ops.mul(r1, c)
+    ```python
+    a = array_ops.placeholder(dtypes.float32, shape=[])
+    b = array_ops.placeholder(dtypes.float32, shape=[])
+    c = array_ops.placeholder(dtypes.float32, shape=[])
+    r1 = math_ops.add(a, b)
+    r2 = math_ops.mul(r1, c)
 
-      h = sess.partial_run_setup([r1, r2], [a, b, c])
-      res = sess.partial_run(h, r1, feed_dict={a: 1, b: 2})
-      res = sess.partial_run(h, r2, feed_dict={c: res})
+    h = sess.partial_run_setup([r1, r2], [a, b, c])
+    res = sess.partial_run(h, r1, feed_dict={a: 1, b: 2})
+    res = sess.partial_run(h, r2, feed_dict={c: res})
+    ```
 
     Args:
       handle: A handle for a sequence of partial runs.
@@ -410,7 +422,7 @@ class BaseSession(SessionInterface):
                          'graph before calling run().')
 
     # Validate and process fetches.
-    unique_fetches, target_list, _ = self._process_fetches(fetches)
+    unique_fetches, target_list, _, _ = self._process_fetches(fetches)
 
     # Create request.
     feed_list = []
@@ -455,6 +467,7 @@ class BaseSession(SessionInterface):
       fetches = [fetches]
 
     unique_fetch_targets = set()
+    unique_fetch_handles = {}
     target_list = []
 
     fetch_info = []
@@ -465,10 +478,15 @@ class BaseSession(SessionInterface):
         try:
           fetch_t = self.graph.as_graph_element(subfetch, allow_tensor=True,
                                                 allow_operation=True)
+          fetch_name = compat.as_bytes(fetch_t.name)
           if isinstance(fetch_t, ops.Operation):
-            target_list.append(compat.as_bytes(fetch_t.name))
+            target_list.append(fetch_name)
           else:
-            subfetch_names.append(compat.as_bytes(fetch_t.name))
+            subfetch_names.append(fetch_name)
+          # Remember the fetch if it is for a tensor handle.
+          if (isinstance(fetch_t, ops.Tensor) and
+              fetch_t.op.type == 'GetSessionHandle'):
+            unique_fetch_handles[fetch_name] = fetch_t.op.inputs[0].dtype
         except TypeError as e:
           raise TypeError('Fetch argument %r of %r has invalid type %r, '
                           'must be a string or Tensor. (%s)'
@@ -483,7 +501,7 @@ class BaseSession(SessionInterface):
       fetch_info.append((subfetch_names, fetch_contraction_fn))
 
     unique_fetch_targets = list(unique_fetch_targets)
-    return unique_fetch_targets, target_list, fetch_info
+    return unique_fetch_targets, target_list, fetch_info, unique_fetch_handles
 
   def _run(self, handle, fetches, feed_dict, options, run_metadata):
     """Perform either run or partial_run, depending the exitence of `handle`."""
@@ -502,10 +520,15 @@ class BaseSession(SessionInterface):
                          'graph before calling run().')
 
     # Validate and process fetches.
-    unique_fetches, target_list, fetch_info = self._process_fetches(fetches)
+    processed_fetches = self._process_fetches(fetches)
+    unique_fetches = processed_fetches[0]
+    target_list = processed_fetches[1]
+    fetch_info = processed_fetches[2]
+    unique_handles = processed_fetches[3]
 
     # Create request.
     feed_dict_string = {}
+    feed_map = {}
 
     # Validate and process feed_dict.
     if feed_dict:
@@ -522,7 +545,6 @@ class BaseSession(SessionInterface):
             raise TypeError('The value of a feed cannot be a tf.Tensor object. '
                             'Acceptable feed values include Python scalars, '
                             'strings, lists, or numpy ndarrays.')
-
           np_val = np.array(subfeed_val, dtype=subfeed_t.dtype.as_numpy_dtype)
           if not subfeed_t.get_shape().is_compatible_with(np_val.shape):
             raise ValueError(
@@ -531,17 +553,31 @@ class BaseSession(SessionInterface):
                 % (np_val.shape, subfeed_t.name, str(subfeed_t.get_shape())))
           if not self.graph.is_feedable(subfeed_t):
             raise ValueError('Tensor %s may not be fed.' % subfeed_t)
-          feed_dict_string[compat.as_bytes(subfeed_t.name)] = np_val
+          subfeed_name = compat.as_bytes(subfeed_t.name)
+          feed_dict_string[subfeed_name] = np_val
+          feed_map[subfeed_name] = (subfeed_t, subfeed_val)
 
     # Run request and get response.
-    results = self._do_run(handle, target_list, unique_fetches,
-                           feed_dict_string, options, run_metadata)
+    movers = self._update_with_movers(feed_dict_string, feed_map)
+    try:
+      results = self._do_run(handle, target_list, unique_fetches,
+                             feed_dict_string, options, run_metadata)
+    finally:
+      # The movers are no longer used. Delete them.
+      for handle in movers:
+        self._register_dead_handle(handle)
 
     # User may have fetched the same tensor multiple times, but we
     # only fetch them from the runtime once.  Furthermore, they may
     # be wrapped as a tuple of tensors.  Here we map the results back
     # to what the client asked for.
-    fetched_results = dict(zip(unique_fetches, results))
+    # TODO(yuanbyu): Use the contraction_fn in _REGISTERED_EXPANSIONS.
+    fetched_results = {}
+    for fetch, result in zip(unique_fetches, results):
+      dtype = unique_handles.get(fetch)
+      if dtype:
+        result = session_ops.TensorHandle(result, dtype, self)
+      fetched_results[fetch] = result
     ret = []
     for fetch_names, fetch_contraction_fn in fetch_info:
       if fetch_names:
@@ -642,6 +678,55 @@ class BaseSession(SessionInterface):
 
         self._current_version = self._graph.version
 
+  # The threshold to run garbage collection to delete dead tensors.
+  _DEAD_HANDLES_THRESHOLD = 10
+
+  def _register_dead_handle(self, handle):
+    # Register a dead handle in the session. Delete the dead tensors when
+    # the number of dead tensors exceeds certain threshold.
+    tensors_to_delete = None
+    with self._delete_lock:
+      self._dead_handles.append(handle)
+      if len(self._dead_handles) == BaseSession._DEAD_HANDLES_THRESHOLD:
+        tensors_to_delete = self._dead_handles
+        self._dead_handles = []
+    # Delete the dead tensors.
+    # TODO(yuanbyu): For now we use a sequence of runs to minimize the graph
+    # size and the overhead of graph construction/partitioning.
+    if tensors_to_delete:
+      for tensor_handle in tensors_to_delete:
+        feeds = {}
+        fetches = []
+        holder, deleter = session_ops._get_handle_deleter(self.graph,
+                                                          tensor_handle)
+        feeds[holder] = tensor_handle
+        fetches.append(deleter)
+        self.run(fetches, feed_dict=feeds)
+
+  def _update_with_movers(self, feed_dict, feed_map):
+    # If a tensor handle that is fed to a device incompatible placeholder,
+    # we move the tensor to the right device, generate a new tensor handle,
+    # and update `feed_dict` to use the new handle.
+    handle_movers = []
+    for feed_name, val in feed_map.items():
+      mover = session_ops._get_handle_mover(self.graph, *val)
+      if mover:
+        handle_movers.append((feed_name, val[1], mover))
+    # Transfer a tensor to the right device if needed.
+    if not handle_movers:
+      return []
+    else:
+      feeds = {}
+      fetches = []
+      for _, handle, mover in handle_movers:
+        feeds[mover[0]] = handle
+        fetches.append(mover[1])
+      handles = self.run(fetches, feed_dict=feeds)
+      for handle_mover, handle in zip(handle_movers, handles):
+        np_val = np.array(handle.handle, dtype=np.object)
+        feed_dict[handle_mover[0]] = np_val
+      return handles
+
 
 class Session(BaseSession):
   """A class for running TensorFlow operations.
diff --git a/tensorflow/python/framework/docs.py b/tensorflow/python/framework/docs.py
index 2fc9893c2ae..b2a4fc62be0 100644
--- a/tensorflow/python/framework/docs.py
+++ b/tensorflow/python/framework/docs.py
@@ -99,11 +99,12 @@ class Index(Document):
         print("", file=f)
 
 
-def collect_members(module_to_name):
+def collect_members(module_to_name, exclude=()):
   """Collect all symbols from a list of modules.
 
   Args:
     module_to_name: Dictionary mapping modules to short names.
+    exclude: Set of fully qualified names to exclude.
 
   Returns:
     Dictionary mapping name to (fullname, member) pairs.
@@ -116,6 +117,8 @@ def collect_members(module_to_name):
           not _always_drop_symbol_re.match(name) and
           (all_names is None or name in all_names)):
         fullname = '%s.%s' % (module_name, name)
+        if fullname in exclude:
+          continue
         if name in members:
           other_fullname, other_member = members[name]
           if member is not other_member:
diff --git a/tensorflow/python/framework/errors.py b/tensorflow/python/framework/errors.py
index f7aaa63792e..0429c7817f6 100644
--- a/tensorflow/python/framework/errors.py
+++ b/tensorflow/python/framework/errors.py
@@ -328,7 +328,7 @@ class AbortedError(OpError):
 
 
 class OutOfRangeError(OpError):
-  """Raised when an operation executed past the valid range.
+  """Raised when an operation iterates past the valid input range.
 
   This exception is raised in "end-of-file" conditions, such as when a
   [`queue.dequeue()`](../../api_docs/python/io_ops.md#QueueBase.dequeue)
diff --git a/tensorflow/python/framework/gen_docs_combined.py b/tensorflow/python/framework/gen_docs_combined.py
index e4bbeea58d1..8bcde1f6b4c 100644
--- a/tensorflow/python/framework/gen_docs_combined.py
+++ b/tensorflow/python/framework/gen_docs_combined.py
@@ -81,9 +81,11 @@ def all_libraries(module_to_name, members, documented):
               exclude_symbols=["sparse_matmul", "arg_min", "arg_max",
                                "lin_space", "sparse_segment_mean_grad"],
               prefix=PREFIX_TEXT),
+      library("string_ops", "Strings", prefix=PREFIX_TEXT),
       library("histogram_ops", "Histograms"),
       library("control_flow_ops", "Control Flow", prefix=PREFIX_TEXT),
       library("functional_ops", "Higher Order Functions", prefix=PREFIX_TEXT),
+      library("session_ops", "Tensor Handle Operations", prefix=PREFIX_TEXT),
       library("image", "Images", tf.image, exclude_symbols=["ResizeMethod"],
               prefix=PREFIX_TEXT),
       library("sparse_ops", "Sparse Tensors",
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index aee976c71e9..b7bc6690b2f 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -1871,6 +1871,14 @@ class Graph(object):
     self._colocation_stack = []
     # Set of tensors that are dangerous to feed!
     self._unfeedable_tensors = set()
+    # A map of tensor handle placeholder to tensor dtype.
+    self._handle_feeders = {}
+    # A map from tensor handle to its read op.
+    self._handle_readers = {}
+    # A map from tensor handle to its move op.
+    self._handle_movers = {}
+    # A map from tensor handle to its delete op.
+    self._handle_deleters = {}
 
   def _check_not_finalized(self):
     """Check if the graph is finalized.
diff --git a/tensorflow/python/framework/tensor_shape.py b/tensorflow/python/framework/tensor_shape.py
index e8643109bc9..67fe3361bb7 100644
--- a/tensorflow/python/framework/tensor_shape.py
+++ b/tensorflow/python/framework/tensor_shape.py
@@ -36,6 +36,10 @@ class Dimension(object):
   def __repr__(self):
     return "Dimension(%s)" % repr(self._value)
 
+  def __str__(self):
+    value = self._value
+    return "?" if value is None else str(value)
+
   def __eq__(self, other):
     """Returns true if `other` has the same known value as this Dimension."""
     other = as_dimension(other)
@@ -429,17 +433,15 @@ class TensorShape(object):
         self._dims = [as_dimension(d) for d in dims_iter]
 
   def __repr__(self):
-    return "TensorShape(%s)" % self._dims
+    return "TensorShape(%r)" % self._dims
 
   def __str__(self):
     if self.ndims is None:
       return "<unknown>"
     elif self.ndims == 1:
-      length = self._dims[0].value
-      return "(%s,)" % (str(length) if length is not None else "?")
+      return "(%s,)" % self._dims[0]
     else:
-      return "(%s)" % ", ".join(str(d.value) if d.value is not None else "?"
-                                for d in self._dims)
+      return "(%s)" % ", ".join(str(d) for d in self._dims)
 
   @property
   def dims(self):
@@ -541,11 +543,15 @@ class TensorShape(object):
     if self._dims is None:
       return other
     else:
-      self.assert_same_rank(other)
-      new_dims = []
-      for i, dim in enumerate(self._dims):
-        new_dims.append(dim.merge_with(other[i]))
-      return TensorShape(new_dims)
+      try:
+        self.assert_same_rank(other)
+        new_dims = []
+        for i, dim in enumerate(self._dims):
+          new_dims.append(dim.merge_with(other[i]))
+        return TensorShape(new_dims)
+      except ValueError:
+        raise ValueError("Shapes %s and %s are not compatible" %
+                         (self, other))
 
   def concatenate(self, other):
     """Returns the concatenation of the dimension in `self` and `other`.
diff --git a/tensorflow/python/framework/tensor_shape_test.py b/tensorflow/python/framework/tensor_shape_test.py
index e83c48947d8..59cd9144ca7 100644
--- a/tensorflow/python/framework/tensor_shape_test.py
+++ b/tensorflow/python/framework/tensor_shape_test.py
@@ -143,6 +143,14 @@ class DimensionTest(test_util.TensorFlowTestCase):
     self.assertIs(None,
                   tensor_shape.Dimension(None) != tensor_shape.Dimension(None))
 
+  def testRepr(self):
+    self.assertEqual(repr(tensor_shape.Dimension(7)), "Dimension(7)")
+    self.assertEqual(repr(tensor_shape.Dimension(None)), "Dimension(None)")
+
+  def testStr(self):
+    self.assertEqual(str(tensor_shape.Dimension(7)), "7")
+    self.assertEqual(str(tensor_shape.Dimension(None)), "?")
+
 
 class ShapeTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/kernel_tests/benchmark_test.py b/tensorflow/python/kernel_tests/benchmark_test.py
index 2cddbe98724..9a96038618e 100644
--- a/tensorflow/python/kernel_tests/benchmark_test.py
+++ b/tensorflow/python/kernel_tests/benchmark_test.py
@@ -103,6 +103,19 @@ class BenchmarkTest(tf.test.TestCase):
     self.assertTrue(_ran_somebenchmark_2[0])
     self.assertFalse(_ran_somebenchmark_but_shouldnt[0])
 
+    _ran_somebenchmark_1[0] = False
+    _ran_somebenchmark_2[0] = False
+    _ran_somebenchmark_but_shouldnt[0] = False
+
+    # Test running a specific method of SomeRandomBenchmark
+    if benchmark.TEST_REPORTER_TEST_ENV in os.environ:
+      del os.environ[benchmark.TEST_REPORTER_TEST_ENV]
+    benchmark._run_benchmarks("SomeRandom.*1$")
+
+    self.assertTrue(_ran_somebenchmark_1[0])
+    self.assertFalse(_ran_somebenchmark_2[0])
+    self.assertFalse(_ran_somebenchmark_but_shouldnt[0])
+
   def testReportingBenchmark(self):
     tempdir = tf.test.get_temp_dir()
     try:
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index cf59bde2af4..91b3887376b 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -59,7 +59,7 @@ def isum(s):
   i = tf.constant(0, name="i")
   c = lambda i, s: tf.less(i, 10)
   b = lambda i, s: [tf.add(i, 1), tf.add(i, s)]
-  _, r_s = control_flow_ops.While(c, b, [i, s])
+  _, r_s = tf.while_loop(c, b, [i, s])
   return r_s
 
 
@@ -467,7 +467,7 @@ class ControlFlowTest(tf.test.TestCase):
       n = tf.constant(0)
       c = lambda x: tf.less(x, 10000)
       b = lambda x: tf.add(x, 1)
-      r = control_flow_ops.While(c, b, [n], parallel_iterations=20)
+      r = tf.while_loop(c, b, [n], parallel_iterations=20)
       self.assertEqual(10000, r.eval())
 
   def testWhileWithRefs_1(self):
@@ -482,7 +482,7 @@ class ControlFlowTest(tf.test.TestCase):
         self.assertEqual(x.dtype, tf.int32_ref)
         return (i+1, gen_array_ops._ref_identity(x))
 
-      r = control_flow_ops.While(c, b, [i, x], parallel_iterations=5)
+      r = tf.while_loop(c, b, [i, x], parallel_iterations=5)
 
       tf.initialize_all_variables().run()
 
@@ -517,7 +517,7 @@ class ControlFlowTest(tf.test.TestCase):
       c = tf.convert_to_tensor(0)
       o = tf.convert_to_tensor(0)
       d = tf.convert_to_tensor(100)
-      r = control_flow_ops.While(
+      r = tf.while_loop(
           lambda i, m, c, o: tf.less(i, d), compute, [i, m, c, o])
       result = r[3].eval()
     self.assertTrue(check_op_order(i.graph))
@@ -539,7 +539,7 @@ class ControlFlowTest(tf.test.TestCase):
       o = tf.convert_to_tensor(0)
       x = tf.convert_to_tensor([1, 2, 3, 4, 5, 6])
       s = tf.size(x)
-      r = control_flow_ops.While(
+      r = tf.while_loop(
           lambda i, m, c, o: tf.less(i, s), compute, [i, m, c, o])
       result = r[3].eval()
     self.assertTrue(check_op_order(i.graph))
@@ -559,7 +559,7 @@ class ControlFlowTest(tf.test.TestCase):
       o = tf.convert_to_tensor([0])
       x = tf.convert_to_tensor([1, 2, 3, 4, 5, 6])
       s = tf.size(x)
-      r = control_flow_ops.While(
+      r = tf.while_loop(
           lambda i, c, o: tf.less(i, s), compute, [i, c, o])
       result = r[2].eval()
     self.assertTrue(check_op_order(i.graph))
@@ -570,7 +570,7 @@ class ControlFlowTest(tf.test.TestCase):
       n = tf.constant(1.0)
       c = lambda x: tf.less(x, 10.0)
       b = lambda x: tf.add(x, 1.0)
-      r = control_flow_ops.While(c, b, [n])
+      r = tf.while_loop(c, b, [n])
       self.assertAllClose(10.0, r.eval())
 
   def testWhile_Gpu_1(self):
@@ -584,7 +584,7 @@ class ControlFlowTest(tf.test.TestCase):
       def b(x):
         with tf.device("/cpu:0"):
           return tf.add(x, 1.0)
-      r = control_flow_ops.While(c, b, [n])
+      r = tf.while_loop(c, b, [n])
       self.assertAllClose(10.0, r.eval())
 
   def testWhile_Gpu_2(self):
@@ -601,11 +601,11 @@ class ControlFlowTest(tf.test.TestCase):
           with tf.device("/cpu:0"):
             s1 = tf.add(i, s)
           return i1, s1
-        _, r_s = control_flow_ops.While(c, b, [n, s])
+        _, r_s = tf.while_loop(c, b, [n, s])
         return r_s
       c = lambda x: tf.less(x, 200)
       b = lambda x: tf.add(x, cpu_sum(n))
-      r = control_flow_ops.While(c, b, [n])
+      r = tf.while_loop(c, b, [n])
       self.assertEqual(225, r.eval())
 
   def testNestedWhile_1(self):
@@ -624,10 +624,8 @@ class ControlFlowTest(tf.test.TestCase):
           r_ = tf.constant(12)
         return [n_, r_]
 
-      res = control_flow_ops.While(condition,
-                                   body,
-                                   [n, r],
-                                   parallel_iterations=1)
+      res = tf.while_loop(condition, body, [n, r],
+                          parallel_iterations=1)
       self.assertAllEqual(12, res[1].eval())
 
   def testWhileWithControl_2(self):
@@ -640,7 +638,7 @@ class ControlFlowTest(tf.test.TestCase):
           r_ = tf.constant(12)
         return [r_]
 
-      res = control_flow_ops.While(condition, body, [r], parallel_iterations=1)
+      res = tf.while_loop(condition, body, [r], parallel_iterations=1)
       self.assertAllEqual(12, res.eval())
 
   def testCondWhile_1(self):
@@ -649,7 +647,7 @@ class ControlFlowTest(tf.test.TestCase):
       c = lambda x: tf.less(x, 10)
       b = lambda x: tf.add(x, 1)
       r = tf.cond(tf.less(0, 1),
-                  lambda: control_flow_ops.While(c, b, [n]),
+                  lambda: tf.while_loop(c, b, [n]),
                   lambda: n)
       self.assertAllEqual(10, r.eval())
 
@@ -659,7 +657,7 @@ class ControlFlowTest(tf.test.TestCase):
       c = lambda x: tf.less(x, 10)
       b = lambda x: tf.add(x, 1)
       r = tf.cond(tf.less(1, 0), lambda: tf.add(n, 1),
-                  lambda: control_flow_ops.While(c, b, [n]))
+                  lambda: tf.while_loop(c, b, [n]))
       self.assertAllEqual(10, r.eval())
 
   def testWhileCond_1(self):
@@ -673,7 +671,7 @@ class ControlFlowTest(tf.test.TestCase):
       b = lambda x: tf.cond(
           tf.constant(True), lambda: tf.add(x, one), lambda: tf.sub(x, one))
       # pylint: enable=undefined-variable
-      r = control_flow_ops.While(c, b, [i])
+      r = tf.while_loop(c, b, [i])
       self.assertAllEqual(10, r.eval())
 
   def testWhileCond_2(self):
@@ -681,7 +679,7 @@ class ControlFlowTest(tf.test.TestCase):
       n = tf.convert_to_tensor(0, name="n")
       c = lambda x: tf.less(x, 10)
       b = lambda x: tf.cond(tf.constant(True), lambda: tf.add(x, 1), lambda: n)
-      r = control_flow_ops.While(c, b, [n])
+      r = tf.while_loop(c, b, [n])
       self.assertAllEqual(10, r.eval())
 
   def testWhileCond_3(self):
@@ -693,7 +691,7 @@ class ControlFlowTest(tf.test.TestCase):
       b = lambda x: tf.cond(tf.less(0, 1), lambda: tf.add(x, 1),
                             lambda: tf.sub(x, 1))
       # pylint: enable=undefined-variable
-      r = control_flow_ops.While(c, b, [n])
+      r = tf.while_loop(c, b, [n])
       self.assertAllEqual(10, r.eval())
 
   # NOTE: It is ok to have parallel_iterations > 1
@@ -712,10 +710,8 @@ class ControlFlowTest(tf.test.TestCase):
         nj = control_flow_ops.with_dependencies([op], nj)
         return [nj]
 
-      r = control_flow_ops.While(loop_iterator,
-                                 loop_body,
-                                 [n],
-                                 parallel_iterations=1)
+      r = tf.while_loop(loop_iterator, loop_body, [n],
+                        parallel_iterations=1)
       self.assertTrue(check_op_order(n.graph))
       tf.initialize_all_variables().run()
       self.assertEqual(3, r.eval())
@@ -739,10 +735,8 @@ class ControlFlowTest(tf.test.TestCase):
         nj = control_flow_ops.with_dependencies([op], nj)
         return [nj]
 
-      r = control_flow_ops.While(loop_iterator,
-                                 loop_body,
-                                 [n],
-                                 parallel_iterations=1)
+      r = tf.while_loop(loop_iterator, loop_body, [n],
+                        parallel_iterations=1)
       self.assertTrue(check_op_order(n.graph))
       tf.initialize_all_variables().run()
       self.assertEqual(3, r.eval())
@@ -764,10 +758,9 @@ class ControlFlowTest(tf.test.TestCase):
         nj = tf.add(j, 1)
         return [nj, ns]
 
-      r = control_flow_ops.While(loop_iterator,
-                                 loop_body,
-                                 [n, tf.identity(select)],
-                                 parallel_iterations=1)
+      r = tf.while_loop(loop_iterator, loop_body,
+                        [n, tf.identity(select)],
+                        parallel_iterations=1)
       tf.initialize_all_variables().run()
       result = r[1].eval()
     self.assertTrue(check_op_order(n.graph))
@@ -792,8 +785,8 @@ class ControlFlowTest(tf.test.TestCase):
           ni = tf.add(i, 1, name="i_add")
         return ni
 
-      lpa = control_flow_ops.While(pred, loop_body, [c],
-                                   parallel_iterations=1)
+      lpa = tf.while_loop(pred, loop_body, [c],
+                          parallel_iterations=1)
 
       self.assertEqual(0, var_b.eval())
       lpa.eval()  # Run the loop
@@ -819,7 +812,7 @@ class ControlFlowTest(tf.test.TestCase):
           inc_b = tf.identity(var_b)
         return inc_b
 
-      lpa = control_flow_ops.While(pred, loop_body, [var_b], 1, name="loop")
+      lpa = tf.while_loop(pred, loop_body, [var_b], 1, name="loop")
 
       self.assertEqual(0, var_b.eval())
       lpa.eval()  # Run the loop
@@ -848,7 +841,7 @@ class ControlFlowTest(tf.test.TestCase):
           ni = tf.add(i, 1, name="i_add")
           return ni
 
-      lpa = control_flow_ops.While(pred, loop_body, [c], 1, name="loop")
+      lpa = tf.while_loop(pred, loop_body, [c], 1, name="loop")
 
       self.assertEqual(0, var_b.eval())
       lpa.eval()  # Run the loop
@@ -868,7 +861,7 @@ class ControlFlowTest(tf.test.TestCase):
         ni = control_flow_ops.with_dependencies([q.enqueue((i,))], ni)
         return ni
 
-      r = control_flow_ops.While(c, b, [i], parallel_iterations=1)
+      r = tf.while_loop(c, b, [i], parallel_iterations=1)
       self.assertEqual([10], r.eval())
       for i in xrange(10):
         self.assertEqual([i], q.dequeue().eval())
@@ -885,7 +878,7 @@ class ControlFlowTest(tf.test.TestCase):
         ni = control_flow_ops.with_dependencies(
             [gen_data_flow_ops._stack_push(s, i)], ni)
         return ni
-      r = control_flow_ops.While(c, b, [i], parallel_iterations=1)
+      r = tf.while_loop(c, b, [i], parallel_iterations=1)
 
       x = tf.constant(0)
       def c1(i, _):
@@ -894,7 +887,7 @@ class ControlFlowTest(tf.test.TestCase):
         ni = tf.sub(i, 1)
         nx = x + gen_data_flow_ops._stack_pop(s, tf.int32)
         return [ni, nx]
-      _, rx = control_flow_ops.While(c1, b1, [r, x], parallel_iterations=1)
+      _, rx = tf.while_loop(c1, b1, [r, x], parallel_iterations=1)
       self.assertEqual(45, rx.eval())
 
   def testWhileGrad_Square(self):
@@ -902,7 +895,7 @@ class ControlFlowTest(tf.test.TestCase):
       v = tf.constant(2.0, name="v")
       c = lambda v: tf.less(v, 100.0)
       b = tf.square
-      r = control_flow_ops.While(c, b, [v], parallel_iterations=1)
+      r = tf.while_loop(c, b, [v], parallel_iterations=1)
       r = control_flow_ops.cond(tf.less(1, 2), lambda: r, lambda: v)
 
       r = tf.gradients(r, v)[0]
@@ -915,7 +908,7 @@ class ControlFlowTest(tf.test.TestCase):
       n = tf.constant(0, name="n")
       c = lambda i, v: tf.less(i, 5)
       b = lambda i, v: [i + 1, tf.mul(x, v)]
-      r = control_flow_ops.While(c, b, [n, v], parallel_iterations=1)
+      r = tf.while_loop(c, b, [n, v], parallel_iterations=1)
 
       r = tf.gradients(r[1], x)[0]
       self.assertEqual(r.get_shape(), tensor_shape.unknown_shape())
@@ -926,7 +919,7 @@ class ControlFlowTest(tf.test.TestCase):
       v = tf.constant(2.0, name="v")
       c = lambda v: tf.less(v, 100.0)
       b = tf.square
-      r = control_flow_ops.While(c, b, [v], parallel_iterations=1)
+      r = tf.while_loop(c, b, [v], parallel_iterations=1)
       r = tf.mul(r, r)
 
       r = tf.gradients(r, v)[0]
@@ -937,7 +930,7 @@ class ControlFlowTest(tf.test.TestCase):
       v = tf.constant(2.0, name="v")
       c = lambda v: tf.less(v, 100.0)
       b = tf.square
-      r = control_flow_ops.While(c, b, [v], parallel_iterations=1)
+      r = tf.while_loop(c, b, [v], parallel_iterations=1)
       r = tf.add(r, r)
 
       r = tf.gradients(r, v)[0]
@@ -949,8 +942,7 @@ class ControlFlowTest(tf.test.TestCase):
       v = tf.constant(2.0, name="v")
       c = lambda v: tf.less(v, 100.0)
       b = lambda v: tf.mul(v, a)
-      r = control_flow_ops.While(c, b, [v],
-                                 parallel_iterations=p_iters)
+      r = tf.while_loop(c, b, [v], parallel_iterations=p_iters)
 
       grad_a, grad_v = tf.gradients(r, [a, v])
       grad_a_val, grad_v_val = sess.run([grad_a, grad_v])
@@ -969,7 +961,7 @@ class ControlFlowTest(tf.test.TestCase):
       v = tf.constant(2.0, name="v")
       c = lambda v: tf.less(v, 100.0)
       b = lambda v: tf.mul(v, a)
-      r = control_flow_ops.While(c, b, [v], parallel_iterations=1)
+      r = tf.while_loop(c, b, [v], parallel_iterations=1)
 
       r = tf.gradients(r, a)
       tf.initialize_all_variables().run()
@@ -985,7 +977,7 @@ class ControlFlowTest(tf.test.TestCase):
         y1 = tf.add(x, y)
         x1 = tf.mul(x, y1)
         return x1, y1
-      rx, ry = control_flow_ops.While(c, b, [x, y], parallel_iterations=1)
+      rx, ry = tf.while_loop(c, b, [x, y], parallel_iterations=1)
 
       r = tf.gradients([rx, ry], x)
       self.assertAllClose(304.0, r[0].eval())
@@ -1006,7 +998,7 @@ class ControlFlowTest(tf.test.TestCase):
         x = tf.mul(x, 2.0)
         i = tf.add(i, 1)
         return i, x
-      ri, rx = control_flow_ops.While(c, b, [i, x], parallel_iterations=1)
+      ri, rx = tf.while_loop(c, b, [i, x], parallel_iterations=1)
 
       r = tf.gradients([ri, rx], x)
       self.assertAllClose(1024.0, r[0].eval())
@@ -1018,7 +1010,7 @@ class ControlFlowTest(tf.test.TestCase):
       v = tf.constant(2.0, name="v")
       c = lambda v: tf.less(v, 100.0)
       b = tf.square
-      r = control_flow_ops.While(c, b, [v], back_prop=False)
+      r = tf.while_loop(c, b, [v], back_prop=False)
       r = tf.add(r, v)
       r = tf.gradients(r, v)
       self.assertAllClose(1.0, r[0].eval())
@@ -1033,8 +1025,8 @@ class ControlFlowTest(tf.test.TestCase):
         x = tf.mul(x, 2.0)
         i = tf.add(i, 1)
         return i, x
-      _, rx = control_flow_ops.While(c, b, [i, x], parallel_iterations=1)
-      _, rx = control_flow_ops.While(c, b, [i, rx], parallel_iterations=1)
+      _, rx = tf.while_loop(c, b, [i, x], parallel_iterations=1)
+      _, rx = tf.while_loop(c, b, [i, rx], parallel_iterations=1)
 
       r = tf.gradients([rx], x)
       self.assertAllClose(1024.0, r[0].eval())
@@ -1049,8 +1041,8 @@ class ControlFlowTest(tf.test.TestCase):
         x = tf.mul(x, 2.0)
         i = tf.add(i, 1)
         return i, x
-      _, r1 = control_flow_ops.While(c, b, [i, x], parallel_iterations=1)
-      _, r2 = control_flow_ops.While(c, b, [i, x], parallel_iterations=1)
+      _, r1 = tf.while_loop(c, b, [i, x], parallel_iterations=1)
+      _, r2 = tf.while_loop(c, b, [i, x], parallel_iterations=1)
       rx = tf.add(r1, r2)
 
       r = tf.gradients([rx], x)
@@ -1062,10 +1054,10 @@ class ControlFlowTest(tf.test.TestCase):
       def inner_loop(s):
         c = lambda x: tf.less(x, 4.0)
         b = lambda x: tf.mul(x, 2.0)
-        return control_flow_ops.While(c, b, [s])
+        return tf.while_loop(c, b, [s])
       c = lambda x: tf.less(x, 2.0)
       b = lambda x: tf.mul(inner_loop(x), 2.0)
-      r = control_flow_ops.While(c, b, [v])
+      r = tf.while_loop(c, b, [v])
 
       r = tf.gradients(r, v)[0]
       self.assertAllClose(8.0, r.eval())
@@ -1081,15 +1073,15 @@ class ControlFlowTest(tf.test.TestCase):
         z = tf.constant(0)
         c = lambda i, x: tf.less(i, 4)
         b = lambda i, x: [tf.add(i, 1), tf.mul(x, 2.0)]
-        return control_flow_ops.While(c, b, [z, s])
+        return tf.while_loop(c, b, [z, s])
       def inner_loop2(s):
         z = tf.constant(0)
         c = lambda i, x: tf.less(i, 4)
         b = lambda i, x: [tf.add(i, 1), tf.mul(x, 2.0)]
-        return control_flow_ops.While(c, b, [z, s])
+        return tf.while_loop(c, b, [z, s])
       c = lambda x: tf.less(x, 128.0)
       b = lambda x: inner_loop2(inner_loop1(x)[1])[1]
-      r = control_flow_ops.While(c, b, [v])
+      r = tf.while_loop(c, b, [v])
 
       r = tf.gradients(r, v)[0]
       self.assertAllClose(256.0, r.eval())
@@ -1101,15 +1093,15 @@ class ControlFlowTest(tf.test.TestCase):
         z = tf.constant(0)
         c = lambda i, x: tf.less(i, 4)
         b = lambda i, x: [tf.add(i, 1), tf.mul(x, 2.0)]
-        return control_flow_ops.While(c, b, [z, s])
+        return tf.while_loop(c, b, [z, s])
       def inner_loop2(s):
         z = tf.constant(0)
         c = lambda i, x: tf.less(i, 4)
         b = lambda i, x: [tf.add(i, 1), tf.mul(x, 2.0)]
-        return control_flow_ops.While(c, b, [z, s])
+        return tf.while_loop(c, b, [z, s])
       c = lambda x: tf.less(x, 128.0)
       b = lambda x: tf.mul(inner_loop1(x)[1], inner_loop2(x)[1])
-      r = control_flow_ops.While(c, b, [v])
+      r = tf.while_loop(c, b, [v])
 
       r = tf.gradients(r, v)[0]
       self.assertAllClose(512.0, r.eval())
@@ -1126,7 +1118,7 @@ class ControlFlowTest(tf.test.TestCase):
                                           lambda: tf.square(x),
                                           lambda: tf.sub(x, one))
       # pylint: enable=undefined-variable
-      r = control_flow_ops.While(c, b, [v])
+      r = tf.while_loop(c, b, [v])
       r = tf.gradients(r, v)[0]
       self.assertAllClose(1024.0, r.eval())
 
@@ -1146,7 +1138,7 @@ class ControlFlowTest(tf.test.TestCase):
                                           lambda: tf.square(x),
                                           lambda: tf.sub(x, one))
       # pylint: enable=undefined-variable
-      r = control_flow_ops.While(c, b, [v])
+      r = tf.while_loop(c, b, [v])
       r = tf.gradients(r, v)[0]
       r = sess.run(r, feed_dict={v: 2.0})
       self.assertAllClose(1024.0, r)
@@ -1165,7 +1157,7 @@ class ControlFlowTest(tf.test.TestCase):
         return (i+1, gen_array_ops._ref_identity(x))
       # pylint: enable=protected-access
 
-      r = control_flow_ops.While(c, body, [i, x], parallel_iterations=5)
+      r = tf.while_loop(c, body, [i, x], parallel_iterations=5)
 
       grad_ys = [tf.Variable(73).ref()]
       grad = tf.gradients([r[1]], [x], grad_ys=grad_ys)
diff --git a/tensorflow/python/kernel_tests/diag_op_test.py b/tensorflow/python/kernel_tests/diag_op_test.py
index ef74f1273c1..e130c9b1b30 100644
--- a/tensorflow/python/kernel_tests/diag_op_test.py
+++ b/tensorflow/python/kernel_tests/diag_op_test.py
@@ -17,10 +17,118 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy
+import numpy as np
 import tensorflow as tf
 
 
+class BatchMatrixDiagTest(tf.test.TestCase):
+  _use_gpu = False
+
+  def testVector(self):
+    with self.test_session(use_gpu=self._use_gpu):
+      v = np.array([1.0, 2.0, 3.0])
+      mat = np.diag(v)
+      v_diag = tf.batch_matrix_diag(v)
+      self.assertEqual((3, 3), v_diag.get_shape())
+      self.assertAllEqual(v_diag.eval(), mat)
+
+  def testBatchVector(self):
+    with self.test_session(use_gpu=self._use_gpu):
+      v_batch = np.array([[1.0, 2.0, 3.0],
+                          [4.0, 5.0, 6.0]])
+      mat_batch = np.array(
+          [[[1.0, 0.0, 0.0],
+            [0.0, 2.0, 0.0],
+            [0.0, 0.0, 3.0]],
+           [[4.0, 0.0, 0.0],
+            [0.0, 5.0, 0.0],
+            [0.0, 0.0, 6.0]]])
+      v_batch_diag = tf.batch_matrix_diag(v_batch)
+      self.assertEqual((2, 3, 3), v_batch_diag.get_shape())
+      self.assertAllEqual(v_batch_diag.eval(), mat_batch)
+
+  def testInvalidShape(self):
+    with self.assertRaisesRegexp(ValueError, "must have rank at least 1"):
+      tf.batch_matrix_diag(0)
+
+  def testInvalidShapeAtEval(self):
+    with self.test_session(use_gpu=self._use_gpu):
+      v = tf.placeholder(dtype=tf.float32)
+      with self.assertRaisesOpError("input must be at least 1-dim"):
+        tf.batch_matrix_diag(v).eval(feed_dict={v: 0.0})
+
+  def testGrad(self):
+    shapes = ((3,), (18, 4), (1, 9, 4, 8,))
+    with self.test_session(use_gpu=self._use_gpu):
+      for shape in shapes:
+        x = tf.constant(np.random.rand(*shape), np.float32)
+        y = tf.batch_matrix_diag(x)
+        error = tf.test.compute_gradient_error(x, x.get_shape().as_list(),
+                                               y, y.get_shape().as_list())
+        self.assertLess(error, 1e-4)
+
+
+class BatchMatrixDiagGpuTest(BatchMatrixDiagTest):
+  _use_gpu = True
+
+
+class BatchMatrixDiagPartTest(tf.test.TestCase):
+  _use_gpu = False
+
+  def testMatrix(self):
+    with self.test_session(use_gpu=self._use_gpu):
+      v = np.array([1.0, 2.0, 3.0])
+      mat = np.diag(v)
+      mat_diag = tf.batch_matrix_diag_part(mat)
+      self.assertEqual((3,), mat_diag.get_shape())
+      self.assertAllEqual(mat_diag.eval(), v)
+
+  def testBatchMatrix(self):
+    with self.test_session(use_gpu=self._use_gpu):
+      v_batch = np.array([[1.0, 2.0, 3.0],
+                          [4.0, 5.0, 6.0]])
+      mat_batch = np.array(
+          [[[1.0, 0.0, 0.0],
+            [0.0, 2.0, 0.0],
+            [0.0, 0.0, 3.0]],
+           [[4.0, 0.0, 0.0],
+            [0.0, 5.0, 0.0],
+            [0.0, 0.0, 6.0]]])
+      self.assertEqual(mat_batch.shape, (2, 3, 3))
+      mat_batch_diag = tf.batch_matrix_diag_part(mat_batch)
+      self.assertEqual((2, 3), mat_batch_diag.get_shape())
+      self.assertAllEqual(mat_batch_diag.eval(), v_batch)
+
+  def testInvalidShape(self):
+    with self.assertRaisesRegexp(ValueError, "must have rank at least 2"):
+      tf.batch_matrix_diag_part(0)
+    with self.assertRaisesRegexp(ValueError, r"Dimensions .* not compatible"):
+      tf.batch_matrix_diag_part([[0, 1], [1, 0], [0, 0]])
+
+  def testInvalidShapeAtEval(self):
+    with self.test_session(use_gpu=self._use_gpu):
+      v = tf.placeholder(dtype=tf.float32)
+      with self.assertRaisesOpError("input must be at least 2-dim"):
+        tf.batch_matrix_diag_part(v).eval(feed_dict={v: 0.0})
+      with self.assertRaisesOpError("last two dimensions must be equal"):
+        tf.batch_matrix_diag_part(v).eval(
+            feed_dict={v: [[0, 1], [1, 0], [0, 0]]})
+
+  def testGrad(self):
+    shapes = ((3, 3), (18, 3, 3), (1, 9, 4, 3, 5, 5))
+    with self.test_session(use_gpu=self._use_gpu):
+      for shape in shapes:
+        x = tf.constant(np.random.rand(*shape), dtype=np.float32)
+        y = tf.batch_matrix_diag_part(x)
+        error = tf.test.compute_gradient_error(x, x.get_shape().as_list(),
+                                               y, y.get_shape().as_list())
+        self.assertLess(error, 1e-4)
+
+
+class BatchMatrixDiagPartGpuTest(BatchMatrixDiagPartTest):
+  _use_gpu = True
+
+
 class DiagTest(tf.test.TestCase):
 
   def diagOp(self, diag, dtype, expected_ans, use_gpu=False):
@@ -35,56 +143,56 @@ class DiagTest(tf.test.TestCase):
     self.assertShapeEqual(diag, tf_ans_inv)
 
   def testEmptyTensor(self):
-    x = numpy.array([])
-    expected_ans = numpy.empty([0, 0])
-    self.diagOp(x, numpy.int32, expected_ans)
+    x = np.array([])
+    expected_ans = np.empty([0, 0])
+    self.diagOp(x, np.int32, expected_ans)
 
   def testRankOneIntTensor(self):
-    x = numpy.array([1, 2, 3])
-    expected_ans = numpy.array(
+    x = np.array([1, 2, 3])
+    expected_ans = np.array(
         [[1, 0, 0],
          [0, 2, 0],
          [0, 0, 3]])
-    self.diagOp(x, numpy.int32, expected_ans)
-    self.diagOp(x, numpy.int64, expected_ans)
+    self.diagOp(x, np.int32, expected_ans)
+    self.diagOp(x, np.int64, expected_ans)
 
   def testRankOneFloatTensor(self):
-    x = numpy.array([1.1, 2.2, 3.3])
-    expected_ans = numpy.array(
+    x = np.array([1.1, 2.2, 3.3])
+    expected_ans = np.array(
         [[1.1, 0, 0],
          [0, 2.2, 0],
          [0, 0, 3.3]])
-    self.diagOp(x, numpy.float32, expected_ans)
-    self.diagOp(x, numpy.float64, expected_ans)
+    self.diagOp(x, np.float32, expected_ans)
+    self.diagOp(x, np.float64, expected_ans)
 
   def testRankTwoIntTensor(self):
-    x = numpy.array([[1, 2, 3], [4, 5, 6]])
-    expected_ans = numpy.array(
+    x = np.array([[1, 2, 3], [4, 5, 6]])
+    expected_ans = np.array(
         [[[[1, 0, 0], [0, 0, 0]],
           [[0, 2, 0], [0, 0, 0]],
           [[0, 0, 3], [0, 0, 0]]],
          [[[0, 0, 0], [4, 0, 0]],
           [[0, 0, 0], [0, 5, 0]],
           [[0, 0, 0], [0, 0, 6]]]])
-    self.diagOp(x, numpy.int32, expected_ans)
-    self.diagOp(x, numpy.int64, expected_ans)
+    self.diagOp(x, np.int32, expected_ans)
+    self.diagOp(x, np.int64, expected_ans)
 
   def testRankTwoFloatTensor(self):
-    x = numpy.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6]])
-    expected_ans = numpy.array(
+    x = np.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6]])
+    expected_ans = np.array(
         [[[[1.1, 0, 0], [0, 0, 0]],
           [[0, 2.2, 0], [0, 0, 0]],
           [[0, 0, 3.3], [0, 0, 0]]],
          [[[0, 0, 0], [4.4, 0, 0]],
           [[0, 0, 0], [0, 5.5, 0]],
           [[0, 0, 0], [0, 0, 6.6]]]])
-    self.diagOp(x, numpy.float32, expected_ans)
-    self.diagOp(x, numpy.float64, expected_ans)
+    self.diagOp(x, np.float32, expected_ans)
+    self.diagOp(x, np.float64, expected_ans)
 
   def testRankThreeFloatTensor(self):
-    x = numpy.array([[[1.1, 2.2], [3.3, 4.4]],
-                     [[5.5, 6.6], [7.7, 8.8]]])
-    expected_ans = numpy.array(
+    x = np.array([[[1.1, 2.2], [3.3, 4.4]],
+                  [[5.5, 6.6], [7.7, 8.8]]])
+    expected_ans = np.array(
         [[[[[[1.1, 0], [0, 0]], [[0, 0], [0, 0]]],
            [[[0, 2.2], [0, 0]], [[0, 0], [0, 0]]]],
           [[[[0, 0], [3.3, 0]], [[0, 0], [0, 0]]],
@@ -93,14 +201,14 @@ class DiagTest(tf.test.TestCase):
            [[[0, 0], [0, 0]], [[0, 6.6], [0, 0]]]],
           [[[[0, 0], [0, 0]], [[0, 0], [7.7, 0]]],
            [[[0, 0], [0, 0]], [[0, 0], [0, 8.8]]]]]])
-    self.diagOp(x, numpy.float32, expected_ans)
-    self.diagOp(x, numpy.float64, expected_ans)
+    self.diagOp(x, np.float32, expected_ans)
+    self.diagOp(x, np.float64, expected_ans)
 
 
 class DiagPartOpTest(tf.test.TestCase):
 
   def setUp(self):
-    numpy.random.seed(0)
+    np.random.seed(0)
 
   def diagPartOp(self, tensor, dtpe, expected_ans, use_gpu=False):
     with self.test_session(use_gpu=use_gpu):
@@ -110,64 +218,64 @@ class DiagPartOpTest(tf.test.TestCase):
     self.assertShapeEqual(expected_ans, tf_ans_inv)
 
   def testRankTwoFloatTensor(self):
-    x = numpy.random.rand(3, 3)
-    i = numpy.arange(3)
+    x = np.random.rand(3, 3)
+    i = np.arange(3)
     expected_ans = x[i, i]
-    self.diagPartOp(x, numpy.float32, expected_ans)
-    self.diagPartOp(x, numpy.float64, expected_ans)
+    self.diagPartOp(x, np.float32, expected_ans)
+    self.diagPartOp(x, np.float64, expected_ans)
 
   def testRankFourFloatTensor(self):
-    x = numpy.random.rand(2, 3, 2, 3)
-    i = numpy.arange(2)[:, None]
-    j = numpy.arange(3)
+    x = np.random.rand(2, 3, 2, 3)
+    i = np.arange(2)[:, None]
+    j = np.arange(3)
     expected_ans = x[i, j, i, j]
-    self.diagPartOp(x, numpy.float32, expected_ans)
-    self.diagPartOp(x, numpy.float64, expected_ans)
+    self.diagPartOp(x, np.float32, expected_ans)
+    self.diagPartOp(x, np.float64, expected_ans)
 
   def testRankSixFloatTensor(self):
-    x = numpy.random.rand(2, 2, 2, 2, 2, 2)
-    i = numpy.arange(2)[:, None, None]
-    j = numpy.arange(2)[:, None]
-    k = numpy.arange(2)
+    x = np.random.rand(2, 2, 2, 2, 2, 2)
+    i = np.arange(2)[:, None, None]
+    j = np.arange(2)[:, None]
+    k = np.arange(2)
     expected_ans = x[i, j, k, i, j, k]
-    self.diagPartOp(x, numpy.float32, expected_ans)
-    self.diagPartOp(x, numpy.float64, expected_ans)
+    self.diagPartOp(x, np.float32, expected_ans)
+    self.diagPartOp(x, np.float64, expected_ans)
 
   def testOddRank(self):
-    w = numpy.random.rand(2)
-    x = numpy.random.rand(2, 2, 2)
-    y = numpy.random.rand(2, 2, 2, 2, 2)
-    z = numpy.random.rand(2, 2, 2, 2, 2, 2, 2)
-    self.assertRaises(ValueError, self.diagPartOp, w, numpy.float32, 0)
-    self.assertRaises(ValueError, self.diagPartOp, x, numpy.float32, 0)
-    self.assertRaises(ValueError, self.diagPartOp, y, numpy.float32, 0)
-    self.assertRaises(ValueError, self.diagPartOp, z, numpy.float32, 0)
+    w = np.random.rand(2)
+    x = np.random.rand(2, 2, 2)
+    y = np.random.rand(2, 2, 2, 2, 2)
+    z = np.random.rand(2, 2, 2, 2, 2, 2, 2)
+    self.assertRaises(ValueError, self.diagPartOp, w, np.float32, 0)
+    self.assertRaises(ValueError, self.diagPartOp, x, np.float32, 0)
+    self.assertRaises(ValueError, self.diagPartOp, y, np.float32, 0)
+    self.assertRaises(ValueError, self.diagPartOp, z, np.float32, 0)
 
   def testUnevenDimensions(self):
-    w = numpy.random.rand(2, 5)
-    x = numpy.random.rand(2, 1, 2, 3)
-    y = numpy.random.rand(2, 1, 2, 1, 2, 5)
-    z = numpy.random.rand(2, 2, 2, 2, 2, 2, 2, 2)
-    self.assertRaises(ValueError, self.diagPartOp, w, numpy.float32, 0)
-    self.assertRaises(ValueError, self.diagPartOp, x, numpy.float32, 0)
-    self.assertRaises(ValueError, self.diagPartOp, y, numpy.float32, 0)
-    self.assertRaises(ValueError, self.diagPartOp, z, numpy.float32, 0)
+    w = np.random.rand(2, 5)
+    x = np.random.rand(2, 1, 2, 3)
+    y = np.random.rand(2, 1, 2, 1, 2, 5)
+    z = np.random.rand(2, 2, 2, 2, 2, 2, 2, 2)
+    self.assertRaises(ValueError, self.diagPartOp, w, np.float32, 0)
+    self.assertRaises(ValueError, self.diagPartOp, x, np.float32, 0)
+    self.assertRaises(ValueError, self.diagPartOp, y, np.float32, 0)
+    self.assertRaises(ValueError, self.diagPartOp, z, np.float32, 0)
 
 
 class DiagGradOpTest(tf.test.TestCase):
 
   def testDiagGrad(self):
-    numpy.random.seed(0)
+    np.random.seed(0)
     shapes = ((3,), (3,3), (3,3,3))
     dtypes = (tf.float32, tf.float64)
     with self.test_session(use_gpu=False):
       errors = []
       for shape in shapes:
         for dtype in dtypes:
-          x1 = tf.constant(numpy.random.rand(*shape), dtype=dtype)
+          x1 = tf.constant(np.random.rand(*shape), dtype=dtype)
           y = tf.diag(x1)
-          error = tf.test.compute_gradient_error(x1, x1._shape_as_list(),
-                                                 y, y._shape_as_list())
+          error = tf.test.compute_gradient_error(x1, x1.get_shape().as_list(),
+                                                 y, y.get_shape().as_list())
           tf.logging.info("error = %f", error)
           self.assertLess(error, 1e-4)
 
@@ -175,17 +283,17 @@ class DiagGradOpTest(tf.test.TestCase):
 class DiagGradPartOpTest(tf.test.TestCase):
 
   def testDiagPartGrad(self):
-    numpy.random.seed(0)
+    np.random.seed(0)
     shapes = ((3,3), (3,3,3,3), (3,3,3,3,3,3))
     dtypes = (tf.float32, tf.float64)
     with self.test_session(use_gpu=False):
       errors = []
       for shape in shapes:
         for dtype in dtypes:
-          x1 = tf.constant(numpy.random.rand(*shape), dtype=dtype)
+          x1 = tf.constant(np.random.rand(*shape), dtype=dtype)
           y = tf.diag_part(x1)
-          error = tf.test.compute_gradient_error(x1, x1._shape_as_list(),
-                                                 y, y._shape_as_list())
+          error = tf.test.compute_gradient_error(x1, x1.get_shape().as_list(),
+                                                 y, y.get_shape().as_list())
           tf.logging.info("error = %f", error)
           self.assertLess(error, 1e-4)
 
diff --git a/tensorflow/python/kernel_tests/matmul_op_test.py b/tensorflow/python/kernel_tests/matmul_op_test.py
index 87ccc83d981..aa291cdbdd1 100644
--- a/tensorflow/python/kernel_tests/matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/matmul_op_test.py
@@ -153,8 +153,7 @@ class MatMulTest(tf.test.TestCase):
     b = tf.placeholder(tf.float32, [36, 2])
     c = tf.placeholder(tf.float32, [37])
     with self.assertRaisesRegexp(
-        ValueError,
-        r"Dimensions Dimension\(37\) and Dimension\(36\) are not compatible"):
+        ValueError, "Dimensions 37 and 36 are not compatible"):
       tf.matmul(a, b)
     with self.assertRaisesRegexp(ValueError, "must have rank 2"):
       tf.matmul(a, c)
diff --git a/tensorflow/python/kernel_tests/reduce_join_op_test.py b/tensorflow/python/kernel_tests/reduce_join_op_test.py
new file mode 100644
index 00000000000..bda4042cba0
--- /dev/null
+++ b/tensorflow/python/kernel_tests/reduce_join_op_test.py
@@ -0,0 +1,283 @@
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for ReduceJoin op from string_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import itertools
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+
+
+def _input_array(num_dims):
+  """Creates an ndarray where each element is the binary of its linear index.
+
+  Args:
+    num_dims: The number of dimensions to create.
+
+  Returns:
+    An ndarray of shape [2] * num_dims.
+  """
+  formatter = "{:0%db}" % num_dims
+  strings = [formatter.format(i) for i in xrange(2 ** num_dims)]
+  return np.array(strings, dtype="S%d" % num_dims).reshape([2] * num_dims)
+
+
+def _joined_array(num_dims, reduce_dim):
+  """Creates an ndarray with the result from reduce_join on input_array.
+
+  Args:
+    num_dims: The number of dimensions of the original input array.
+    reduce_dim: The dimension to reduce.
+
+  Returns:
+    An ndarray of shape [2] * (num_dims - 1).
+  """
+  formatter = "{:0%db}" % (num_dims - 1)
+  result = np.zeros(shape=[2] * (num_dims - 1), dtype="S%d" % (2 * num_dims))
+  flat = result.ravel()
+  for i in xrange(2 ** (num_dims - 1)):
+    dims = formatter.format(i)
+    flat[i] = "".join([(dims[:reduce_dim] + "%d" + dims[reduce_dim:]) % j
+                       for j in xrange(2)])
+  return result
+
+
+class UnicodeTestCase(tf.test.TestCase):
+  """Test case with Python3-compatible string comparator."""
+
+  def assertAllEqualUnicode(self, truth, actual):
+    self.assertAllEqual(np.array(truth).astype("U"),
+                        np.array(actual).astype("U"))
+
+
+class ReduceJoinTestHelperTest(UnicodeTestCase):
+  """Tests for helper functions."""
+
+  def testInputArray(self):
+    num_dims = 3
+    truth = ["{:03b}".format(i) for i in xrange(2 ** num_dims)]
+    output_array = _input_array(num_dims).reshape([-1])
+    self.assertAllEqualUnicode(truth, output_array)
+
+  def testJoinedArray(self):
+    num_dims = 3
+    truth_dim_zero = [["000100", "001101"], ["010110", "011111"]]
+    truth_dim_one = [["000010", "001011"], ["100110", "101111"]]
+    truth_dim_two = [["000001", "010011"], ["100101", "110111"]]
+    output_array_dim_zero = _joined_array(num_dims, reduce_dim=0)
+    output_array_dim_one = _joined_array(num_dims, reduce_dim=1)
+    output_array_dim_two = _joined_array(num_dims, reduce_dim=2)
+    self.assertAllEqualUnicode(truth_dim_zero, output_array_dim_zero)
+    self.assertAllEqualUnicode(truth_dim_one, output_array_dim_one)
+    self.assertAllEqualUnicode(truth_dim_two, output_array_dim_two)
+
+
+class ReduceJoinTest(UnicodeTestCase):
+
+  def _testReduceJoin(self, input_array, truth, reduction_indices,
+                      keep_dims=False, separator=""):
+    """Compares the output of reduce_join to an expected result.
+
+    Args:
+      input_array: The string input to be joined.
+      truth: An array or np.array of the expected result.
+      reduction_indices: The indices to reduce over.
+      keep_dims: Whether or not to retain reduced dimensions.
+      separator: The separator to use for joining.
+    """
+    with self.test_session():
+      output = tf.reduce_join(inputs=input_array,
+                              reduction_indices=reduction_indices,
+                              keep_dims=keep_dims,
+                              separator=separator)
+      output_array = output.eval()
+
+    self.assertAllEqualUnicode(truth, output_array)
+
+  def _testMultipleReduceJoin(self, input_array, reduction_indices,
+                              separator=" "):
+    """Tests reduce_join for one input and multiple reduction_indices.
+
+    Does so by comparing the output to that from nested reduce_string_joins.
+    The correctness of single-dimension reduce_join is verified by other
+    tests below using _testReduceJoin.
+
+    Args:
+      input_array: The input to test.
+      reduction_indices: The indices to reduce.
+      separator: The separator to use when joining.
+    """
+    num_dims = len(input_array.shape)
+    truth_red_indices = reduction_indices or list(reversed(xrange(num_dims)))
+    with self.test_session():
+      output = tf.reduce_join(
+          inputs=input_array, reduction_indices=reduction_indices,
+          keep_dims=False, separator=separator)
+      output_keep_dims = tf.reduce_join(
+          inputs=input_array, reduction_indices=reduction_indices,
+          keep_dims=True, separator=separator)
+
+      truth = input_array
+      for index in truth_red_indices:
+        truth = tf.reduce_join(
+            inputs=truth, reduction_indices=index, keep_dims=True,
+            separator=separator)
+      truth_squeezed = tf.squeeze(truth, squeeze_dims=truth_red_indices)
+      output_array = output.eval()
+      output_keep_dims_array = output_keep_dims.eval()
+      truth_array = truth.eval()
+      truth_squeezed_array = truth_squeezed.eval()
+    self.assertAllEqualUnicode(truth_array, output_keep_dims_array)
+    self.assertAllEqualUnicode(truth_squeezed_array, output_array)
+
+  def testRankOne(self):
+    input_array = ["this", "is", "a", "test"]
+    truth = "thisisatest"
+    self._testReduceJoin(input_array, truth, reduction_indices=0)
+
+  def testRankTwo(self):
+    input_array = [["this", "is", "a", "test"],
+                   ["please", "do", "not", "panic"]]
+    truth_dim_zero = ["thisplease", "isdo", "anot", "testpanic"]
+    truth_dim_one = ["thisisatest", "pleasedonotpanic"]
+    self._testReduceJoin(input_array, truth_dim_zero, reduction_indices=0)
+    self._testReduceJoin(input_array, truth_dim_one, reduction_indices=1)
+
+  def testRankFive(self):
+    input_array = _input_array(num_dims=5)
+    truths = [_joined_array(num_dims=5, reduce_dim=i) for i in xrange(5)]
+    for i in xrange(5):
+      self._testReduceJoin(input_array, truths[i], reduction_indices=i)
+
+  def testNegative(self):
+    input_array = _input_array(num_dims=5)
+    truths = [_joined_array(num_dims=5, reduce_dim=i) for i in xrange(5)]
+    for i in xrange(5):
+      self._testReduceJoin(input_array, truths[i], reduction_indices=i - 5)
+
+  def testSingletonDimension(self):
+    input_arrays = [_input_array(num_dims=5)
+                    .reshape([2] * i + [1] + [2] * (5 - i))
+                    for i in xrange(6)]
+    truth = _input_array(num_dims=5)
+    for i in xrange(6):
+      self._testReduceJoin(input_arrays[i], truth, reduction_indices=i)
+
+  def testSeparator(self):
+    input_array = [["this", "is", "a", "test"],
+                   ["please", "do", "not", "panic"]]
+    truth_dim_zero = ["this  please", "is  do", "a  not", "test  panic"]
+    truth_dim_one = ["this  is  a  test", "please  do  not  panic"]
+    self._testReduceJoin(input_array, truth_dim_zero, reduction_indices=0,
+                         separator="  ")
+    self._testReduceJoin(input_array, truth_dim_one, reduction_indices=1,
+                         separator="  ")
+
+  def testUnknownShape(self):
+    input_array = [["a"], ["b"]]
+    truth = ["ab"]
+    with self.test_session():
+      placeholder = tf.placeholder(tf.string, name="placeholder")
+      reduced = tf.reduce_join(placeholder, reduction_indices=0)
+      output_array = reduced.eval(feed_dict={placeholder.name: input_array})
+      self.assertAllEqualUnicode(truth, output_array)
+
+  def testUnknownIndices(self):
+    input_array = [["this", "is", "a", "test"],
+                   ["please", "do", "not", "panic"]]
+    truth_dim_zero = ["thisplease", "isdo", "anot", "testpanic"]
+    truth_dim_one = ["thisisatest", "pleasedonotpanic"]
+    with self.test_session():
+      placeholder = tf.placeholder(tf.int32, name="placeholder")
+      reduced = tf.reduce_join(input_array, reduction_indices=placeholder)
+      output_array_dim_zero = reduced.eval(feed_dict={placeholder.name: [0]})
+      output_array_dim_one = reduced.eval(feed_dict={placeholder.name: [1]})
+      self.assertAllEqualUnicode(truth_dim_zero, output_array_dim_zero)
+      self.assertAllEqualUnicode(truth_dim_one, output_array_dim_one)
+
+  def testKeepDims(self):
+    input_array = [["this", "is", "a", "test"],
+                   ["please", "do", "not", "panic"]]
+    truth_dim_zero = [["thisplease", "isdo", "anot", "testpanic"]]
+    truth_dim_one = [["thisisatest"], ["pleasedonotpanic"]]
+    self._testReduceJoin(input_array, truth_dim_zero, reduction_indices=0,
+                         keep_dims=True)
+    self._testReduceJoin(input_array, truth_dim_one, reduction_indices=1,
+                         keep_dims=True)
+
+  def testMultiIndex(self):
+    num_dims = 3
+    input_array = _input_array(num_dims=num_dims)
+    # Also tests [].
+    for i in xrange(num_dims + 1):
+      for permutation in itertools.permutations(xrange(num_dims), i):
+        self._testMultipleReduceJoin(input_array,
+                                     reduction_indices=permutation)
+
+  def testInvalidReductionIndices(self):
+    with self.test_session():
+      with self.assertRaisesRegexp(ValueError, "scalar"):
+        tf.reduce_join(inputs="", reduction_indices=0)
+      with self.assertRaisesRegexp(ValueError,
+                                   "Invalid reduction dimension -3"):
+        tf.reduce_join(inputs=[[""]], reduction_indices=-3)
+      with self.assertRaisesRegexp(ValueError, "Invalid reduction dimension 2"):
+        tf.reduce_join(inputs=[[""]], reduction_indices=2)
+      with self.assertRaisesRegexp(ValueError,
+                                   "Invalid reduction dimension -3"):
+        tf.reduce_join(inputs=[[""]], reduction_indices=[0, -3])
+      with self.assertRaisesRegexp(ValueError, "Invalid reduction dimension 2"):
+        tf.reduce_join(inputs=[[""]], reduction_indices=[0, 2])
+      with self.assertRaisesRegexp(ValueError, "Duplicate reduction index 0"):
+        tf.reduce_join(inputs=[[""]], reduction_indices=[0, 0])
+
+  def testZeroDims(self):
+    valid_truth_shape = [0]
+    with self.test_session():
+      inputs = np.zeros([0, 1], dtype=str)
+      with self.assertRaisesRegexp(ValueError, "dimension 0 with size 0"):
+        tf.reduce_join(inputs=inputs, reduction_indices=0)
+      valid = tf.reduce_join(inputs=inputs, reduction_indices=1)
+      valid_array_shape = valid.eval().shape
+      self.assertAllEqualUnicode(valid_truth_shape, valid_array_shape)
+
+  def testInvalidArgsUnknownShape(self):
+    with self.test_session():
+      placeholder = tf.placeholder(tf.string, name="placeholder")
+      index_too_high = tf.reduce_join(placeholder, reduction_indices=1)
+      duplicate_index = tf.reduce_join(placeholder, reduction_indices=[-1, 1])
+      with self.assertRaisesOpError("Invalid reduction dimension 1"):
+        index_too_high.eval(feed_dict={placeholder.name: [""]})
+      with self.assertRaisesOpError("Duplicate reduction dimension 1"):
+        duplicate_index.eval(feed_dict={placeholder.name: [[""]]})
+
+  def testInvalidArgsUnknownIndices(self):
+    with self.test_session():
+      placeholder = tf.placeholder(tf.int32, name="placeholder")
+      reduced = tf.reduce_join(["test", "test2"],
+                               reduction_indices=placeholder)
+
+      with self.assertRaisesOpError("reduction dimension -2"):
+        reduced.eval(feed_dict={placeholder.name: -2})
+      with self.assertRaisesOpError("reduction dimension 2"):
+        reduced.eval(feed_dict={placeholder.name: 2})
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/python/kernel_tests/rnn_test.py b/tensorflow/python/kernel_tests/rnn_test.py
index 1837b02e2be..be989f50835 100644
--- a/tensorflow/python/kernel_tests/rnn_test.py
+++ b/tensorflow/python/kernel_tests/rnn_test.py
@@ -937,13 +937,14 @@ def graph_creation_static_vs_dynamic_rnn_benchmark(max_time):
 
   def _create_static_rnn():
     with tf.Session(config=config, graph=tf.Graph()) as sess:
-      inputs_list_t = [tf.constant(x) for x in inputs_list]
+      inputs_list_t = [
+          tf.Variable(x, trainable=False).value() for x in inputs_list]
       ops = _static_vs_dynamic_rnn_benchmark_static(
           inputs_list_t, sequence_length)
 
   def _create_dynamic_rnn():
     with tf.Session(config=config, graph=tf.Graph()) as sess:
-      inputs_t = tf.constant(inputs)
+      inputs_t = tf.Variable(inputs, trainable=False).value()
       ops = _static_vs_dynamic_rnn_benchmark_dynamic(
           inputs_t, sequence_length)
 
@@ -961,7 +962,7 @@ def _timer(sess, ops):
     sess.run(ops)
 
   # Timing run
-  runs = 10
+  runs = 20
   start = time.time()
   for _ in range(runs):
     sess.run(ops)
@@ -983,13 +984,9 @@ def static_vs_dynamic_rnn_benchmark(batch_size, max_time, num_units, use_gpu):
 
   # Using rnn()
   with tf.Session(config=config, graph=tf.Graph()) as sess:
-    if not use_gpu:
-      with tf.device("/cpu:0"):
-        inputs_list_t = [tf.constant(x) for x in inputs_list]
-        ops = _static_vs_dynamic_rnn_benchmark_static(
-            inputs_list_t, sequence_length)
-    else:
-      inputs_list_t = [tf.constant(x) for x in inputs_list]
+    with tf.device("/cpu:0" if not use_gpu else None):
+      inputs_list_t = [
+          tf.Variable(x, trainable=False).value() for x in inputs_list]
       ops = _static_vs_dynamic_rnn_benchmark_static(
           inputs_list_t, sequence_length)
     tf.initialize_all_variables().run()
@@ -997,13 +994,8 @@ def static_vs_dynamic_rnn_benchmark(batch_size, max_time, num_units, use_gpu):
 
   # Using dynamic_rnn()
   with tf.Session(config=config, graph=tf.Graph()) as sess:
-    if not use_gpu:
-      with tf.device("/cpu:0"):
-        inputs_t = tf.Variable(inputs)
-        ops = _static_vs_dynamic_rnn_benchmark_dynamic(
-            inputs_t, sequence_length)
-    else:
-      inputs_t = tf.Variable(inputs)
+    with tf.device("/cpu:0" if not use_gpu else None):
+      inputs_t = tf.Variable(inputs, trainable=False).value()
       ops = _static_vs_dynamic_rnn_benchmark_dynamic(
           inputs_t, sequence_length)
     tf.initialize_all_variables().run()
@@ -1016,6 +1008,59 @@ def static_vs_dynamic_rnn_benchmark(batch_size, max_time, num_units, use_gpu):
   return delta_static, delta_dynamic
 
 
+def _half_seq_len_vs_unroll_half_rnn_benchmark(inputs_list_t, sequence_length):
+  (_, input_size) = inputs_list_t[0].get_shape().as_list()
+  initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=127)
+  cell = tf.nn.rnn_cell.LSTMCell(
+      num_units=input_size, input_size=input_size, use_peepholes=True,
+      initializer=initializer)
+  outputs, final_state = tf.nn.rnn(
+      cell, inputs_list_t, sequence_length=sequence_length, dtype=tf.float32)
+
+  trainable_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
+  gradients = tf.gradients(outputs + [final_state], trainable_variables)
+
+  return tf.group(final_state, *(gradients + outputs))
+
+
+def half_seq_len_vs_unroll_half_rnn_benchmark(
+    batch_size, max_time, num_units, use_gpu):
+  config = tf.ConfigProto()
+  config.allow_soft_placement = True
+
+  # Set up sequence lengths
+  np.random.seed([127])
+  sequence_length = max_time * np.ones((batch_size,))
+  inputs_list = [
+      np.random.randn(batch_size, num_units).astype(np.float32)
+      for _ in range(max_time)]
+
+  # Halve the sequence length, full static unroll
+  with tf.Session(config=config, graph=tf.Graph()) as sess:
+    with tf.device("/cpu:0" if not use_gpu else None):
+      inputs_list_t = [
+          tf.Variable(x, trainable=False).value() for x in inputs_list]
+      ops = _half_seq_len_vs_unroll_half_rnn_benchmark(
+          inputs_list_t, sequence_length / 2)
+    tf.initialize_all_variables().run()
+    delta_half_seq_len = _timer(sess, ops)
+
+  # Halve the unroll size, don't use sequence length
+  with tf.Session(config=config, graph=tf.Graph()) as sess:
+    with tf.device("/cpu:0" if not use_gpu else None):
+      inputs_list_t = [
+          tf.Variable(x, trainable=False).value() for x in inputs_list]
+      ops = _half_seq_len_vs_unroll_half_rnn_benchmark(
+          inputs_list_t[:(max_time // 2)], sequence_length / 2)
+    tf.initialize_all_variables().run()
+    delta_unroll_half = _timer(sess, ops)
+  print("%d \t %d \t\t %d \t %s \t %f \t\t %f \t\t %f" %
+        (batch_size, max_time, num_units, use_gpu, delta_half_seq_len,
+         delta_unroll_half, delta_half_seq_len/delta_unroll_half))
+
+  return delta_half_seq_len, delta_unroll_half
+
+
 def _dynamic_rnn_swap_memory_benchmark(inputs_t, sequence_length,
                                        swap_memory):
   (unused_0, unused_1, input_size) = inputs_t.get_shape().as_list()
@@ -1047,7 +1092,7 @@ def dynamic_rnn_swap_memory_benchmark(batch_size, max_time, num_units):
 
   # No memory swap
   with tf.Session(config=config, graph=tf.Graph()) as sess:
-    inputs_t = tf.Variable(inputs)
+    inputs_t = tf.Variable(inputs, trainable=False).value()
     ops = _dynamic_rnn_swap_memory_benchmark(
         inputs_t, sequence_length, swap_memory=False)
     tf.initialize_all_variables().run()
@@ -1055,7 +1100,7 @@ def dynamic_rnn_swap_memory_benchmark(batch_size, max_time, num_units):
 
   # Memory swap
   with tf.Session(config=config, graph=tf.Graph()) as sess:
-    inputs_t = tf.Variable(inputs)
+    inputs_t = tf.Variable(inputs, trainable=False).value()
     ops = _dynamic_rnn_swap_memory_benchmark(
         inputs_t, sequence_length, swap_memory=True)
     tf.initialize_all_variables().run()
@@ -1082,14 +1127,15 @@ def rnn_long_sequence_benchmark(batch_size, seqlen, num_units,
   for _ in range(5):
     if dynamic:
       with tf.Session(config=config, graph=tf.Graph()) as sess:
-        inputs_t = tf.Variable(inputs)
+        inputs_t = tf.Variable(inputs, trainable=False).value()
         ops = _dynamic_rnn_swap_memory_benchmark(
             inputs_t, sequence_length, swap_memory=swap_memory)
         tf.initialize_all_variables().run()
         elapsed = _timer(sess, ops)
     else:
       with tf.Session(config=config, graph=tf.Graph()) as sess:
-        inputs_list_t = [tf.constant(x) for x in inputs_list]
+        inputs_list_t = [
+            tf.Variable(x, trainable=False).value() for x in inputs_list]
         ops = _static_vs_dynamic_rnn_benchmark_static(
             inputs_list_t, sequence_length)
         tf.initialize_all_variables().run()
@@ -1126,11 +1172,11 @@ class BenchmarkRNN(tf.test.Benchmark):
             self.report_benchmark(
                 name="static_unroll_time_T%02d_B%03d_N%03d_gpu_%s"
                 % (max_time, batch_size, num_units, use_gpu),
-                iters=10, wall_time=s_dt)
+                iters=20, wall_time=s_dt)
             self.report_benchmark(
                 name="dynamic_unroll_time_T%02d_B%03d_N%03d_gpu_%s"
                 % (max_time, batch_size, num_units, use_gpu),
-                iters=10, wall_time=d_dt)
+                iters=20, wall_time=d_dt)
 
   def benchmarkDynamicLSTMNoMemorySwapVsMemorySwap(self):
     print("Calculation: Dynamic LSTM No Memory Swap vs. Memory Swap")
@@ -1143,11 +1189,31 @@ class BenchmarkRNN(tf.test.Benchmark):
           self.report_benchmark(
               name="dynamic_lstm_no_memory_swap_T%02d_B%03d_N%03d"
               % (max_time, batch_size, num_units),
-              iters=10, wall_time=no_swap)
+              iters=20, wall_time=no_swap)
           self.report_benchmark(
               name="dynamic_lstm_with_memory_swap_T%02d_B%03d_N%03d"
               % (max_time, batch_size, num_units),
-              iters=10, wall_time=swap)
+              iters=20, wall_time=swap)
+
+  def benchmarkStaticUnrollHalfSequenceLengthVsHalfUnroll(self):
+    print("Calculation: Static Unroll with Halved Sequence Length "
+          "vs. Half Static Unroll")
+    print("batch \t full_t \t units \t gpu \t dt(half_seq_len) "
+          "\t dt(unroll_half) \t dt(half_seq_len)/dt(unroll_half)")
+    for batch_size in (128,):
+      for max_time in (50,):
+        for num_units in (256,):
+          for use_gpu in (False, True):
+            s_dt, d_dt = half_seq_len_vs_unroll_half_rnn_benchmark(
+                batch_size, max_time, num_units, use_gpu)
+            self.report_benchmark(
+                name="half_seq_len_time_T%02d_B%03d_N%03d_gpu_%s"
+                % (max_time, batch_size, num_units, use_gpu),
+                iters=20, wall_time=s_dt)
+            self.report_benchmark(
+                name="unroll_half_time_T%02d_B%03d_N%03d_gpu_%s"
+                % (max_time, batch_size, num_units, use_gpu),
+                iters=20, wall_time=d_dt)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/session_ops_test.py b/tensorflow/python/kernel_tests/session_ops_test.py
new file mode 100644
index 00000000000..4f61055cbcd
--- /dev/null
+++ b/tensorflow/python/kernel_tests/session_ops_test.py
@@ -0,0 +1,157 @@
+# Copyright 2015 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.session_ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+
+class SessionOpsTest(tf.test.TestCase):
+
+  def testHandleBasic(self):
+    with self.test_session() as sess:
+      # Return a handle.
+      a = tf.constant(10)
+      b = tf.constant(5)
+      c = tf.mul(a, b)
+      h = tf.get_session_handle(c)
+      h = sess.run(h)
+
+      # Feed a tensor handle.
+      f, x = tf.get_session_tensor(tf.int32)
+      y = tf.mul(x, 10)
+      self.assertEqual(500, sess.run(y, feed_dict={f: h.handle}))
+
+  def testHandleEval(self):
+    with self.test_session() as sess:
+      # Return a handle.
+      a = tf.constant(10)
+      b = tf.constant(5)
+      c = tf.mul(a, b)
+      h = tf.get_session_handle(c)
+      h = sess.run(h)
+
+      # Get the tensor from its handle.
+      self.assertEqual(50, h.eval())
+
+  def testHandleAndValue(self):
+    with self.test_session() as sess:
+      # Return a handle and a value.
+      a = tf.constant(10)
+      b = tf.constant(5)
+      c = tf.mul(a, b)
+      h = tf.get_session_handle(c)
+      v = tf.mul(a, c)
+      h, v = sess.run([h, v])
+
+      self.assertEqual(50, h.eval())
+      self.assertEqual(500, v)
+
+  def testHandleCond(self):
+    with self.test_session() as sess:
+      # Return a handle and a value
+      a = tf.constant(10)
+      b = tf.constant(5)
+      p = tf.less(a, b)
+      c = tf.mul(a, b)
+      h = tf.get_session_handle(c)
+      p, h = sess.run([p, h])
+
+      # Run by feeding a tensor handle.
+      f, x = tf.get_session_tensor(tf.int32)
+      if p:
+        y = tf.mul(x, 10)
+      else:
+        y = tf.mul(x, 100)
+      result = sess.run(y, feed_dict={f: h.handle})
+
+      self.assertEqual(5000, result)
+
+  def testHandleForLoop(self):
+    with self.test_session() as sess:
+      # Initialize a handle.
+      a = tf.constant(0)
+      h = tf.get_session_handle(a)
+      h = sess.run(h)
+
+      # Do some computation.
+      f, x = tf.get_session_tensor(tf.int32)
+      # Must define the loop body outside the loop.
+      h_x = tf.get_session_handle(tf.add(x, 1))
+      for _ in range(100):
+        # This exercises garbage collection.
+        h = sess.run(h_x, feed_dict={f: h.handle})
+
+      self.assertEqual(100, h.eval())
+
+  def testHandleWhileLoop(self):
+    with self.test_session() as sess:
+      # Initialize a handle.
+      a = tf.constant(0)
+      h = tf.get_session_handle(a)
+      h = sess.run(h)
+
+      # Do some computation.
+      f, x = tf.get_session_tensor(tf.int32)
+      b = tf.constant(100)
+      p = tf.less(x, b)
+      # Must define the loop body outside the loop.
+      h_x = tf.get_session_handle(tf.add(x, 1))
+      while True:
+        rp, h = sess.run([p, h_x], feed_dict={f: h.handle})
+        if not rp:
+          break
+
+      self.assertEqual(101, h.eval())
+
+  def testHandleMover(self):
+    with self.test_session() as sess:
+      # Return a handle.
+      a = tf.constant(10)
+      b = tf.constant(5)
+      c = tf.mul(a, b)
+      h = tf.get_session_handle(c)
+      h = sess.run(h)
+
+      # Feed a tensor handle.
+      f, x = tf.get_session_tensor(tf.int32)
+      y = tf.mul(x, 10)
+      self.assertEqual(500, sess.run(y, feed_dict={f: h.handle}))
+
+      # Feed another tensor handle.
+      with tf.device("/gpu:0"):
+        a = tf.constant(10)
+        h = tf.get_session_handle(a)
+        h = sess.run(h)
+        self.assertEqual(100, sess.run(y, feed_dict={f: h.handle}))
+
+  def testHandleDeleter(self):
+    with self.test_session() as sess:
+      # Return a handle.
+      a = tf.constant(10)
+      b = tf.constant(5)
+      c = tf.mul(a, b)
+      h = tf.get_session_handle(c)
+      h = sess.run(h)
+
+      # Delete using a raw tensor handle.
+      h = h.get_raw_handle()
+      f, x = tf.delete_session_tensor()
+      sess.run(x, feed_dict={f: h})
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
index ce7bbfa61f5..b3b25bf031c 100644
--- a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
@@ -24,7 +24,6 @@ import time
 import numpy as np
 import tensorflow as tf
 
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import sparse_ops
 # pylint: enable=g-bad-import-order,unused-import
 
@@ -131,7 +130,7 @@ def _sparse_tensor_dense_vs_dense_matmul_benchmark_dense(
   t0 = tf.constant(0)
   v0 = tf.constant(0.0)
   def _timeit(iterations, _):
-    (_, final) = control_flow_ops.While(
+    (_, final) = tf.while_loop(
         lambda t, _: t < iterations, body, (t0, v0),
         parallel_iterations=1, back_prop=False)
     return [final]
@@ -151,7 +150,7 @@ def _sparse_tensor_dense_vs_dense_matmul_benchmark_sparse(
   t0 = tf.constant(0)
   v0 = tf.constant(0.0)
   def _timeit(iterations, _):
-    (_, final) = control_flow_ops.While(
+    (_, final) = tf.while_loop(
         lambda t, _: t < iterations, body, (t0, v0),
         parallel_iterations=1, back_prop=False)
     return [final]
diff --git a/tensorflow/python/kernel_tests/stack_ops_test.py b/tensorflow/python/kernel_tests/stack_ops_test.py
index 5270a13de23..02bf02858bf 100644
--- a/tensorflow/python/kernel_tests/stack_ops_test.py
+++ b/tensorflow/python/kernel_tests/stack_ops_test.py
@@ -22,7 +22,6 @@ import numpy as np
 import tensorflow as tf
 
 from tensorflow.python.framework import errors
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_data_flow_ops
 
 
@@ -67,7 +66,7 @@ class StackOpTest(tf.test.TestCase):
           v = gen_data_flow_ops._stack_push(h, a, swap_memory=True)
         with tf.control_dependencies([v]):
           return tf.add(x, 1)
-      r = control_flow_ops.While(c, b, [n])
+      r = tf.while_loop(c, b, [n])
 
       v = tf.constant(np.zeros(2000), dtype=tf.float32)
       def c1(x, y):
@@ -76,7 +75,7 @@ class StackOpTest(tf.test.TestCase):
         nx = tf.sub(x, 1)
         ny = y + gen_data_flow_ops._stack_pop(h, tf.float32)
         return [nx, ny]
-      rx, ry = control_flow_ops.While(c1, b1, [r, v])
+      rx, ry = tf.while_loop(c1, b1, [r, v])
       self.assertAllClose(np.ones(2000) * 10.0, ry.eval())
 
   def testStackWhileSwap(self):
diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
index 0c68fd2d8a7..3fa04322b39 100644
--- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
@@ -23,7 +23,6 @@ import numpy as np
 import tensorflow as tf
 
 from tensorflow.python.framework import errors
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import tensor_array_grad
 from tensorflow.python.ops import tensor_array_ops
@@ -433,6 +432,44 @@ class TensorArrayCPUTest(tf.test.TestCase):
           r"dynamically resizeable"):
         ta.split([1.0], [1]).flow.eval()
 
+  def _testTensorArrayWriteGradientAddMultipleAdds(self, dtype):
+    with self.test_session(use_gpu=self._use_gpu):
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtype, tensor_array_name="foo", size=3)
+      ta_grad = ta.grad("grad")
+
+      c = lambda x: np.asarray(x, dtype=dtype.as_numpy_dtype)
+
+      w0 = ta.write(2, c(3.0))
+      w1 = w0.write(2, c(4.0))
+
+      w0_grad = ta_grad.write(2, c(3.0))
+      w1_grad = w0_grad.write(2, c(4.0))
+      w2_grad = w1_grad.write(2, c(5.0))
+
+      # Assert that aggregation works correctly
+      self.assertAllEqual(c(12.00), w2_grad.read(2).eval())
+
+      # Assert that if multiple_writes_aggregate is not enabled,
+      # multiple writes raise an exception.
+      with self.assertRaisesOpError(
+          r"TensorArray foo: Could not write to TensorArray index 2 because "
+          r"it has already been written to."):
+        w1.flow.eval()
+
+      # Using differing shapes causes an exception
+      wb0_grad = ta_grad.write(1, c(1.0))
+      wb1_grad = wb0_grad.write(1, c([1.0]))
+
+      with self.assertRaisesOpError(
+          r"Could not aggregate to TensorArray index 1 because the "
+          r"existing shape is \[\] but the new input shape is \[1\]"):
+        wb1_grad.flow.eval()
+
+  def testTensorArrayWriteGradientAddMultipleAdds(self):
+    for dtype in [tf.int32, tf.int64, tf.float32, tf.float64, tf.complex64]:
+      self._testTensorArrayWriteGradientAddMultipleAdds(dtype)
+
   def testMultiTensorArray(self):
     with self.test_session(use_gpu=self._use_gpu):
       h1 = tensor_array_ops.TensorArray(
@@ -473,6 +510,7 @@ class TensorArrayCPUTest(tf.test.TestCase):
       w1 = w0.write(1, value_1)
       r0 = w1.read(0)
       r1 = w1.read(1)
+      r0_2 = w1.read(0)
 
       # Test individual components' gradients
       grad_just_r0 = tf.gradients(
@@ -480,6 +518,12 @@ class TensorArrayCPUTest(tf.test.TestCase):
       grad_just_r0_vals = session.run(grad_just_r0)
       self.assertAllEqual(c([[2.0, 3.0]]), grad_just_r0_vals[0])
 
+      grad_r0_r0_2 = tf.gradients(
+          ys=[r0, r0_2], xs=[value_0],
+          grad_ys=[c([[2.0, 3.0]]), c([[1.0, -1.0]])])
+      grad_r0_r0_2_vals = session.run(grad_r0_r0_2)
+      self.assertAllEqual(c([[3.0, 2.0]]), grad_r0_r0_2_vals[0])
+
       grad_just_r1 = tf.gradients(
           ys=[r1], xs=[value_1], grad_ys=[c(-2.0)])
       grad_just_r1_vals = session.run(grad_just_r1)
@@ -487,35 +531,93 @@ class TensorArrayCPUTest(tf.test.TestCase):
 
       # Test combined gradients
       grad = tf.gradients(
-          ys=[r0, r1], xs=[value_0, value_1],
-          grad_ys=[c(-1.0), c([[2.0, 3.0]])])
+          ys=[r0, r0_2, r1], xs=[value_0, value_1],
+          grad_ys=[c(-1.0), c(-2.0), c([[2.0, 3.0]])])
       grad_vals = session.run(grad)
       self.assertEqual(len(grad_vals), 2)
-      self.assertAllEqual(c(-1.0), grad_vals[0])
+      self.assertAllEqual(c(-3.0), grad_vals[0])
       self.assertAllEqual(c([[2.0, 3.0]]), grad_vals[1])
 
   def testTensorArrayGradientWriteRead(self):
     for dtype in (np.float32, np.float64, np.int32, np.int64, np.complex64):
       self._testTensorArrayGradientWriteReadType(dtype)
 
+  def testTensorArrayGradientWritePackConcatAndRead(self):
+    with self.test_session(use_gpu=self._use_gpu) as sess:
+      ta = tensor_array_ops.TensorArray(
+          dtype=tf.float32, tensor_array_name="foo", size=2,
+          clear_after_read=False)
+
+      value_0 = tf.constant([-1.0, 1.0])
+      value_1 = tf.constant([-10.0, 10.0])
+
+      w0 = ta.write(0, value_0)
+      w1 = w0.write(1, value_1)
+      p0 = w1.pack()
+      r0 = w1.read(0)
+      s0 = w1.concat()
+
+      # Test gradient accumulation between read(0), pack(), and concat()
+      with tf.control_dependencies([p0, r0, s0]):
+        grad_r = tf.gradients(
+            ys=[p0, r0, s0], xs=[value_0, value_1],
+            grad_ys=[
+                [[2.0, 3.0], [4.0, 5.0]],  # pack gradient
+                [-0.5, 1.5],  # read(0) gradient
+                [20.0, 30.0, 40.0, 50.0]])  # concat gradient
+      grad_vals = sess.run(grad_r)  # 2 + 2 entries
+
+      self.assertAllClose([2.0 - 0.5 + 20.0, 3.0 + 1.5 + 30.0], grad_vals[0])
+      self.assertAllEqual([4.0 + 40.0, 5.0 + 50.0], grad_vals[1])
+
+  def testTensorArrayReadTwice(self):
+    with self.test_session(use_gpu=self._use_gpu):
+      value = tf.constant([[1.0, -1.0], [10.0, -10.0]])
+
+      ta_readonce = tensor_array_ops.TensorArray(
+          dtype=tf.float32, tensor_array_name="foo", size=2)
+
+      w_readonce = ta_readonce.unpack(value)
+      r0_readonce = w_readonce.read(0)
+      with tf.control_dependencies([r0_readonce]):
+        r1_readonce = w_readonce.read(0)
+
+      with self.assertRaisesOpError(
+          r"Could not read index 0 twice because it was cleared after a "
+          r"previous read \(perhaps try setting clear_after_read = false\?\)"):
+        r1_readonce.eval()
+
+      ta_readtwice = tensor_array_ops.TensorArray(
+          dtype=tf.float32, tensor_array_name="foo", size=2,
+          clear_after_read=False)
+      w_readtwice = ta_readtwice.unpack(value)
+      r0_readtwice = w_readtwice.read(0)
+      with tf.control_dependencies([r0_readtwice]):
+        r1_readtwice = w_readtwice.read(0)
+
+      self.assertAllEqual([1.0, -1.0], r1_readtwice.eval())
+
   def testTensorArrayGradientUnpackRead(self):
     with self.test_session(use_gpu=self._use_gpu) as session:
       ta = tensor_array_ops.TensorArray(
-          dtype=tf.float32, tensor_array_name="foo", size=2)
+          dtype=tf.float32, tensor_array_name="foo", size=2,
+          clear_after_read=False)
 
       value = tf.constant([[1.0, -1.0], [10.0, -10.0]])
 
       w = ta.unpack(value)
       r0 = w.read(0)
+      r0_1 = w.read(0)
       r1 = w.read(1)
 
       # Test combined gradients + aggregation of read(0)
       grad = tf.gradients(
-          ys=[r0, r1], xs=[value], grad_ys=[[2.0, 3.0], [4.0, 5.0]])
+          ys=[r0, r0_1, r1], xs=[value],
+          grad_ys=[[2.0, 3.0], [-1.5, 1.5], [4.0, 5.0]])
       grad_vals = session.run(grad)
 
       self.assertEqual(len(grad_vals), 1)
-      self.assertAllEqual([[2.0, 3.0], [4.0, 5.0]], grad_vals[0])
+      self.assertAllEqual([[2.0 - 1.5, 3.0 + 1.5], [4.0, 5.0]], grad_vals[0])
 
   def testTensorArrayGradientSplitConcat(self):
     with self.test_session(use_gpu=self._use_gpu) as session:
@@ -602,7 +704,7 @@ class TensorArrayCPUTest(tf.test.TestCase):
         ta_t = ta_t.write(time, out)
         return (time+1, ta_t, state)
 
-      (unused_0, h_final, unused_2) = control_flow_ops.While(
+      (unused_0, h_final, unused_2) = tf.while_loop(
           cond=lambda time, unused_1, unused_2: time < 3,
           body=body,
           loop_vars=(time_0, ta, state0),
@@ -726,9 +828,9 @@ class TensorArrayCPUTest(tf.test.TestCase):
         "foo/bar/gradients_0",
         self._grad_source_for_name("foo/bar/gradients_0/baz"))
 
-  def testGetGradSource_NestedUsesTompost(self):
+  def testGetGradSource_NestedUsesInnermost(self):
     self.assertEqual(
-        "foo/gradients",
+        "foo/gradients/bar/gradients_0",
         self._grad_source_for_name("foo/gradients/bar/gradients_0/baz"))
 
 
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index 5fbd69f1553..d7ae855865a 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -165,6 +165,17 @@ def _DiagGrad(_, grad):
 def _DiagPartGrad(_, grad):
   return array_ops.diag(grad)
 
+
+@ops.RegisterGradient("BatchMatrixDiag")
+def _BatchMatrixDiagGrad(_, grad):
+  return array_ops.batch_matrix_diag_part(grad)
+
+
+@ops.RegisterGradient("BatchMatrixDiagPart")
+def _BatchMatrixDiagPartGrad(_, grad):
+  return array_ops.batch_matrix_diag(grad)
+
+
 # Edit Distance has no gradient (but can be used to eval seq2seq or CTC).
 ops.NoGradient("EditDistance")
 
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 0cb751553c4..7c3e78cf8e6 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -918,6 +918,22 @@ def _UniqueWithCountsShape(op):
   return [tensor_shape.vector(None), input_shape, tensor_shape.vector(None)]
 
 
+@ops.RegisterShape("BatchMatrixDiag")
+def _BatchMatrixDiagShape(op):
+  """Shape function for array_ops.batch_matrix_diag."""
+  diag_shape = op.inputs[0].get_shape().with_rank_at_least(1)
+  return [diag_shape.concatenate(diag_shape[-1])]
+
+
+@ops.RegisterShape("BatchMatrixDiagPart")
+def _BatchMatrixDiagPartShape(op):
+  """Shape function for array_ops.batch_matrix_diag_part."""
+  input_shape = op.inputs[0].get_shape().with_rank_at_least(2)
+  # Last two dims must match
+  input_shape[-1].assert_is_compatible_with(input_shape[-2])
+  return [input_shape[:-1]]
+
+
 @ops.RegisterShape("Diag")
 def _DiagShape(op):
   """Shape function for array_ops.diag.
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 54584443371..a60174be2ae 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -25,6 +25,7 @@ the execution of operations and add conditional dependencies to your graph.
 @@count_up_to
 @@cond
 @@case
+@@while_loop
 
 ## Logical Operators
 
@@ -257,6 +258,7 @@ def merge(inputs, name=None):
       else:
         dense_shape = None
       return ops.IndexedSlices(values, indices, dense_shape), chosen_index
+# pylint: enable=protected-access
 
 
 def _SwitchRefOrTensor(data, pred, name="Switch"):
@@ -970,9 +972,8 @@ class ControlFlowContext(object):
     """
     while_ctxt = self.GetWhileContext()
     if while_ctxt is not None:
-      # pylint: disable=protected-access
       op._add_control_input(while_ctxt.GetControlPivot().op)
-      # pylint: enable=protected-access
+  # pylint: enable=protected-access
 
 
 class CondContext(ControlFlowContext):
@@ -1123,7 +1124,7 @@ def cond(pred, fn1, fn2, name=None):
     y = tf.constant(5)
     def f1(): return tf.mul(x, 17)
     def f2(): return tf.add(y, 23)
-    r = cond(math_ops.less(x, y), f1, f2)
+    r = cond(tf.less(x, y), f1, f2)
     # r is set to f1().
     # Operations in f2 (e.g., tf.add) are not executed.
   ```
@@ -1528,8 +1529,8 @@ class WhileContext(ControlFlowContext):
             else exit_vars_with_tensor_arrays)
 
 
-def While(cond, body, loop_vars, parallel_iterations=10, back_prop=True,
-          swap_memory=False, name=None):
+def while_loop(cond, body, loop_vars, parallel_iterations=10, back_prop=True,
+               swap_memory=False, name=None):
   """Repeat `body` while the condition `cond` is true.
 
   `cond` is a callable taking a list of tensors and returning a boolean scalar
@@ -1560,14 +1561,16 @@ def While(cond, body, loop_vars, parallel_iterations=10, back_prop=True,
     ValueError: if `loop_var` is empty.
 
   Example:
+
     ```python
-    i = constant(0)
-    c = lambda i: math_ops.less(i, 10)
-    b = lambda i: math_ops.add(i, 1)
-    r = While(c, b, [i])
+    i = tf.constant(0)
+    c = lambda i: tf.less(i, 10)
+    b = lambda i: tf.add(i, 1)
+    r = tf.while_loop(c, b, [i])
     ```
+
   """
-  with ops.op_scope(loop_vars, name, "While") as name:
+  with ops.op_scope(loop_vars, name, "while") as name:
     if not loop_vars:
       raise ValueError("No loop variables provided")
     if not callable(cond):
@@ -1582,6 +1585,14 @@ def While(cond, body, loop_vars, parallel_iterations=10, back_prop=True,
     return result
 
 
+def While(cond, body, loop_vars, parallel_iterations=10, back_prop=True,
+          swap_memory=False, name=None):
+  """DEPRECATED: Use `while_loop`."""
+  return while_loop(cond=cond, body=body, loop_vars=loop_vars,
+                    parallel_iterations=parallel_iterations,
+                    back_prop=back_prop, swap_memory=swap_memory, name=name)
+
+
 def _AsTensorList(x, p):
   """Return x as a list of Tensors or IndexedSlices.
 
diff --git a/tensorflow/python/ops/data_flow_grad.py b/tensorflow/python/ops/data_flow_grad.py
index 84cb9a39b15..dedecaa3752 100644
--- a/tensorflow/python/ops/data_flow_grad.py
+++ b/tensorflow/python/ops/data_flow_grad.py
@@ -76,3 +76,7 @@ ops.NoGradient("Stack")
 ops.NoGradient("StackPush")
 ops.NoGradient("StackPop")
 ops.NoGradient("StackClose")
+
+ops.NoGradient("GetSessionHandle")
+ops.NoGradient("GetSessionTensor")
+ops.NoGradient("DeleteSessionTensor")
diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py
index 214b7bb29a9..3f72ccf5cdd 100644
--- a/tensorflow/python/ops/data_flow_ops.py
+++ b/tensorflow/python/ops/data_flow_ops.py
@@ -570,6 +570,11 @@ ops.RegisterShape("StackPush")(common_shapes.unknown_shape)
 ops.RegisterShape("StackPop")(common_shapes.unknown_shape)
 ops.RegisterShape("StackClose")(_ScalarToVoidShape)
 
+# NOTE(yuanbyu): We probably can do better here.
+ops.RegisterShape("GetSessionHandle")(common_shapes.scalar_shape)
+ops.RegisterShape("GetSessionTensor")(common_shapes.unknown_shape)
+ops.RegisterShape("DeleteSessionTensor")(_ScalarToVoidShape)
+
 
 @ops.RegisterShape("DynamicPartition")
 def _DynamicPartitionShape(op):
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index 034c494dc45..a4a4a180ab7 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -110,10 +110,11 @@ def foldl(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
     def compute(i, a):
       a = fn(a, elems_ta.read(i))
       return [i + 1, a]
-    _, r_a = control_flow_ops.While(lambda i, a: i < n, compute, [i, a],
-                                    parallel_iterations=parallel_iterations,
-                                    back_prop=back_prop,
-                                    swap_memory=swap_memory)
+    _, r_a = control_flow_ops.while_loop(
+        lambda i, a: i < n, compute, [i, a],
+        parallel_iterations=parallel_iterations,
+        back_prop=back_prop,
+        swap_memory=swap_memory)
     return r_a
 
 
@@ -180,10 +181,11 @@ def foldr(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
       i -= 1
       a = fn(a, elems_ta.read(i))
       return [i, a]
-    _, r_a = control_flow_ops.While(lambda i, a: i > 0, compute, [i, a],
-                                    parallel_iterations=parallel_iterations,
-                                    back_prop=back_prop,
-                                    swap_memory=swap_memory)
+    _, r_a = control_flow_ops.while_loop(
+        lambda i, a: i > 0, compute, [i, a],
+        parallel_iterations=parallel_iterations,
+        back_prop=back_prop,
+        swap_memory=swap_memory)
     return r_a
 
 
@@ -246,10 +248,11 @@ def map_fn(fn, elems, dtype=None, parallel_iterations=10, back_prop=True,
     def compute(i, ta):
       ta = ta.write(i, fn(elems_ta.read(i)))
       return [i + 1, ta]
-    _, r_a = control_flow_ops.While(lambda i, a: i < n, compute, [i, acc_ta],
-                                    parallel_iterations=parallel_iterations,
-                                    back_prop=back_prop,
-                                    swap_memory=swap_memory)
+    _, r_a = control_flow_ops.while_loop(
+        lambda i, a: i < n, compute, [i, acc_ta],
+        parallel_iterations=parallel_iterations,
+        back_prop=back_prop,
+        swap_memory=swap_memory)
     return r_a.pack()
 
 
@@ -323,7 +326,7 @@ def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
       a = fn(a, elems_ta.read(i))
       ta = ta.write(i, a)
       return [i + 1, a, ta]
-    _, _, r_a = control_flow_ops.While(
+    _, _, r_a = control_flow_ops.while_loop(
         lambda i, a, ta: i < n, compute, [i, a, acc_ta],
         parallel_iterations=parallel_iterations,
         back_prop=back_prop, swap_memory=swap_memory)
diff --git a/tensorflow/python/ops/histogram_ops.py b/tensorflow/python/ops/histogram_ops.py
index 6da0065ab1e..384245e2085 100644
--- a/tensorflow/python/ops/histogram_ops.py
+++ b/tensorflow/python/ops/histogram_ops.py
@@ -53,6 +53,7 @@ def histogram_fixed_width(values,
     A 1-D `Tensor` holding histogram of values.
 
   Examples:
+
   ```python
   # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
   nbins = 5
diff --git a/tensorflow/python/ops/linalg_grad.py b/tensorflow/python/ops/linalg_grad.py
index d4fc0df9695..e76244933bf 100644
--- a/tensorflow/python/ops/linalg_grad.py
+++ b/tensorflow/python/ops/linalg_grad.py
@@ -81,9 +81,7 @@ def _MatrixSolveGrad(op, grad):
   """Gradients for MatrixSolve."""
   a = op.inputs[0]
   c = op.outputs[0]
-  # TODO(rmlarsen): Get rid of explicit transpose after adding
-  # adjoint_a attribute to solver.
-  grad_b = linalg_ops.matrix_solve(array_ops.transpose(a), grad)
+  grad_b = linalg_ops.matrix_solve(a, grad, adjoint=True)
   grad_a = -math_ops.matmul(grad_b, c, transpose_b=True)
   return (grad_a, grad_b)
 
@@ -93,10 +91,6 @@ def _BatchMatrixSolveGrad(op, grad):
   """Gradient for BatchMatrixSolve."""
   a = op.inputs[0]
   c = op.outputs[0]
-  # TODO(rmlarsen): Replace the following two lines with
-  # a single call to batch_matrix_solve after adding
-  # in an option to solve for A^T X = Y.
-  ainv = linalg_ops.batch_matrix_inverse(a)
-  grad_b = math_ops.batch_matmul(ainv, grad, adj_x=True)
+  grad_b = linalg_ops.batch_matrix_solve(a, grad, adjoint=True)
   grad_a = -math_ops.batch_matmul(grad_b, c, adj_y=True)
   return (grad_a, grad_b)
diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index f1e4f1b8419..58bddb0b672 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -28,7 +28,8 @@ from tensorflow.python.ops.gen_linalg_ops import *
 
 
 @ops.RegisterShape("Cholesky")
-def _CholeskyShape(op):
+@ops.RegisterShape("MatrixInverse")
+def _UnchangedSquare(op):
   input_shape = op.inputs[0].get_shape().with_rank(2)
   # The matrix must be square.
   input_shape[0].assert_is_compatible_with(input_shape[1])
@@ -36,7 +37,8 @@ def _CholeskyShape(op):
 
 
 @ops.RegisterShape("BatchCholesky")
-def _BatchCholeskyShape(op):
+@ops.RegisterShape("BatchMatrixInverse")
+def _BatchUnchangedSquare(op):
   input_shape = op.inputs[0].get_shape().with_rank_at_least(3)
   # The matrices in the batch must be square.
   input_shape[-1].assert_is_compatible_with(input_shape[-2])
@@ -68,22 +70,6 @@ def _BatchMatrixDeterminantShape(op):
     return [tensor_shape.unknown_shape()]
 
 
-@ops.RegisterShape("MatrixInverse")
-def _MatrixInverseShape(op):
-  input_shape = op.inputs[0].get_shape().with_rank(2)
-  # The matrix must be square.
-  input_shape[0].assert_is_compatible_with(input_shape[1])
-  return [input_shape]
-
-
-@ops.RegisterShape("BatchMatrixInverse")
-def _BatchMatrixInverseShape(op):
-  input_shape = op.inputs[0].get_shape().with_rank_at_least(3)
-  # The matrices in the batch must be square.
-  input_shape[-1].assert_is_compatible_with(input_shape[-2])
-  return [input_shape]
-
-
 @ops.RegisterShape("SelfAdjointEig")
 def _SelfAdjointEigShape(op):
   input_shape = op.inputs[0].get_shape().with_rank(2)
@@ -106,18 +92,20 @@ def _BatchSelfAdjointEigShape(op):
 
 
 @ops.RegisterShape("MatrixSolve")
-def _MatrixSolveShape(op):
+@ops.RegisterShape("MatrixTriangularSolve")
+def _SquareMatrixSolveShape(op):
   lhs_shape = op.inputs[0].get_shape().with_rank(2)
   rhs_shape = op.inputs[1].get_shape().with_rank_at_least(2)
   # The matrix must be square.
   lhs_shape[0].assert_is_compatible_with(lhs_shape[1])
   # The matrix and right-hand side must have the same number of rows.
   lhs_shape[0].assert_is_compatible_with(rhs_shape[0])
-  return [[lhs_shape[1], rhs_shape[1]]]
+  return [rhs_shape]
 
 
 @ops.RegisterShape("BatchMatrixSolve")
-def _BatchMatrixSolveShape(op):
+@ops.RegisterShape("BatchMatrixTriangularSolve")
+def _BatchSquareMatrixSolveShape(op):
   lhs_shape = op.inputs[0].get_shape().with_rank_at_least(3)
   rhs_shape = op.inputs[1].get_shape().with_rank_at_least(3)
   # The matrices must be square.
@@ -125,29 +113,6 @@ def _BatchMatrixSolveShape(op):
   # The matrices and right-hand sides in the batch must have the same number of
   # rows.
   lhs_shape[-2].assert_is_compatible_with(rhs_shape[-2])
-  return [lhs_shape[:-1].concatenate(rhs_shape[-1])]
-
-
-@ops.RegisterShape("MatrixTriangularSolve")
-def _MatrixTriangularSolveShape(op):
-  lhs_shape = op.inputs[0].get_shape().with_rank(2)
-  rhs_shape = op.inputs[1].get_shape().with_rank_at_least(2)
-  # The matrix must be square.
-  lhs_shape[0].assert_is_compatible_with(lhs_shape[1])
-  # The matrix and righ-hand side must have the same number of rows.
-  lhs_shape[0].assert_is_compatible_with(rhs_shape[0])
-  return [rhs_shape]
-
-
-@ops.RegisterShape("BatchMatrixTriangularSolve")
-def _BatchMatrixTriangularSolveShape(op):
-  lhs_shape = op.inputs[0].get_shape().with_rank_at_least(3)
-  rhs_shape = op.inputs[1].get_shape().with_rank_at_least(3)
-  # The matrices must be square.
-  lhs_shape[-1].assert_is_compatible_with(lhs_shape[-2])
-  # The matrices and righ-hand sides in the batch must have the same number of
-  # rows.
-  lhs_shape[-2].assert_is_compatible_with(rhs_shape[-2])
   return [rhs_shape]
 
 
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 74c50360ae0..45611138b5a 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -94,7 +94,7 @@ def histogram_summary(tag, values, collections=None, name=None):
   [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
   has one summary value containing a histogram for `values`.
 
-  This op reports an `OutOfRange` error if any value is not finite.
+  This op reports an `InvalidArgument` error if any value is not finite.
 
   Args:
     tag: A `string` `Tensor`. 0-D.  Tag to use for the summary value.
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 99cbe3949fe..229324a2930 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -64,6 +64,9 @@ mathematical functions to your graph.
 TensorFlow provides several operations that you can use to add basic
 mathematical functions for matrices to your graph.
 
+@@batch_matrix_diag
+@@batch_matrix_diag_part
+
 @@diag
 @@diag_part
 @@trace
diff --git a/tensorflow/python/ops/partitioned_variables.py b/tensorflow/python/ops/partitioned_variables.py
index fb2f51dcde5..9d4d19668af 100644
--- a/tensorflow/python/ops/partitioned_variables.py
+++ b/tensorflow/python/ops/partitioned_variables.py
@@ -95,7 +95,7 @@ def _compute_slice_dim_and_shape(full_shape, slicing):
 
 def create_partitioned_variables(
     shape, slicing, initializer, dtype=dtypes.float32,
-    trainable=True, collections=None, name=None):
+    trainable=True, collections=None, name=None, reuse=None):
   """Create a list of partitioned variables according to the given `slicing`.
 
   Currently only one dimension of the full variable can be sliced, and the
@@ -127,6 +127,9 @@ def create_partitioned_variables(
       Defaults to `[GraphKeys.VARIABLES]`.
     name: Optional name for the full variable.  Defaults to
       `"PartitionedVariable"` and gets uniquified automatically.
+    reuse: Boolean or `None`; if `True` and name is set, it would reuse
+      previously created variables. if `False` it will create new variables.
+      if `None`, it would inherit the parent scope reuse.
 
   Returns:
     A list of Variables corresponding to the slicing.
@@ -152,7 +155,8 @@ def create_partitioned_variables(
   num_slices_with_excess = full_shape[slice_dim] % num_slices
 
   with variable_scope.variable_op_scope([], name,
-                                        "PartitionedVariable") as scope:
+                                        "PartitionedVariable",
+                                        reuse=reuse) as scope:
     full_name = scope.name
     slice_offset = [0] * len(full_shape)
     for i in xrange(num_slices):
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index 6b50a30205c..ffc621d2942 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -226,44 +226,47 @@ def _rnn_step(
       final_output is a `Tensor` matrix of shape [batch_size, output_size]
       final_state is a `Tensor` matrix of shape [batch_size, state_size]
   """
-  # Step 1: determine whether we need to call_cell or not
-  empty_update = lambda: (zero_output, state)
   state_shape = state.get_shape()
 
-  if skip_conditionals:
-    # Skip using conditionals: calculate the RNN step at all time
-    # steps.  This is faster for dynamic_rnn, where the time steps
-    # should cap out at max_sequence_length anyway.
-    output, new_state = call_cell()
-  else:
-    output, new_state = control_flow_ops.cond(
-        time < max_sequence_length, call_cell, empty_update)
-
-  # Step 2: determine whether we need to copy through state and/or outputs
-  existing_output_state = lambda: (output, new_state)
-
-  def copy_through():
+  def _copy_some_through(new_output, new_state):
     # Use broadcasting select to determine which values should get
     # the previous state & zero output, and which values should get
     # a calculated state & output.
     copy_cond = (time >= sequence_length)
-    return (math_ops.select(copy_cond, zero_output, output),
+    return (math_ops.select(copy_cond, zero_output, new_output),
             math_ops.select(copy_cond, state, new_state))
 
+  def _maybe_copy_some_through():
+    """Run RNN step.  Pass through either no or some past state."""
+    new_output, new_state = call_cell()
+
+    return control_flow_ops.cond(
+        # if t < min_seq_len: calculate and return everything
+        time < min_sequence_length, lambda: (new_output, new_state),
+        # else copy some of it through
+        lambda: _copy_some_through(new_output, new_state))
+
   # TODO(ebrevdo): skipping these conditionals may cause a slowdown,
   # but benefits from removing cond() and its gradient.  We should
   # profile with and without this switch here.
   if skip_conditionals:
-    # Skip using conditionals: perform the selective copy at all time
-    # steps.  This is usually faster.
-    (output, state) = copy_through()
+    # Instead of using conditionals, perform the selective copy at all time
+    # steps.  This is faster when max_seq_len is equal to the number of unrolls
+    # (which is typical for dynamic_rnn).
+    new_output, new_state = call_cell()
+    (final_output, final_state) = _copy_some_through(new_output, new_state)
   else:
-    (output, state) = control_flow_ops.cond(
-        time < min_sequence_length, existing_output_state, copy_through)
+    empty_update = lambda: (zero_output, state)
 
-  output.set_shape(zero_output.get_shape())
-  state.set_shape(state_shape)
-  return (output, state)
+    (final_output, final_state) = control_flow_ops.cond(
+        # if t >= max_seq_len: copy all state through, output zeros
+        time >= max_sequence_length, empty_update,
+        # otherwise calculation is required: copy some or all of it through
+        _maybe_copy_some_through)
+
+  final_output.set_shape(zero_output.get_shape())
+  final_state.set_shape(state_shape)
+  return (final_output, final_state)
 
 
 def _reverse_seq(input_seq, lengths):
@@ -584,7 +587,7 @@ def _dynamic_rnn_loop(
 
     return (time + 1, new_state, output_ta_t)
 
-  (unused_final_time, final_state, output_final_ta) = control_flow_ops.While(
+  (_, final_state, output_final_ta) = control_flow_ops.while_loop(
       cond=lambda time, _1, _2: time < time_steps,
       body=_time_step,
       loop_vars=(time, state, output_ta),
diff --git a/tensorflow/python/ops/rnn_cell.py b/tensorflow/python/ops/rnn_cell.py
index e33e2964516..6650d3b53b8 100644
--- a/tensorflow/python/ops/rnn_cell.py
+++ b/tensorflow/python/ops/rnn_cell.py
@@ -242,7 +242,7 @@ def _get_sharded_variable(name, shape, dtype, num_shards):
     current_size = unit_shard_size
     if i < remaining_rows:
       current_size += 1
-    shards.append(vs.get_variable(name + "_%d" % i, [current_size, shape[1]],
+    shards.append(vs.get_variable(name + "_%d" % i, [current_size] + shape[1:],
                                   dtype=dtype))
   return shards
 
diff --git a/tensorflow/python/ops/session_ops.py b/tensorflow/python/ops/session_ops.py
new file mode 100644
index 00000000000..8a1ba164d67
--- /dev/null
+++ b/tensorflow/python/ops/session_ops.py
@@ -0,0 +1,255 @@
+# Copyright 2015 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""## Tensor Handle Operations.
+
+TensorFlow provides several operators that allows the user to keep tensors
+"in-place" across run calls.
+
+@@get_session_handle
+@@get_session_tensor
+@@delete_session_tensor
+"""
+
+# pylint: disable=g-bad-name
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import device as pydev
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_data_flow_ops
+from tensorflow.python.util import compat
+
+
+class TensorHandle(object):
+  """Represents a handle for a live tensor in a session."""
+
+  def __init__(self, handle, dtype, session):
+    """Constructs a new tensor handle.
+
+    A tensor handle for a persistent tensor is a python string
+    that has the form of "tensor_name;unique_id;device_name".
+
+    Args:
+      handle: A tensor handle.
+      dtype: The data type of the tensor represented by `handle`.
+      session: The session in which the tensor is produced.
+    """
+    self._handle = compat.as_str_any(handle)
+    self._dtype = dtype
+    self._session = session
+    self._auto_gc_enabled = True
+
+  def __del__(self):
+    if self._auto_gc_enabled:
+      self._session._register_dead_handle(self.handle)
+
+  def __str__(self):
+    return self._handle
+
+  @property
+  def handle(self):
+    return self._handle
+
+  def eval(self):
+    """Return the value of the tensor represented by this handle."""
+    holder, reader = _get_handle_reader(self._session.graph, self._handle,
+                                        self._dtype)
+    return self._session.run(reader, feed_dict={holder: self._handle})
+
+  def get_raw_handle(self):
+    """Return the raw handle of the tensor.
+
+    Note that the method disables the automatic garbage collection of this
+    persistent tensor. The caller is now responsible for managing the life
+    time of the tensor.
+    """
+    self._auto_gc_enabled = False
+    return self._handle
+
+  @staticmethod
+  def _get_device_name(handle):
+    """The device name encoded in the handle."""
+    handle_str = compat.as_str_any(handle)
+    return pydev.canonical_name(handle_str.split(';')[-1])
+
+  @staticmethod
+  def _get_reader_key(handle):
+    """The graph key for reader."""
+    handle_parts = str(handle).split(';')
+    return handle_parts[0] + ';' + handle_parts[-1]
+
+  @staticmethod
+  def _get_deleter_key(handle):
+    """The graph key for deleter."""
+    return str(handle).split(';')[-1]
+
+  @staticmethod
+  def _get_mover_key(feeder, handle):
+    """The graph key for mover."""
+    return feeder.op.name + ';' + TensorHandle._get_reader_key(handle)
+
+
+def get_session_handle(data, name=None):
+  """Return the handle of `data`.
+
+  This is EXPERIMENTAL and subject to change.
+
+  Keep `data` "in-place" in the runtime and create a handle that can be
+  used to retrieve `data` in a subsequent run().
+
+  Combined with `get_session_tensor`, we can keep a tensor produced in
+  one run call in place, and use it as the input in a future run call.
+  Below is a simple example:
+
+  ```python
+  c = tf.mul(a, b)
+  h = tf.get_session_handle(c)
+  h = sess.run(h)
+
+  p, a = tf.get_session_tensor(tf.float32)
+  b = tf.mul(a, 10)
+  c = sess.run(b, feed_dict={p: h.handle})
+  ```
+
+  Args:
+    data: A tensor to be stored in the session.
+    name: Optional name prefix for the return tensor.
+
+  Returns:
+    A scalar string tensor representing a unique handle for `data`.
+
+  Raises:
+    TypeError: if `data` is not a Tensor.
+  """
+  if not isinstance(data, ops.Tensor):
+    raise TypeError('`data` must be of type Tensor.')
+
+  # Colocate this operation with data.
+  with ops.colocate_with(data):
+    return gen_data_flow_ops._get_session_handle(data, name=name)
+
+
+def get_session_tensor(dtype, name=None):
+  """Get the tensor of type `dtype` by feeding a tensor handle.
+
+  This is EXPERIMENTAL and subject to change.
+
+  Get the value of the tensor from a tensor handle. The tensor
+  is produced in a previous run() and stored in the state of the
+  session.
+
+  Args:
+    dtype: The type of the output tensor.
+    name: Optional name prefix for the return tensor.
+
+  Returns:
+    A pair of tensors. The first is a placeholder for feeding a
+    tensor handle and the second is the tensor in the session state
+    keyed by the tensor handle.
+  """
+  with ops.device(None):
+    # Commit the device when it is used the first time.
+    holder = array_ops.placeholder(dtypes.string)
+    _register_handle_feeder(holder.graph, holder, dtype)
+  tensor = gen_data_flow_ops._get_session_tensor(holder, dtype, name=name)
+  return (holder, tensor)
+
+
+def delete_session_tensor(name=None):
+  """Delete the tensor by feeding a tensor handle.
+
+  This is EXPERIMENTAL and subject to change.
+
+  Delete the tensor of a given tensor handle. The tensor is produced
+  in a previous run() and stored in the state of the session.
+
+  Args:
+    name: Optional name prefix for the return tensor.
+
+  Returns:
+    A pair of graph elements. The first is a placeholder for feeding a
+    tensor handle and the second is a deletion operation.
+  """
+  with ops.device(None):
+    # We will commit the device at the time it is used.
+    holder = array_ops.placeholder(dtypes.string)
+  deleter = gen_data_flow_ops._delete_session_tensor(holder, name=name)
+  return (holder, deleter)
+
+
+def _register_handle_feeder(graph, feeder, dtype):
+  graph._handle_feeders[feeder.op.name] = dtype
+
+
+def _get_handle_feeder(graph, feeder):
+  return graph._handle_feeders.get(feeder.op.name)
+
+
+def _get_handle_reader(graph, handle, dtype):
+  """Return a read subgraph for this handle."""
+  graph_key = TensorHandle._get_reader_key(handle)
+  result = graph._handle_readers.get(graph_key)
+  if result is None:
+    # Create reader if we haven't done it.
+    handle_device = TensorHandle._get_device_name(handle)
+    with ops.device(handle_device):
+      holder = array_ops.placeholder(dtypes.string)
+      _register_handle_feeder(holder.graph, holder, dtype)
+      reader = gen_data_flow_ops._get_session_tensor(holder, dtype)
+    result = (holder, reader)
+    graph._handle_readers[graph_key] = result
+  return result
+
+
+def _get_handle_mover(graph, feeder, handle):
+  """Return a move subgraph for this pair of feeder and handle."""
+  dtype = _get_handle_feeder(graph, feeder)
+  if dtype is None:
+    return None
+  handle_device = TensorHandle._get_device_name(handle)
+  if not feeder.op.device:
+    feeder.op._set_device(handle_device)
+    return None
+  if feeder.op.device == handle_device:
+    return None
+  # Now we know we have to move the tensor.
+  graph_key = TensorHandle._get_mover_key(feeder, handle)
+  result = graph._handle_movers.get(graph_key)
+  if result is None:
+    # Create mover if we haven't done it.
+    holder, reader = _get_handle_reader(graph, handle, dtype)
+    with ops.device(feeder.op.device):
+      mover = gen_data_flow_ops._get_session_handle(reader)
+    result = (holder, mover)
+    graph._handle_movers[graph_key] = result
+  return result
+
+
+def _get_handle_deleter(graph, handle):
+  """Return a deletion subgraph for this handle."""
+  graph_key = TensorHandle._get_deleter_key(handle)
+  result = graph._handle_deleters.get(graph_key)
+  if result is None:
+    # Create deleter if we haven't done it.
+    handle_device = TensorHandle._get_device_name(handle)
+    with ops.device(handle_device):
+      holder = array_ops.placeholder(dtypes.string)
+      deleter = gen_data_flow_ops._delete_session_tensor(holder)
+    result = (holder, deleter)
+    graph._handle_deleters[graph_key] = result
+  return result
diff --git a/tensorflow/python/ops/standard_ops.py b/tensorflow/python/ops/standard_ops.py
index ace9cd2c5ca..2d61190b962 100644
--- a/tensorflow/python/ops/standard_ops.py
+++ b/tensorflow/python/ops/standard_ops.py
@@ -38,6 +38,7 @@ from tensorflow.python.ops.control_flow_ops import no_op
 from tensorflow.python.ops.control_flow_ops import tuple
 from tensorflow.python.ops.control_flow_ops import cond
 from tensorflow.python.ops.control_flow_ops import case
+from tensorflow.python.ops.control_flow_ops import while_loop
 from tensorflow.python.ops.data_flow_ops import *
 from tensorflow.python.ops.functional_ops import *
 from tensorflow.python.ops.gradients import *
@@ -52,6 +53,7 @@ from tensorflow.python.ops.parsing_ops import *
 from tensorflow.python.ops.partitioned_variables import *
 from tensorflow.python.ops.random_ops import *
 from tensorflow.python.ops.script_ops import py_func
+from tensorflow.python.ops.session_ops import *
 from tensorflow.python.ops.sparse_ops import *
 from tensorflow.python.ops.state_ops import assign
 from tensorflow.python.ops.state_ops import assign_add
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index 2af7a3d6c4f..d082377bd48 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -13,21 +13,85 @@
 # limitations under the License.
 # ==============================================================================
 
-"""String Ops."""
+"""## Hashing
+
+String hashing ops take a string input tensor and map each element to an
+integer.
+
+@@string_to_hash_bucket
+
+## Joining
+
+String joining ops concatenate elements of input string tensors to produce a new
+string tensor.
+
+@@reduce_join
+"""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import common_shapes
+# pylint: disable=unused-import
 from tensorflow.python.ops import gen_string_ops
+# pylint: enable=unused-import
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_string_ops import *
 # pylint: enable=wildcard-import
 
 ops.NoGradient("StringToHashBucket")
+ops.NoGradient("ReduceJoin")
 
 ops.RegisterShape("StringToHashBucket")(common_shapes.unchanged_shape)
+
+
+@ops.RegisterShape("ReduceJoin")
+def _ReduceJoinShape(op):
+  """Shape function for the ReduceJoin op."""
+  input_shape = op.inputs[0].get_shape()
+  reduction_indices = np.ravel(tensor_util.constant_value(op.inputs[1]))
+  keep_dims = op.get_attr("keep_dims")
+
+  if input_shape.ndims is None:
+    return [tensor_shape.unknown_shape()]
+
+  if input_shape.ndims == 0:
+    raise ValueError("Input string tensor cannot be a scalar.")
+
+  true_indices = set()
+  for reduction_index in reduction_indices:
+    if reduction_index is None:
+      return [tensor_shape.unknown_shape()]
+
+    if (reduction_index < -input_shape.ndims or
+        reduction_index >= input_shape.ndims):
+      raise ValueError("Invalid reduction dimension %d for input with %d "
+                       "dimensions" % (reduction_index, input_shape.ndims))
+
+    true_index = reduction_index % input_shape.ndims
+    if true_index in true_indices:
+      raise ValueError("Duplicate reduction index %d." % reduction_index)
+
+    if input_shape.dims[true_index] == 0:
+      raise ValueError("Cannot reduce dimension %d with size 0." %
+                       reduction_index)
+
+    true_indices.add(true_index)
+
+  returned_dims = []
+  for i, dim in enumerate(input_shape.dims):
+    if i in true_indices:
+      if keep_dims:
+        returned_dims.append(1)
+    else:
+      returned_dims.append(dim)
+
+  return [tensor_shape.TensorShape(returned_dims)]
+
diff --git a/tensorflow/python/ops/tensor_array_grad.py b/tensorflow/python/ops/tensor_array_grad.py
index a2a8b592671..a47898e20a0 100644
--- a/tensorflow/python/ops/tensor_array_grad.py
+++ b/tensorflow/python/ops/tensor_array_grad.py
@@ -62,7 +62,7 @@ def _GetGradSource(op_or_tensor):
     raise ValueError(
         "Expected op/tensor name to start with gradients (excluding scope)"
         ", got: %s" % op_or_tensor.name)
-  return "/".join(name_tokens[:grad_pos[0] + 1])
+  return "/".join(name_tokens[:grad_pos[-1] + 1])
 
 
 @ops.RegisterGradient("TensorArrayRead")
diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index 35f63db23f2..b5bc6e2570a 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -23,6 +23,7 @@ from __future__ import print_function
 from tensorflow.python.framework import dtypes as _dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import constant_op
 from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import math_ops
@@ -53,7 +54,7 @@ class TensorArray(object):
   """
 
   def __init__(self, dtype, size=None, dynamic_size=None,
-               tensor_array_name=None,
+               clear_after_read=None, tensor_array_name=None,
                handle=None, flow=None, name=None):
     """Construct a new TensorArray or wrap an existing TensorArray handle.
 
@@ -63,6 +64,9 @@ class TensorArray(object):
         Required if handle is not provided.
       dynamic_size: (optional) Python bool: If true, writes to the TensorArray
         can grow the TensorArray past its initial size.  Default: False.
+      clear_after_read: Boolean (optional, default: True).  If True, clear
+        TensorArray values after reading them.  This disables read-many
+        semantics, but allows early release of memory.
       tensor_array_name: (optional) Python string: the name of the TensorArray.
         This is used when creating the TensorArray handle.  If this value is
         set, handle should be None.
@@ -89,7 +93,12 @@ class TensorArray(object):
     if handle is not None and dynamic_size is not None:
       raise ValueError("Cannot provide both a handle and dynamic_size "
                        "at the same time")
+    if handle is not None and clear_after_read is not None:
+      raise ValueError("Cannot provide both a handle and clear_after_read "
+                       "at the same time")
 
+    if clear_after_read is None:
+      clear_after_read = True
     dynamic_size = dynamic_size or False
 
     self._dtype = dtype
@@ -97,13 +106,22 @@ class TensorArray(object):
       if handle is not None:
         self._handle = handle
       else:
-        self._handle = gen_data_flow_ops._tensor_array(
-            dtype=dtype, size=size, dynamic_size=dynamic_size,
-            tensor_array_name=tensor_array_name, name=scope)
-    if flow is not None:
-      self._flow = flow
-    else:
-      self._flow = constant_op.constant(0, dtype=_dtypes.float32)
+        if flow is not None:
+          with ops.colocate_with(flow):
+            self._handle = gen_data_flow_ops._tensor_array(
+                dtype=dtype, size=size, dynamic_size=dynamic_size,
+                clear_after_read=clear_after_read,
+                tensor_array_name=tensor_array_name, name=scope)
+        else:
+          self._handle = gen_data_flow_ops._tensor_array(
+              dtype=dtype, size=size, dynamic_size=dynamic_size,
+              clear_after_read=clear_after_read,
+              tensor_array_name=tensor_array_name, name=scope)
+      if flow is not None:
+        self._flow = flow
+      else:
+        with ops.colocate_with(self._handle):
+          self._flow = constant_op.constant(0, dtype=_dtypes.float32)
 
   @property
   def flow(self):
@@ -120,80 +138,148 @@ class TensorArray(object):
     """The reference to the TensorArray."""
     return self._handle
 
-  def grad(self, source, flow=None):
+  def grad(self, source, flow=None, name=None):
     # tensor_array_grad requires a flow input when forward
     # TensorArrays are dynamically sized.  This forces the creation
     # of the grad TensorArray only once the final forward array's size
     # is fixed.
     if flow is None:
       flow = self.flow
-    g_handle = gen_data_flow_ops._tensor_array_grad(
-        handle=self._handle, source=source, flow_in=flow)
-    g = TensorArray(dtype=self._dtype, handle=g_handle, flow=flow)
-    return g
+    with ops.op_scope([self._handle], name, "TensorArrayGrad"):
+      with ops.colocate_with(self._handle):
+        g_handle = gen_data_flow_ops._tensor_array_grad(
+            handle=self._handle, source=source, flow_in=flow, name=name)
+        with ops.control_dependencies([g_handle]):
+          flow = array_ops.identity(flow, name="gradient_flow")
+        g = TensorArray(dtype=self._dtype, handle=g_handle, flow=flow)
+        return g
 
   def read(self, index, name=None):
-    """Read the value at location `index` in the TensorArray."""
-    value = gen_data_flow_ops._tensor_array_read(
-        handle=self._handle, index=index, flow_in=self._flow, dtype=self._dtype,
-        name=name)
-    return value
+    """Read the value at location `index` in the TensorArray.
+
+    Args:
+      index: 0-D.  int32 tensor with the index to read from.
+      name: A name for the operation (optional).
+
+    Returns:
+      The tensor at index `index`.
+    """
+    with ops.colocate_with(self._handle):
+      value = gen_data_flow_ops._tensor_array_read(
+          handle=self._handle, index=index, flow_in=self._flow,
+          dtype=self._dtype, name=name)
+      return value
 
   def write(self, index, value, name=None):
-    """Write `value` into index `index` of the TensorArray."""
-    flow_out = gen_data_flow_ops._tensor_array_write(
-        handle=self._handle, index=index, value=value, flow_in=self._flow,
-        name=name)
-    # Size below is ignored
-    ta = TensorArray(dtype=self._dtype, handle=self._handle)
-    ta._flow = flow_out
-    return ta
+    """Write `value` into index `index` of the TensorArray.
+
+    Args:
+      index: 0-D.  int32 scalar with the index to write to.
+      value: N-D.  Tensor of type `dtype`.  The Tensor to write to this index.
+      name: A name for the operation (optional).
+
+    Returns:
+      A new TensorArray object with flow that ensures the write occurs.
+      Use this object all for subsequent operations.
+    """
+    with ops.colocate_with(self._handle):
+      flow_out = gen_data_flow_ops._tensor_array_write(
+          handle=self._handle, index=index, value=value, flow_in=self._flow,
+          name=name)
+      ta = TensorArray(dtype=self._dtype, handle=self._handle)
+      ta._flow = flow_out
+      return ta
 
   def pack(self, name=None):
-    """Return the values in the TensorArray as a packed `Tensor`."""
-    value = gen_data_flow_ops._tensor_array_pack(
-        handle=self._handle, flow_in=self._flow, dtype=self._dtype,
-        name=name)
+    """Return the values in the TensorArray as a packed `Tensor`.
 
-    return value
+    All of the values must have been written and their shapes must all match.
+
+    Args:
+      name: A name for the operation (optional).
+
+    Returns:
+      All the tensors in the TensorArray packed into one tensor.
+    """
+    with ops.colocate_with(self._handle):
+      value = gen_data_flow_ops._tensor_array_pack(
+          handle=self._handle, flow_in=self._flow, dtype=self._dtype,
+          name=name)
+
+      return value
 
   def concat(self, name=None):
-    """Return the values in the TensorArray as a concatenated `Tensor`."""
-    value, _ = gen_data_flow_ops._tensor_array_concat(
-        handle=self._handle, flow_in=self._flow, dtype=self._dtype,
-        name=name)
-    return value
+    """Return the values in the TensorArray as a concatenated `Tensor`.
+
+    All of the values must have been written, their ranks must match, and
+    and their shapes must all match for all dimensions except the first.
+
+    Args:
+      name: A name for the operation (optional).
+
+    Returns:
+      All the tensors in the TensorArray concatenated into one tensor.
+    """
+    with ops.colocate_with(self._handle):
+      value, _ = gen_data_flow_ops._tensor_array_concat(
+          handle=self._handle, flow_in=self._flow, dtype=self._dtype,
+          name=name)
+      return value
 
   def unpack(self, value, name=None):
-    """Pack the values of a `Tensor` in the TensorArray."""
-    flow_out = gen_data_flow_ops._tensor_array_unpack(
-        handle=self._handle, value=value, flow_in=self._flow,
-        name=name)
-    ta = TensorArray(dtype=self._dtype, handle=self._handle)
-    ta._flow = flow_out
-    return ta
+    """Pack the values of a `Tensor` in the TensorArray.
+
+    Args:
+      value: (N+1)-D.  Tensor of type `dtype`.  The Tensor to unpack.
+      name: A name for the operation (optional).
+
+    Returns:
+      A new TensorArray object with flow that ensures the unpack occurs.
+      Use this object all for subsequent operations.
+    """
+    with ops.colocate_with(self._handle):
+      flow_out = gen_data_flow_ops._tensor_array_unpack(
+          handle=self._handle, value=value, flow_in=self._flow,
+          name=name)
+      ta = TensorArray(dtype=self._dtype, handle=self._handle)
+      ta._flow = flow_out
+      return ta
 
   def split(self, value, lengths, name=None):
-    """Split the values of a `Tensor` into the TensorArray."""
-    with ops.op_scope(
-        [self._handle, value, lengths], name, "TensorArraySplit"):
-      lengths = math_ops.to_int64(lengths)
-    flow_out = gen_data_flow_ops._tensor_array_split(
-        handle=self._handle, value=value, lengths=lengths, flow_in=self._flow,
-        name=name)
-    ta = TensorArray(dtype=self._dtype, handle=self._handle)
-    ta._flow = flow_out
-    return ta
+    """Split the values of a `Tensor` into the TensorArray.
+
+    Args:
+      value: (N+1)-D.  Tensor of type `dtype`.  The Tensor to split.
+      lengths: 1-D.  int32 vector with the lengths to use when splitting
+        `value` along its first dimension.
+      name: A name for the operation (optional).
+
+    Returns:
+      A new TensorArray object with flow that ensures the split occurs.
+      Use this object all for subsequent operations.
+    """
+    with ops.colocate_with(self._handle):
+      with ops.op_scope(
+          [self._handle, value, lengths], name, "TensorArraySplit"):
+        lengths = math_ops.to_int64(lengths)
+      flow_out = gen_data_flow_ops._tensor_array_split(
+          handle=self._handle, value=value, lengths=lengths, flow_in=self._flow,
+          name=name)
+      ta = TensorArray(dtype=self._dtype, handle=self._handle)
+      ta._flow = flow_out
+      return ta
 
   def size(self, name=None):
     """Return the size of the TensorArray."""
-    return gen_data_flow_ops._tensor_array_size(
-        handle=self._handle, flow_in=self.flow, name=name)
+    with ops.colocate_with(self._handle):
+      return gen_data_flow_ops._tensor_array_size(
+          handle=self._handle, flow_in=self.flow, name=name)
 
   def close(self, name=None):
     """Close the current TensorArray."""
-    return gen_data_flow_ops._tensor_array_close(
-        handle=self._handle, name=name)
+    with ops.colocate_with(self._handle):
+      return gen_data_flow_ops._tensor_array_close(
+          handle=self._handle, name=name)
 
 
 @ops.RegisterShape("TensorArray")
diff --git a/tensorflow/python/platform/benchmark.py b/tensorflow/python/platform/benchmark.py
index 89061bb1e84..65c8d100d29 100644
--- a/tensorflow/python/platform/benchmark.py
+++ b/tensorflow/python/platform/benchmark.py
@@ -245,27 +245,13 @@ class TensorFlowBenchmark(Benchmark):
         name=name)
 
 
-def _run_specific_benchmark(benchmark_class):
-  benchmark = benchmark_class()
-  attrs = dir(benchmark)
-  # Only run methods of this class whose names start with "benchmark"
-  for attr in attrs:
-    if not attr.startswith("benchmark"):
-      continue
-    benchmark_fn = getattr(benchmark, attr)
-    if not callable(benchmark_fn):
-      continue
-    # Call this benchmark method
-    benchmark_fn()
-
-
 def _run_benchmarks(regex):
   """Run benchmarks that match regex `regex`.
 
   This function goes through the global benchmark registry, and matches
-  benchmark **classe names** of the form "module.name.BenchmarkClass" to
-  the given regex.  If a class matches, all of its benchmark methods
-  are run.
+  benchmark class and method names of the form
+  `module.name.BenchmarkClass.benchmarkMethod` to the given regex.
+  If a method matches, it is run.
 
   Args:
     regex: The string regular expression to match Benchmark classes against.
@@ -275,10 +261,24 @@ def _run_benchmarks(regex):
   # Match benchmarks in registry against regex
   for benchmark in registry:
     benchmark_name = "%s.%s" % (benchmark.__module__, benchmark.__name__)
-    if re.search(regex, benchmark_name):
-      # Found a match
+    attrs = dir(benchmark)
+    # Don't instantiate the benchmark class unless necessary
+    benchmark_instance = None
 
-      _run_specific_benchmark(benchmark)
+    for attr in attrs:
+      if not attr.startswith("benchmark"):
+        continue
+      candidate_benchmark_fn = getattr(benchmark, attr)
+      if not callable(candidate_benchmark_fn):
+        continue
+      full_benchmark_name = "%s.%s" % (benchmark_name, attr)
+      if regex == "all" or re.search(regex, full_benchmark_name):
+        # Instantiate the class if it hasn't been instantiated
+        benchmark_instance = benchmark_instance or benchmark()
+        # Get the method tied to the class
+        instance_benchmark_fn = getattr(benchmark_instance, attr)
+        # Call the instance method
+        instance_benchmark_fn()
 
 
 def benchmarks_main(true_main):
diff --git a/tensorflow/python/summary/event_accumulator.py b/tensorflow/python/summary/event_accumulator.py
index e36dc6e43f3..01fa5e7718f 100644
--- a/tensorflow/python/summary/event_accumulator.py
+++ b/tensorflow/python/summary/event_accumulator.py
@@ -18,17 +18,15 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import os.path
 import threading
 
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf.config_pb2 import RunMetadata
 from tensorflow.core.util.event_pb2 import SessionLog
-from tensorflow.python.platform import gfile
 from tensorflow.python.platform import logging
 from tensorflow.python.summary.impl import directory_watcher
-from tensorflow.python.summary.impl import event_file_loader
-from tensorflow.python.summary.impl import gcs
-from tensorflow.python.summary.impl import gcs_file_loader
+from tensorflow.python.summary.impl import io_wrapper
 from tensorflow.python.summary.impl import reservoir
 
 namedtuple = collections.namedtuple
@@ -86,7 +84,7 @@ STORE_EVERYTHING_SIZE_GUIDANCE = {
 
 def IsTensorFlowEventsFile(path):
   """Check the path name to see if it is probably a TF Events file."""
-  return 'tfevents' in path
+  return 'tfevents' in os.path.basename(path)
 
 
 class EventAccumulator(object):
@@ -581,20 +579,14 @@ def _GetPurgeMessage(most_recent_step, most_recent_wall_time, event_step,
 
 def _GeneratorFromPath(path):
   """Create an event generator for file or directory at given path string."""
-  if gcs.IsGCSPath(path):
-    provider = directory_watcher.SequentialGCSProvider(
-        path,
-        path_filter=IsTensorFlowEventsFile)
-    return directory_watcher.DirectoryWatcher(provider,
-                                              gcs_file_loader.GCSFileLoader)
-  elif gfile.IsDirectory(path):
-    provider = directory_watcher.SequentialGFileProvider(
-        path,
-        path_filter=IsTensorFlowEventsFile)
-    return directory_watcher.DirectoryWatcher(provider,
-                                              event_file_loader.EventFileLoader)
+  if IsTensorFlowEventsFile(path):
+    return io_wrapper.CreateFileLoader(path)
   else:
-    return event_file_loader.EventFileLoader(path)
+    provider = directory_watcher.SequentialFileProvider(
+        path,
+        path_filter=IsTensorFlowEventsFile)
+    return directory_watcher.DirectoryWatcher(provider,
+                                              io_wrapper.CreateFileLoader)
 
 
 def _ParseFileVersion(file_version):
diff --git a/tensorflow/python/summary/event_multiplexer.py b/tensorflow/python/summary/event_multiplexer.py
index 6ce30710eeb..79a78252620 100644
--- a/tensorflow/python/summary/event_multiplexer.py
+++ b/tensorflow/python/summary/event_multiplexer.py
@@ -23,10 +23,9 @@ import threading
 
 import six
 
-from tensorflow.python.platform import gfile
 from tensorflow.python.platform import logging
 from tensorflow.python.summary import event_accumulator
-from tensorflow.python.summary.impl import gcs
+from tensorflow.python.summary.impl import io_wrapper
 
 
 class EventMultiplexer(object):
@@ -172,24 +171,15 @@ class EventMultiplexer(object):
     Returns:
       The `EventMultiplexer`.
     """
-    subdirs = []
-    if gcs.IsGCSPath(path):
-      subdirs = [
-          subdir
-          for (subdir, files) in gcs.ListRecursively(path)
-          if list(filter(event_accumulator.IsTensorFlowEventsFile, files))
-      ]
-    else:
-      if not gfile.Exists(path):
-        return  # Maybe it hasn't been created yet, fail silently to retry later
-      if not gfile.IsDirectory(path):
-        raise ValueError('AddRunsFromDirectory: path exists and is not a '
-                         'directory, %s' % path)
-      subdirs = [
-          subdir
-          for (subdir, _, files) in gfile.Walk(path)
-          if list(filter(event_accumulator.IsTensorFlowEventsFile, files))
-      ]
+    if io_wrapper.Exists(path) and not io_wrapper.IsDirectory(path):
+      raise ValueError('AddRunsFromDirectory: path exists and is not a '
+                       'directory, %s' % path)
+    # ListRecursively just yields nothing if the path doesn't exist.
+    subdirs = [
+        subdir
+        for (subdir, files) in io_wrapper.ListRecursively(path)
+        if list(filter(event_accumulator.IsTensorFlowEventsFile, files))
+    ]
 
     for subdir in subdirs:
       logging.info('Adding events from directory %s', subdir)
diff --git a/tensorflow/python/summary/impl/directory_watcher.py b/tensorflow/python/summary/impl/directory_watcher.py
index 7b6a7dca0f8..e8975517f5a 100644
--- a/tensorflow/python/summary/impl/directory_watcher.py
+++ b/tensorflow/python/summary/impl/directory_watcher.py
@@ -18,11 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
-from tensorflow.python.platform import gfile
 from tensorflow.python.platform import logging
-from tensorflow.python.summary.impl import gcs
+from tensorflow.python.summary.impl import io_wrapper
 
 
 class DirectoryWatcher(object):
@@ -134,31 +131,27 @@ class DirectoryWatcher(object):
     return self._path_provider(self._path)
 
 
-def _SequentialProvider(path_source):
-  """A provider that iterates over the output of a function that produces paths.
+def SequentialFileProvider(directory, path_filter=lambda x: True):
+  """Provides the files in a directory that match the given filter.
 
-  _SequentialProvider takes in a path_source, which is a function that returns a
-  list of all currently available paths. _SequentialProvider returns in a path
-  provider (see documentation for the |DirectoryWatcher| class for the
-  semantics) that will return the alphabetically next path after the current one
-  (or the earliest path if the current path is None).
-
-  The provider will never return a path which is alphanumerically less than the
-  current path; as such, if the path source provides a high path (e.g. "c") and
-  later doubles back and provides a low path (e.g. "b"), once the current path
-  was set to "c" the _SequentialProvider will ignore the "b" and never return
-  it.
+  Each time the provider is called, it returns the next path (in alphabetical
+  ordering) in the directory that satisfies the filter.
 
   Args:
-    path_source: A function that returns an iterable of paths.
+    directory: The directory to look for paths under.
+    path_filter: If present, only paths that satisfy this filter are considered.
 
   Returns:
-    A path provider for use with DirectoryWatcher.
-
+    A function that takes in a path (or None) and returns the next path to look
+    at (or None if there are no more paths).
   """
+
   def _Provider(current_path):
+    filtered_paths = (path
+                      for path in io_wrapper.ListDirectoryAbsolute(directory)
+                      if path_filter(path))
     next_paths = list(path
-                      for path in path_source()
+                      for path in filtered_paths
                       if current_path is None or path > current_path)
     if next_paths:
       return min(next_paths)
@@ -166,21 +159,3 @@ def _SequentialProvider(path_source):
       return None
 
   return _Provider
-
-
-def SequentialGFileProvider(directory, path_filter=lambda x: True):
-  """Provides the files in a directory that match the given filter."""
-  def _Source():
-    paths = (os.path.join(directory, path)
-             for path in gfile.ListDirectory(directory))
-    return (path for path in paths if path_filter(path))
-
-  return _SequentialProvider(_Source)
-
-
-def SequentialGCSProvider(directory, path_filter=lambda x: True):
-  """Provides the files in a GCS directory that match the given filter."""
-  def _Source():
-    return (path for path in gcs.ListDirectory(directory) if path_filter(path))
-
-  return _SequentialProvider(_Source)
diff --git a/tensorflow/python/summary/impl/directory_watcher_test.py b/tensorflow/python/summary/impl/directory_watcher_test.py
index 784d585dfad..2494662b5f9 100644
--- a/tensorflow/python/summary/impl/directory_watcher_test.py
+++ b/tensorflow/python/summary/impl/directory_watcher_test.py
@@ -52,7 +52,7 @@ class DirectoryWatcherTest(test_util.TensorFlowTestCase):
     self._directory = os.path.join(self.get_temp_dir(), 'monitor_dir')
     os.mkdir(self._directory)
     self._watcher = directory_watcher.DirectoryWatcher(
-        directory_watcher.SequentialGFileProvider(self._directory), _ByteLoader)
+        directory_watcher.SequentialFileProvider(self._directory), _ByteLoader)
 
   def tearDown(self):
     shutil.rmtree(self._directory)
@@ -111,7 +111,7 @@ class DirectoryWatcherTest(test_util.TensorFlowTestCase):
     self.assertWatcherYields(['a', 'c'])
 
   def testPathFilter(self):
-    provider = directory_watcher.SequentialGFileProvider(
+    provider = directory_watcher.SequentialFileProvider(
         self._directory,
         path_filter=lambda path: 'do_not_watch_me' not in path)
     self._watcher = directory_watcher.DirectoryWatcher(provider, _ByteLoader)
diff --git a/tensorflow/python/summary/impl/gcs.py b/tensorflow/python/summary/impl/gcs.py
index 390899d6b90..293886255c2 100644
--- a/tensorflow/python/summary/impl/gcs.py
+++ b/tensorflow/python/summary/impl/gcs.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os.path
+import os
 import subprocess
 
 from tensorflow.python.platform import logging
@@ -86,6 +86,25 @@ def ListRecursively(top):
   return tuples
 
 
+def IsDirectory(path):
+  """Returns true if path exists and is a directory."""
+  path = path.rstrip('/')
+  ls = ListDirectory(path)
+  if not ls:
+    # Doesn't exist.
+    return False
+  elif len(ls) == 1:
+    # Either it's a file (which ls-es as itself) or it's a dir with one file.
+    return ls[0] == path
+  else:
+    return True
+
+
+def Exists(path):
+  """Returns true if path exists."""
+  return bool(ListDirectory(path))
+
+
 def IsGCSPath(path):
   return path.startswith(PATH_PREFIX)
 
diff --git a/tensorflow/python/summary/impl/io_wrapper.py b/tensorflow/python/summary/impl/io_wrapper.py
new file mode 100644
index 00000000000..2f1f9323b16
--- /dev/null
+++ b/tensorflow/python/summary/impl/io_wrapper.py
@@ -0,0 +1,93 @@
+# Copyright 2015 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functions that wrap both gfile and gcs.
+
+This module is *not* intended to be a general-purpose IO wrapper library; it
+only implements the operations that are necessary for loading event files. The
+functions either dispatch to the gcs library or to gfile, depending on whether
+the path is a GCS 'pseudo-path' (i.e., it satisfies gcs.IsGCSPath) or not.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.platform import gfile
+from tensorflow.python.summary.impl import event_file_loader
+from tensorflow.python.summary.impl import gcs
+from tensorflow.python.summary.impl import gcs_file_loader
+
+
+def CreateFileLoader(path):
+  """Creates a file loader for the given path.
+
+  Args:
+    path: A string representing either a normal path or a GCS
+  Returns:
+    An object with a Load() method that yields event_pb2.Event protos.
+  """
+  if gcs.IsGCSPath(path):
+    return gcs_file_loader.GCSFileLoader(path)
+  else:
+    return event_file_loader.EventFileLoader(path)
+
+
+def ListDirectoryAbsolute(directory):
+  """Yields all files in the given directory. The paths are absolute."""
+  if gcs.IsGCSPath(directory):
+    return gcs.ListDirectory(directory)
+  else:
+    return (os.path.join(directory, path)
+            for path in gfile.ListDirectory(directory))
+
+
+def ListRecursively(top):
+  """Walks a directory tree, yielding (dir_path, file_paths) tuples.
+
+  For each of `top` and its subdirectories, yields a tuple containing the path
+  to the directory and the path to each of the contained files.  Note that
+  unlike os.Walk()/gfile.Walk(), this does not list subdirectories and the file
+  paths are all absolute.
+
+  If the directory does not exist, this yields nothing.
+
+  Args:
+    top: A path to a directory..
+  Yields:
+    A list of (dir_path, file_paths) tuples.
+  """
+  if gcs.IsGCSPath(top):
+    for x in gcs.ListRecursively(top):
+      yield x
+  else:
+    for dir_path, _, filenames in gfile.Walk(top):
+      yield (dir_path, (os.path.join(dir_path, filename)
+                        for filename in filenames))
+
+
+def IsDirectory(path):
+  """Returns true if path exists and is a directory."""
+  if gcs.IsGCSPath(path):
+    return gcs.IsDirectory(path)
+  else:
+    return gfile.IsDirectory(path)
+
+
+def Exists(path):
+  if gcs.IsGCSPath(path):
+    return gcs.Exists(path)
+  else:
+    return gfile.Exists(path)
diff --git a/tensorflow/python/training/server_lib.py b/tensorflow/python/training/server_lib.py
index 58c39e9b8b5..63a72cbd768 100644
--- a/tensorflow/python/training/server_lib.py
+++ b/tensorflow/python/training/server_lib.py
@@ -22,6 +22,7 @@ import six  # pylint: disable=unused-import
 
 from tensorflow.core.protobuf import tensorflow_server_pb2
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.framework import errors
 from tensorflow.python.util import compat
 
 
@@ -33,13 +34,15 @@ def _make_server_def(server_or_cluster_def, job_name, task_index, protocol):
       `tf.train.ClusterDef` protocol buffer, or a
       `tf.train.ClusterSpec` object, describing the server to be
       defined and/or the cluster of which it is a member.
-    job_name: (Optional.) If not specified in `server_or_cluster_def`,
-      specifies the name of the job of which the server is a member.
-    task_index: (Optional.) If not specified in `server_or_cluster_def`,
-      specifies the task index of the server in its job.
-    protocol: (Optional.) If not specified in `server_or_cluster_def`,
-      specifies the protocol to be used by the server. Acceptable
-      values include `"grpc"`.
+    job_name: (Optional.) Specifies the name of the job of which the server
+      is a member. Defaults to the value in `server_or_cluster_def`, if
+      specified.
+    task_index: (Optional.) Specifies the task index of the server in its job.
+      Defaults to the value in `server_or_cluster_def`, if specified. Otherwise
+      defaults to 0 if the server's job has only one task.
+    protocol: (Optional.) Specifies the protocol to be used by the server.
+      Acceptable values include `"grpc"`. Defaults to the value in
+      `server_or_cluster_def`, if specified. Otherwise defaults to `"grpc"`.
 
   Returns:
     A `tf.train.ServerDef`.
@@ -109,26 +112,33 @@ class Server(object):
     """Creates a new server with the given definition.
 
     The `job_name`, `task_index`, and `protocol` arguments are optional, and
-    override any information also provided in `server_or_cluster_def`.
+    override any information provided in `server_or_cluster_def`.
 
     Args:
       server_or_cluster_def: A `tf.train.ServerDef` or
         `tf.train.ClusterDef` protocol buffer, or a
         `tf.train.ClusterSpec` object, describing the server to be
         created and/or the cluster of which it is a member.
-      job_name: (Optional.) If not specified in `server_or_cluster_def`,
-        specifies the name of the job of which this server is a member.
-      task_index: (Optional.) If not specified in `server_or_cluster_def`,
-        specifies the task index of this server in its job.
-      protocol: (Optional.) If not specified in `server_or_cluster_def`,
-        specifies the protocol to be used by this server. Acceptable
-        values include `"grpc"`.
+      job_name: (Optional.) Specifies the name of the job of which the server
+        is a member. Defaults to the value in `server_or_cluster_def`, if
+        specified.
+      task_index: (Optional.) Specifies the task index of the server in its
+        job. Defaults to the value in `server_or_cluster_def`, if specified.
+        Otherwise defaults to 0 if the server's job has only one task.
+      protocol: (Optional.) Specifies the protocol to be used by the server.
+        Acceptable values include `"grpc"`. Defaults to the value in
+        `server_or_cluster_def`, if specified. Otherwise defaults to `"grpc"`.
       start: (Optional.) Boolean, indicating whether to start the server
         after creating it. Defaults to `True`.
     """
     server_def = _make_server_def(server_or_cluster_def,
                                   job_name, task_index, protocol)
-    self._server = pywrap_tensorflow.NewServer(server_def.SerializeToString())
+    try:
+      self._server = pywrap_tensorflow.NewServer(server_def.SerializeToString())
+    except pywrap_tensorflow.StatusNotOK as e:
+      # pylint: disable=protected-access
+      raise errors._make_specific_exception(None, None, e.error_message, e.code)
+      # pylint: enable=protected-access
     if start:
       self.start()
 
@@ -260,7 +270,7 @@ class ClusterSpec(object):
 
     Returns:
       A list of strings, corresponding to the network addresses of tasks in
-      the given job.
+      the given job, ordered by task index.
 
     Raises:
       ValueError: If `job_name` does not name a job in this cluster.
@@ -296,3 +306,4 @@ class ClusterSpec(object):
           raise TypeError(
               "Task address %r must be bytes or unicode" % task_address)
         job_def.tasks[i] = task_address
+
diff --git a/tensorflow/python/training/server_lib_test.py b/tensorflow/python/training/server_lib_test.py
index 3b9614472f8..c07f5220dc6 100644
--- a/tensorflow/python/training/server_lib_test.py
+++ b/tensorflow/python/training/server_lib_test.py
@@ -79,6 +79,12 @@ class GrpcServerTest(tf.test.TestCase):
       self.assertEqual(0.5, min_val)
       self.assertEqual(0.5, max_val)
 
+  def testInvalidHostname(self):
+    with self.assertRaisesRegexp(tf.errors.InvalidArgumentError, "port"):
+      _ = tf.train.Server({"local": ["localhost"]},
+                          job_name="local",
+                          task_index=0)
+
 
 class ServerDefTest(tf.test.TestCase):
 
diff --git a/tensorflow/tensorboard/TAG b/tensorflow/tensorboard/TAG
index 60d3b2f4a4c..b6a7d89c68e 100644
--- a/tensorflow/tensorboard/TAG
+++ b/tensorflow/tensorboard/TAG
@@ -1 +1 @@
-15
+16
diff --git a/tensorflow/tensorboard/bower.json b/tensorflow/tensorboard/bower.json
index 39b50cc2aa3..503fd25e4f6 100644
--- a/tensorflow/tensorboard/bower.json
+++ b/tensorflow/tensorboard/bower.json
@@ -46,7 +46,7 @@
     "iron-collapse": "PolymerElements/iron-collapse#1.0.6",
     "iron-dropdown": "PolymerElements/iron-dropdown#1.3.0",
     "iron-fit-behavior": "PolymerElements/iron-fit-behavior#1.0.6",
-    "iron-flex-layout": "PolymerElements/iron-flex-layout#1.2.3",
+    "iron-flex-layout": "PolymerElements/iron-flex-layout#1.3.0",
     "iron-form-element-behavior": "PolymerElements/iron-form-element-behavior#1.0.6",
     "iron-icon": "PolymerElements/iron-icon#1.0.8",
     "iron-icons": "PolymerElements/iron-icons#1.1.3",
@@ -73,7 +73,7 @@
     "paper-material": "PolymerElements/paper-material#1.0.6",
     "paper-menu": "PolymerElements/paper-menu#1.2.2",
     "paper-menu-button": "PolymerElements/paper-menu-button#1.0.4",
-    "paper-progress": "PolymerElements/paper-progress#1.0.8",
+    "paper-progress": "PolymerElements/paper-progress#1.0.9",
     "paper-radio-button": "PolymerElements/paper-radio-button#1.1.1",
     "paper-radio-group": "PolymerElements/paper-radio-group#1.0.9",
     "paper-ripple": "PolymerElements/paper-ripple#1.0.5",
@@ -117,7 +117,7 @@
     "iron-collapse": "1.0.6",
     "iron-dropdown": "1.3.0",
     "iron-fit-behavior": "1.0.6",
-    "iron-flex-layout": "1.2.3",
+    "iron-flex-layout": "1.3.0",
     "iron-form-element-behavior": "1.0.6",
     "iron-icon": "1.0.8",
     "iron-icons": "1.1.3",
@@ -144,7 +144,7 @@
     "paper-material": "1.0.6",
     "paper-menu": "1.2.2",
     "paper-menu-button": "1.0.4",
-    "paper-progress": "1.0.8",
+    "paper-progress": "1.0.9",
     "paper-radio-button": "1.1.1",
     "paper-radio-group": "1.0.9",
     "paper-ripple": "1.0.5",
diff --git a/tensorflow/tensorboard/components/index.html b/tensorflow/tensorboard/components/index.html
index 8b0f87d3504..934252c1fdf 100644
--- a/tensorflow/tensorboard/components/index.html
+++ b/tensorflow/tensorboard/components/index.html
@@ -11,7 +11,7 @@
       }
     </style>
     <script src="../app/analytics.js"></script>
-    <link rel="shortcut icon" href="lib/images/favicon.png">
+    <link rel="shortcut icon" href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAMQAAADECAMAAAD3eH5ZAAAABGdBTUEAALGPC/xhBQAAAAFzUkdCAK7OHOkAAAD/UExURfFlKfaELvFmKfNyK/67NvWALf68Nv69NvNxK/20NfyyNP22NfN0K/JrKvqhMv2zNf25Nf24Nf23NfeOL/yzNPyvNPJoKviWMPmeMfN1K/WBLfePL/FnKfeML/qlMvR7LPmcMfeLL/aJLvR5LPFoKfJuKvR3LP66NvywNPeNL/V/LfaILv21Nf26NfNzK/NvK/R6LPmaMfyxNPqfMvV+LfurM/iSMPmbMfJvKvmdMfumM/qiMvmZMfytNPJqKvysNPN2K/iYMPNwK/upM/JtKvJsKviVMPaHLvaGLvJpKvR8LPaKLvqkMvuqM/aFLvR4LPuoM/iTMPWDLfiRMPmYMXS0ngkAAALoSURBVHja7drnctpAFIbhFUISSKJ3MKYa0+y4xTW9937/15JkJhlTjhrSrHRmvuf/as6L0YLFCgEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMBJ6njenqspzgnPrsrGX9Zpi2tCrmnc6+dYNthVY5WpMmxQLWPdMsOuYVwzNj3ei2t3mQwaV43BJPDCS2NbJ5aEeuX/+9qcjQOtfFIkIkrvY2g4MVcmOBsFWbowKO/kNyj62gRpJcDaPBlxLr1B0zdG0C/8LzbJiJrshuvy1gzlA9+rD8mIkuyIJjFE3/dqnYwoSm7IUEPoD/wut8iIguSIDjlFxe/yfXL5vuSI21BTZLLhXoOILMO8Hxwa/L8bI0LfmUdhGowb2ZvT0e57pFNDgB06IlVyjmmIBl2T/nl9Rw6SD9GgSG/Q0uQkaW3XhmovKQ3eFQ4N2Uo9OQ1eFZsNerf7vP+rO4rhmY1Lg3vFVoP8+8BXg1sFnwbnCk4NThW8GuiKBDdkVVtTNFvNelVsNqTbyWnIOM2oeTRoyWvwmpJHg/ucXBrcJuXT4DwrpwZi2vy0VCx8YtXg/D2bU4OfiuQ3eFfE2KD4bfCqiLNB993gXsGlwa2CT4NzBacGIVQ6YsipQdh0xEdODUKjIxrSp88onZ8zbbFLg1DoiFO5BXvDGv2My9/JhUT8JUZTI0yDaNHLBzIbvqTDNYhUiVw/kdjQ1kM2CHFDPjKW+KzyRTF0g/ga9w9y+fANQpxvX8CU+Ny7FUWDeF3Y+g3lROIf4k0UDX9eCyvO531PyYhHga9zvPZJU5b73Y/eXj8Hv9D48n6HaF5LbcjRt8TZTtda5M1DfXnbkX1C0SHCFKzQB5Fe8op4GNGNHavvZESbVwT5r6W1xyuCPBY3Y9YgDqzknH/e3YfNzzuL30l0IebrZ5kKtuDIXt1n868ET6kf3/49tLvrCcZyF8Pu215dAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAcPIbNrBhOaBXucoAAAAASUVORK5CYII=">
     <link rel="import" href="tf-tensorboard/tf-tensorboard.html">
     <title>TensorBoard</title>
   </head>
diff --git a/tensorflow/tensorboard/components/tf-dashboard-common/tensorboard-color.html b/tensorflow/tensorboard/components/tf-dashboard-common/tensorboard-color.html
index 03f53a746f3..3cfba6114d0 100644
--- a/tensorflow/tensorboard/components/tf-dashboard-common/tensorboard-color.html
+++ b/tensorflow/tensorboard/components/tf-dashboard-common/tensorboard-color.html
@@ -2,8 +2,8 @@
 <style is="custom-style">
 
   :root {
-    --tb-orange-weak: #fcb938;
-    --tb-orange-strong: #f3913e;
+    --tb-orange-weak: #ffa726;
+    --tb-orange-strong: #f57c00;
     --tb-grey-darker: #e2e2e2;
     --tb-grey-lighter: #f3f3f3;
     --tb-ui-dark-accent: #757575;
diff --git a/tensorflow/tensorboard/components/tf-tensorboard/tf-tensorboard.html b/tensorflow/tensorboard/components/tf-tensorboard/tf-tensorboard.html
index 1476bdc012d..35441e21bbe 100644
--- a/tensorflow/tensorboard/components/tf-tensorboard/tf-tensorboard.html
+++ b/tensorflow/tensorboard/components/tf-tensorboard/tf-tensorboard.html
@@ -28,6 +28,11 @@ allows the user to toggle between various dashboards.
             <paper-tab data-mode="graphs">Graph</paper-tab>
             <paper-tab data-mode="histograms">Histograms</paper-tab>
           </paper-tabs>
+          <div class="global-actions">
+            <a href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/README.md" tabindex="-1">
+              <paper-icon-button icon="help-outline"></paper-icon-button>
+            </a>
+          </div>
         </div>
       </paper-toolbar>
       <div id="content" class="fit">
@@ -74,25 +79,13 @@ allows the user to toggle between various dashboards.
         -webkit-font-smoothing: antialiased;
       }
 
-      #toolbar-content {
-        width: 100%;
-        height: 100%;
-        display: flex;
-        flex-direction: row;
-        justify-content: space-between;
-        align-items: center;
-      }
-
       .toolbar-title {
         font-size: 20px;
         margin-left: 10px;
         text-rendering: optimizeLegibility;
         letter-spacing: -0.025em;
         font-weight: 500;
-      }
-
-      #content {
-        height: 100%;
+        width: 340px;
       }
 
       .tabs {
@@ -105,6 +98,29 @@ allows the user to toggle between various dashboards.
         --paper-tabs-selection-bar-color: white;
       }
 
+      .global-actions {
+        flex-grow: 2;
+        text-align: right;
+        color: white;
+      }
+
+      .global-actions a {
+        color: white;
+      }
+
+      #toolbar-content {
+        width: 100%;
+        height: 100%;
+        display: flex;
+        flex-direction: row;
+        justify-content: space-between;
+        align-items: center;
+      }
+
+      #content {
+        height: 100%;
+      }
+
     </style>
   </template>
   <script>
diff --git a/tensorflow/tensorboard/demo/index.html b/tensorflow/tensorboard/demo/index.html
index 90119a1fcb3..1f592b04edc 100644
--- a/tensorflow/tensorboard/demo/index.html
+++ b/tensorflow/tensorboard/demo/index.html
@@ -4,6 +4,7 @@
   <script src="../components/webcomponentsjs/webcomponents-lite.min.js"></script>
     <link rel="import" href="../components/tf-tensorboard/tf-tensorboard-demo.html">
     <link rel="stylesheet" type="text/css" href="../lib/css/global.css">
+    <link rel="shortcut icon" href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAMQAAADECAMAAAD3eH5ZAAAABGdBTUEAALGPC/xhBQAAAAFzUkdCAK7OHOkAAAD/UExURfFlKfaELvFmKfNyK/67NvWALf68Nv69NvNxK/20NfyyNP22NfN0K/JrKvqhMv2zNf25Nf24Nf23NfeOL/yzNPyvNPJoKviWMPmeMfN1K/WBLfePL/FnKfeML/qlMvR7LPmcMfeLL/aJLvR5LPFoKfJuKvR3LP66NvywNPeNL/V/LfaILv21Nf26NfNzK/NvK/R6LPmaMfyxNPqfMvV+LfurM/iSMPmbMfJvKvmdMfumM/qiMvmZMfytNPJqKvysNPN2K/iYMPNwK/upM/JtKvJsKviVMPaHLvaGLvJpKvR8LPaKLvqkMvuqM/aFLvR4LPuoM/iTMPWDLfiRMPmYMXS0ngkAAALoSURBVHja7drnctpAFIbhFUISSKJ3MKYa0+y4xTW9937/15JkJhlTjhrSrHRmvuf/as6L0YLFCgEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMBJ6njenqspzgnPrsrGX9Zpi2tCrmnc6+dYNthVY5WpMmxQLWPdMsOuYVwzNj3ei2t3mQwaV43BJPDCS2NbJ5aEeuX/+9qcjQOtfFIkIkrvY2g4MVcmOBsFWbowKO/kNyj62gRpJcDaPBlxLr1B0zdG0C/8LzbJiJrshuvy1gzlA9+rD8mIkuyIJjFE3/dqnYwoSm7IUEPoD/wut8iIguSIDjlFxe/yfXL5vuSI21BTZLLhXoOILMO8Hxwa/L8bI0LfmUdhGowb2ZvT0e57pFNDgB06IlVyjmmIBl2T/nl9Rw6SD9GgSG/Q0uQkaW3XhmovKQ3eFQ4N2Uo9OQ1eFZsNerf7vP+rO4rhmY1Lg3vFVoP8+8BXg1sFnwbnCk4NThW8GuiKBDdkVVtTNFvNelVsNqTbyWnIOM2oeTRoyWvwmpJHg/ucXBrcJuXT4DwrpwZi2vy0VCx8YtXg/D2bU4OfiuQ3eFfE2KD4bfCqiLNB993gXsGlwa2CT4NzBacGIVQ6YsipQdh0xEdODUKjIxrSp88onZ8zbbFLg1DoiFO5BXvDGv2My9/JhUT8JUZTI0yDaNHLBzIbvqTDNYhUiVw/kdjQ1kM2CHFDPjKW+KzyRTF0g/ga9w9y+fANQpxvX8CU+Ny7FUWDeF3Y+g3lROIf4k0UDX9eCyvO531PyYhHga9zvPZJU5b73Y/eXj8Hv9D48n6HaF5LbcjRt8TZTtda5M1DfXnbkX1C0SHCFKzQB5Fe8op4GNGNHavvZESbVwT5r6W1xyuCPBY3Y9YgDqzknH/e3YfNzzuL30l0IebrZ5kKtuDIXt1n868ET6kf3/49tLvrCcZyF8Pu215dAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAcPIbNrBhOaBXucoAAAAASUVORK5CYII=">
 </head>
 <body>
   <tf-tensorboard-demo data-dir="data/"></tf-tensorboard-demo>
diff --git a/tensorflow/tensorboard/dist/tf-tensorboard.html b/tensorflow/tensorboard/dist/tf-tensorboard.html
index 15ede2b7553..a9cc4e419e0 100644
--- a/tensorflow/tensorboard/dist/tf-tensorboard.html
+++ b/tensorflow/tensorboard/dist/tf-tensorboard.html
@@ -1,4 +1,22 @@
-// AUTOGENERATED FILE - DO NOT MODIFY 
+<!-- Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+============================================================================
+
+This file is generated by `gulp` & `vulcanize`. Do not directly change it.
+Instead, use `gulp regenerate` to create a new version with your changes.
+-->
+
 <html><head><meta charset="UTF-8">
 </head><body><div hidden="" by-vulcanize="">
 <dom-module id="tf-tooltip-coordinator" assetpath="../tf-event-dashboard/">
@@ -405,8 +423,8 @@
 <style is="custom-style">
 
   :root {
-    --tb-orange-weak: #fcb938;
-    --tb-orange-strong: #f3913e;
+    --tb-orange-weak: #ffa726;
+    --tb-orange-strong: #f57c00;
     --tb-grey-darker: #e2e2e2;
     --tb-grey-lighter: #f3f3f3;
     --tb-ui-dark-accent: #757575;
@@ -1797,19 +1815,33 @@ var TF;
         */
         var RequestCancellationError = (function (_super) {
             __extends(RequestCancellationError, _super);
-            function RequestCancellationError(message) {
-                _super.call(this, message);
+            function RequestCancellationError() {
+                _super.apply(this, arguments);
                 this.name = "RequestCancellationError";
             }
             return RequestCancellationError;
         }(Error));
         Backend.RequestCancellationError = RequestCancellationError;
+        var RequestNetworkError = (function (_super) {
+            __extends(RequestNetworkError, _super);
+            function RequestNetworkError(req, url) {
+                _super.call(this);
+                this.message = "RequestNetworkError: " + req.status + " at " + url;
+                this.name = "RequestNetworkError";
+                this.req = req;
+                this.url = url;
+            }
+            return RequestNetworkError;
+        }(Error));
+        Backend.RequestNetworkError = RequestNetworkError;
         var RequestManager = (function () {
-            function RequestManager(nSimultaneousRequests) {
+            function RequestManager(nSimultaneousRequests, maxRetries) {
                 if (nSimultaneousRequests === void 0) { nSimultaneousRequests = 10; }
+                if (maxRetries === void 0) { maxRetries = 3; }
                 this._queue = [];
                 this._nActiveRequests = 0;
                 this._nSimultaneousRequests = nSimultaneousRequests;
+                this._maxRetries = maxRetries;
             }
             /* Gives a promise that loads assets from given url (respects queuing) */
             RequestManager.prototype.request = function (url) {
@@ -1819,11 +1851,20 @@ var TF;
                     _this._queue.push(resolver);
                     _this.launchRequests();
                 }).then(function () {
-                    return _this._promiseFromUrl(url);
+                    return _this.promiseWithRetries(url, _this._maxRetries);
                 }).then(function (response) {
+                    // Success - Let's free space for another active reqest, and launch it
                     _this._nActiveRequests--;
-                    _this.launchRequests(); // since we may have queued responses to launch
+                    _this.launchRequests();
                     return response;
+                }, function (rejection) {
+                    if (rejection.name === "RequestNetworkError") {
+                        // If we failed due to network error, we should decrement
+                        // _nActiveRequests because this request was active
+                        _this._nActiveRequests--;
+                        _this.launchRequests();
+                    }
+                    return Promise.reject(rejection);
                 });
                 return promise;
             };
@@ -1846,6 +1887,29 @@ var TF;
                     this._queue.pop().resolve();
                 }
             };
+            /**
+             * Try to request a given URL using overwritable _promiseFromUrl method.
+             * If the request fails for any reason, we will retry up to maxRetries
+             * times. In practice, this will help us paper over transient network issues
+             * like "502 Bad Gateway".
+             * By default, Chrome displays network errors in console, so
+             * the user will be able to tell when the requests are failing. I think this
+             * is a feature, if the request failures and retries are causing any
+             * pain to users, they can see it and file issues.
+             */
+            RequestManager.prototype.promiseWithRetries = function (url, maxRetries) {
+                var _this = this;
+                var success = function (x) { return x; };
+                var failure = function (x) {
+                    if (maxRetries > 0) {
+                        return _this.promiseWithRetries(url, maxRetries - 1);
+                    }
+                    else {
+                        return Promise.reject(x);
+                    }
+                };
+                return this._promiseFromUrl(url).then(success, failure);
+            };
             /* Actually get promise from url using XMLHttpRequest */
             RequestManager.prototype._promiseFromUrl = function (url) {
                 return new Promise(function (resolve, reject) {
@@ -1856,11 +1920,11 @@ var TF;
                             resolve(JSON.parse(req.responseText));
                         }
                         else {
-                            reject(Error("Status: " + req.status + ":" + req.statusText + " at url: " + url));
+                            reject(new RequestNetworkError(req, url));
                         }
                     };
                     req.onerror = function () {
-                        reject(Error("Network error"));
+                        reject(new RequestNetworkError(req, url));
                     };
                     req.send();
                 });
@@ -1952,11 +2016,19 @@ var TF;
             }
             function standardRoute(route) {
                 return function (tag, run) {
-                    return dataDir + "/" + route + clean(Backend.queryEncoder({ tag: tag, run: run }));
+                    var url = dataDir + "/" + route + clean(Backend.queryEncoder({ tag: tag, run: run }));
+                    if (demoMode) {
+                        url += ".json";
+                    }
+                    return url;
                 };
             }
             function individualImageUrl(query) {
-                return dataDir + "/" + clean("individualImage?" + query);
+                var url = dataDir + "/" + clean("individualImage?" + query);
+                if (demoMode) {
+                    url += ".png";
+                }
+                return url;
             }
             function graphUrl(run, limit_attr_size, large_attrs_key) {
                 var query_params = [["run", clean(run)]];
@@ -1969,10 +2041,14 @@ var TF;
                 var query = query_params.map(function (param) {
                     return param[0] + "=" + encodeURIComponent(param[1]);
                 }).join("&");
-                return dataDir + "/graph" + clean("?" + query);
+                var url = dataDir + "/graph" + clean("?" + query);
+                if (demoMode) {
+                    url += ".pbtxt";
+                }
+                return url;
             }
             return {
-                runs: function () { return dataDir + "/runs"; },
+                runs: function () { return dataDir + "/runs" + (demoMode ? ".json" : ""); },
                 individualImage: individualImageUrl,
                 graph: graphUrl,
                 scalars: standardRoute("scalars"),
@@ -2108,7 +2184,16 @@ var TF;
                 var p;
                 var url = this.router.histograms(tag, run);
                 p = this.requestManager.request(url);
-                return p.then(map(detupler(createHistogram)));
+                return p.then(map(detupler(createHistogram)))
+                    .then(function (histos) {
+                    return histos.map(function (histo, i) {
+                        return {
+                            wall_time: histo.wall_time,
+                            step: histo.step,
+                            bins: convertBins(histo)
+                        };
+                    });
+                });
             };
             /**
              * Return a promise containing ImageDatums for given run and tag.
@@ -2209,6 +2294,38 @@ var TF;
             };
         }
         ;
+        /**
+         * Takes histogram data as stored by tensorboard backend and converts it to
+         * the standard d3 histogram data format to make it more compatible and easier to
+         * visualize. When visualizing histograms, having the left edge and width makes
+         * things quite a bit easier.
+         *
+         * @param {histogram} Histogram - A histogram from tensorboard backend.
+         * @return {HistogramBin[]} - Each bin has an x (left edge), a dx (width), and a y (count).
+         *
+         * If given rightedges are inclusive, then these left edges (x) are exclusive.
+         */
+        function convertBins(histogram) {
+            if (histogram.bucketRightEdges.length !== histogram.bucketCounts.length) {
+                throw (new Error("Edges and counts are of different lengths."));
+            }
+            var previousRightEdge = histogram.min;
+            return histogram.bucketRightEdges.map(function (rightEdge, i) {
+                // Use the previous bin's rightEdge as the new leftEdge
+                var left = previousRightEdge;
+                // We need to clip the rightEdge because right-most edge can be
+                // infinite-sized
+                var right = Math.min(histogram.max, rightEdge);
+                // Store rightEdgeValue for next iteration
+                previousRightEdge = rightEdge;
+                return {
+                    x: left,
+                    dx: right - left,
+                    y: histogram.bucketCounts[i]
+                };
+            });
+        }
+        Backend_1.convertBins = convertBins;
     })(Backend = TF.Backend || (TF.Backend = {}));
 })(TF || (TF = {}));
 </script>
@@ -2548,7 +2665,7 @@ var TF;
   </style>
   <template>
     <template is="dom-if" if="[[imageUrl]]">
-      <img src="[[imageUrl]]">
+      <img src="[[imageUrl]]" on-error="retry">
     </template>
   </template>
   <script>
@@ -2569,7 +2686,11 @@ var TF;
       },
       ready: function() {
         this.reload();
-      }
+      },
+      retry: function() {
+        this.imageUrl = ""; // force reload
+        this.reload();
+      },
     });
   </script>
 </dom-module>
@@ -2747,11 +2868,6 @@ Polymer({
       notify: true,
     },
     datasets: Array,
-    hasStats: {
-      type: Boolean,
-      readOnly: true, // This property produces data.
-      notify: true
-    },
     selectedDataset: Number,
     selectedFile: {
       type: Object,
@@ -2767,21 +2883,41 @@ Polymer({
       readOnly: true, //readonly so outsider can't change this via binding
       notify: true
     },
-    outGraphName: {
-      type: String,
-      readOnly: true,
-      notify: true
-    },
     outHierarchyParams: {
       type: Object,
       readOnly: true,
       notify: true
     },
+    outStats: {
+      type: Object,
+      readOnly: true, // This property produces data.
+      notify: true
+    }
   },
   observers: [
-    '_selectedDatasetChanged(selectedDataset, datasets)'
+    '_selectedDatasetChanged(selectedDataset, datasets)',
+    '_readAndParseMetadata(selectedDataset, selectedMetadataTag, datasets)'
   ],
-  _parseAndConstructHierarchicalGraph: function(dataset, pbTxtFile) {
+  _readAndParseMetadata: function(datasetIndex, metadataIndex, datasets) {
+    if (metadataIndex == -1 || datasets[datasetIndex] == null ||
+        datasets[datasetIndex].runMetadata == null ||
+        datasets[datasetIndex].runMetadata[metadataIndex] == null) {
+      this._setOutStats(null);
+      return;
+    }
+    var path = datasets[datasetIndex].runMetadata[metadataIndex].path;
+    // Reset the progress bar to 0.
+    this.set('progress', {
+      value: 0,
+      msg: ''
+    });
+    var tracker = tf.getTracker(this);
+    tf.graph.parser.fetchAndParseMetadata(path, tracker)
+    .then(function(stats) {
+      this._setOutStats(stats);
+    }.bind(this));
+  },
+  _parseAndConstructHierarchicalGraph: function(path, pbTxtFile) {
     // Reset the progress bar to 0.
     this.set('progress', {
       value: 0,
@@ -2800,13 +2936,10 @@ Polymer({
       seriesMap: {},
     };
     this._setOutHierarchyParams(hierarchyParams);
-    var statsJson;
     var dataTracker = tf.getSubtaskTracker(tracker, 30, 'Data');
-    tf.graph.parser.readAndParseData(dataset, pbTxtFile, dataTracker)
-    .then(function(result) {
+    tf.graph.parser.fetchAndParseGraphData(path, pbTxtFile, dataTracker)
+    .then(function(graph) {
       // Build the flat graph (consists only of Op nodes).
-      var nodes = result.nodes;
-      statsJson = result.statsJson;
 
       // This is the whitelist of inputs on op types that are considered
       // reference edges. "Assign 0" indicates that the first input to
@@ -2831,18 +2964,11 @@ Polymer({
         outEmbeddingTypes: ['^[a-zA-Z]+Summary$'],
         refEdges: refEdges
       };
-      var graphTracker = tf.getSubtaskTracker(tracker, 20,
-          'Graph');
-      return tf.graph.build(nodes, buildParams, graphTracker);
+      var graphTracker = tf.getSubtaskTracker(tracker, 20, 'Graph');
+      return tf.graph.build(graph, buildParams, graphTracker);
     })
     .then(function(graph) {
       this._setOutGraph(graph);
-      if (statsJson) {
-        // If there are associated stats, join them with the graph.
-        tf.time('Joining stats info with graph...', function() {
-          tf.graph.joinStatsInfoWithGraph(graph, statsJson);
-        });
-      }
       var hierarchyTracker = tf.getSubtaskTracker(tracker, 50,
           'Namespace hierarchy');
       return tf.graph.hierarchy.build(graph, hierarchyParams, hierarchyTracker);
@@ -2850,7 +2976,6 @@ Polymer({
     .then(function(graphHierarchy) {
       // Update the properties which notify the parent with the
       // graph hierarchy and whether the data has live stats or not.
-      this._setHasStats(statsJson != null);
       this._setOutGraphHierarchy(graphHierarchy);
     }.bind(this))
     .catch(function(e) {
@@ -2860,9 +2985,7 @@ Polymer({
     });
   },
   _selectedDatasetChanged: function(datasetIndex, datasets) {
-    var dataset = datasets[datasetIndex];
-    this._parseAndConstructHierarchicalGraph(dataset);
-    this._setOutGraphName(dataset.name);
+    this._parseAndConstructHierarchicalGraph(datasets[datasetIndex].path);
   },
   _selectedFileChanged: function(e) {
     if (!e) {
@@ -3177,33 +3300,36 @@ var tf;
          * Joins the information from the stats file (memory, compute time) with the
          * graph information.
          */
-        function joinStatsInfoWithGraph(graph, statsJson) {
-            _.each(statsJson.devStats, function (stats) {
-                _.each(stats.nodeStats, function (nodeStats) {
+        function joinStatsInfoWithGraph(graph, stats) {
+            _.each(stats.dev_stats, function (devStats) {
+                _.each(devStats.node_stats, function (nodeStats) {
                     // Lookup the node in the graph by its original name, e.g. A. If not
                     // found, lookup by the rewritten name A/(A) in case the name is both
                     // a namespace and a node name.
-                    var nodeName = nodeStats.nodeName in graph.nodes ?
-                        nodeStats.nodeName :
-                        nodeStats.nodeName + graph_1.NAMESPACE_DELIM + "(" + nodeStats.nodeName + ")";
-                    if (nodeName in graph.nodes) {
-                        // Compute the total bytes used.
-                        var totalBytes_1 = 0;
-                        if (nodeStats.memory) {
-                            _.each(nodeStats.memory, function (alloc) {
-                                if (alloc.totalBytes) {
-                                    totalBytes_1 += Number(alloc.totalBytes);
-                                }
-                            });
-                        }
-                        var outputSize = null;
-                        if (nodeStats.output) {
-                            outputSize = _.map(nodeStats.output, function (output) {
-                                return _.map(output.tensorDescription.shape.dim, function (dim) { return Number(dim.size); });
-                            });
-                        }
-                        graph.nodes[nodeName].stats = new NodeStats(totalBytes_1, Number(nodeStats.allEndRelMicros), outputSize);
+                    var nodeName = nodeStats.node_name in graph.nodes ?
+                        nodeStats.node_name :
+                        nodeStats.node_name + graph_1.NAMESPACE_DELIM + "(" + nodeStats.node_name + ")";
+                    // Couldn't find a matching node.
+                    if (!(nodeName in graph.nodes)) {
+                        return;
                     }
+                    // Compute the total bytes used.
+                    var totalBytes = 0;
+                    if (nodeStats.memory) {
+                        _.each(nodeStats.memory, function (alloc) {
+                            if (alloc.total_bytes) {
+                                totalBytes += Number(alloc.total_bytes);
+                            }
+                        });
+                    }
+                    var outputSize = null;
+                    if (nodeStats.output) {
+                        outputSize = _.map(nodeStats.output, function (output) {
+                            return _.map(output.tensor_description.shape.dim, function (dim) { return Number(dim.size); });
+                        });
+                    }
+                    graph.nodes[nodeName].device = devStats.device;
+                    graph.nodes[nodeName].stats = new NodeStats(totalBytes, Number(nodeStats.all_end_rel_micros), outputSize);
                 });
             });
         }
@@ -3261,7 +3387,6 @@ var tf;
                 this.templateId = null;
                 /** Metanode which contains this node, if any */
                 this.parentNode = null;
-                this.stats = new NodeStats(0, 0, null);
                 this.hasNonControlEdges = false;
                 this.include = InclusionType.UNSPECIFIED;
             }
@@ -3335,6 +3460,7 @@ var tf;
                 // Compute the size of the tensor flowing through this
                 // base edge.
                 this.totalSize += MetaedgeImpl.computeSizeOfEdge(edge, h);
+                h.maxMetaEdgeSize = Math.max(h.maxMetaEdgeSize, this.totalSize);
             };
             MetaedgeImpl.computeSizeOfEdge = function (edge, h) {
                 var opNode = h.node(edge.v);
@@ -3343,6 +3469,7 @@ var tf;
                     // a lower bound for the total size.
                     return 1;
                 }
+                h.hasShapeInfo = true;
                 // Sum the sizes of all output tensors.
                 return _(opNode.outputShapes).map(function (shape) {
                     // If the shape is unknown, treat it as 1 when computing
@@ -3394,7 +3521,6 @@ var tf;
                 this.parentNode = null;
                 this.deviceHistogram = {};
                 this.hasNonControlEdges = false;
-                this.stats = new NodeStats(0, 0, null);
                 this.include = InclusionType.UNSPECIFIED;
             }
             return SeriesNodeImpl;
@@ -3439,7 +3565,7 @@ var tf;
             }
             // We didn't find OUTPUT_SHAPES_KEY in attributes, so we don't know anything
             // about the output tensors.
-            return result;
+            return null;
         }
         /**
          * Normalizes the inputs and extracts associated metadata:
@@ -3828,7 +3954,7 @@ var tf;
             /**
              * Fetches a text file and returns a promise of the result.
              */
-            function readPbTxt(filepath) {
+            function fetchPbTxt(filepath) {
                 return new Promise(function (resolve, reject) {
                     d3.text(filepath, function (error, text) {
                         if (error) {
@@ -3839,55 +3965,40 @@ var tf;
                     });
                 });
             }
-            parser.readPbTxt = readPbTxt;
+            parser.fetchPbTxt = fetchPbTxt;
             /**
-             * Fetches and parses a json file and returns a promise of the result.
+             * Fetches the metadata file, parses it and returns a promise of the result.
              */
-            function readJson(filepath) {
-                return new Promise(function (resolve, reject) {
-                    d3.json(filepath, function (error, text) {
-                        if (error) {
-                            reject(error);
-                            return;
-                        }
-                        resolve(text);
-                    });
-                });
-            }
-            parser.readJson = readJson;
-            /**
-             * Reads the graph and stats file (if available), parses them and returns a
-             * promise of the result.
-             */
-            function readAndParseData(dataset, pbTxtFile, tracker) {
-                var graphPbTxt;
-                var statsJson;
-                return tf.runTask("Reading graph.pbtxt", 20, function () {
-                    return pbTxtFile ?
-                        Promise.resolve(pbTxtFile) :
-                        readPbTxt(dataset.path).then(function (text) { return new Blob([text]); });
+            function fetchAndParseMetadata(path, tracker) {
+                return tf.runTask("Reading metadata pbtxt", 40, function () {
+                    if (path == null) {
+                        return Promise.resolve(null);
+                    }
+                    return fetchPbTxt(path).then(function (text) { return new Blob([text]); });
                 }, tracker)
                     .then(function (blob) {
-                    graphPbTxt = blob;
-                    return tf.runTask("Reading stats.pbtxt", 20, function () {
-                        return (dataset != null && dataset.statsPath != null) ?
-                            readJson(dataset.statsPath) : null;
+                    return tf.runTask("Parsing metadata.pbtxt", 60, function () {
+                        return blob != null ? parseStatsPbTxt(blob) : null;
                     }, tracker);
-                })
-                    .then(function (json) {
-                    statsJson = json;
-                    return tf.runTask("Parsing graph.pbtxt", 60, function () {
-                        return parsePbtxtFile(graphPbTxt);
-                    }, tracker);
-                })
-                    .then(function (nodes) {
-                    return {
-                        nodes: nodes,
-                        statsJson: statsJson
-                    };
                 });
             }
-            parser.readAndParseData = readAndParseData;
+            parser.fetchAndParseMetadata = fetchAndParseMetadata;
+            /**
+             * Fetches the graph file, parses it and returns a promise of the result.
+             */
+            function fetchAndParseGraphData(path, pbTxtFile, tracker) {
+                return tf.runTask("Reading graph pbtxt", 40, function () {
+                    return pbTxtFile ?
+                        Promise.resolve(pbTxtFile) :
+                        fetchPbTxt(path).then(function (text) { return new Blob([text]); });
+                }, tracker)
+                    .then(function (blob) {
+                    return tf.runTask("Parsing graph.pbtxt", 60, function () {
+                        return parseGraphPbTxt(blob);
+                    }, tracker);
+                });
+            }
+            parser.fetchAndParseGraphData = fetchAndParseGraphData;
             /**
              * Parse a file object in a streaming fashion line by line (or custom delim).
              * Can handle very large files.
@@ -3946,13 +4057,55 @@ var tf;
             }
             parser.streamParse = streamParse;
             /**
-             * Parses a proto txt file or blob into javascript object.
+             * Since proto-txt doesn't explicitly say whether an attribute is repeated
+             * (an array) or not, we keep a hard-coded list of attributes that are known
+             * to be repeated. This list is used in parsing time to convert repeated
+             * attributes into arrays even when the attribute only shows up once in the
+             * object.
+             */
+            var GRAPH_REPEATED_FIELDS = {
+                "node": true,
+                "node.input": true,
+                "node.attr": true,
+                "node.attr.value.list.type": true,
+                "node.attr.value.shape.dim": true,
+                "node.attr.value.tensor.string_val": true,
+                "node.attr.value.tensor.tensor_shape.dim": true,
+                "node.attr.value.list.shape": true,
+                "node.attr.value.list.shape.dim": true,
+                "node.attr.value.list.s": true
+            };
+            var METADATA_REPEATED_FIELDS = {
+                "step_stats.dev_stats": true,
+                "step_stats.dev_stats.node_stats": true,
+                "step_stats.dev_stats.node_stats.output": true,
+                "step_stats.dev_stats.node_stats.memory": true,
+                "step_stats.dev_stats.node_stats.output.tensor_description.shape.dim": true
+            };
+            /**
+             * Parses a blob of proto txt file into a raw Graph object.
+             */
+            function parseGraphPbTxt(input) {
+                return parsePbtxtFile(input, GRAPH_REPEATED_FIELDS).then(function (obj) { return obj["node"]; });
+            }
+            parser.parseGraphPbTxt = parseGraphPbTxt;
+            /**
+             * Parses a blob of proto txt file into a StepStats object.
+             */
+            function parseStatsPbTxt(input) {
+                return parsePbtxtFile(input, METADATA_REPEATED_FIELDS)
+                    .then(function (obj) { return obj["step_stats"]; });
+            }
+            /**
+             * Parses a blob of proto txt file into javascript object.
              *
              * @param input The Blob or file object implementing slice.
+             * @param repeatedFields Map (Set) of all the repeated fields, since you can't
+             *   tell directly from the pbtxt if a field is repeated or not.
              * @returns The parsed object.
              */
-            function parsePbtxtFile(input) {
-                var output = { node: [] };
+            function parsePbtxtFile(input, repeatedFields) {
+                var output = {};
                 var stack = [];
                 var path = [];
                 var current = output;
@@ -3965,25 +4118,6 @@ var tf;
                         value: value
                     };
                 }
-                /**
-                 * Since proto-txt doesn't explicitly say whether an attribute is repeated
-                 * (an array) or not, we keep a hard-coded list of attributes that are known
-                 * to be repeated. This list is used in parsing time to convert repeated
-                 * attributes into arrays even when the attribute only shows up once in the
-                 * object.
-                 */
-                var ARRAY_ATTRIBUTES = {
-                    "node": true,
-                    "node.input": true,
-                    "node.attr": true,
-                    "node.attr.value.list.type": true,
-                    "node.attr.value.shape.dim": true,
-                    "node.attr.value.tensor.string_val": true,
-                    "node.attr.value.tensor.tensor_shape.dim": true,
-                    "node.attr.value.list.shape": true,
-                    "node.attr.value.list.shape.dim": true,
-                    "node.attr.value.list.s": true
-                };
                 /**
                  * Adds a value, given the attribute name and the host object. If the
                  * attribute already exists, but is not an array, it will convert it to an
@@ -3999,7 +4133,7 @@ var tf;
                     // We treat "node" specially since it is done so often.
                     var existingValue = obj[name];
                     if (existingValue == null) {
-                        obj[name] = path.join(".") in ARRAY_ATTRIBUTES ? [value] : value;
+                        obj[name] = path.join(".") in repeatedFields ? [value] : value;
                     }
                     else if (Array.isArray(existingValue)) {
                         existingValue.push(value);
@@ -4032,21 +4166,9 @@ var tf;
                             break;
                     }
                 }).then(function () {
-                    return output["node"];
+                    return output;
                 });
             }
-            parser.parsePbtxtFile = parsePbtxtFile;
-            /**
-             * Parses a proto txt file into a javascript object.
-             *
-             * @param input The string contents of the proto txt file.
-             * @return The parsed object.
-             */
-            function parsePbtxt(input) {
-                var blob = new Blob([input]);
-                return parsePbtxtFile(blob);
-            }
-            parser.parsePbtxt = parsePbtxt;
         })(parser = graph.parser || (graph.parser = {}));
     })(graph = tf.graph || (tf.graph = {}));
 })(tf || (tf = {})); // Close module tf.graph.parser.
@@ -4079,6 +4201,8 @@ var tf;
              */
             var HierarchyImpl = (function () {
                 function HierarchyImpl() {
+                    this.hasShapeInfo = false;
+                    this.maxMetaEdgeSize = 1;
                     this.root = graph_1.createMetanode(graph_1.ROOT_NAME, { compound: true });
                     this.templates = null;
                     this.devices = null;
@@ -4415,6 +4539,40 @@ var tf;
             }
             hierarchy_1.build = build;
             ;
+            function joinAndAggregateStats(h, stats) {
+                // Get all the possible device names.
+                var deviceNames = {};
+                _.each(h.root.leaves(), function (nodeName) {
+                    var leaf = h.node(nodeName);
+                    if (leaf.device != null) {
+                        deviceNames[leaf.device] = true;
+                    }
+                });
+                h.devices = _.keys(deviceNames);
+                // Reset stats for each group node.
+                _.each(h.getNodeMap(), function (node, nodeName) {
+                    if (node.isGroupNode) {
+                        node.stats = new graph_1.NodeStats(0, 0, null);
+                        node.deviceHistogram = {};
+                    }
+                });
+                // Bubble-up the stats and device distribution from leaves to parents.
+                _.each(h.root.leaves(), function (nodeName) {
+                    var leaf = h.node(nodeName);
+                    var node = leaf;
+                    while (node.parentNode != null) {
+                        if (leaf.device != null) {
+                            var deviceHistogram = node.parentNode.deviceHistogram;
+                            deviceHistogram[leaf.device] = (deviceHistogram[leaf.device] || 0) + 1;
+                        }
+                        if (leaf.stats != null) {
+                            node.parentNode.stats.combine(leaf.stats);
+                        }
+                        node = node.parentNode;
+                    }
+                });
+            }
+            hierarchy_1.joinAndAggregateStats = joinAndAggregateStats;
             /**
              * Creates the metanodes in the hierarchical graph and assigns parent-child
              * relationship between them.
@@ -4431,9 +4589,6 @@ var tf;
                         parent.depth = Math.max(parent.depth, path.length - i);
                         parent.cardinality += node.cardinality;
                         parent.opHistogram[node.op] = (parent.opHistogram[node.op] || 0) + 1;
-                        if (node.stats) {
-                            parent.stats.combine(node.stats);
-                        }
                         if (node.device != null) {
                             parent.deviceHistogram[node.device] =
                                 (parent.deviceHistogram[node.device] || 0) + 1;
@@ -4593,9 +4748,6 @@ var tf;
                         }
                         child.parentNode = seriesNode;
                         seriesNames[n] = seriesName;
-                        if (child.stats) {
-                            seriesNode.stats.combine(child.stats);
-                        }
                         // Remove now-grouped node from its original parent's metagraph.
                         metagraph.removeNode(n);
                     });
@@ -4870,10 +5022,20 @@ var tf;
                 function RenderGraphInfo(hierarchy) {
                     this.hierarchy = hierarchy;
                     this.index = {};
+                    this.computeScales();
+                    // Maps node name to whether the rendering hierarchy was already
+                    // constructed.
+                    this.hasSubhierarchy = {};
+                    this.root = new RenderGroupNodeInfo(hierarchy.root);
+                    this.index[hierarchy.root.name] = this.root;
+                    this.buildSubhierarchy(hierarchy.root.name);
+                    this.root.expanded = true;
+                }
+                RenderGraphInfo.prototype.computeScales = function () {
                     this.deviceColorMap = d3.scale.ordinal()
-                        .domain(hierarchy.devices)
-                        .range(_.map(d3.range(hierarchy.devices.length), render.MetanodeColors.DEVICE_PALETTE));
-                    var topLevelGraph = hierarchy.root.metagraph;
+                        .domain(this.hierarchy.devices)
+                        .range(_.map(d3.range(this.hierarchy.devices.length), render.MetanodeColors.DEVICE_PALETTE));
+                    var topLevelGraph = this.hierarchy.root.metagraph;
                     // Find the maximum and minimum memory usage.
                     var memoryExtent = d3.extent(topLevelGraph.nodes(), function (nodeName, index) {
                         var node = topLevelGraph.node(nodeName);
@@ -4896,14 +5058,12 @@ var tf;
                     this.computeTimeScale = d3.scale.linear()
                         .domain(computeTimeExtent)
                         .range(PARAMS.minMaxColors);
-                    // Maps node name to whether the rendering hierarchy was already
-                    // constructed.
-                    this.hasSubhierarchy = {};
-                    this.root = new RenderGroupNodeInfo(hierarchy.root);
-                    this.index[hierarchy.root.name] = this.root;
-                    this.buildSubhierarchy(hierarchy.root.name);
-                    this.root.expanded = true;
-                }
+                    this.edgeWidthScale = this.hierarchy.hasShapeInfo ?
+                        graph_1.scene.edge.EDGE_WIDTH_SCALE :
+                        d3.scale.linear()
+                            .domain([1, this.hierarchy.maxMetaEdgeSize])
+                            .range([graph_1.scene.edge.MIN_EDGE_WIDTH, graph_1.scene.edge.MAX_EDGE_WIDTH]);
+                };
                 /**
                  * Get a previously created RenderNodeInfo by its node name.
                  */
@@ -6818,20 +6978,20 @@ var tf;
                 /** Delimiter between dimensions when showing sizes of tensors. */
                 var TENSOR_SHAPE_DELIM = "×";
                 /** The minimum stroke width of an edge. */
-                var MIN_EDGE_WIDTH = 0.75;
+                edge.MIN_EDGE_WIDTH = 0.75;
                 /** The maximum stroke width of an edge. */
-                var MAX_EDGE_WIDTH = 12;
+                edge.MAX_EDGE_WIDTH = 12;
                 /** The exponent used in the power scale for edge thickness. */
                 var EDGE_WIDTH_SCALE_EXPONENT = 0.3;
                 /** The domain (min and max value) for the edge width. */
                 var DOMAIN_EDGE_WIDTH_SCALE = [1, 5E6];
-                var edgeWidthScale = d3.scale.pow()
+                edge.EDGE_WIDTH_SCALE = d3.scale.pow()
                     .exponent(EDGE_WIDTH_SCALE_EXPONENT)
                     .domain(DOMAIN_EDGE_WIDTH_SCALE)
-                    .range([MIN_EDGE_WIDTH, MAX_EDGE_WIDTH])
+                    .range([edge.MIN_EDGE_WIDTH, edge.MAX_EDGE_WIDTH])
                     .clamp(true);
                 var arrowheadMap = d3.scale.quantize()
-                    .domain([MIN_EDGE_WIDTH, MAX_EDGE_WIDTH])
+                    .domain([edge.MIN_EDGE_WIDTH, edge.MAX_EDGE_WIDTH])
                     .range(["small", "medium", "large", "xlarge"]);
                 /** Minimum stroke width to put edge labels in the middle of edges */
                 var CENTER_EDGE_LABEL_MIN_STROKE_WIDTH = 2.5;
@@ -7005,7 +7165,7 @@ var tf;
                     // Give the path a unique id, which will be used to link
                     // the textPath (edge label) to this path.
                     var pathId = "path_" + getEdgeKey(d);
-                    var strokeWidth = edgeWidthScale(size);
+                    var strokeWidth = sceneElement.renderHierarchy.edgeWidthScale(size);
                     var path = edgeGroup.append("path")
                         .attr({
                         "id": pathId,
@@ -9614,12 +9774,14 @@ Polymer({
     * UI controls.
     */
   _colorByChanged: function() {
-    // We iterate through each svg node and update its state.
-    _.each(this._nodeGroupIndex, function(nodeGroup, nodeName) {
-      this._updateNodeState(nodeName);
-    }, this);
-    // Notify also the minimap.
-    this.minimap.update();
+    if (this.renderHierarchy != null) {
+      // We iterate through each svg node and update its state.
+      _.each(this._nodeGroupIndex, function(nodeGroup, nodeName) {
+        this._updateNodeState(nodeName);
+      }, this);
+      // Notify also the minimap.
+      this.minimap.update();
+    }
   },
   fit: function() {
     tf.graph.scene.fit(this.$.svg, this.$.root, this._zoom, function() {
@@ -9779,7 +9941,7 @@ paper-button {
 <div class="container">
   <div class="vertical">
     <h2>[[title]]</h2>
-    <tf-graph-scene id="scene" class="auto" render-hierarchy="[[renderHierarchy]]" highlighted-node="[[_getVisible(highlightedNode)]]" selected-node="[[selectedNode]]" color-by="[[colorBy]]" name="[[graphName]]" progress="[[progress]]"></tf-graph-scene>
+    <tf-graph-scene id="scene" class="auto" render-hierarchy="[[renderHierarchy]]" highlighted-node="[[_getVisible(highlightedNode)]]" selected-node="[[selectedNode]]" color-by="[[colorBy]]" progress="[[progress]]"></tf-graph-scene>
   </div>
 </div>
 </template>
@@ -9797,6 +9959,10 @@ Polymer({
       observer: '_graphChanged'
     },
     basicGraph: Object,
+    stats: {
+      type: Object,
+      observer: '_statsChanged'
+    },
     hierarchyParams: Object,
     progress: {
       type: Object,
@@ -9835,6 +10001,14 @@ Polymer({
   observers: [
     '_buildRenderHierarchy(graphHierarchy)'
   ],
+  _statsChanged: function(stats) {
+    if (stats != null) {
+      tf.graph.joinStatsInfoWithGraph(this.basicGraph, stats);
+      tf.graph.hierarchy.joinAndAggregateStats(this.graphHierarchy, stats);
+      // Recompute the rendering information.
+      this._buildRenderHierarchy(this.graphHierarchy);
+    }
+  },
   _buildRenderHierarchy: function(graphHierarchy) {
     tf.time('new tf.graph.render.Hierarchy', function() {
       if (graphHierarchy.root.type !== tf.graph.NodeType.META) {
@@ -10936,7 +11110,7 @@ paper-progress {
 </template>
 <div class$="[[_getContainerClass(progress)]]">
   <div id="main">
-    <tf-graph id="graph" graph-hierarchy="{{graphHierarchy}}" basic-graph="[[graph]]" hierarchy-params="[[hierarchyParams]]" render-hierarchy="{{_renderHierarchy}}" selected-node="{{_selectedNode}}" highlighted-node="{{_highlightedNode}}" color-by="[[colorBy]]" color-by-params="{{colorByParams}}" graph-name="[[graphName]]" progress="{{progress}}"></tf-graph>
+    <tf-graph id="graph" graph-hierarchy="{{graphHierarchy}}" basic-graph="[[graph]]" hierarchy-params="[[hierarchyParams]]" render-hierarchy="{{_renderHierarchy}}" stats="[[stats]]" selected-node="{{_selectedNode}}" highlighted-node="{{_highlightedNode}}" color-by="[[colorBy]]" color-by-params="{{colorByParams}}" progress="{{progress}}"></tf-graph>
   </div>
   <div id="info">
     <tf-graph-info id="graph-info" title="selected" graph-hierarchy="[[graphHierarchy]]" render-hierarchy="[[_renderHierarchy]]" graph="[[graph]]" selected-node="{{_selectedNode}}" selected-node-include="{{_selectedNodeInclude}}" highlighted-node="{{_highlightedNode}}" color-by="[[colorBy]]" color-by-params="[[colorByParams]]"></tf-graph-info>
@@ -10953,9 +11127,7 @@ Polymer({
     // Public API.
     graphHierarchy: Object,
     graph: Object,
-    graphName: String,
-    // True if the graph data has also run-time stats.
-    hasStats: Boolean,
+    stats: Object,
     /**
      * @type {value: number, msg: string}
      *
@@ -11005,6 +11177,7 @@ Polymer({
   }
 });
 </script>
+
 <dom-module id="tf-graph-controls" assetpath="../tf-graph/">
 <template>
 <style>
@@ -11055,6 +11228,7 @@ table td {
 }
 
 .allcontrols {
+  width: 188px;
   padding: 30px;
 }
 
@@ -11065,6 +11239,7 @@ table td {
 }
 
 paper-radio-button {
+  display: block;
   padding: 5px;
 }
 svg.icon {
@@ -11128,7 +11303,7 @@ svg.icon {
 }
 
 .color-text {
-  padding: 0 0 0 55px;
+  padding: 0 0 0 49px;
 }
 
 .button-text {
@@ -11161,6 +11336,15 @@ svg.icon {
   display: flex;
   clear: both;
 }
+
+.allcontrols .control-holder paper-radio-group {
+  margin-top: 5px;
+}
+
+span.counter {
+  font-size: 13px;
+  color: gray;
+}
 </style>
 <div class="allcontrols">
   <div class="control-holder">
@@ -11178,7 +11362,7 @@ svg.icon {
     </a>
   </div>
   <div class="control-holder">
-    <div class="title">Run</div>
+    <div class="title">Run <span class="counter">([[datasets.length]])</span></div>
     <paper-dropdown-menu no-label-float="" no-animations="" noink="" class="run-dropdown">
       <paper-menu id="select" class="dropdown-content" selected="{{selectedDataset}}">
         <template is="dom-repeat" items="[[datasets]]">
@@ -11187,6 +11371,17 @@ svg.icon {
       </paper-menu>
     </paper-dropdown-menu>
   </div>
+  <div class="control-holder">
+    <div class="title">Session runs <span class="counter">([[_numSessionRuns(metadataTags)]])</span></div>
+    <paper-dropdown-menu no-label-float="" no-animations="" noink="" class="run-dropdown">
+      <paper-menu id="select" class="dropdown-content" selected="{{selectedMetadataTag}}">
+        <template is="dom-repeat" items="[[metadataTags]]">
+          <paper-item>[[item.tag]]</paper-item>
+        </template>
+        <paper-item>None</paper-item>
+      </paper-menu>
+    </paper-dropdown-menu>
+  </div>
   <div class="control-holder">
     <div class="title">Upload</div>
     <paper-button raised="" class="text-button upload-button" on-click="_getFile">Choose File</paper-button>
@@ -11196,27 +11391,25 @@ svg.icon {
   </div>
   <div class="control-holder">
     <div class="title">Color</div>
-    <paper-dropdown-menu no-label-float="" no-animations="" noink="" class="color-dropdown">
-      <paper-menu class="dropdown-content" selected="{{_colorByIndex}}">
-        <paper-item>Structure</paper-item>
-        <paper-item>Device</paper-item>
-        <template is="dom-if" if="[[hasStats]]">
-          <paper-item>Compute time</paper-item>
-          <paper-item>Memory</paper-item>
-        </template>
-      </paper-menu>
-    </paper-dropdown-menu>
+    <paper-radio-group selected="{{colorBy}}">
+      <paper-radio-button name="structure">Structure</paper-radio-button>
+      <paper-radio-button name="device">Device</paper-radio-button>
+      <template is="dom-if" if="[[_statsNotNull(stats)]]">
+        <paper-radio-button name="compute_time">Compute time</paper-radio-button>
+        <paper-radio-button name="memory">Memory</paper-radio-button>
+      </template>
+    </paper-radio-group>
   </div>
   <div>
     <template is="dom-if" if="[[_isGradientColoring(colorBy)]]">
-      <svg width="160" height="20" style="margin: 0 5px" class="color-text">
+      <svg width="140" height="20" style="margin: 0 5px" class="color-text">
         <defs>
           <linearGradient id="linearGradient" x1="0%" y1="0%" x2="100%" y2="0%">
             <stop class="start" offset="0%" stop-color$="[[_currentGradientParams.startColor]]"></stop>
             <stop class="end" offset="100%" stop-color$="[[_currentGradientParams.endColor]]"></stop>
           </linearGradient>
         </defs>
-        <rect x="0" y="0" width="160" height="20" fill="url(#linearGradient)" stroke="black"></rect>
+        <rect x="0" y="0" width="135" height="20" fill="url(#linearGradient)" stroke="black"></rect>
       </svg>
       <div class="domainValues color-text">
         <div class="domainStart">[[_currentGradientParams.minValue]]</div>
@@ -11343,19 +11536,22 @@ Polymer({
   is: 'tf-graph-controls',
   properties: {
     // Public API.
-    hasStats: {
-      type: Boolean
-    },
+    stats: Object,
     colorBy: {
       type: String,
+      value: 'structure',
       notify: true,
-      computed: '_getColorBy(_colorByIndex)'
+      readonly: true
     },
     colorByParams: Object,
     datasets: {
       type: Array,
       observer: '_datasetsChanged'
     },
+    metadataTags: {
+      type: Array,
+      computed: '_getMetadataTags(selectedDataset, datasets)'
+    },
     selectedDataset: {
       type: Number,
       notify: true,
@@ -11366,18 +11562,21 @@ Polymer({
       type: Object,
       notify: true
     },
-    // Private API.
-    _colorByIndex: {
+    selectedMetadataTag: {
       type: Number,
-      value: 0 // Defaults to 'structure'.
+      notify: true,
+      value: -1
     },
     _currentGradientParams: {
       type: Object,
       computed: '_getCurrentGradientParams(colorByParams, colorBy)'
     }
   },
-  _getColorBy: function(colorByIndex) {
-    return ["structure", "device", "compute_time", "memory"][colorByIndex];
+  _statsNotNull: function(stats) {
+    return stats != null;
+  },
+  _numSessionRuns: function(metadataTags) {
+    return metadataTags != null ? metadataTags.length : 0;
   },
   _getBackgroundColor: function(color) {
     return 'background-color:' + color;
@@ -11430,8 +11629,13 @@ Polymer({
       this._setDownloadFilename(this.datasets[this.selectedDataset].path);
     }
   },
+  _getMetadataTags: function(selectedDataset, datasets) {
+    return this.datasets[selectedDataset].runMetadata;
+  },
   _selectedDatasetChanged: function(newDataset, oldDataset) {
     if (this.datasets) {
+      this.set('selectedMetadataTag', -1);
+      this.set('colorBy', 'structure');
       this._setDownloadFilename(this.datasets[newDataset].path);
     }
   },
@@ -11500,11 +11704,11 @@ function convertToHumanReadable(value, units, unitIndex) {
 <template is="dom-if" if="[[!_datasetsEmpty(_datasets)]]">
 <tf-dashboard-layout>
 <div class="sidebar">
-  <tf-graph-controls id="controls" color-by-params="[[_colorByParams]]" has-stats="[[_hasStats]]" color-by="{{_colorBy}}" ,="" datasets="[[_datasets]]" selected-dataset="{{_selectedDataset}}" selected-file="{{_selectedFile}}"></tf-graph-controls>
-  <tf-graph-loader id="loader" datasets="[[_datasets]]" ,="" selected-dataset="[[_selectedDataset]]" selected-file="[[_selectedFile]]" out-graph-hierarchy="{{_graphHierarchy}}" out-graph="{{_graph}}" out-graph-name="{{_graphName}}" has-stats="{{_hasStats}}" progress="{{_progress}}" out-hierarchy-params="{{_hierarchyParams}}"></tf-graph-loader>
+  <tf-graph-controls id="controls" color-by-params="[[_colorByParams]]" stats="[[_stats]]" color-by="{{_colorBy}}" ,="" datasets="[[_datasets]]" selected-dataset="{{_selectedDataset}}" selected-file="{{_selectedFile}}" selected-metadata-tag="{{_selectedMetadataTag}}"></tf-graph-controls>
+  <tf-graph-loader id="loader" datasets="[[_datasets]]" ,="" selected-dataset="[[_selectedDataset]]" selected-metadata-tag="[[_selectedMetadataTag]]" selected-file="[[_selectedFile]]" out-graph-hierarchy="{{_graphHierarchy}}" out-graph="{{_graph}}" out-stats="{{_stats}}" progress="{{_progress}}" out-hierarchy-params="{{_hierarchyParams}}"></tf-graph-loader>
 </div>
 <div class="center">
-    <tf-graph-board id="graphboard" graph-hierarchy="[[_graphHierarchy]]" graph="[[_graph]]" has-stats="[[_hasStats]]" graph-name="[[_graphName]]" progress="[[_progress]]" color-by="[[_colorBy]]" color-by-params="{{_colorByParams}}" hierarchy-params="[[_hierarchyParams]]">
+    <tf-graph-board id="graphboard" graph-hierarchy="[[_graphHierarchy]]" graph="[[_graph]]" stats="[[_stats]]" progress="[[_progress]]" color-by="[[_colorBy]]" color-by-params="{{_colorByParams}}" hierarchy-params="[[_hierarchyParams]]">
     </tf-graph-board>
 </div>
 </tf-dashboard-layout>
@@ -11529,28 +11733,31 @@ function convertToHumanReadable(value, units, unitIndex) {
 Polymer({
   is: 'tf-graph-dashboard',
   properties: {
-    _datasets: {
-      type: Object,
-      computed: '_getDatasets(runs.*, router)'
-    },
+    _datasets: Object,
     backend: {type: Object, observer: 'reload'},
     router: {type: Object},
     runs: Array,
   },
   reload: function() {
-    var _this = this;
-    this.backend.graphRuns().then(function(x) {
-      _this.runs = x;
-    });
-  },
-  _getDatasets: function(runs, router) {
-    return _.map(this.runs, function(runName) {
-      return {
-        name: runName,
-        path: router.graph(runName, tf.graph.LIMIT_ATTR_SIZE,
-            tf.graph.LARGE_ATTRS_KEY)
-      };
-    });
+    Promise.all([this.backend.graphRuns(), this.backend.runMetadataRuns()])
+    .then(function(result) {
+      var runsWithGraph = result[0];
+      var runToMetadata = result[1];
+      var datasets = _.map(runsWithGraph, function(runName) {
+        return {
+          name: runName,
+          path: this.router.graph(runName, tf.graph.LIMIT_ATTR_SIZE,
+            tf.graph.LARGE_ATTRS_KEY),
+          runMetadata: _.map(runToMetadata[runName], function(tag) {
+            return {
+              tag: tag,
+              path: this.router.runMetadata(tag, runName)
+            };
+          }, this)
+        };
+      }, this);
+      this.set('_datasets', datasets);
+    }.bind(this));
   },
   _datasetsEmpty: function(datasets) {
     return !datasets || !datasets.length;
@@ -11570,6 +11777,11 @@ Polymer({
             <paper-tab data-mode="graphs">Graph</paper-tab>
             <paper-tab data-mode="histograms">Histograms</paper-tab>
           </paper-tabs>
+          <div class="global-actions">
+            <a href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/README.md" tabindex="-1">
+              <paper-icon-button icon="help-outline"></paper-icon-button>
+            </a>
+          </div>
         </div>
       </paper-toolbar>
       <div id="content" class="fit">
@@ -11602,25 +11814,13 @@ Polymer({
         -webkit-font-smoothing: antialiased;
       }
 
-      #toolbar-content {
-        width: 100%;
-        height: 100%;
-        display: flex;
-        flex-direction: row;
-        justify-content: space-between;
-        align-items: center;
-      }
-
       .toolbar-title {
         font-size: 20px;
         margin-left: 10px;
         text-rendering: optimizeLegibility;
         letter-spacing: -0.025em;
         font-weight: 500;
-      }
-
-      #content {
-        height: 100%;
+        width: 340px;
       }
 
       .tabs {
@@ -11633,6 +11833,29 @@ Polymer({
         --paper-tabs-selection-bar-color: white;
       }
 
+      .global-actions {
+        flex-grow: 2;
+        text-align: right;
+        color: white;
+      }
+
+      .global-actions a {
+        color: white;
+      }
+
+      #toolbar-content {
+        width: 100%;
+        height: 100%;
+        display: flex;
+        flex-direction: row;
+        justify-content: space-between;
+        align-items: center;
+      }
+
+      #content {
+        height: 100%;
+      }
+
     </style>
   </template>
   <script>
diff --git a/tensorflow/tensorboard/lib/images/favicon.png b/tensorflow/tensorboard/lib/images/favicon.png
deleted file mode 100644
index 13b40ca568c..00000000000
Binary files a/tensorflow/tensorboard/lib/images/favicon.png and /dev/null differ
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 0c6ca145ccd..90e5a563682 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -77,12 +77,17 @@ def if_cuda(a, b=[]):
       "//conditions:default": b,
   })
 
+def if_android_arm(a, b=[]):
+  return select({
+      "//tensorflow:android_arm": a,
+      "//conditions:default": b,
+  })
 
 def tf_copts():
   return (["-fno-exceptions", "-DEIGEN_AVOID_STL_ARRAY",] +
           if_cuda(["-DGOOGLE_CUDA=1"]) +
+          if_android_arm(["-mfpu=neon"]) +
           select({"//tensorflow:android": [
-                    "-mfpu=neon",
                     "-std=c++11",
                     "-DMIN_LOG_LEVEL=0",
                     "-DTF_LEAN_BINARY",
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index e4af36efbc1..5c919cb2c8b 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -13,8 +13,8 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
   native.new_http_archive(
     name = "eigen_archive",
-    url = "https://bitbucket.org/eigen/eigen/get/6e521c802bf5.tar.gz",
-    sha256 = "f1b4b4401d08d0d44128ab80ebe76633363dab20c29b1bf2370aed8b4893cc5e",
+    url = "https://bitbucket.org/eigen/eigen/get/3f653ace7d28.tar.gz",
+    sha256 = "b49502f423deda55cea33bc503f84409cca92157f3b536d17113b81138f86715",
     build_file = path_prefix + "eigen.BUILD",
   )
 
diff --git a/third_party/eigen3/Eigen/Cholesky b/third_party/eigen3/Eigen/Cholesky
index bb6c5638f7b..57aabf873d0 100644
--- a/third_party/eigen3/Eigen/Cholesky
+++ b/third_party/eigen3/Eigen/Cholesky
@@ -1 +1 @@
-#include "eigen-eigen-6e521c802bf5/Eigen/Cholesky"
+#include "eigen-eigen-3f653ace7d28/Eigen/Cholesky"
diff --git a/third_party/eigen3/Eigen/Core b/third_party/eigen3/Eigen/Core
index 43155fe6b33..42926a07c09 100644
--- a/third_party/eigen3/Eigen/Core
+++ b/third_party/eigen3/Eigen/Core
@@ -1 +1 @@
-#include "eigen-eigen-6e521c802bf5/Eigen/Core"
+#include "eigen-eigen-3f653ace7d28/Eigen/Core"
diff --git a/third_party/eigen3/Eigen/Eigenvalues b/third_party/eigen3/Eigen/Eigenvalues
index 0923f4a8706..796e2554c6e 100644
--- a/third_party/eigen3/Eigen/Eigenvalues
+++ b/third_party/eigen3/Eigen/Eigenvalues
@@ -1 +1 @@
-#include "eigen-eigen-6e521c802bf5/Eigen/Eigenvalues"
+#include "eigen-eigen-3f653ace7d28/Eigen/Eigenvalues"
diff --git a/third_party/eigen3/Eigen/LU b/third_party/eigen3/Eigen/LU
index 523f5927800..4b92d1ca437 100644
--- a/third_party/eigen3/Eigen/LU
+++ b/third_party/eigen3/Eigen/LU
@@ -1 +1 @@
-#include "eigen-eigen-6e521c802bf5/Eigen/LU"
+#include "eigen-eigen-3f653ace7d28/Eigen/LU"
diff --git a/third_party/eigen3/Eigen/QR b/third_party/eigen3/Eigen/QR
index 20655070eab..604453558d1 100644
--- a/third_party/eigen3/Eigen/QR
+++ b/third_party/eigen3/Eigen/QR
@@ -1 +1 @@
-#include "eigen-eigen-6e521c802bf5/Eigen/QR"
+#include "eigen-eigen-3f653ace7d28/Eigen/QR"
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
index d112cffeaf2..3987cd0af9b 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
@@ -1 +1 @@
-#include "eigen-eigen-6e521c802bf5/unsupported/Eigen/CXX11/Tensor"
+#include "eigen-eigen-3f653ace7d28/unsupported/Eigen/CXX11/Tensor"